caching_file_system.py revision 03b57e008b61dfcb1fbad3aea950ae0e001748b0
1# Copyright (c) 2012 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import posixpath
6import sys
7
8from file_system import FileSystem, StatInfo, FileNotFoundError
9from future import Future
10from path_util import IsDirectory, ToDirectory
11from third_party.json_schema_compiler.memoize import memoize
12
13
14class CachingFileSystem(FileSystem):
15  '''FileSystem which implements a caching layer on top of |file_system|. It's
16  smart, using Stat() to decided whether to skip Read()ing from |file_system|,
17  and only Stat()ing directories never files.
18  '''
19  def __init__(self, file_system, object_store_creator):
20    self._file_system = file_system
21    def create_object_store(category, **optargs):
22      return object_store_creator.Create(
23          CachingFileSystem,
24          category='%s/%s' % (file_system.GetIdentity(), category),
25          **optargs)
26    self._stat_object_store = create_object_store('stat')
27    # The read caches can start populated (start_empty=False) because file
28    # updates are picked up by the stat, so it doesn't need the force-refresh
29    # which starting empty is designed for. Without this optimisation, cron
30    # runs are extra slow.
31    self._read_object_store = create_object_store('read', start_empty=False)
32
33  def Refresh(self):
34    return self._file_system.Refresh()
35
36  def StatAsync(self, path):
37    '''Stats the directory given, or if a file is given, stats the file's parent
38    directory to get info about the file.
39    '''
40    # Always stat the parent directory, since it will have the stat of the child
41    # anyway, and this gives us an entire directory's stat info at once.
42    dir_path, file_path = posixpath.split(path)
43    dir_path = ToDirectory(dir_path)
44
45    def make_stat_info(dir_stat):
46      '''Converts a dir stat into the correct resulting StatInfo; if the Stat
47      was for a file, the StatInfo should just contain that file.
48      '''
49      if path == dir_path:
50        return dir_stat
51      # Was a file stat. Extract that file.
52      file_version = dir_stat.child_versions.get(file_path)
53      if file_version is None:
54        raise FileNotFoundError('No stat found for %s in %s (found %s)' %
55                                (path, dir_path, dir_stat.child_versions))
56      return StatInfo(file_version)
57
58    dir_stat = self._stat_object_store.Get(dir_path).Get()
59    if dir_stat is not None:
60      return Future(value=make_stat_info(dir_stat))
61
62    def next(dir_stat):
63      assert dir_stat is not None  # should have raised a FileNotFoundError
64      # We only ever need to cache the dir stat.
65      self._stat_object_store.Set(dir_path, dir_stat)
66      return make_stat_info(dir_stat)
67    return self._MemoizedStatAsyncFromFileSystem(dir_path).Then(next)
68
69  @memoize
70  def _MemoizedStatAsyncFromFileSystem(self, dir_path):
71    '''This is a simple wrapper to memoize Futures to directory stats, since
72    StatAsync makes heavy use of it. Only cache directories so that the
73    memoized cache doesn't blow up.
74    '''
75    assert IsDirectory(dir_path)
76    return self._file_system.StatAsync(dir_path)
77
78  def Read(self, paths, skip_not_found=False):
79    '''Reads a list of files. If a file is in memcache and it is not out of
80    date, it is returned. Otherwise, the file is retrieved from the file system.
81    '''
82    cached_read_values = self._read_object_store.GetMulti(paths).Get()
83    cached_stat_values = self._stat_object_store.GetMulti(paths).Get()
84
85    # Populate a map of paths to Futures to their stat. They may have already
86    # been cached in which case their Future will already have been constructed
87    # with a value.
88    stat_futures = {}
89
90    def handle(error):
91      if isinstance(error, FileNotFoundError):
92        return None
93      raise error
94
95    for path in paths:
96      stat_value = cached_stat_values.get(path)
97      if stat_value is None:
98        stat_future = self.StatAsync(path)
99        if skip_not_found:
100          stat_future = stat_future.Then(lambda x: x, handle)
101      else:
102        stat_future = Future(value=stat_value)
103      stat_futures[path] = stat_future
104
105    # Filter only the cached data which is fresh by comparing to the latest
106    # stat. The cached read data includes the cached version. Remove it for
107    # the result returned to callers.
108    fresh_data = dict(
109        (path, data) for path, (data, version) in cached_read_values.iteritems()
110        if stat_futures[path].Get().version == version)
111
112    if len(fresh_data) == len(paths):
113      # Everything was cached and up-to-date.
114      return Future(value=fresh_data)
115
116    def next(new_results):
117      # Update the cache. This is a path -> (data, version) mapping.
118      self._read_object_store.SetMulti(
119          dict((path, (new_result, stat_futures[path].Get().version))
120               for path, new_result in new_results.iteritems()))
121      new_results.update(fresh_data)
122      return new_results
123    # Read in the values that were uncached or old.
124    return self._file_system.Read(set(paths) - set(fresh_data.iterkeys()),
125                                  skip_not_found=skip_not_found).Then(next)
126
127  def GetIdentity(self):
128    return self._file_system.GetIdentity()
129
130  def __repr__(self):
131    return '%s of <%s>' % (type(self).__name__, repr(self._file_system))
132