caching_file_system.py revision 03b57e008b61dfcb1fbad3aea950ae0e001748b0
1# Copyright (c) 2012 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import posixpath 6import sys 7 8from file_system import FileSystem, StatInfo, FileNotFoundError 9from future import Future 10from path_util import IsDirectory, ToDirectory 11from third_party.json_schema_compiler.memoize import memoize 12 13 14class CachingFileSystem(FileSystem): 15 '''FileSystem which implements a caching layer on top of |file_system|. It's 16 smart, using Stat() to decided whether to skip Read()ing from |file_system|, 17 and only Stat()ing directories never files. 18 ''' 19 def __init__(self, file_system, object_store_creator): 20 self._file_system = file_system 21 def create_object_store(category, **optargs): 22 return object_store_creator.Create( 23 CachingFileSystem, 24 category='%s/%s' % (file_system.GetIdentity(), category), 25 **optargs) 26 self._stat_object_store = create_object_store('stat') 27 # The read caches can start populated (start_empty=False) because file 28 # updates are picked up by the stat, so it doesn't need the force-refresh 29 # which starting empty is designed for. Without this optimisation, cron 30 # runs are extra slow. 31 self._read_object_store = create_object_store('read', start_empty=False) 32 33 def Refresh(self): 34 return self._file_system.Refresh() 35 36 def StatAsync(self, path): 37 '''Stats the directory given, or if a file is given, stats the file's parent 38 directory to get info about the file. 39 ''' 40 # Always stat the parent directory, since it will have the stat of the child 41 # anyway, and this gives us an entire directory's stat info at once. 42 dir_path, file_path = posixpath.split(path) 43 dir_path = ToDirectory(dir_path) 44 45 def make_stat_info(dir_stat): 46 '''Converts a dir stat into the correct resulting StatInfo; if the Stat 47 was for a file, the StatInfo should just contain that file. 48 ''' 49 if path == dir_path: 50 return dir_stat 51 # Was a file stat. Extract that file. 52 file_version = dir_stat.child_versions.get(file_path) 53 if file_version is None: 54 raise FileNotFoundError('No stat found for %s in %s (found %s)' % 55 (path, dir_path, dir_stat.child_versions)) 56 return StatInfo(file_version) 57 58 dir_stat = self._stat_object_store.Get(dir_path).Get() 59 if dir_stat is not None: 60 return Future(value=make_stat_info(dir_stat)) 61 62 def next(dir_stat): 63 assert dir_stat is not None # should have raised a FileNotFoundError 64 # We only ever need to cache the dir stat. 65 self._stat_object_store.Set(dir_path, dir_stat) 66 return make_stat_info(dir_stat) 67 return self._MemoizedStatAsyncFromFileSystem(dir_path).Then(next) 68 69 @memoize 70 def _MemoizedStatAsyncFromFileSystem(self, dir_path): 71 '''This is a simple wrapper to memoize Futures to directory stats, since 72 StatAsync makes heavy use of it. Only cache directories so that the 73 memoized cache doesn't blow up. 74 ''' 75 assert IsDirectory(dir_path) 76 return self._file_system.StatAsync(dir_path) 77 78 def Read(self, paths, skip_not_found=False): 79 '''Reads a list of files. If a file is in memcache and it is not out of 80 date, it is returned. Otherwise, the file is retrieved from the file system. 81 ''' 82 cached_read_values = self._read_object_store.GetMulti(paths).Get() 83 cached_stat_values = self._stat_object_store.GetMulti(paths).Get() 84 85 # Populate a map of paths to Futures to their stat. They may have already 86 # been cached in which case their Future will already have been constructed 87 # with a value. 88 stat_futures = {} 89 90 def handle(error): 91 if isinstance(error, FileNotFoundError): 92 return None 93 raise error 94 95 for path in paths: 96 stat_value = cached_stat_values.get(path) 97 if stat_value is None: 98 stat_future = self.StatAsync(path) 99 if skip_not_found: 100 stat_future = stat_future.Then(lambda x: x, handle) 101 else: 102 stat_future = Future(value=stat_value) 103 stat_futures[path] = stat_future 104 105 # Filter only the cached data which is fresh by comparing to the latest 106 # stat. The cached read data includes the cached version. Remove it for 107 # the result returned to callers. 108 fresh_data = dict( 109 (path, data) for path, (data, version) in cached_read_values.iteritems() 110 if stat_futures[path].Get().version == version) 111 112 if len(fresh_data) == len(paths): 113 # Everything was cached and up-to-date. 114 return Future(value=fresh_data) 115 116 def next(new_results): 117 # Update the cache. This is a path -> (data, version) mapping. 118 self._read_object_store.SetMulti( 119 dict((path, (new_result, stat_futures[path].Get().version)) 120 for path, new_result in new_results.iteritems())) 121 new_results.update(fresh_data) 122 return new_results 123 # Read in the values that were uncached or old. 124 return self._file_system.Read(set(paths) - set(fresh_data.iterkeys()), 125 skip_not_found=skip_not_found).Then(next) 126 127 def GetIdentity(self): 128 return self._file_system.GetIdentity() 129 130 def __repr__(self): 131 return '%s of <%s>' % (type(self).__name__, repr(self._file_system)) 132