dumbdbm.py revision 1aa2c0f073bdbed4fa824591d53e20bbf3d01add
1"""A dumb and slow but simple dbm clone. 2 3For database spam, spam.dir contains the index (a text file), 4spam.bak *may* contain a backup of the index (also a text file), 5while spam.dat contains the data (a binary file). 6 7XXX TO DO: 8 9- seems to contain a bug when updating... 10 11- reclaim free space (currently, space once occupied by deleted or expanded 12items is never reused) 13 14- support concurrent access (currently, if two processes take turns making 15updates, they can mess up the index) 16 17- support efficient access to large databases (currently, the whole index 18is read when the database is opened, and some updates rewrite the whole index) 19 20- support opening for read-only (flag = 'm') 21 22""" 23 24import ast as _ast 25import os as _os 26import __builtin__ 27import UserDict 28 29_open = __builtin__.open 30 31_BLOCKSIZE = 512 32 33error = IOError # For anydbm 34 35class _Database(UserDict.DictMixin): 36 37 # The on-disk directory and data files can remain in mutually 38 # inconsistent states for an arbitrarily long time (see comments 39 # at the end of __setitem__). This is only repaired when _commit() 40 # gets called. One place _commit() gets called is from __del__(), 41 # and if that occurs at program shutdown time, module globals may 42 # already have gotten rebound to None. Since it's crucial that 43 # _commit() finish successfully, we can't ignore shutdown races 44 # here, and _commit() must not reference any globals. 45 _os = _os # for _commit() 46 _open = _open # for _commit() 47 48 def __init__(self, filebasename, mode): 49 self._mode = mode 50 51 # The directory file is a text file. Each line looks like 52 # "%r, (%d, %d)\n" % (key, pos, siz) 53 # where key is the string key, pos is the offset into the dat 54 # file of the associated value's first byte, and siz is the number 55 # of bytes in the associated value. 56 self._dirfile = filebasename + _os.extsep + 'dir' 57 58 # The data file is a binary file pointed into by the directory 59 # file, and holds the values associated with keys. Each value 60 # begins at a _BLOCKSIZE-aligned byte offset, and is a raw 61 # binary 8-bit string value. 62 self._datfile = filebasename + _os.extsep + 'dat' 63 self._bakfile = filebasename + _os.extsep + 'bak' 64 65 # The index is an in-memory dict, mirroring the directory file. 66 self._index = None # maps keys to (pos, siz) pairs 67 68 # Mod by Jack: create data file if needed 69 try: 70 f = _open(self._datfile, 'r') 71 except IOError: 72 with _open(self._datfile, 'w') as f: 73 self._chmod(self._datfile) 74 else: 75 f.close() 76 self._update() 77 78 # Read directory file into the in-memory index dict. 79 def _update(self): 80 self._index = {} 81 try: 82 f = _open(self._dirfile) 83 except IOError: 84 pass 85 else: 86 with f: 87 for line in f: 88 line = line.rstrip() 89 key, pos_and_siz_pair = _ast.literal_eval(line) 90 self._index[key] = pos_and_siz_pair 91 92 # Write the index dict to the directory file. The original directory 93 # file (if any) is renamed with a .bak extension first. If a .bak 94 # file currently exists, it's deleted. 95 def _commit(self): 96 # CAUTION: It's vital that _commit() succeed, and _commit() can 97 # be called from __del__(). Therefore we must never reference a 98 # global in this routine. 99 if self._index is None: 100 return # nothing to do 101 102 try: 103 self._os.unlink(self._bakfile) 104 except self._os.error: 105 pass 106 107 try: 108 self._os.rename(self._dirfile, self._bakfile) 109 except self._os.error: 110 pass 111 112 with self._open(self._dirfile, 'w') as f: 113 self._chmod(self._dirfile) 114 for key, pos_and_siz_pair in self._index.iteritems(): 115 f.write("%r, %r\n" % (key, pos_and_siz_pair)) 116 117 sync = _commit 118 119 def __getitem__(self, key): 120 pos, siz = self._index[key] # may raise KeyError 121 with _open(self._datfile, 'rb') as f: 122 f.seek(pos) 123 dat = f.read(siz) 124 return dat 125 126 # Append val to the data file, starting at a _BLOCKSIZE-aligned 127 # offset. The data file is first padded with NUL bytes (if needed) 128 # to get to an aligned offset. Return pair 129 # (starting offset of val, len(val)) 130 def _addval(self, val): 131 with _open(self._datfile, 'rb+') as f: 132 f.seek(0, 2) 133 pos = int(f.tell()) 134 npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE 135 f.write('\0'*(npos-pos)) 136 pos = npos 137 f.write(val) 138 return (pos, len(val)) 139 140 # Write val to the data file, starting at offset pos. The caller 141 # is responsible for ensuring that there's enough room starting at 142 # pos to hold val, without overwriting some other value. Return 143 # pair (pos, len(val)). 144 def _setval(self, pos, val): 145 with _open(self._datfile, 'rb+') as f: 146 f.seek(pos) 147 f.write(val) 148 return (pos, len(val)) 149 150 # key is a new key whose associated value starts in the data file 151 # at offset pos and with length siz. Add an index record to 152 # the in-memory index dict, and append one to the directory file. 153 def _addkey(self, key, pos_and_siz_pair): 154 self._index[key] = pos_and_siz_pair 155 with _open(self._dirfile, 'a') as f: 156 self._chmod(self._dirfile) 157 f.write("%r, %r\n" % (key, pos_and_siz_pair)) 158 159 def __setitem__(self, key, val): 160 if not type(key) == type('') == type(val): 161 raise TypeError, "keys and values must be strings" 162 if key not in self._index: 163 self._addkey(key, self._addval(val)) 164 else: 165 # See whether the new value is small enough to fit in the 166 # (padded) space currently occupied by the old value. 167 pos, siz = self._index[key] 168 oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE 169 newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE 170 if newblocks <= oldblocks: 171 self._index[key] = self._setval(pos, val) 172 else: 173 # The new value doesn't fit in the (padded) space used 174 # by the old value. The blocks used by the old value are 175 # forever lost. 176 self._index[key] = self._addval(val) 177 178 # Note that _index may be out of synch with the directory 179 # file now: _setval() and _addval() don't update the directory 180 # file. This also means that the on-disk directory and data 181 # files are in a mutually inconsistent state, and they'll 182 # remain that way until _commit() is called. Note that this 183 # is a disaster (for the database) if the program crashes 184 # (so that _commit() never gets called). 185 186 def __delitem__(self, key): 187 # The blocks used by the associated value are lost. 188 del self._index[key] 189 # XXX It's unclear why we do a _commit() here (the code always 190 # XXX has, so I'm not changing it). _setitem__ doesn't try to 191 # XXX keep the directory file in synch. Why should we? Or 192 # XXX why shouldn't __setitem__? 193 self._commit() 194 195 def keys(self): 196 return self._index.keys() 197 198 def has_key(self, key): 199 return key in self._index 200 201 def __contains__(self, key): 202 return key in self._index 203 204 def iterkeys(self): 205 return self._index.iterkeys() 206 __iter__ = iterkeys 207 208 def __len__(self): 209 return len(self._index) 210 211 def close(self): 212 try: 213 self._commit() 214 finally: 215 self._index = self._datfile = self._dirfile = self._bakfile = None 216 217 __del__ = close 218 219 def _chmod (self, file): 220 if hasattr(self._os, 'chmod'): 221 self._os.chmod(file, self._mode) 222 223 224def open(file, flag=None, mode=0666): 225 """Open the database file, filename, and return corresponding object. 226 227 The flag argument, used to control how the database is opened in the 228 other DBM implementations, is ignored in the dumbdbm module; the 229 database is always opened for update, and will be created if it does 230 not exist. 231 232 The optional mode argument is the UNIX mode of the file, used only when 233 the database has to be created. It defaults to octal code 0666 (and 234 will be modified by the prevailing umask). 235 236 """ 237 # flag argument is currently ignored 238 239 # Modify mode depending on the umask 240 try: 241 um = _os.umask(0) 242 _os.umask(um) 243 except AttributeError: 244 pass 245 else: 246 # Turn off any bits that are set in the umask 247 mode = mode & (~um) 248 249 return _Database(file, mode) 250