revoke.c revision 53ef44c40a3e425d2c700d8fd77a6b655aa121fe
1/* 2 * linux/fs/revoke.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 5 * 6 * Copyright 2000 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal revoke routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 * 15 * Revoke is the mechanism used to prevent old log records for deleted 16 * metadata from being replayed on top of newer data using the same 17 * blocks. The revoke mechanism is used in two separate places: 18 * 19 * + Commit: during commit we write the entire list of the current 20 * transaction's revoked blocks to the journal 21 * 22 * + Recovery: during recovery we record the transaction ID of all 23 * revoked blocks. If there are multiple revoke records in the log 24 * for a single block, only the last one counts, and if there is a log 25 * entry for a block beyond the last revoke, then that log entry still 26 * gets replayed. 27 * 28 * We can get interactions between revokes and new log data within a 29 * single transaction: 30 * 31 * Block is revoked and then journaled: 32 * The desired end result is the journaling of the new block, so we 33 * cancel the revoke before the transaction commits. 34 * 35 * Block is journaled and then revoked: 36 * The revoke must take precedence over the write of the block, so 37 * we need either to cancel the journal entry or to write the revoke 38 * later in the log than the log block. In this case, we choose the 39 * former: the commit code must skip any block that has the Revoke bit 40 * set. 41 * 42 * Block is revoked and then written as data: 43 * The data write is allowed to succeed, but the revoke is _not_ 44 * cancelled. We still need to prevent old log records from 45 * overwriting the new data. We don't even need to clear the revoke 46 * bit here. 47 * 48 * Revoke information on buffers is a tri-state value: 49 * 50 * RevokeValid clear: no cached revoke status, need to look it up 51 * RevokeValid set, Revoke clear: 52 * buffer has not been revoked, and cancel_revoke 53 * need do nothing. 54 * RevokeValid set, Revoke set: 55 * buffer has been revoked. 56 */ 57 58#ifndef __KERNEL__ 59#include "jfs_user.h" 60#else 61#include <linux/sched.h> 62#include <linux/fs.h> 63#include <linux/jfs.h> 64#include <linux/errno.h> 65#include <linux/slab.h> 66#include <linux/locks.h> 67#include <linux/buffer.h> 68#include <linux/list.h> 69#endif 70 71static kmem_cache_t *revoke_record_cache; 72static kmem_cache_t *revoke_table_cache; 73 74/* Each revoke record represents one single revoked block. During 75 journal replay, this involves recording the transaction ID of the 76 last transaction to revoke this block. */ 77 78struct jfs_revoke_record_s 79{ 80 struct list_head hash; 81 tid_t sequence; /* Used for recovery only */ 82 unsigned long blocknr; 83}; 84 85 86/* The revoke table is just a simple hash table of revoke records. */ 87struct jfs_revoke_table_s 88{ 89 /* It is conceivable that we might want a larger hash table 90 * for recovery. Must be a power of two. */ 91 int hash_size; 92 int hash_shift; 93 struct list_head *hash_table; 94}; 95 96 97#ifdef __KERNEL__ 98static void write_one_revoke_record(journal_t *, transaction_t *, 99 struct buffer_head **, int *, 100 struct jfs_revoke_record_s *); 101static void flush_descriptor(journal_t *, struct buffer_head *, int); 102#endif 103 104/* Utility functions to maintain the revoke table */ 105 106/* Borrowed from buffer.c: this is a tried and tested block hash function */ 107static inline int hash(journal_t *journal, unsigned long block) 108{ 109 struct jfs_revoke_table_s *table = journal->j_revoke; 110 int hash_shift = table->hash_shift; 111 112 return ((block << (hash_shift - 6)) ^ 113 (block >> 13) ^ 114 (block << (hash_shift - 12))) & (table->hash_size - 1); 115} 116 117static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 118 tid_t seq) 119{ 120 struct list_head *hash_list; 121 struct jfs_revoke_record_s *record; 122 123 record = kmem_cache_alloc(revoke_record_cache, GFP_KERNEL); 124 if (!record) 125 return -ENOMEM; 126 127 record->sequence = seq; 128 record->blocknr = blocknr; 129 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; 130 list_add(&record->hash, hash_list); 131 return 0; 132} 133 134/* Find a revoke record in the journal's hash table. */ 135 136static struct jfs_revoke_record_s *find_revoke_record(journal_t *journal, 137 unsigned long blocknr) 138{ 139 struct list_head *hash_list; 140 struct jfs_revoke_record_s *record; 141 142 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; 143 144 record = (struct jfs_revoke_record_s *) hash_list->next; 145 while (&(record->hash) != hash_list) { 146 if (record->blocknr == blocknr) 147 return record; 148 record = (struct jfs_revoke_record_s *) record->hash.next; 149 } 150 return NULL; 151} 152 153 154 155/* Initialise the revoke table for a given journal to a given size. */ 156 157int journal_init_revoke(journal_t *journal, int hash_size) 158{ 159 int shift, tmp; 160 161 J_ASSERT (journal->j_revoke == NULL); 162 163 if (!revoke_record_cache) 164 revoke_record_cache = 165 kmem_cache_create ("revoke_record", 166 sizeof(struct jfs_revoke_record_s), 167 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 168 169 if (!revoke_table_cache) 170 revoke_table_cache = 171 kmem_cache_create ("revoke_table", 172 sizeof(struct jfs_revoke_table_s), 173 0, 0, NULL, NULL); 174 175 if (!revoke_record_cache || !revoke_table_cache) 176 return -ENOMEM; 177 178 journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); 179 if (!journal->j_revoke) 180 return -ENOMEM; 181 182 /* Check that the hash_size is a power of two */ 183 J_ASSERT ((hash_size & (hash_size-1)) == 0); 184 185 journal->j_revoke->hash_size = hash_size; 186 187 shift = 0; 188 tmp = hash_size; 189 while((tmp >>= 1UL) != 0UL) 190 shift++; 191 journal->j_revoke->hash_shift = shift; 192 193 journal->j_revoke->hash_table = 194 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); 195 if (!journal->j_revoke->hash_table) { 196 kmem_cache_free(revoke_table_cache, journal->j_revoke); 197 journal->j_revoke = NULL; 198 return -ENOMEM; 199 } 200 201 for (tmp = 0; tmp < hash_size; tmp++) 202 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); 203 204 return 0; 205} 206 207/* Destoy a journal's revoke table. The table must already be empty! */ 208 209void journal_destroy_revoke(journal_t *journal) 210{ 211 struct jfs_revoke_table_s *table; 212 struct list_head *hash_list; 213 int i; 214 215 table = journal->j_revoke; 216 if (!table) 217 return; 218 219 for (i=0; i<table->hash_size; i++) { 220 hash_list = &table->hash_table[i]; 221 J_ASSERT (list_empty(hash_list)); 222 } 223 224 kfree(table->hash_table); 225 kmem_cache_free(revoke_table_cache, table); 226 journal->j_revoke = NULL; 227} 228 229 230#ifdef __KERNEL__ 231 232/* 233 * journal_revoke: revoke a given buffer_head from the journal. This 234 * prevents the block from being replayed during recovery if we take a 235 * crash after this current transaction commits. Any subsequent 236 * metadata writes of the buffer in this transaction cancel the 237 * revoke. 238 * 239 * Note that this call may block --- it is up to the caller to make 240 * sure that there are no further calls to journal_write_metadata 241 * before the revoke is complete. In ext3, this implies calling the 242 * revoke before clearing the block bitmap when we are deleting 243 * metadata. 244 * 245 * Revoke performs a journal_forget on any buffer_head passed in as a 246 * parameter, but does _not_ forget the buffer_head if the bh was only 247 * found implicitly. 248 * 249 * Revoke must observe the same synchronisation rules as bforget: it 250 * must not discard the buffer once it has blocked. 251 */ 252 253int journal_revoke(handle_t *handle, unsigned long blocknr, 254 struct buffer_head *bh_in) 255{ 256 struct buffer_head *bh; 257 journal_t *journal; 258 kdev_t dev; 259 int err; 260 261 journal = handle->h_transaction->t_journal; 262 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 263 return -EINVAL; 264 265 dev = journal->j_dev; 266 bh = bh_in; 267 268 if (!bh) 269 bh = get_hash_table(dev, blocknr, journal->j_blocksize); 270 271 /* We really ought not ever to revoke twice in a row without 272 first having the revoke cancelled: it's illegal to free a 273 block twice without allocating it in between! */ 274 if (bh) { 275 J_ASSERT (!test_and_set_bit(BH_Revoked, &bh->b_state)); 276 set_bit(BH_RevokeValid, &bh->b_state); 277 if (bh_in) 278 journal_forget(handle, bh_in); 279 else 280 brelse(bh); 281 } 282 283 lock_journal(journal); 284 err = insert_revoke_hash(journal, blocknr, 285 handle->h_transaction->t_tid); 286 unlock_journal(journal); 287 288 return err; 289} 290 291 292/* 293 * Cancel an outstanding revoke. For use only internally by the 294 * journaling code (called from journal_get_write_access). 295 * 296 * We trust the BH_Revoked bit on the buffer if the buffer is already 297 * being journaled: if there is no revoke pending on the buffer, then we 298 * don't do anything here. 299 * 300 * This would break if it were possible for a buffer to be revoked and 301 * discarded, and then reallocated within the same transaction. In such 302 * a case we would have lost the revoked bit, but when we arrived here 303 * the second time we would still have a pending revoke to cancel. So, 304 * do not trust the Revoked bit on buffers unless RevokeValid is also 305 * set. 306 * 307 * The caller must have the journal locked. 308 * */ 309 310void journal_cancel_revoke(handle_t *handle, struct buffer_head *bh) 311{ 312 struct jfs_revoke_record_s *record; 313 journal_t *journal = handle->h_transaction->t_journal; 314 int need_cancel; 315 316 J_ASSERT (journal->j_locked); 317 318 /* Is the existing Revoke bit valid? If so, we trust it, and 319 * only perform the full cancel if the revoke bit is set. If 320 * not, we can't trust the revoke bit, and we need to do the 321 * full search for a revoke record. */ 322 if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) 323 need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); 324 else { 325 need_cancel = 1; 326 clear_bit(BH_Revoked, &bh->b_state); 327 } 328 329 if (need_cancel) { 330 record = find_revoke_record(journal, bh->b_blocknr); 331 if (record) { 332 list_del(&record->hash); 333 kmem_cache_free(revoke_record_cache, record); 334 } 335 } 336} 337 338 339/* 340 * Write revoke records to the journal for all entries in the current 341 * revoke hash, deleting the entries as we go. 342 * 343 * Called with the journal lock held. 344 */ 345 346void journal_write_revoke_records(journal_t *journal, 347 transaction_t *transaction) 348{ 349 struct buffer_head *descriptor; 350 struct jfs_revoke_record_s *record; 351 struct jfs_revoke_table_s *revoke; 352 struct list_head *hash_list; 353 int i, offset; 354 355 descriptor = NULL; 356 offset = 0; 357 revoke = journal->j_revoke; 358 359 for (i = 0; i < revoke->hash_size; i++) { 360 hash_list = &revoke->hash_table[i]; 361 362 while (!list_empty(hash_list)) { 363 record = (struct jfs_revoke_record_s *) 364 hash_list->next; 365 write_one_revoke_record(journal, transaction, 366 &descriptor, &offset, 367 record); 368 list_del(&record->hash); 369 kmem_cache_free(revoke_record_cache, record); 370 } 371 } 372 if (descriptor) 373 flush_descriptor(journal, descriptor, offset); 374} 375 376/* 377 * Write out one revoke record. We need to create a new descriptor 378 * block if the old one is full or if we have not already created one. 379 */ 380 381static void write_one_revoke_record(journal_t *journal, 382 transaction_t *transaction, 383 struct buffer_head **descriptorp, 384 int *offsetp, 385 struct jfs_revoke_record_s *record) 386{ 387 struct buffer_head *descriptor; 388 int offset; 389 journal_header_t *header; 390 391 /* If we are already aborting, this all becomes a noop. We 392 still need to go round the loop in 393 journal_write_revoke_records in order to free all of the 394 revoke records: only the IO to the journal is omitted. */ 395 if (is_journal_abort(journal)) 396 return; 397 398 descriptor = *descriptorp; 399 offset = *offsetp; 400 401 /* Make sure we have a descriptor with space left for the record */ 402 if (descriptor) { 403 if (offset == journal->j_blocksize) { 404 flush_descriptor(journal, descriptor, offset); 405 descriptor = NULL; 406 } 407 } 408 409 if (!descriptor) { 410 descriptor = journal_get_descriptor_buffer(journal); 411 header = (journal_header_t *) &descriptor->b_data[0]; 412 header->h_magic = htonl(JFS_MAGIC_NUMBER); 413 header->h_blocktype = htonl(JFS_REVOKE_BLOCK); 414 header->h_sequence = htonl(transaction->t_tid); 415 416 /* Record it so that we can wait for IO completion later */ 417 journal_file_buffer(descriptor, transaction, BJ_LogCtl); 418 419 offset = sizeof(journal_revoke_header_t); 420 *descriptorp = descriptor; 421 } 422 423 * ((unsigned int *)(&descriptor->b_data[offset])) = 424 htonl(record->blocknr); 425 offset += 4; 426 *offsetp = offset; 427} 428 429/* 430 * Flush a revoke descriptor out to the journal. If we are aborting, 431 * this is a noop; otherwise we are generating a buffer which needs to 432 * be waited for during commit, so it has to go onto the appropriate 433 * journal buffer list. 434 */ 435 436static void flush_descriptor(journal_t *journal, 437 struct buffer_head *descriptor, 438 int offset) 439{ 440 journal_revoke_header_t *header; 441 442 if (is_journal_abort(journal)) { 443 brelse(descriptor); 444 return; 445 } 446 447 header = (journal_revoke_header_t *) descriptor->b_data; 448 header->r_count = htonl(offset); 449 set_bit(BH_JWrite, &descriptor->b_state); 450 ll_rw_block (WRITE, 1, &descriptor); 451} 452 453#endif 454 455/* 456 * Revoke support for recovery. 457 * 458 * Recovery needs to be able to: 459 * 460 * record all revoke records, including the tid of the latest instance 461 * of each revoke in the journal 462 * 463 * check whether a given block in a given transaction should be replayed 464 * (ie. has not been revoked by a revoke record in that or a subsequent 465 * transaction) 466 * 467 * empty the revoke table after recovery. 468 */ 469 470/* 471 * First, setting revoke records. We create a new revoke record for 472 * every block ever revoked in the log as we scan it for recovery, and 473 * we update the existing records if we find multiple revokes for a 474 * single block. 475 */ 476 477int journal_set_revoke(journal_t *journal, 478 unsigned long blocknr, 479 tid_t sequence) 480{ 481 struct jfs_revoke_record_s *record; 482 483 record = find_revoke_record(journal, blocknr); 484 if (record) { 485 /* If we have multiple occurences, only record the 486 * latest sequence number in the hashed record */ 487 if (tid_ge(sequence, record->sequence)) 488 record->sequence = sequence; 489 return 0; 490 } 491 return insert_revoke_hash(journal, blocknr, sequence); 492} 493 494/* 495 * Test revoke records. For a given block referenced in the log, has 496 * that block been revoked? A revoke record with a given transaction 497 * sequence number revokes all blocks in that transaction and earlier 498 * ones, but later transactions still need replayed. 499 */ 500 501int journal_test_revoke(journal_t *journal, 502 unsigned long blocknr, 503 tid_t sequence) 504{ 505 struct jfs_revoke_record_s *record; 506 507 record = find_revoke_record(journal, blocknr); 508 if (!record) 509 return 0; 510 if (tid_ge(sequence, record->sequence)) 511 return 0; 512 return 1; 513} 514 515/* 516 * Finally, once recovery is over, we need to clear the revoke table so 517 * that it can be reused by the running filesystem. 518 */ 519 520void journal_clear_revoke(journal_t *journal) 521{ 522 int i; 523 struct list_head *hash_list; 524 struct jfs_revoke_record_s *record; 525 struct jfs_revoke_table_s *revoke; 526 527 revoke = journal->j_revoke; 528 529 for (i = 0; i < revoke->hash_size; i++) { 530 hash_list = &revoke->hash_table[i]; 531 while (!list_empty(hash_list)) { 532 record = (struct jfs_revoke_record_s*) hash_list->next; 533 list_del(&record->hash); 534 kmem_cache_free(revoke_record_cache, record); 535 } 536 } 537} 538 539