revoke.c revision 53ef44c40a3e425d2c700d8fd77a6b655aa121fe
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks.  The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 *   transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 *   revoked blocks.  If there are multiple revoke records in the log
24 *   for a single block, only the last one counts, and if there is a log
25 *   entry for a block beyond the last revoke, then that log entry still
26 *   gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 *   The desired end result is the journaling of the new block, so we
33 *   cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 *   The revoke must take precedence over the write of the block, so
37 *   we need either to cancel the journal entry or to write the revoke
38 *   later in the log than the log block.  In this case, we choose the
39 *   former: the commit code must skip any block that has the Revoke bit
40 *   set.
41 *
42 * Block is revoked and then written as data:
43 *   The data write is allowed to succeed, but the revoke is _not_
44 *   cancelled.  We still need to prevent old log records from
45 *   overwriting the new data.  We don't even need to clear the revoke
46 *   bit here.
47 *
48 * Revoke information on buffers is a tri-state value:
49 *
50 * RevokeValid clear:	no cached revoke status, need to look it up
51 * RevokeValid set, Revoke clear:
52 *			buffer has not been revoked, and cancel_revoke
53 *			need do nothing.
54 * RevokeValid set, Revoke set:
55 *			buffer has been revoked.
56 */
57
58#ifndef __KERNEL__
59#include "jfs_user.h"
60#else
61#include <linux/sched.h>
62#include <linux/fs.h>
63#include <linux/jfs.h>
64#include <linux/errno.h>
65#include <linux/slab.h>
66#include <linux/locks.h>
67#include <linux/buffer.h>
68#include <linux/list.h>
69#endif
70
71static kmem_cache_t *revoke_record_cache;
72static kmem_cache_t *revoke_table_cache;
73
74/* Each revoke record represents one single revoked block.  During
75   journal replay, this involves recording the transaction ID of the
76   last transaction to revoke this block. */
77
78struct jfs_revoke_record_s
79{
80	struct list_head  hash;
81	tid_t		  sequence;	/* Used for recovery only */
82	unsigned long	  blocknr;
83};
84
85
86/* The revoke table is just a simple hash table of revoke records. */
87struct jfs_revoke_table_s
88{
89	/* It is conceivable that we might want a larger hash table
90	 * for recovery.  Must be a power of two. */
91	int		  hash_size;
92	int		  hash_shift;
93	struct list_head *hash_table;
94};
95
96
97#ifdef __KERNEL__
98static void write_one_revoke_record(journal_t *, transaction_t *,
99				    struct buffer_head **, int *,
100				    struct jfs_revoke_record_s *);
101static void flush_descriptor(journal_t *, struct buffer_head *, int);
102#endif
103
104/* Utility functions to maintain the revoke table */
105
106/* Borrowed from buffer.c: this is a tried and tested block hash function */
107static inline int hash(journal_t *journal, unsigned long block)
108{
109	struct jfs_revoke_table_s *table = journal->j_revoke;
110	int hash_shift = table->hash_shift;
111
112	return ((block << (hash_shift - 6)) ^
113		(block >> 13) ^
114		(block << (hash_shift - 12))) & (table->hash_size - 1);
115}
116
117static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
118			      tid_t seq)
119{
120	struct list_head *hash_list;
121	struct jfs_revoke_record_s *record;
122
123	record = kmem_cache_alloc(revoke_record_cache, GFP_KERNEL);
124	if (!record)
125		return -ENOMEM;
126
127	record->sequence = seq;
128	record->blocknr = blocknr;
129	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
130	list_add(&record->hash, hash_list);
131	return 0;
132}
133
134/* Find a revoke record in the journal's hash table. */
135
136static struct jfs_revoke_record_s *find_revoke_record(journal_t *journal,
137						      unsigned long blocknr)
138{
139	struct list_head *hash_list;
140	struct jfs_revoke_record_s *record;
141
142	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
143
144	record = (struct jfs_revoke_record_s *) hash_list->next;
145	while (&(record->hash) != hash_list) {
146		if (record->blocknr == blocknr)
147			return record;
148		record = (struct jfs_revoke_record_s *) record->hash.next;
149	}
150	return NULL;
151}
152
153
154
155/* Initialise the revoke table for a given journal to a given size. */
156
157int journal_init_revoke(journal_t *journal, int hash_size)
158{
159	int shift, tmp;
160
161	J_ASSERT (journal->j_revoke == NULL);
162
163	if (!revoke_record_cache)
164		revoke_record_cache =
165			kmem_cache_create ("revoke_record",
166					   sizeof(struct jfs_revoke_record_s),
167					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
168
169	if (!revoke_table_cache)
170		revoke_table_cache =
171			kmem_cache_create ("revoke_table",
172					   sizeof(struct jfs_revoke_table_s),
173					   0, 0, NULL, NULL);
174
175	if (!revoke_record_cache || !revoke_table_cache)
176		return -ENOMEM;
177
178	journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
179	if (!journal->j_revoke)
180		return -ENOMEM;
181
182	/* Check that the hash_size is a power of two */
183	J_ASSERT ((hash_size & (hash_size-1)) == 0);
184
185	journal->j_revoke->hash_size = hash_size;
186
187	shift = 0;
188	tmp = hash_size;
189	while((tmp >>= 1UL) != 0UL)
190		shift++;
191	journal->j_revoke->hash_shift = shift;
192
193	journal->j_revoke->hash_table =
194		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
195	if (!journal->j_revoke->hash_table) {
196		kmem_cache_free(revoke_table_cache, journal->j_revoke);
197		journal->j_revoke = NULL;
198		return -ENOMEM;
199	}
200
201	for (tmp = 0; tmp < hash_size; tmp++)
202		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
203
204	return 0;
205}
206
207/* Destoy a journal's revoke table.  The table must already be empty! */
208
209void journal_destroy_revoke(journal_t *journal)
210{
211	struct jfs_revoke_table_s *table;
212	struct list_head *hash_list;
213	int i;
214
215	table = journal->j_revoke;
216	if (!table)
217		return;
218
219	for (i=0; i<table->hash_size; i++) {
220		hash_list = &table->hash_table[i];
221		J_ASSERT (list_empty(hash_list));
222	}
223
224	kfree(table->hash_table);
225	kmem_cache_free(revoke_table_cache, table);
226	journal->j_revoke = NULL;
227}
228
229
230#ifdef __KERNEL__
231
232/*
233 * journal_revoke: revoke a given buffer_head from the journal.  This
234 * prevents the block from being replayed during recovery if we take a
235 * crash after this current transaction commits.  Any subsequent
236 * metadata writes of the buffer in this transaction cancel the
237 * revoke.
238 *
239 * Note that this call may block --- it is up to the caller to make
240 * sure that there are no further calls to journal_write_metadata
241 * before the revoke is complete.  In ext3, this implies calling the
242 * revoke before clearing the block bitmap when we are deleting
243 * metadata.
244 *
245 * Revoke performs a journal_forget on any buffer_head passed in as a
246 * parameter, but does _not_ forget the buffer_head if the bh was only
247 * found implicitly.
248 *
249 * Revoke must observe the same synchronisation rules as bforget: it
250 * must not discard the buffer once it has blocked.
251 */
252
253int journal_revoke(handle_t *handle, unsigned long blocknr,
254		   struct buffer_head *bh_in)
255{
256	struct buffer_head *bh;
257	journal_t *journal;
258	kdev_t dev;
259	int err;
260
261	journal = handle->h_transaction->t_journal;
262	if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
263		return -EINVAL;
264
265	dev = journal->j_dev;
266	bh = bh_in;
267
268	if (!bh)
269		bh = get_hash_table(dev, blocknr, journal->j_blocksize);
270
271	/* We really ought not ever to revoke twice in a row without
272           first having the revoke cancelled: it's illegal to free a
273           block twice without allocating it in between! */
274	if (bh) {
275		J_ASSERT (!test_and_set_bit(BH_Revoked, &bh->b_state));
276		set_bit(BH_RevokeValid, &bh->b_state);
277		if (bh_in)
278			journal_forget(handle, bh_in);
279		else
280			brelse(bh);
281	}
282
283	lock_journal(journal);
284	err = insert_revoke_hash(journal, blocknr,
285				 handle->h_transaction->t_tid);
286	unlock_journal(journal);
287
288	return err;
289}
290
291
292/*
293 * Cancel an outstanding revoke.  For use only internally by the
294 * journaling code (called from journal_get_write_access).
295 *
296 * We trust the BH_Revoked bit on the buffer if the buffer is already
297 * being journaled: if there is no revoke pending on the buffer, then we
298 * don't do anything here.
299 *
300 * This would break if it were possible for a buffer to be revoked and
301 * discarded, and then reallocated within the same transaction.  In such
302 * a case we would have lost the revoked bit, but when we arrived here
303 * the second time we would still have a pending revoke to cancel.  So,
304 * do not trust the Revoked bit on buffers unless RevokeValid is also
305 * set.
306 *
307 * The caller must have the journal locked.
308 * */
309
310void journal_cancel_revoke(handle_t *handle, struct buffer_head *bh)
311{
312	struct jfs_revoke_record_s *record;
313	journal_t *journal = handle->h_transaction->t_journal;
314	int need_cancel;
315
316	J_ASSERT (journal->j_locked);
317
318	/* Is the existing Revoke bit valid?  If so, we trust it, and
319	 * only perform the full cancel if the revoke bit is set.  If
320	 * not, we can't trust the revoke bit, and we need to do the
321	 * full search for a revoke record. */
322	if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
323		need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
324	else {
325		need_cancel = 1;
326		clear_bit(BH_Revoked, &bh->b_state);
327	}
328
329	if (need_cancel) {
330		record = find_revoke_record(journal, bh->b_blocknr);
331		if (record) {
332			list_del(&record->hash);
333			kmem_cache_free(revoke_record_cache, record);
334		}
335	}
336}
337
338
339/*
340 * Write revoke records to the journal for all entries in the current
341 * revoke hash, deleting the entries as we go.
342 *
343 * Called with the journal lock held.
344 */
345
346void journal_write_revoke_records(journal_t *journal,
347				  transaction_t *transaction)
348{
349	struct buffer_head *descriptor;
350	struct jfs_revoke_record_s *record;
351	struct jfs_revoke_table_s *revoke;
352	struct list_head *hash_list;
353	int i, offset;
354
355	descriptor = NULL;
356	offset = 0;
357	revoke = journal->j_revoke;
358
359	for (i = 0; i < revoke->hash_size; i++) {
360		hash_list = &revoke->hash_table[i];
361
362		while (!list_empty(hash_list)) {
363			record = (struct jfs_revoke_record_s *)
364				hash_list->next;
365			write_one_revoke_record(journal, transaction,
366						&descriptor, &offset,
367						record);
368			list_del(&record->hash);
369			kmem_cache_free(revoke_record_cache, record);
370		}
371	}
372	if (descriptor)
373		flush_descriptor(journal, descriptor, offset);
374}
375
376/*
377 * Write out one revoke record.  We need to create a new descriptor
378 * block if the old one is full or if we have not already created one.
379 */
380
381static void write_one_revoke_record(journal_t *journal,
382				    transaction_t *transaction,
383				    struct buffer_head **descriptorp,
384				    int *offsetp,
385				    struct jfs_revoke_record_s *record)
386{
387	struct buffer_head *descriptor;
388	int offset;
389	journal_header_t *header;
390
391	/* If we are already aborting, this all becomes a noop.  We
392           still need to go round the loop in
393           journal_write_revoke_records in order to free all of the
394           revoke records: only the IO to the journal is omitted. */
395	if (is_journal_abort(journal))
396		return;
397
398	descriptor = *descriptorp;
399	offset = *offsetp;
400
401	/* Make sure we have a descriptor with space left for the record */
402	if (descriptor) {
403		if (offset == journal->j_blocksize) {
404			flush_descriptor(journal, descriptor, offset);
405			descriptor = NULL;
406		}
407	}
408
409	if (!descriptor) {
410		descriptor = journal_get_descriptor_buffer(journal);
411		header = (journal_header_t *) &descriptor->b_data[0];
412		header->h_magic     = htonl(JFS_MAGIC_NUMBER);
413		header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
414		header->h_sequence  = htonl(transaction->t_tid);
415
416		/* Record it so that we can wait for IO completion later */
417		journal_file_buffer(descriptor, transaction, BJ_LogCtl);
418
419		offset = sizeof(journal_revoke_header_t);
420		*descriptorp = descriptor;
421	}
422
423	* ((unsigned int *)(&descriptor->b_data[offset])) =
424		htonl(record->blocknr);
425	offset += 4;
426	*offsetp = offset;
427}
428
429/*
430 * Flush a revoke descriptor out to the journal.  If we are aborting,
431 * this is a noop; otherwise we are generating a buffer which needs to
432 * be waited for during commit, so it has to go onto the appropriate
433 * journal buffer list.
434 */
435
436static void flush_descriptor(journal_t *journal,
437			     struct buffer_head *descriptor,
438			     int offset)
439{
440	journal_revoke_header_t *header;
441
442	if (is_journal_abort(journal)) {
443		brelse(descriptor);
444		return;
445	}
446
447	header = (journal_revoke_header_t *) descriptor->b_data;
448	header->r_count = htonl(offset);
449	set_bit(BH_JWrite, &descriptor->b_state);
450	ll_rw_block (WRITE, 1, &descriptor);
451}
452
453#endif
454
455/*
456 * Revoke support for recovery.
457 *
458 * Recovery needs to be able to:
459 *
460 *  record all revoke records, including the tid of the latest instance
461 *  of each revoke in the journal
462 *
463 *  check whether a given block in a given transaction should be replayed
464 *  (ie. has not been revoked by a revoke record in that or a subsequent
465 *  transaction)
466 *
467 *  empty the revoke table after recovery.
468 */
469
470/*
471 * First, setting revoke records.  We create a new revoke record for
472 * every block ever revoked in the log as we scan it for recovery, and
473 * we update the existing records if we find multiple revokes for a
474 * single block.
475 */
476
477int journal_set_revoke(journal_t *journal,
478		       unsigned long blocknr,
479		       tid_t sequence)
480{
481	struct jfs_revoke_record_s *record;
482
483	record = find_revoke_record(journal, blocknr);
484	if (record) {
485		/* If we have multiple occurences, only record the
486		 * latest sequence number in the hashed record */
487		if (tid_ge(sequence, record->sequence))
488			record->sequence = sequence;
489		return 0;
490	}
491	return insert_revoke_hash(journal, blocknr, sequence);
492}
493
494/*
495 * Test revoke records.  For a given block referenced in the log, has
496 * that block been revoked?  A revoke record with a given transaction
497 * sequence number revokes all blocks in that transaction and earlier
498 * ones, but later transactions still need replayed.
499 */
500
501int journal_test_revoke(journal_t *journal,
502			unsigned long blocknr,
503			tid_t sequence)
504{
505	struct jfs_revoke_record_s *record;
506
507	record = find_revoke_record(journal, blocknr);
508	if (!record)
509		return 0;
510	if (tid_ge(sequence, record->sequence))
511		return 0;
512	return 1;
513}
514
515/*
516 * Finally, once recovery is over, we need to clear the revoke table so
517 * that it can be reused by the running filesystem.
518 */
519
520void journal_clear_revoke(journal_t *journal)
521{
522	int i;
523	struct list_head *hash_list;
524	struct jfs_revoke_record_s *record;
525	struct jfs_revoke_table_s *revoke;
526
527	revoke = journal->j_revoke;
528
529	for (i = 0; i < revoke->hash_size; i++) {
530		hash_list = &revoke->hash_table[i];
531		while (!list_empty(hash_list)) {
532			record = (struct jfs_revoke_record_s*) hash_list->next;
533			list_del(&record->hash);
534			kmem_cache_free(revoke_record_cache, record);
535		}
536	}
537}
538
539