journal.c revision 25985edcedea6396277003854657b5f3cb31a628
1/*
2** Write ahead logging implementation copyright Chris Mason 2000
3**
4** The background commits make this code very interrelated, and
5** overly complex.  I need to rethink things a bit....The major players:
6**
7** journal_begin -- call with the number of blocks you expect to log.
8**                  If the current transaction is too
9** 		    old, it will block until the current transaction is
10** 		    finished, and then start a new one.
11**		    Usually, your transaction will get joined in with
12**                  previous ones for speed.
13**
14** journal_join  -- same as journal_begin, but won't block on the current
15**                  transaction regardless of age.  Don't ever call
16**                  this.  Ever.  There are only two places it should be
17**                  called from, and they are both inside this file.
18**
19** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
20**                       that might make them get sent to disk
21**                       and then marks them BH_JDirty.  Puts the buffer head
22**                       into the current transaction hash.
23**
24** journal_end -- if the current transaction is batchable, it does nothing
25**                   otherwise, it could do an async/synchronous commit, or
26**                   a full flush of all log and real blocks in the
27**                   transaction.
28**
29** flush_old_commits -- if the current transaction is too old, it is ended and
30**                      commit blocks are sent to disk.  Forces commit blocks
31**                      to disk for all backgrounded commits that have been
32**                      around too long.
33**		     -- Note, if you call this as an immediate flush from
34**		        from within kupdate, it will ignore the immediate flag
35*/
36
37#include <linux/time.h>
38#include <linux/semaphore.h>
39#include <linux/vmalloc.h>
40#include <linux/reiserfs_fs.h>
41#include <linux/kernel.h>
42#include <linux/errno.h>
43#include <linux/fcntl.h>
44#include <linux/stat.h>
45#include <linux/string.h>
46#include <linux/buffer_head.h>
47#include <linux/workqueue.h>
48#include <linux/writeback.h>
49#include <linux/blkdev.h>
50#include <linux/backing-dev.h>
51#include <linux/uaccess.h>
52#include <linux/slab.h>
53
54#include <asm/system.h>
55
56/* gets a struct reiserfs_journal_list * from a list head */
57#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
58                               j_list))
59#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
60                               j_working_list))
61
62/* the number of mounted filesystems.  This is used to decide when to
63** start and kill the commit workqueue
64*/
65static int reiserfs_mounted_fs_count;
66
67static struct workqueue_struct *commit_wq;
68
69#define JOURNAL_TRANS_HALF 1018	/* must be correct to keep the desc and commit
70				   structs at 4k */
71#define BUFNR 64		/*read ahead */
72
73/* cnode stat bits.  Move these into reiserfs_fs.h */
74
75#define BLOCK_FREED 2		/* this block was freed, and can't be written.  */
76#define BLOCK_FREED_HOLDER 3	/* this block was freed during this transaction, and can't be written */
77
78#define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
79#define BLOCK_DIRTIED 5
80
81/* journal list state bits */
82#define LIST_TOUCHED 1
83#define LIST_DIRTY   2
84#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
85
86/* flags for do_journal_end */
87#define FLUSH_ALL   1		/* flush commit and real blocks */
88#define COMMIT_NOW  2		/* end and commit this transaction */
89#define WAIT        4		/* wait for the log blocks to hit the disk */
90
91static int do_journal_end(struct reiserfs_transaction_handle *,
92			  struct super_block *, unsigned long nblocks,
93			  int flags);
94static int flush_journal_list(struct super_block *s,
95			      struct reiserfs_journal_list *jl, int flushall);
96static int flush_commit_list(struct super_block *s,
97			     struct reiserfs_journal_list *jl, int flushall);
98static int can_dirty(struct reiserfs_journal_cnode *cn);
99static int journal_join(struct reiserfs_transaction_handle *th,
100			struct super_block *sb, unsigned long nblocks);
101static int release_journal_dev(struct super_block *super,
102			       struct reiserfs_journal *journal);
103static int dirty_one_transaction(struct super_block *s,
104				 struct reiserfs_journal_list *jl);
105static void flush_async_commits(struct work_struct *work);
106static void queue_log_writer(struct super_block *s);
107
108/* values for join in do_journal_begin_r */
109enum {
110	JBEGIN_REG = 0,		/* regular journal begin */
111	JBEGIN_JOIN = 1,	/* join the running transaction if at all possible */
112	JBEGIN_ABORT = 2,	/* called from cleanup code, ignores aborted flag */
113};
114
115static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
116			      struct super_block *sb,
117			      unsigned long nblocks, int join);
118
119static void init_journal_hash(struct super_block *sb)
120{
121	struct reiserfs_journal *journal = SB_JOURNAL(sb);
122	memset(journal->j_hash_table, 0,
123	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
124}
125
126/*
127** clears BH_Dirty and sticks the buffer on the clean list.  Called because I can't allow refile_buffer to
128** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
129** more details.
130*/
131static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
132{
133	if (bh) {
134		clear_buffer_dirty(bh);
135		clear_buffer_journal_test(bh);
136	}
137	return 0;
138}
139
140static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
141							 *sb)
142{
143	struct reiserfs_bitmap_node *bn;
144	static int id;
145
146	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
147	if (!bn) {
148		return NULL;
149	}
150	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
151	if (!bn->data) {
152		kfree(bn);
153		return NULL;
154	}
155	bn->id = id++;
156	INIT_LIST_HEAD(&bn->list);
157	return bn;
158}
159
160static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
161{
162	struct reiserfs_journal *journal = SB_JOURNAL(sb);
163	struct reiserfs_bitmap_node *bn = NULL;
164	struct list_head *entry = journal->j_bitmap_nodes.next;
165
166	journal->j_used_bitmap_nodes++;
167      repeat:
168
169	if (entry != &journal->j_bitmap_nodes) {
170		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
171		list_del(entry);
172		memset(bn->data, 0, sb->s_blocksize);
173		journal->j_free_bitmap_nodes--;
174		return bn;
175	}
176	bn = allocate_bitmap_node(sb);
177	if (!bn) {
178		yield();
179		goto repeat;
180	}
181	return bn;
182}
183static inline void free_bitmap_node(struct super_block *sb,
184				    struct reiserfs_bitmap_node *bn)
185{
186	struct reiserfs_journal *journal = SB_JOURNAL(sb);
187	journal->j_used_bitmap_nodes--;
188	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
189		kfree(bn->data);
190		kfree(bn);
191	} else {
192		list_add(&bn->list, &journal->j_bitmap_nodes);
193		journal->j_free_bitmap_nodes++;
194	}
195}
196
197static void allocate_bitmap_nodes(struct super_block *sb)
198{
199	int i;
200	struct reiserfs_journal *journal = SB_JOURNAL(sb);
201	struct reiserfs_bitmap_node *bn = NULL;
202	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
203		bn = allocate_bitmap_node(sb);
204		if (bn) {
205			list_add(&bn->list, &journal->j_bitmap_nodes);
206			journal->j_free_bitmap_nodes++;
207		} else {
208			break;	/* this is ok, we'll try again when more are needed */
209		}
210	}
211}
212
213static int set_bit_in_list_bitmap(struct super_block *sb,
214				  b_blocknr_t block,
215				  struct reiserfs_list_bitmap *jb)
216{
217	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
218	unsigned int bit_nr = block % (sb->s_blocksize << 3);
219
220	if (!jb->bitmaps[bmap_nr]) {
221		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
222	}
223	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
224	return 0;
225}
226
227static void cleanup_bitmap_list(struct super_block *sb,
228				struct reiserfs_list_bitmap *jb)
229{
230	int i;
231	if (jb->bitmaps == NULL)
232		return;
233
234	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
235		if (jb->bitmaps[i]) {
236			free_bitmap_node(sb, jb->bitmaps[i]);
237			jb->bitmaps[i] = NULL;
238		}
239	}
240}
241
242/*
243** only call this on FS unmount.
244*/
245static int free_list_bitmaps(struct super_block *sb,
246			     struct reiserfs_list_bitmap *jb_array)
247{
248	int i;
249	struct reiserfs_list_bitmap *jb;
250	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
251		jb = jb_array + i;
252		jb->journal_list = NULL;
253		cleanup_bitmap_list(sb, jb);
254		vfree(jb->bitmaps);
255		jb->bitmaps = NULL;
256	}
257	return 0;
258}
259
260static int free_bitmap_nodes(struct super_block *sb)
261{
262	struct reiserfs_journal *journal = SB_JOURNAL(sb);
263	struct list_head *next = journal->j_bitmap_nodes.next;
264	struct reiserfs_bitmap_node *bn;
265
266	while (next != &journal->j_bitmap_nodes) {
267		bn = list_entry(next, struct reiserfs_bitmap_node, list);
268		list_del(next);
269		kfree(bn->data);
270		kfree(bn);
271		next = journal->j_bitmap_nodes.next;
272		journal->j_free_bitmap_nodes--;
273	}
274
275	return 0;
276}
277
278/*
279** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
280** jb_array is the array to be filled in.
281*/
282int reiserfs_allocate_list_bitmaps(struct super_block *sb,
283				   struct reiserfs_list_bitmap *jb_array,
284				   unsigned int bmap_nr)
285{
286	int i;
287	int failed = 0;
288	struct reiserfs_list_bitmap *jb;
289	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
290
291	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
292		jb = jb_array + i;
293		jb->journal_list = NULL;
294		jb->bitmaps = vmalloc(mem);
295		if (!jb->bitmaps) {
296			reiserfs_warning(sb, "clm-2000", "unable to "
297					 "allocate bitmaps for journal lists");
298			failed = 1;
299			break;
300		}
301		memset(jb->bitmaps, 0, mem);
302	}
303	if (failed) {
304		free_list_bitmaps(sb, jb_array);
305		return -1;
306	}
307	return 0;
308}
309
310/*
311** find an available list bitmap.  If you can't find one, flush a commit list
312** and try again
313*/
314static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
315						    struct reiserfs_journal_list
316						    *jl)
317{
318	int i, j;
319	struct reiserfs_journal *journal = SB_JOURNAL(sb);
320	struct reiserfs_list_bitmap *jb = NULL;
321
322	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
323		i = journal->j_list_bitmap_index;
324		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
325		jb = journal->j_list_bitmap + i;
326		if (journal->j_list_bitmap[i].journal_list) {
327			flush_commit_list(sb,
328					  journal->j_list_bitmap[i].
329					  journal_list, 1);
330			if (!journal->j_list_bitmap[i].journal_list) {
331				break;
332			}
333		} else {
334			break;
335		}
336	}
337	if (jb->journal_list) {	/* double check to make sure if flushed correctly */
338		return NULL;
339	}
340	jb->journal_list = jl;
341	return jb;
342}
343
344/*
345** allocates a new chunk of X nodes, and links them all together as a list.
346** Uses the cnode->next and cnode->prev pointers
347** returns NULL on failure
348*/
349static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
350{
351	struct reiserfs_journal_cnode *head;
352	int i;
353	if (num_cnodes <= 0) {
354		return NULL;
355	}
356	head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
357	if (!head) {
358		return NULL;
359	}
360	memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
361	head[0].prev = NULL;
362	head[0].next = head + 1;
363	for (i = 1; i < num_cnodes; i++) {
364		head[i].prev = head + (i - 1);
365		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
366	}
367	head[num_cnodes - 1].next = NULL;
368	return head;
369}
370
371/*
372** pulls a cnode off the free list, or returns NULL on failure
373*/
374static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
375{
376	struct reiserfs_journal_cnode *cn;
377	struct reiserfs_journal *journal = SB_JOURNAL(sb);
378
379	reiserfs_check_lock_depth(sb, "get_cnode");
380
381	if (journal->j_cnode_free <= 0) {
382		return NULL;
383	}
384	journal->j_cnode_used++;
385	journal->j_cnode_free--;
386	cn = journal->j_cnode_free_list;
387	if (!cn) {
388		return cn;
389	}
390	if (cn->next) {
391		cn->next->prev = NULL;
392	}
393	journal->j_cnode_free_list = cn->next;
394	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
395	return cn;
396}
397
398/*
399** returns a cnode to the free list
400*/
401static void free_cnode(struct super_block *sb,
402		       struct reiserfs_journal_cnode *cn)
403{
404	struct reiserfs_journal *journal = SB_JOURNAL(sb);
405
406	reiserfs_check_lock_depth(sb, "free_cnode");
407
408	journal->j_cnode_used--;
409	journal->j_cnode_free++;
410	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
411	cn->next = journal->j_cnode_free_list;
412	if (journal->j_cnode_free_list) {
413		journal->j_cnode_free_list->prev = cn;
414	}
415	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
416	journal->j_cnode_free_list = cn;
417}
418
419static void clear_prepared_bits(struct buffer_head *bh)
420{
421	clear_buffer_journal_prepared(bh);
422	clear_buffer_journal_restore_dirty(bh);
423}
424
425/* return a cnode with same dev, block number and size in table, or null if not found */
426static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
427								  super_block
428								  *sb,
429								  struct
430								  reiserfs_journal_cnode
431								  **table,
432								  long bl)
433{
434	struct reiserfs_journal_cnode *cn;
435	cn = journal_hash(table, sb, bl);
436	while (cn) {
437		if (cn->blocknr == bl && cn->sb == sb)
438			return cn;
439		cn = cn->hnext;
440	}
441	return (struct reiserfs_journal_cnode *)0;
442}
443
444/*
445** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
446** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
447** being overwritten by a replay after crashing.
448**
449** If you don't set search_all, a block can only be allocated if it is not in the current transaction.  Since deleting
450** a block removes it from the current transaction, this case should never happen.  If you don't set search_all, make
451** sure you never write the block without logging it.
452**
453** next_zero_bit is a suggestion about the next block to try for find_forward.
454** when bl is rejected because it is set in a journal list bitmap, we search
455** for the next zero bit in the bitmap that rejected bl.  Then, we return that
456** through next_zero_bit for find_forward to try.
457**
458** Just because we return something in next_zero_bit does not mean we won't
459** reject it on the next call to reiserfs_in_journal
460**
461*/
462int reiserfs_in_journal(struct super_block *sb,
463			unsigned int bmap_nr, int bit_nr, int search_all,
464			b_blocknr_t * next_zero_bit)
465{
466	struct reiserfs_journal *journal = SB_JOURNAL(sb);
467	struct reiserfs_journal_cnode *cn;
468	struct reiserfs_list_bitmap *jb;
469	int i;
470	unsigned long bl;
471
472	*next_zero_bit = 0;	/* always start this at zero. */
473
474	PROC_INFO_INC(sb, journal.in_journal);
475	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
476	 ** if we crash before the transaction that freed it commits,  this transaction won't
477	 ** have committed either, and the block will never be written
478	 */
479	if (search_all) {
480		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
481			PROC_INFO_INC(sb, journal.in_journal_bitmap);
482			jb = journal->j_list_bitmap + i;
483			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
484			    test_bit(bit_nr,
485				     (unsigned long *)jb->bitmaps[bmap_nr]->
486				     data)) {
487				*next_zero_bit =
488				    find_next_zero_bit((unsigned long *)
489						       (jb->bitmaps[bmap_nr]->
490							data),
491						       sb->s_blocksize << 3,
492						       bit_nr + 1);
493				return 1;
494			}
495		}
496	}
497
498	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
499	/* is it in any old transactions? */
500	if (search_all
501	    && (cn =
502		get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
503		return 1;
504	}
505
506	/* is it in the current transaction.  This should never happen */
507	if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
508		BUG();
509		return 1;
510	}
511
512	PROC_INFO_INC(sb, journal.in_journal_reusable);
513	/* safe for reuse */
514	return 0;
515}
516
517/* insert cn into table
518*/
519static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
520				       struct reiserfs_journal_cnode *cn)
521{
522	struct reiserfs_journal_cnode *cn_orig;
523
524	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
525	cn->hnext = cn_orig;
526	cn->hprev = NULL;
527	if (cn_orig) {
528		cn_orig->hprev = cn;
529	}
530	journal_hash(table, cn->sb, cn->blocknr) = cn;
531}
532
533/* lock the current transaction */
534static inline void lock_journal(struct super_block *sb)
535{
536	PROC_INFO_INC(sb, journal.lock_journal);
537
538	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
539}
540
541/* unlock the current transaction */
542static inline void unlock_journal(struct super_block *sb)
543{
544	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
545}
546
547static inline void get_journal_list(struct reiserfs_journal_list *jl)
548{
549	jl->j_refcount++;
550}
551
552static inline void put_journal_list(struct super_block *s,
553				    struct reiserfs_journal_list *jl)
554{
555	if (jl->j_refcount < 1) {
556		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
557			       jl->j_trans_id, jl->j_refcount);
558	}
559	if (--jl->j_refcount == 0)
560		kfree(jl);
561}
562
563/*
564** this used to be much more involved, and I'm keeping it just in case things get ugly again.
565** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
566** transaction.
567*/
568static void cleanup_freed_for_journal_list(struct super_block *sb,
569					   struct reiserfs_journal_list *jl)
570{
571
572	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
573	if (jb) {
574		cleanup_bitmap_list(sb, jb);
575	}
576	jl->j_list_bitmap->journal_list = NULL;
577	jl->j_list_bitmap = NULL;
578}
579
580static int journal_list_still_alive(struct super_block *s,
581				    unsigned int trans_id)
582{
583	struct reiserfs_journal *journal = SB_JOURNAL(s);
584	struct list_head *entry = &journal->j_journal_list;
585	struct reiserfs_journal_list *jl;
586
587	if (!list_empty(entry)) {
588		jl = JOURNAL_LIST_ENTRY(entry->next);
589		if (jl->j_trans_id <= trans_id) {
590			return 1;
591		}
592	}
593	return 0;
594}
595
596/*
597 * If page->mapping was null, we failed to truncate this page for
598 * some reason.  Most likely because it was truncated after being
599 * logged via data=journal.
600 *
601 * This does a check to see if the buffer belongs to one of these
602 * lost pages before doing the final put_bh.  If page->mapping was
603 * null, it tries to free buffers on the page, which should make the
604 * final page_cache_release drop the page from the lru.
605 */
606static void release_buffer_page(struct buffer_head *bh)
607{
608	struct page *page = bh->b_page;
609	if (!page->mapping && trylock_page(page)) {
610		page_cache_get(page);
611		put_bh(bh);
612		if (!page->mapping)
613			try_to_free_buffers(page);
614		unlock_page(page);
615		page_cache_release(page);
616	} else {
617		put_bh(bh);
618	}
619}
620
621static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
622{
623	char b[BDEVNAME_SIZE];
624
625	if (buffer_journaled(bh)) {
626		reiserfs_warning(NULL, "clm-2084",
627				 "pinned buffer %lu:%s sent to disk",
628				 bh->b_blocknr, bdevname(bh->b_bdev, b));
629	}
630	if (uptodate)
631		set_buffer_uptodate(bh);
632	else
633		clear_buffer_uptodate(bh);
634
635	unlock_buffer(bh);
636	release_buffer_page(bh);
637}
638
639static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
640{
641	if (uptodate)
642		set_buffer_uptodate(bh);
643	else
644		clear_buffer_uptodate(bh);
645	unlock_buffer(bh);
646	put_bh(bh);
647}
648
649static void submit_logged_buffer(struct buffer_head *bh)
650{
651	get_bh(bh);
652	bh->b_end_io = reiserfs_end_buffer_io_sync;
653	clear_buffer_journal_new(bh);
654	clear_buffer_dirty(bh);
655	if (!test_clear_buffer_journal_test(bh))
656		BUG();
657	if (!buffer_uptodate(bh))
658		BUG();
659	submit_bh(WRITE, bh);
660}
661
662static void submit_ordered_buffer(struct buffer_head *bh)
663{
664	get_bh(bh);
665	bh->b_end_io = reiserfs_end_ordered_io;
666	clear_buffer_dirty(bh);
667	if (!buffer_uptodate(bh))
668		BUG();
669	submit_bh(WRITE, bh);
670}
671
672#define CHUNK_SIZE 32
673struct buffer_chunk {
674	struct buffer_head *bh[CHUNK_SIZE];
675	int nr;
676};
677
678static void write_chunk(struct buffer_chunk *chunk)
679{
680	int i;
681	get_fs_excl();
682	for (i = 0; i < chunk->nr; i++) {
683		submit_logged_buffer(chunk->bh[i]);
684	}
685	chunk->nr = 0;
686	put_fs_excl();
687}
688
689static void write_ordered_chunk(struct buffer_chunk *chunk)
690{
691	int i;
692	get_fs_excl();
693	for (i = 0; i < chunk->nr; i++) {
694		submit_ordered_buffer(chunk->bh[i]);
695	}
696	chunk->nr = 0;
697	put_fs_excl();
698}
699
700static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
701			spinlock_t * lock, void (fn) (struct buffer_chunk *))
702{
703	int ret = 0;
704	BUG_ON(chunk->nr >= CHUNK_SIZE);
705	chunk->bh[chunk->nr++] = bh;
706	if (chunk->nr >= CHUNK_SIZE) {
707		ret = 1;
708		if (lock)
709			spin_unlock(lock);
710		fn(chunk);
711		if (lock)
712			spin_lock(lock);
713	}
714	return ret;
715}
716
717static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
718static struct reiserfs_jh *alloc_jh(void)
719{
720	struct reiserfs_jh *jh;
721	while (1) {
722		jh = kmalloc(sizeof(*jh), GFP_NOFS);
723		if (jh) {
724			atomic_inc(&nr_reiserfs_jh);
725			return jh;
726		}
727		yield();
728	}
729}
730
731/*
732 * we want to free the jh when the buffer has been written
733 * and waited on
734 */
735void reiserfs_free_jh(struct buffer_head *bh)
736{
737	struct reiserfs_jh *jh;
738
739	jh = bh->b_private;
740	if (jh) {
741		bh->b_private = NULL;
742		jh->bh = NULL;
743		list_del_init(&jh->list);
744		kfree(jh);
745		if (atomic_read(&nr_reiserfs_jh) <= 0)
746			BUG();
747		atomic_dec(&nr_reiserfs_jh);
748		put_bh(bh);
749	}
750}
751
752static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
753			   int tail)
754{
755	struct reiserfs_jh *jh;
756
757	if (bh->b_private) {
758		spin_lock(&j->j_dirty_buffers_lock);
759		if (!bh->b_private) {
760			spin_unlock(&j->j_dirty_buffers_lock);
761			goto no_jh;
762		}
763		jh = bh->b_private;
764		list_del_init(&jh->list);
765	} else {
766	      no_jh:
767		get_bh(bh);
768		jh = alloc_jh();
769		spin_lock(&j->j_dirty_buffers_lock);
770		/* buffer must be locked for __add_jh, should be able to have
771		 * two adds at the same time
772		 */
773		BUG_ON(bh->b_private);
774		jh->bh = bh;
775		bh->b_private = jh;
776	}
777	jh->jl = j->j_current_jl;
778	if (tail)
779		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
780	else {
781		list_add_tail(&jh->list, &jh->jl->j_bh_list);
782	}
783	spin_unlock(&j->j_dirty_buffers_lock);
784	return 0;
785}
786
787int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
788{
789	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
790}
791int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
792{
793	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
794}
795
796#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
797static int write_ordered_buffers(spinlock_t * lock,
798				 struct reiserfs_journal *j,
799				 struct reiserfs_journal_list *jl,
800				 struct list_head *list)
801{
802	struct buffer_head *bh;
803	struct reiserfs_jh *jh;
804	int ret = j->j_errno;
805	struct buffer_chunk chunk;
806	struct list_head tmp;
807	INIT_LIST_HEAD(&tmp);
808
809	chunk.nr = 0;
810	spin_lock(lock);
811	while (!list_empty(list)) {
812		jh = JH_ENTRY(list->next);
813		bh = jh->bh;
814		get_bh(bh);
815		if (!trylock_buffer(bh)) {
816			if (!buffer_dirty(bh)) {
817				list_move(&jh->list, &tmp);
818				goto loop_next;
819			}
820			spin_unlock(lock);
821			if (chunk.nr)
822				write_ordered_chunk(&chunk);
823			wait_on_buffer(bh);
824			cond_resched();
825			spin_lock(lock);
826			goto loop_next;
827		}
828		/* in theory, dirty non-uptodate buffers should never get here,
829		 * but the upper layer io error paths still have a few quirks.
830		 * Handle them here as gracefully as we can
831		 */
832		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
833			clear_buffer_dirty(bh);
834			ret = -EIO;
835		}
836		if (buffer_dirty(bh)) {
837			list_move(&jh->list, &tmp);
838			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
839		} else {
840			reiserfs_free_jh(bh);
841			unlock_buffer(bh);
842		}
843	      loop_next:
844		put_bh(bh);
845		cond_resched_lock(lock);
846	}
847	if (chunk.nr) {
848		spin_unlock(lock);
849		write_ordered_chunk(&chunk);
850		spin_lock(lock);
851	}
852	while (!list_empty(&tmp)) {
853		jh = JH_ENTRY(tmp.prev);
854		bh = jh->bh;
855		get_bh(bh);
856		reiserfs_free_jh(bh);
857
858		if (buffer_locked(bh)) {
859			spin_unlock(lock);
860			wait_on_buffer(bh);
861			spin_lock(lock);
862		}
863		if (!buffer_uptodate(bh)) {
864			ret = -EIO;
865		}
866		/* ugly interaction with invalidatepage here.
867		 * reiserfs_invalidate_page will pin any buffer that has a valid
868		 * journal head from an older transaction.  If someone else sets
869		 * our buffer dirty after we write it in the first loop, and
870		 * then someone truncates the page away, nobody will ever write
871		 * the buffer. We're safe if we write the page one last time
872		 * after freeing the journal header.
873		 */
874		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
875			spin_unlock(lock);
876			ll_rw_block(WRITE, 1, &bh);
877			spin_lock(lock);
878		}
879		put_bh(bh);
880		cond_resched_lock(lock);
881	}
882	spin_unlock(lock);
883	return ret;
884}
885
886static int flush_older_commits(struct super_block *s,
887			       struct reiserfs_journal_list *jl)
888{
889	struct reiserfs_journal *journal = SB_JOURNAL(s);
890	struct reiserfs_journal_list *other_jl;
891	struct reiserfs_journal_list *first_jl;
892	struct list_head *entry;
893	unsigned int trans_id = jl->j_trans_id;
894	unsigned int other_trans_id;
895	unsigned int first_trans_id;
896
897      find_first:
898	/*
899	 * first we walk backwards to find the oldest uncommitted transation
900	 */
901	first_jl = jl;
902	entry = jl->j_list.prev;
903	while (1) {
904		other_jl = JOURNAL_LIST_ENTRY(entry);
905		if (entry == &journal->j_journal_list ||
906		    atomic_read(&other_jl->j_older_commits_done))
907			break;
908
909		first_jl = other_jl;
910		entry = other_jl->j_list.prev;
911	}
912
913	/* if we didn't find any older uncommitted transactions, return now */
914	if (first_jl == jl) {
915		return 0;
916	}
917
918	first_trans_id = first_jl->j_trans_id;
919
920	entry = &first_jl->j_list;
921	while (1) {
922		other_jl = JOURNAL_LIST_ENTRY(entry);
923		other_trans_id = other_jl->j_trans_id;
924
925		if (other_trans_id < trans_id) {
926			if (atomic_read(&other_jl->j_commit_left) != 0) {
927				flush_commit_list(s, other_jl, 0);
928
929				/* list we were called with is gone, return */
930				if (!journal_list_still_alive(s, trans_id))
931					return 1;
932
933				/* the one we just flushed is gone, this means all
934				 * older lists are also gone, so first_jl is no longer
935				 * valid either.  Go back to the beginning.
936				 */
937				if (!journal_list_still_alive
938				    (s, other_trans_id)) {
939					goto find_first;
940				}
941			}
942			entry = entry->next;
943			if (entry == &journal->j_journal_list)
944				return 0;
945		} else {
946			return 0;
947		}
948	}
949	return 0;
950}
951
952static int reiserfs_async_progress_wait(struct super_block *s)
953{
954	struct reiserfs_journal *j = SB_JOURNAL(s);
955
956	if (atomic_read(&j->j_async_throttle)) {
957		reiserfs_write_unlock(s);
958		congestion_wait(BLK_RW_ASYNC, HZ / 10);
959		reiserfs_write_lock(s);
960	}
961
962	return 0;
963}
964
965/*
966** if this journal list still has commit blocks unflushed, send them to disk.
967**
968** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
969** Before the commit block can by written, every other log block must be safely on disk
970**
971*/
972static int flush_commit_list(struct super_block *s,
973			     struct reiserfs_journal_list *jl, int flushall)
974{
975	int i;
976	b_blocknr_t bn;
977	struct buffer_head *tbh = NULL;
978	unsigned int trans_id = jl->j_trans_id;
979	struct reiserfs_journal *journal = SB_JOURNAL(s);
980	int retval = 0;
981	int write_len;
982
983	reiserfs_check_lock_depth(s, "flush_commit_list");
984
985	if (atomic_read(&jl->j_older_commits_done)) {
986		return 0;
987	}
988
989	get_fs_excl();
990
991	/* before we can put our commit blocks on disk, we have to make sure everyone older than
992	 ** us is on disk too
993	 */
994	BUG_ON(jl->j_len <= 0);
995	BUG_ON(trans_id == journal->j_trans_id);
996
997	get_journal_list(jl);
998	if (flushall) {
999		if (flush_older_commits(s, jl) == 1) {
1000			/* list disappeared during flush_older_commits.  return */
1001			goto put_jl;
1002		}
1003	}
1004
1005	/* make sure nobody is trying to flush this one at the same time */
1006	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
1007
1008	if (!journal_list_still_alive(s, trans_id)) {
1009		mutex_unlock(&jl->j_commit_mutex);
1010		goto put_jl;
1011	}
1012	BUG_ON(jl->j_trans_id == 0);
1013
1014	/* this commit is done, exit */
1015	if (atomic_read(&(jl->j_commit_left)) <= 0) {
1016		if (flushall) {
1017			atomic_set(&(jl->j_older_commits_done), 1);
1018		}
1019		mutex_unlock(&jl->j_commit_mutex);
1020		goto put_jl;
1021	}
1022
1023	if (!list_empty(&jl->j_bh_list)) {
1024		int ret;
1025
1026		/*
1027		 * We might sleep in numerous places inside
1028		 * write_ordered_buffers. Relax the write lock.
1029		 */
1030		reiserfs_write_unlock(s);
1031		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1032					    journal, jl, &jl->j_bh_list);
1033		if (ret < 0 && retval == 0)
1034			retval = ret;
1035		reiserfs_write_lock(s);
1036	}
1037	BUG_ON(!list_empty(&jl->j_bh_list));
1038	/*
1039	 * for the description block and all the log blocks, submit any buffers
1040	 * that haven't already reached the disk.  Try to write at least 256
1041	 * log blocks. later on, we will only wait on blocks that correspond
1042	 * to this transaction, but while we're unplugging we might as well
1043	 * get a chunk of data on there.
1044	 */
1045	atomic_inc(&journal->j_async_throttle);
1046	write_len = jl->j_len + 1;
1047	if (write_len < 256)
1048		write_len = 256;
1049	for (i = 0 ; i < write_len ; i++) {
1050		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
1051		    SB_ONDISK_JOURNAL_SIZE(s);
1052		tbh = journal_find_get_block(s, bn);
1053		if (tbh) {
1054			if (buffer_dirty(tbh)) {
1055		            reiserfs_write_unlock(s);
1056			    ll_rw_block(WRITE, 1, &tbh);
1057			    reiserfs_write_lock(s);
1058			}
1059			put_bh(tbh) ;
1060		}
1061	}
1062	atomic_dec(&journal->j_async_throttle);
1063
1064	for (i = 0; i < (jl->j_len + 1); i++) {
1065		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1066		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1067		tbh = journal_find_get_block(s, bn);
1068
1069		reiserfs_write_unlock(s);
1070		wait_on_buffer(tbh);
1071		reiserfs_write_lock(s);
1072		// since we're using ll_rw_blk above, it might have skipped over
1073		// a locked buffer.  Double check here
1074		//
1075		/* redundant, sync_dirty_buffer() checks */
1076		if (buffer_dirty(tbh)) {
1077			reiserfs_write_unlock(s);
1078			sync_dirty_buffer(tbh);
1079			reiserfs_write_lock(s);
1080		}
1081		if (unlikely(!buffer_uptodate(tbh))) {
1082#ifdef CONFIG_REISERFS_CHECK
1083			reiserfs_warning(s, "journal-601",
1084					 "buffer write failed");
1085#endif
1086			retval = -EIO;
1087		}
1088		put_bh(tbh);	/* once for journal_find_get_block */
1089		put_bh(tbh);	/* once due to original getblk in do_journal_end */
1090		atomic_dec(&(jl->j_commit_left));
1091	}
1092
1093	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1094
1095	/* If there was a write error in the journal - we can't commit
1096	 * this transaction - it will be invalid and, if successful,
1097	 * will just end up propagating the write error out to
1098	 * the file system. */
1099	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1100		if (buffer_dirty(jl->j_commit_bh))
1101			BUG();
1102		mark_buffer_dirty(jl->j_commit_bh) ;
1103		reiserfs_write_unlock(s);
1104		if (reiserfs_barrier_flush(s))
1105			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1106		else
1107			sync_dirty_buffer(jl->j_commit_bh);
1108		reiserfs_write_lock(s);
1109	}
1110
1111	/* If there was a write error in the journal - we can't commit this
1112	 * transaction - it will be invalid and, if successful, will just end
1113	 * up propagating the write error out to the filesystem. */
1114	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
1115#ifdef CONFIG_REISERFS_CHECK
1116		reiserfs_warning(s, "journal-615", "buffer write failed");
1117#endif
1118		retval = -EIO;
1119	}
1120	bforget(jl->j_commit_bh);
1121	if (journal->j_last_commit_id != 0 &&
1122	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
1123		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
1124				 journal->j_last_commit_id, jl->j_trans_id);
1125	}
1126	journal->j_last_commit_id = jl->j_trans_id;
1127
1128	/* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
1129	cleanup_freed_for_journal_list(s, jl);
1130
1131	retval = retval ? retval : journal->j_errno;
1132
1133	/* mark the metadata dirty */
1134	if (!retval)
1135		dirty_one_transaction(s, jl);
1136	atomic_dec(&(jl->j_commit_left));
1137
1138	if (flushall) {
1139		atomic_set(&(jl->j_older_commits_done), 1);
1140	}
1141	mutex_unlock(&jl->j_commit_mutex);
1142      put_jl:
1143	put_journal_list(s, jl);
1144
1145	if (retval)
1146		reiserfs_abort(s, retval, "Journal write error in %s",
1147			       __func__);
1148	put_fs_excl();
1149	return retval;
1150}
1151
1152/*
1153** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
1154** returns NULL if it can't find anything
1155*/
1156static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
1157							  reiserfs_journal_cnode
1158							  *cn)
1159{
1160	struct super_block *sb = cn->sb;
1161	b_blocknr_t blocknr = cn->blocknr;
1162
1163	cn = cn->hprev;
1164	while (cn) {
1165		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
1166			return cn->jlist;
1167		}
1168		cn = cn->hprev;
1169	}
1170	return NULL;
1171}
1172
1173static int newer_jl_done(struct reiserfs_journal_cnode *cn)
1174{
1175	struct super_block *sb = cn->sb;
1176	b_blocknr_t blocknr = cn->blocknr;
1177
1178	cn = cn->hprev;
1179	while (cn) {
1180		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
1181		    atomic_read(&cn->jlist->j_commit_left) != 0)
1182				    return 0;
1183		cn = cn->hprev;
1184	}
1185	return 1;
1186}
1187
1188static void remove_journal_hash(struct super_block *,
1189				struct reiserfs_journal_cnode **,
1190				struct reiserfs_journal_list *, unsigned long,
1191				int);
1192
1193/*
1194** once all the real blocks have been flushed, it is safe to remove them from the
1195** journal list for this transaction.  Aside from freeing the cnode, this also allows the
1196** block to be reallocated for data blocks if it had been deleted.
1197*/
1198static void remove_all_from_journal_list(struct super_block *sb,
1199					 struct reiserfs_journal_list *jl,
1200					 int debug)
1201{
1202	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1203	struct reiserfs_journal_cnode *cn, *last;
1204	cn = jl->j_realblock;
1205
1206	/* which is better, to lock once around the whole loop, or
1207	 ** to lock for each call to remove_journal_hash?
1208	 */
1209	while (cn) {
1210		if (cn->blocknr != 0) {
1211			if (debug) {
1212				reiserfs_warning(sb, "reiserfs-2201",
1213						 "block %u, bh is %d, state %ld",
1214						 cn->blocknr, cn->bh ? 1 : 0,
1215						 cn->state);
1216			}
1217			cn->state = 0;
1218			remove_journal_hash(sb, journal->j_list_hash_table,
1219					    jl, cn->blocknr, 1);
1220		}
1221		last = cn;
1222		cn = cn->next;
1223		free_cnode(sb, last);
1224	}
1225	jl->j_realblock = NULL;
1226}
1227
1228/*
1229** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
1230** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
1231** releasing blocks in this transaction for reuse as data blocks.
1232** called by flush_journal_list, before it calls remove_all_from_journal_list
1233**
1234*/
1235static int _update_journal_header_block(struct super_block *sb,
1236					unsigned long offset,
1237					unsigned int trans_id)
1238{
1239	struct reiserfs_journal_header *jh;
1240	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1241
1242	if (reiserfs_is_journal_aborted(journal))
1243		return -EIO;
1244
1245	if (trans_id >= journal->j_last_flush_trans_id) {
1246		if (buffer_locked((journal->j_header_bh))) {
1247			reiserfs_write_unlock(sb);
1248			wait_on_buffer((journal->j_header_bh));
1249			reiserfs_write_lock(sb);
1250			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1251#ifdef CONFIG_REISERFS_CHECK
1252				reiserfs_warning(sb, "journal-699",
1253						 "buffer write failed");
1254#endif
1255				return -EIO;
1256			}
1257		}
1258		journal->j_last_flush_trans_id = trans_id;
1259		journal->j_first_unflushed_offset = offset;
1260		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
1261							b_data);
1262		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
1263		jh->j_first_unflushed_offset = cpu_to_le32(offset);
1264		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1265
1266		set_buffer_dirty(journal->j_header_bh);
1267		reiserfs_write_unlock(sb);
1268
1269		if (reiserfs_barrier_flush(sb))
1270			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1271		else
1272			sync_dirty_buffer(journal->j_header_bh);
1273
1274		reiserfs_write_lock(sb);
1275		if (!buffer_uptodate(journal->j_header_bh)) {
1276			reiserfs_warning(sb, "journal-837",
1277					 "IO error during journal replay");
1278			return -EIO;
1279		}
1280	}
1281	return 0;
1282}
1283
1284static int update_journal_header_block(struct super_block *sb,
1285				       unsigned long offset,
1286				       unsigned int trans_id)
1287{
1288	return _update_journal_header_block(sb, offset, trans_id);
1289}
1290
1291/*
1292** flush any and all journal lists older than you are
1293** can only be called from flush_journal_list
1294*/
1295static int flush_older_journal_lists(struct super_block *sb,
1296				     struct reiserfs_journal_list *jl)
1297{
1298	struct list_head *entry;
1299	struct reiserfs_journal_list *other_jl;
1300	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1301	unsigned int trans_id = jl->j_trans_id;
1302
1303	/* we know we are the only ones flushing things, no extra race
1304	 * protection is required.
1305	 */
1306      restart:
1307	entry = journal->j_journal_list.next;
1308	/* Did we wrap? */
1309	if (entry == &journal->j_journal_list)
1310		return 0;
1311	other_jl = JOURNAL_LIST_ENTRY(entry);
1312	if (other_jl->j_trans_id < trans_id) {
1313		BUG_ON(other_jl->j_refcount <= 0);
1314		/* do not flush all */
1315		flush_journal_list(sb, other_jl, 0);
1316
1317		/* other_jl is now deleted from the list */
1318		goto restart;
1319	}
1320	return 0;
1321}
1322
1323static void del_from_work_list(struct super_block *s,
1324			       struct reiserfs_journal_list *jl)
1325{
1326	struct reiserfs_journal *journal = SB_JOURNAL(s);
1327	if (!list_empty(&jl->j_working_list)) {
1328		list_del_init(&jl->j_working_list);
1329		journal->j_num_work_lists--;
1330	}
1331}
1332
1333/* flush a journal list, both commit and real blocks
1334**
1335** always set flushall to 1, unless you are calling from inside
1336** flush_journal_list
1337**
1338** IMPORTANT.  This can only be called while there are no journal writers,
1339** and the journal is locked.  That means it can only be called from
1340** do_journal_end, or by journal_release
1341*/
1342static int flush_journal_list(struct super_block *s,
1343			      struct reiserfs_journal_list *jl, int flushall)
1344{
1345	struct reiserfs_journal_list *pjl;
1346	struct reiserfs_journal_cnode *cn, *last;
1347	int count;
1348	int was_jwait = 0;
1349	int was_dirty = 0;
1350	struct buffer_head *saved_bh;
1351	unsigned long j_len_saved = jl->j_len;
1352	struct reiserfs_journal *journal = SB_JOURNAL(s);
1353	int err = 0;
1354
1355	BUG_ON(j_len_saved <= 0);
1356
1357	if (atomic_read(&journal->j_wcount) != 0) {
1358		reiserfs_warning(s, "clm-2048", "called with wcount %d",
1359				 atomic_read(&journal->j_wcount));
1360	}
1361	BUG_ON(jl->j_trans_id == 0);
1362
1363	/* if flushall == 0, the lock is already held */
1364	if (flushall) {
1365		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1366	} else if (mutex_trylock(&journal->j_flush_mutex)) {
1367		BUG();
1368	}
1369
1370	count = 0;
1371	if (j_len_saved > journal->j_trans_max) {
1372		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
1373			       j_len_saved, jl->j_trans_id);
1374		return 0;
1375	}
1376
1377	get_fs_excl();
1378
1379	/* if all the work is already done, get out of here */
1380	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1381	    atomic_read(&(jl->j_commit_left)) <= 0) {
1382		goto flush_older_and_return;
1383	}
1384
1385	/* start by putting the commit list on disk.  This will also flush
1386	 ** the commit lists of any olders transactions
1387	 */
1388	flush_commit_list(s, jl, 1);
1389
1390	if (!(jl->j_state & LIST_DIRTY)
1391	    && !reiserfs_is_journal_aborted(journal))
1392		BUG();
1393
1394	/* are we done now? */
1395	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1396	    atomic_read(&(jl->j_commit_left)) <= 0) {
1397		goto flush_older_and_return;
1398	}
1399
1400	/* loop through each cnode, see if we need to write it,
1401	 ** or wait on a more recent transaction, or just ignore it
1402	 */
1403	if (atomic_read(&(journal->j_wcount)) != 0) {
1404		reiserfs_panic(s, "journal-844", "journal list is flushing, "
1405			       "wcount is not 0");
1406	}
1407	cn = jl->j_realblock;
1408	while (cn) {
1409		was_jwait = 0;
1410		was_dirty = 0;
1411		saved_bh = NULL;
1412		/* blocknr of 0 is no longer in the hash, ignore it */
1413		if (cn->blocknr == 0) {
1414			goto free_cnode;
1415		}
1416
1417		/* This transaction failed commit. Don't write out to the disk */
1418		if (!(jl->j_state & LIST_DIRTY))
1419			goto free_cnode;
1420
1421		pjl = find_newer_jl_for_cn(cn);
1422		/* the order is important here.  We check pjl to make sure we
1423		 ** don't clear BH_JDirty_wait if we aren't the one writing this
1424		 ** block to disk
1425		 */
1426		if (!pjl && cn->bh) {
1427			saved_bh = cn->bh;
1428
1429			/* we do this to make sure nobody releases the buffer while
1430			 ** we are working with it
1431			 */
1432			get_bh(saved_bh);
1433
1434			if (buffer_journal_dirty(saved_bh)) {
1435				BUG_ON(!can_dirty(cn));
1436				was_jwait = 1;
1437				was_dirty = 1;
1438			} else if (can_dirty(cn)) {
1439				/* everything with !pjl && jwait should be writable */
1440				BUG();
1441			}
1442		}
1443
1444		/* if someone has this block in a newer transaction, just make
1445		 ** sure they are committed, and don't try writing it to disk
1446		 */
1447		if (pjl) {
1448			if (atomic_read(&pjl->j_commit_left))
1449				flush_commit_list(s, pjl, 1);
1450			goto free_cnode;
1451		}
1452
1453		/* bh == NULL when the block got to disk on its own, OR,
1454		 ** the block got freed in a future transaction
1455		 */
1456		if (saved_bh == NULL) {
1457			goto free_cnode;
1458		}
1459
1460		/* this should never happen.  kupdate_one_transaction has this list
1461		 ** locked while it works, so we should never see a buffer here that
1462		 ** is not marked JDirty_wait
1463		 */
1464		if ((!was_jwait) && !buffer_locked(saved_bh)) {
1465			reiserfs_warning(s, "journal-813",
1466					 "BAD! buffer %llu %cdirty %cjwait, "
1467					 "not in a newer tranasction",
1468					 (unsigned long long)saved_bh->
1469					 b_blocknr, was_dirty ? ' ' : '!',
1470					 was_jwait ? ' ' : '!');
1471		}
1472		if (was_dirty) {
1473			/* we inc again because saved_bh gets decremented at free_cnode */
1474			get_bh(saved_bh);
1475			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
1476			lock_buffer(saved_bh);
1477			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
1478			if (buffer_dirty(saved_bh))
1479				submit_logged_buffer(saved_bh);
1480			else
1481				unlock_buffer(saved_bh);
1482			count++;
1483		} else {
1484			reiserfs_warning(s, "clm-2082",
1485					 "Unable to flush buffer %llu in %s",
1486					 (unsigned long long)saved_bh->
1487					 b_blocknr, __func__);
1488		}
1489	      free_cnode:
1490		last = cn;
1491		cn = cn->next;
1492		if (saved_bh) {
1493			/* we incremented this to keep others from taking the buffer head away */
1494			put_bh(saved_bh);
1495			if (atomic_read(&(saved_bh->b_count)) < 0) {
1496				reiserfs_warning(s, "journal-945",
1497						 "saved_bh->b_count < 0");
1498			}
1499		}
1500	}
1501	if (count > 0) {
1502		cn = jl->j_realblock;
1503		while (cn) {
1504			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
1505				if (!cn->bh) {
1506					reiserfs_panic(s, "journal-1011",
1507						       "cn->bh is NULL");
1508				}
1509
1510				reiserfs_write_unlock(s);
1511				wait_on_buffer(cn->bh);
1512				reiserfs_write_lock(s);
1513
1514				if (!cn->bh) {
1515					reiserfs_panic(s, "journal-1012",
1516						       "cn->bh is NULL");
1517				}
1518				if (unlikely(!buffer_uptodate(cn->bh))) {
1519#ifdef CONFIG_REISERFS_CHECK
1520					reiserfs_warning(s, "journal-949",
1521							 "buffer write failed");
1522#endif
1523					err = -EIO;
1524				}
1525				/* note, we must clear the JDirty_wait bit after the up to date
1526				 ** check, otherwise we race against our flushpage routine
1527				 */
1528				BUG_ON(!test_clear_buffer_journal_dirty
1529				       (cn->bh));
1530
1531				/* drop one ref for us */
1532				put_bh(cn->bh);
1533				/* drop one ref for journal_mark_dirty */
1534				release_buffer_page(cn->bh);
1535			}
1536			cn = cn->next;
1537		}
1538	}
1539
1540	if (err)
1541		reiserfs_abort(s, -EIO,
1542			       "Write error while pushing transaction to disk in %s",
1543			       __func__);
1544      flush_older_and_return:
1545
1546	/* before we can update the journal header block, we _must_ flush all
1547	 ** real blocks from all older transactions to disk.  This is because
1548	 ** once the header block is updated, this transaction will not be
1549	 ** replayed after a crash
1550	 */
1551	if (flushall) {
1552		flush_older_journal_lists(s, jl);
1553	}
1554
1555	err = journal->j_errno;
1556	/* before we can remove everything from the hash tables for this
1557	 ** transaction, we must make sure it can never be replayed
1558	 **
1559	 ** since we are only called from do_journal_end, we know for sure there
1560	 ** are no allocations going on while we are flushing journal lists.  So,
1561	 ** we only need to update the journal header block for the last list
1562	 ** being flushed
1563	 */
1564	if (!err && flushall) {
1565		err =
1566		    update_journal_header_block(s,
1567						(jl->j_start + jl->j_len +
1568						 2) % SB_ONDISK_JOURNAL_SIZE(s),
1569						jl->j_trans_id);
1570		if (err)
1571			reiserfs_abort(s, -EIO,
1572				       "Write error while updating journal header in %s",
1573				       __func__);
1574	}
1575	remove_all_from_journal_list(s, jl, 0);
1576	list_del_init(&jl->j_list);
1577	journal->j_num_lists--;
1578	del_from_work_list(s, jl);
1579
1580	if (journal->j_last_flush_id != 0 &&
1581	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
1582		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
1583				 journal->j_last_flush_id, jl->j_trans_id);
1584	}
1585	journal->j_last_flush_id = jl->j_trans_id;
1586
1587	/* not strictly required since we are freeing the list, but it should
1588	 * help find code using dead lists later on
1589	 */
1590	jl->j_len = 0;
1591	atomic_set(&(jl->j_nonzerolen), 0);
1592	jl->j_start = 0;
1593	jl->j_realblock = NULL;
1594	jl->j_commit_bh = NULL;
1595	jl->j_trans_id = 0;
1596	jl->j_state = 0;
1597	put_journal_list(s, jl);
1598	if (flushall)
1599		mutex_unlock(&journal->j_flush_mutex);
1600	put_fs_excl();
1601	return err;
1602}
1603
1604static int test_transaction(struct super_block *s,
1605                            struct reiserfs_journal_list *jl)
1606{
1607	struct reiserfs_journal_cnode *cn;
1608
1609	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
1610		return 1;
1611
1612	cn = jl->j_realblock;
1613	while (cn) {
1614		/* if the blocknr == 0, this has been cleared from the hash,
1615		 ** skip it
1616		 */
1617		if (cn->blocknr == 0) {
1618			goto next;
1619		}
1620		if (cn->bh && !newer_jl_done(cn))
1621			return 0;
1622	      next:
1623		cn = cn->next;
1624		cond_resched();
1625	}
1626	return 0;
1627}
1628
1629static int write_one_transaction(struct super_block *s,
1630				 struct reiserfs_journal_list *jl,
1631				 struct buffer_chunk *chunk)
1632{
1633	struct reiserfs_journal_cnode *cn;
1634	int ret = 0;
1635
1636	jl->j_state |= LIST_TOUCHED;
1637	del_from_work_list(s, jl);
1638	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
1639		return 0;
1640	}
1641
1642	cn = jl->j_realblock;
1643	while (cn) {
1644		/* if the blocknr == 0, this has been cleared from the hash,
1645		 ** skip it
1646		 */
1647		if (cn->blocknr == 0) {
1648			goto next;
1649		}
1650		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
1651			struct buffer_head *tmp_bh;
1652			/* we can race against journal_mark_freed when we try
1653			 * to lock_buffer(cn->bh), so we have to inc the buffer
1654			 * count, and recheck things after locking
1655			 */
1656			tmp_bh = cn->bh;
1657			get_bh(tmp_bh);
1658			lock_buffer(tmp_bh);
1659			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
1660				if (!buffer_journal_dirty(tmp_bh) ||
1661				    buffer_journal_prepared(tmp_bh))
1662					BUG();
1663				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
1664				ret++;
1665			} else {
1666				/* note, cn->bh might be null now */
1667				unlock_buffer(tmp_bh);
1668			}
1669			put_bh(tmp_bh);
1670		}
1671	      next:
1672		cn = cn->next;
1673		cond_resched();
1674	}
1675	return ret;
1676}
1677
1678/* used by flush_commit_list */
1679static int dirty_one_transaction(struct super_block *s,
1680				 struct reiserfs_journal_list *jl)
1681{
1682	struct reiserfs_journal_cnode *cn;
1683	struct reiserfs_journal_list *pjl;
1684	int ret = 0;
1685
1686	jl->j_state |= LIST_DIRTY;
1687	cn = jl->j_realblock;
1688	while (cn) {
1689		/* look for a more recent transaction that logged this
1690		 ** buffer.  Only the most recent transaction with a buffer in
1691		 ** it is allowed to send that buffer to disk
1692		 */
1693		pjl = find_newer_jl_for_cn(cn);
1694		if (!pjl && cn->blocknr && cn->bh
1695		    && buffer_journal_dirty(cn->bh)) {
1696			BUG_ON(!can_dirty(cn));
1697			/* if the buffer is prepared, it will either be logged
1698			 * or restored.  If restored, we need to make sure
1699			 * it actually gets marked dirty
1700			 */
1701			clear_buffer_journal_new(cn->bh);
1702			if (buffer_journal_prepared(cn->bh)) {
1703				set_buffer_journal_restore_dirty(cn->bh);
1704			} else {
1705				set_buffer_journal_test(cn->bh);
1706				mark_buffer_dirty(cn->bh);
1707			}
1708		}
1709		cn = cn->next;
1710	}
1711	return ret;
1712}
1713
1714static int kupdate_transactions(struct super_block *s,
1715				struct reiserfs_journal_list *jl,
1716				struct reiserfs_journal_list **next_jl,
1717				unsigned int *next_trans_id,
1718				int num_blocks, int num_trans)
1719{
1720	int ret = 0;
1721	int written = 0;
1722	int transactions_flushed = 0;
1723	unsigned int orig_trans_id = jl->j_trans_id;
1724	struct buffer_chunk chunk;
1725	struct list_head *entry;
1726	struct reiserfs_journal *journal = SB_JOURNAL(s);
1727	chunk.nr = 0;
1728
1729	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1730	if (!journal_list_still_alive(s, orig_trans_id)) {
1731		goto done;
1732	}
1733
1734	/* we've got j_flush_mutex held, nobody is going to delete any
1735	 * of these lists out from underneath us
1736	 */
1737	while ((num_trans && transactions_flushed < num_trans) ||
1738	       (!num_trans && written < num_blocks)) {
1739
1740		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
1741		    atomic_read(&jl->j_commit_left)
1742		    || !(jl->j_state & LIST_DIRTY)) {
1743			del_from_work_list(s, jl);
1744			break;
1745		}
1746		ret = write_one_transaction(s, jl, &chunk);
1747
1748		if (ret < 0)
1749			goto done;
1750		transactions_flushed++;
1751		written += ret;
1752		entry = jl->j_list.next;
1753
1754		/* did we wrap? */
1755		if (entry == &journal->j_journal_list) {
1756			break;
1757		}
1758		jl = JOURNAL_LIST_ENTRY(entry);
1759
1760		/* don't bother with older transactions */
1761		if (jl->j_trans_id <= orig_trans_id)
1762			break;
1763	}
1764	if (chunk.nr) {
1765		write_chunk(&chunk);
1766	}
1767
1768      done:
1769	mutex_unlock(&journal->j_flush_mutex);
1770	return ret;
1771}
1772
1773/* for o_sync and fsync heavy applications, they tend to use
1774** all the journa list slots with tiny transactions.  These
1775** trigger lots and lots of calls to update the header block, which
1776** adds seeks and slows things down.
1777**
1778** This function tries to clear out a large chunk of the journal lists
1779** at once, which makes everything faster since only the newest journal
1780** list updates the header block
1781*/
1782static int flush_used_journal_lists(struct super_block *s,
1783				    struct reiserfs_journal_list *jl)
1784{
1785	unsigned long len = 0;
1786	unsigned long cur_len;
1787	int ret;
1788	int i;
1789	int limit = 256;
1790	struct reiserfs_journal_list *tjl;
1791	struct reiserfs_journal_list *flush_jl;
1792	unsigned int trans_id;
1793	struct reiserfs_journal *journal = SB_JOURNAL(s);
1794
1795	flush_jl = tjl = jl;
1796
1797	/* in data logging mode, try harder to flush a lot of blocks */
1798	if (reiserfs_data_log(s))
1799		limit = 1024;
1800	/* flush for 256 transactions or limit blocks, whichever comes first */
1801	for (i = 0; i < 256 && len < limit; i++) {
1802		if (atomic_read(&tjl->j_commit_left) ||
1803		    tjl->j_trans_id < jl->j_trans_id) {
1804			break;
1805		}
1806		cur_len = atomic_read(&tjl->j_nonzerolen);
1807		if (cur_len > 0) {
1808			tjl->j_state &= ~LIST_TOUCHED;
1809		}
1810		len += cur_len;
1811		flush_jl = tjl;
1812		if (tjl->j_list.next == &journal->j_journal_list)
1813			break;
1814		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
1815	}
1816	/* try to find a group of blocks we can flush across all the
1817	 ** transactions, but only bother if we've actually spanned
1818	 ** across multiple lists
1819	 */
1820	if (flush_jl != jl) {
1821		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
1822	}
1823	flush_journal_list(s, flush_jl, 1);
1824	return 0;
1825}
1826
1827/*
1828** removes any nodes in table with name block and dev as bh.
1829** only touchs the hnext and hprev pointers.
1830*/
1831void remove_journal_hash(struct super_block *sb,
1832			 struct reiserfs_journal_cnode **table,
1833			 struct reiserfs_journal_list *jl,
1834			 unsigned long block, int remove_freed)
1835{
1836	struct reiserfs_journal_cnode *cur;
1837	struct reiserfs_journal_cnode **head;
1838
1839	head = &(journal_hash(table, sb, block));
1840	if (!head) {
1841		return;
1842	}
1843	cur = *head;
1844	while (cur) {
1845		if (cur->blocknr == block && cur->sb == sb
1846		    && (jl == NULL || jl == cur->jlist)
1847		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
1848			if (cur->hnext) {
1849				cur->hnext->hprev = cur->hprev;
1850			}
1851			if (cur->hprev) {
1852				cur->hprev->hnext = cur->hnext;
1853			} else {
1854				*head = cur->hnext;
1855			}
1856			cur->blocknr = 0;
1857			cur->sb = NULL;
1858			cur->state = 0;
1859			if (cur->bh && cur->jlist)	/* anybody who clears the cur->bh will also dec the nonzerolen */
1860				atomic_dec(&(cur->jlist->j_nonzerolen));
1861			cur->bh = NULL;
1862			cur->jlist = NULL;
1863		}
1864		cur = cur->hnext;
1865	}
1866}
1867
1868static void free_journal_ram(struct super_block *sb)
1869{
1870	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1871	kfree(journal->j_current_jl);
1872	journal->j_num_lists--;
1873
1874	vfree(journal->j_cnode_free_orig);
1875	free_list_bitmaps(sb, journal->j_list_bitmap);
1876	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
1877	if (journal->j_header_bh) {
1878		brelse(journal->j_header_bh);
1879	}
1880	/* j_header_bh is on the journal dev, make sure not to release the journal
1881	 * dev until we brelse j_header_bh
1882	 */
1883	release_journal_dev(sb, journal);
1884	vfree(journal);
1885}
1886
1887/*
1888** call on unmount.  Only set error to 1 if you haven't made your way out
1889** of read_super() yet.  Any other caller must keep error at 0.
1890*/
1891static int do_journal_release(struct reiserfs_transaction_handle *th,
1892			      struct super_block *sb, int error)
1893{
1894	struct reiserfs_transaction_handle myth;
1895	int flushed = 0;
1896	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1897
1898	/* we only want to flush out transactions if we were called with error == 0
1899	 */
1900	if (!error && !(sb->s_flags & MS_RDONLY)) {
1901		/* end the current trans */
1902		BUG_ON(!th->t_trans_id);
1903		do_journal_end(th, sb, 10, FLUSH_ALL);
1904
1905		/* make sure something gets logged to force our way into the flush code */
1906		if (!journal_join(&myth, sb, 1)) {
1907			reiserfs_prepare_for_journal(sb,
1908						     SB_BUFFER_WITH_SB(sb),
1909						     1);
1910			journal_mark_dirty(&myth, sb,
1911					   SB_BUFFER_WITH_SB(sb));
1912			do_journal_end(&myth, sb, 1, FLUSH_ALL);
1913			flushed = 1;
1914		}
1915	}
1916
1917	/* this also catches errors during the do_journal_end above */
1918	if (!error && reiserfs_is_journal_aborted(journal)) {
1919		memset(&myth, 0, sizeof(myth));
1920		if (!journal_join_abort(&myth, sb, 1)) {
1921			reiserfs_prepare_for_journal(sb,
1922						     SB_BUFFER_WITH_SB(sb),
1923						     1);
1924			journal_mark_dirty(&myth, sb,
1925					   SB_BUFFER_WITH_SB(sb));
1926			do_journal_end(&myth, sb, 1, FLUSH_ALL);
1927		}
1928	}
1929
1930	reiserfs_mounted_fs_count--;
1931	/* wait for all commits to finish */
1932	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
1933
1934	/*
1935	 * We must release the write lock here because
1936	 * the workqueue job (flush_async_commit) needs this lock
1937	 */
1938	reiserfs_write_unlock(sb);
1939	flush_workqueue(commit_wq);
1940
1941	if (!reiserfs_mounted_fs_count) {
1942		destroy_workqueue(commit_wq);
1943		commit_wq = NULL;
1944	}
1945
1946	free_journal_ram(sb);
1947
1948	reiserfs_write_lock(sb);
1949
1950	return 0;
1951}
1952
1953/*
1954** call on unmount.  flush all journal trans, release all alloc'd ram
1955*/
1956int journal_release(struct reiserfs_transaction_handle *th,
1957		    struct super_block *sb)
1958{
1959	return do_journal_release(th, sb, 0);
1960}
1961
1962/*
1963** only call from an error condition inside reiserfs_read_super!
1964*/
1965int journal_release_error(struct reiserfs_transaction_handle *th,
1966			  struct super_block *sb)
1967{
1968	return do_journal_release(th, sb, 1);
1969}
1970
1971/* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
1972static int journal_compare_desc_commit(struct super_block *sb,
1973				       struct reiserfs_journal_desc *desc,
1974				       struct reiserfs_journal_commit *commit)
1975{
1976	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
1977	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
1978	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
1979	    get_commit_trans_len(commit) <= 0) {
1980		return 1;
1981	}
1982	return 0;
1983}
1984
1985/* returns 0 if it did not find a description block
1986** returns -1 if it found a corrupt commit block
1987** returns 1 if both desc and commit were valid
1988*/
1989static int journal_transaction_is_valid(struct super_block *sb,
1990					struct buffer_head *d_bh,
1991					unsigned int *oldest_invalid_trans_id,
1992					unsigned long *newest_mount_id)
1993{
1994	struct reiserfs_journal_desc *desc;
1995	struct reiserfs_journal_commit *commit;
1996	struct buffer_head *c_bh;
1997	unsigned long offset;
1998
1999	if (!d_bh)
2000		return 0;
2001
2002	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2003	if (get_desc_trans_len(desc) > 0
2004	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
2005		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
2006		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
2007			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2008				       "journal-986: transaction "
2009				       "is valid returning because trans_id %d is greater than "
2010				       "oldest_invalid %lu",
2011				       get_desc_trans_id(desc),
2012				       *oldest_invalid_trans_id);
2013			return 0;
2014		}
2015		if (newest_mount_id
2016		    && *newest_mount_id > get_desc_mount_id(desc)) {
2017			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2018				       "journal-1087: transaction "
2019				       "is valid returning because mount_id %d is less than "
2020				       "newest_mount_id %lu",
2021				       get_desc_mount_id(desc),
2022				       *newest_mount_id);
2023			return -1;
2024		}
2025		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
2026			reiserfs_warning(sb, "journal-2018",
2027					 "Bad transaction length %d "
2028					 "encountered, ignoring transaction",
2029					 get_desc_trans_len(desc));
2030			return -1;
2031		}
2032		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2033
2034		/* ok, we have a journal description block, lets see if the transaction was valid */
2035		c_bh =
2036		    journal_bread(sb,
2037				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2038				  ((offset + get_desc_trans_len(desc) +
2039				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
2040		if (!c_bh)
2041			return 0;
2042		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2043		if (journal_compare_desc_commit(sb, desc, commit)) {
2044			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2045				       "journal_transaction_is_valid, commit offset %ld had bad "
2046				       "time %d or length %d",
2047				       c_bh->b_blocknr -
2048				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2049				       get_commit_trans_id(commit),
2050				       get_commit_trans_len(commit));
2051			brelse(c_bh);
2052			if (oldest_invalid_trans_id) {
2053				*oldest_invalid_trans_id =
2054				    get_desc_trans_id(desc);
2055				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2056					       "journal-1004: "
2057					       "transaction_is_valid setting oldest invalid trans_id "
2058					       "to %d",
2059					       get_desc_trans_id(desc));
2060			}
2061			return -1;
2062		}
2063		brelse(c_bh);
2064		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2065			       "journal-1006: found valid "
2066			       "transaction start offset %llu, len %d id %d",
2067			       d_bh->b_blocknr -
2068			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2069			       get_desc_trans_len(desc),
2070			       get_desc_trans_id(desc));
2071		return 1;
2072	} else {
2073		return 0;
2074	}
2075}
2076
2077static void brelse_array(struct buffer_head **heads, int num)
2078{
2079	int i;
2080	for (i = 0; i < num; i++) {
2081		brelse(heads[i]);
2082	}
2083}
2084
2085/*
2086** given the start, and values for the oldest acceptable transactions,
2087** this either reads in a replays a transaction, or returns because the transaction
2088** is invalid, or too old.
2089*/
2090static int journal_read_transaction(struct super_block *sb,
2091				    unsigned long cur_dblock,
2092				    unsigned long oldest_start,
2093				    unsigned int oldest_trans_id,
2094				    unsigned long newest_mount_id)
2095{
2096	struct reiserfs_journal *journal = SB_JOURNAL(sb);
2097	struct reiserfs_journal_desc *desc;
2098	struct reiserfs_journal_commit *commit;
2099	unsigned int trans_id = 0;
2100	struct buffer_head *c_bh;
2101	struct buffer_head *d_bh;
2102	struct buffer_head **log_blocks = NULL;
2103	struct buffer_head **real_blocks = NULL;
2104	unsigned int trans_offset;
2105	int i;
2106	int trans_half;
2107
2108	d_bh = journal_bread(sb, cur_dblock);
2109	if (!d_bh)
2110		return 1;
2111	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2112	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2113	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
2114		       "journal_read_transaction, offset %llu, len %d mount_id %d",
2115		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2116		       get_desc_trans_len(desc), get_desc_mount_id(desc));
2117	if (get_desc_trans_id(desc) < oldest_trans_id) {
2118		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
2119			       "journal_read_trans skipping because %lu is too old",
2120			       cur_dblock -
2121			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2122		brelse(d_bh);
2123		return 1;
2124	}
2125	if (get_desc_mount_id(desc) != newest_mount_id) {
2126		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
2127			       "journal_read_trans skipping because %d is != "
2128			       "newest_mount_id %lu", get_desc_mount_id(desc),
2129			       newest_mount_id);
2130		brelse(d_bh);
2131		return 1;
2132	}
2133	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2134			     ((trans_offset + get_desc_trans_len(desc) + 1) %
2135			      SB_ONDISK_JOURNAL_SIZE(sb)));
2136	if (!c_bh) {
2137		brelse(d_bh);
2138		return 1;
2139	}
2140	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2141	if (journal_compare_desc_commit(sb, desc, commit)) {
2142		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2143			       "journal_read_transaction, "
2144			       "commit offset %llu had bad time %d or length %d",
2145			       c_bh->b_blocknr -
2146			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2147			       get_commit_trans_id(commit),
2148			       get_commit_trans_len(commit));
2149		brelse(c_bh);
2150		brelse(d_bh);
2151		return 1;
2152	}
2153
2154	if (bdev_read_only(sb->s_bdev)) {
2155		reiserfs_warning(sb, "clm-2076",
2156				 "device is readonly, unable to replay log");
2157		brelse(c_bh);
2158		brelse(d_bh);
2159		return -EROFS;
2160	}
2161
2162	trans_id = get_desc_trans_id(desc);
2163	/* now we know we've got a good transaction, and it was inside the valid time ranges */
2164	log_blocks = kmalloc(get_desc_trans_len(desc) *
2165			     sizeof(struct buffer_head *), GFP_NOFS);
2166	real_blocks = kmalloc(get_desc_trans_len(desc) *
2167			      sizeof(struct buffer_head *), GFP_NOFS);
2168	if (!log_blocks || !real_blocks) {
2169		brelse(c_bh);
2170		brelse(d_bh);
2171		kfree(log_blocks);
2172		kfree(real_blocks);
2173		reiserfs_warning(sb, "journal-1169",
2174				 "kmalloc failed, unable to mount FS");
2175		return -1;
2176	}
2177	/* get all the buffer heads */
2178	trans_half = journal_trans_half(sb->s_blocksize);
2179	for (i = 0; i < get_desc_trans_len(desc); i++) {
2180		log_blocks[i] =
2181		    journal_getblk(sb,
2182				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2183				   (trans_offset + 1 +
2184				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
2185		if (i < trans_half) {
2186			real_blocks[i] =
2187			    sb_getblk(sb,
2188				      le32_to_cpu(desc->j_realblock[i]));
2189		} else {
2190			real_blocks[i] =
2191			    sb_getblk(sb,
2192				      le32_to_cpu(commit->
2193						  j_realblock[i - trans_half]));
2194		}
2195		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
2196			reiserfs_warning(sb, "journal-1207",
2197					 "REPLAY FAILURE fsck required! "
2198					 "Block to replay is outside of "
2199					 "filesystem");
2200			goto abort_replay;
2201		}
2202		/* make sure we don't try to replay onto log or reserved area */
2203		if (is_block_in_log_or_reserved_area
2204		    (sb, real_blocks[i]->b_blocknr)) {
2205			reiserfs_warning(sb, "journal-1204",
2206					 "REPLAY FAILURE fsck required! "
2207					 "Trying to replay onto a log block");
2208		      abort_replay:
2209			brelse_array(log_blocks, i);
2210			brelse_array(real_blocks, i);
2211			brelse(c_bh);
2212			brelse(d_bh);
2213			kfree(log_blocks);
2214			kfree(real_blocks);
2215			return -1;
2216		}
2217	}
2218	/* read in the log blocks, memcpy to the corresponding real block */
2219	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2220	for (i = 0; i < get_desc_trans_len(desc); i++) {
2221
2222		reiserfs_write_unlock(sb);
2223		wait_on_buffer(log_blocks[i]);
2224		reiserfs_write_lock(sb);
2225
2226		if (!buffer_uptodate(log_blocks[i])) {
2227			reiserfs_warning(sb, "journal-1212",
2228					 "REPLAY FAILURE fsck required! "
2229					 "buffer write failed");
2230			brelse_array(log_blocks + i,
2231				     get_desc_trans_len(desc) - i);
2232			brelse_array(real_blocks, get_desc_trans_len(desc));
2233			brelse(c_bh);
2234			brelse(d_bh);
2235			kfree(log_blocks);
2236			kfree(real_blocks);
2237			return -1;
2238		}
2239		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
2240		       real_blocks[i]->b_size);
2241		set_buffer_uptodate(real_blocks[i]);
2242		brelse(log_blocks[i]);
2243	}
2244	/* flush out the real blocks */
2245	for (i = 0; i < get_desc_trans_len(desc); i++) {
2246		set_buffer_dirty(real_blocks[i]);
2247		write_dirty_buffer(real_blocks[i], WRITE);
2248	}
2249	for (i = 0; i < get_desc_trans_len(desc); i++) {
2250		wait_on_buffer(real_blocks[i]);
2251		if (!buffer_uptodate(real_blocks[i])) {
2252			reiserfs_warning(sb, "journal-1226",
2253					 "REPLAY FAILURE, fsck required! "
2254					 "buffer write failed");
2255			brelse_array(real_blocks + i,
2256				     get_desc_trans_len(desc) - i);
2257			brelse(c_bh);
2258			brelse(d_bh);
2259			kfree(log_blocks);
2260			kfree(real_blocks);
2261			return -1;
2262		}
2263		brelse(real_blocks[i]);
2264	}
2265	cur_dblock =
2266	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2267	    ((trans_offset + get_desc_trans_len(desc) +
2268	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
2269	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2270		       "journal-1095: setting journal " "start to offset %ld",
2271		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2272
2273	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
2274	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2275	journal->j_last_flush_trans_id = trans_id;
2276	journal->j_trans_id = trans_id + 1;
2277	/* check for trans_id overflow */
2278	if (journal->j_trans_id == 0)
2279		journal->j_trans_id = 10;
2280	brelse(c_bh);
2281	brelse(d_bh);
2282	kfree(log_blocks);
2283	kfree(real_blocks);
2284	return 0;
2285}
2286
2287/* This function reads blocks starting from block and to max_block of bufsize
2288   size (but no more than BUFNR blocks at a time). This proved to improve
2289   mounting speed on self-rebuilding raid5 arrays at least.
2290   Right now it is only used from journal code. But later we might use it
2291   from other places.
2292   Note: Do not use journal_getblk/sb_getblk functions here! */
2293static struct buffer_head *reiserfs_breada(struct block_device *dev,
2294					   b_blocknr_t block, int bufsize,
2295					   b_blocknr_t max_block)
2296{
2297	struct buffer_head *bhlist[BUFNR];
2298	unsigned int blocks = BUFNR;
2299	struct buffer_head *bh;
2300	int i, j;
2301
2302	bh = __getblk(dev, block, bufsize);
2303	if (buffer_uptodate(bh))
2304		return (bh);
2305
2306	if (block + BUFNR > max_block) {
2307		blocks = max_block - block;
2308	}
2309	bhlist[0] = bh;
2310	j = 1;
2311	for (i = 1; i < blocks; i++) {
2312		bh = __getblk(dev, block + i, bufsize);
2313		if (buffer_uptodate(bh)) {
2314			brelse(bh);
2315			break;
2316		} else
2317			bhlist[j++] = bh;
2318	}
2319	ll_rw_block(READ, j, bhlist);
2320	for (i = 1; i < j; i++)
2321		brelse(bhlist[i]);
2322	bh = bhlist[0];
2323	wait_on_buffer(bh);
2324	if (buffer_uptodate(bh))
2325		return bh;
2326	brelse(bh);
2327	return NULL;
2328}
2329
2330/*
2331** read and replay the log
2332** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
2333** transaction.  This tests that before finding all the transactions in the log, which makes normal mount times fast.
2334**
2335** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
2336**
2337** On exit, it sets things up so the first transaction will work correctly.
2338*/
2339static int journal_read(struct super_block *sb)
2340{
2341	struct reiserfs_journal *journal = SB_JOURNAL(sb);
2342	struct reiserfs_journal_desc *desc;
2343	unsigned int oldest_trans_id = 0;
2344	unsigned int oldest_invalid_trans_id = 0;
2345	time_t start;
2346	unsigned long oldest_start = 0;
2347	unsigned long cur_dblock = 0;
2348	unsigned long newest_mount_id = 9;
2349	struct buffer_head *d_bh;
2350	struct reiserfs_journal_header *jh;
2351	int valid_journal_header = 0;
2352	int replay_count = 0;
2353	int continue_replay = 1;
2354	int ret;
2355	char b[BDEVNAME_SIZE];
2356
2357	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2358	reiserfs_info(sb, "checking transaction log (%s)\n",
2359		      bdevname(journal->j_dev_bd, b));
2360	start = get_seconds();
2361
2362	/* step 1, read in the journal header block.  Check the transaction it says
2363	 ** is the first unflushed, and if that transaction is not valid,
2364	 ** replay is done
2365	 */
2366	journal->j_header_bh = journal_bread(sb,
2367					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
2368					     + SB_ONDISK_JOURNAL_SIZE(sb));
2369	if (!journal->j_header_bh) {
2370		return 1;
2371	}
2372	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
2373	if (le32_to_cpu(jh->j_first_unflushed_offset) <
2374	    SB_ONDISK_JOURNAL_SIZE(sb)
2375	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2376		oldest_start =
2377		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2378		    le32_to_cpu(jh->j_first_unflushed_offset);
2379		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2380		newest_mount_id = le32_to_cpu(jh->j_mount_id);
2381		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2382			       "journal-1153: found in "
2383			       "header: first_unflushed_offset %d, last_flushed_trans_id "
2384			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
2385			       le32_to_cpu(jh->j_last_flush_trans_id));
2386		valid_journal_header = 1;
2387
2388		/* now, we try to read the first unflushed offset.  If it is not valid,
2389		 ** there is nothing more we can do, and it makes no sense to read
2390		 ** through the whole log.
2391		 */
2392		d_bh =
2393		    journal_bread(sb,
2394				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2395				  le32_to_cpu(jh->j_first_unflushed_offset));
2396		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
2397		if (!ret) {
2398			continue_replay = 0;
2399		}
2400		brelse(d_bh);
2401		goto start_log_replay;
2402	}
2403
2404	/* ok, there are transactions that need to be replayed.  start with the first log block, find
2405	 ** all the valid transactions, and pick out the oldest.
2406	 */
2407	while (continue_replay
2408	       && cur_dblock <
2409	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2410		SB_ONDISK_JOURNAL_SIZE(sb))) {
2411		/* Note that it is required for blocksize of primary fs device and journal
2412		   device to be the same */
2413		d_bh =
2414		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
2415				    sb->s_blocksize,
2416				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2417				    SB_ONDISK_JOURNAL_SIZE(sb));
2418		ret =
2419		    journal_transaction_is_valid(sb, d_bh,
2420						 &oldest_invalid_trans_id,
2421						 &newest_mount_id);
2422		if (ret == 1) {
2423			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2424			if (oldest_start == 0) {	/* init all oldest_ values */
2425				oldest_trans_id = get_desc_trans_id(desc);
2426				oldest_start = d_bh->b_blocknr;
2427				newest_mount_id = get_desc_mount_id(desc);
2428				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2429					       "journal-1179: Setting "
2430					       "oldest_start to offset %llu, trans_id %lu",
2431					       oldest_start -
2432					       SB_ONDISK_JOURNAL_1st_BLOCK
2433					       (sb), oldest_trans_id);
2434			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
2435				/* one we just read was older */
2436				oldest_trans_id = get_desc_trans_id(desc);
2437				oldest_start = d_bh->b_blocknr;
2438				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2439					       "journal-1180: Resetting "
2440					       "oldest_start to offset %lu, trans_id %lu",
2441					       oldest_start -
2442					       SB_ONDISK_JOURNAL_1st_BLOCK
2443					       (sb), oldest_trans_id);
2444			}
2445			if (newest_mount_id < get_desc_mount_id(desc)) {
2446				newest_mount_id = get_desc_mount_id(desc);
2447				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2448					       "journal-1299: Setting "
2449					       "newest_mount_id to %d",
2450					       get_desc_mount_id(desc));
2451			}
2452			cur_dblock += get_desc_trans_len(desc) + 2;
2453		} else {
2454			cur_dblock++;
2455		}
2456		brelse(d_bh);
2457	}
2458
2459      start_log_replay:
2460	cur_dblock = oldest_start;
2461	if (oldest_trans_id) {
2462		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2463			       "journal-1206: Starting replay "
2464			       "from offset %llu, trans_id %lu",
2465			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2466			       oldest_trans_id);
2467
2468	}
2469	replay_count = 0;
2470	while (continue_replay && oldest_trans_id > 0) {
2471		ret =
2472		    journal_read_transaction(sb, cur_dblock, oldest_start,
2473					     oldest_trans_id, newest_mount_id);
2474		if (ret < 0) {
2475			return ret;
2476		} else if (ret != 0) {
2477			break;
2478		}
2479		cur_dblock =
2480		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
2481		replay_count++;
2482		if (cur_dblock == oldest_start)
2483			break;
2484	}
2485
2486	if (oldest_trans_id == 0) {
2487		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2488			       "journal-1225: No valid " "transactions found");
2489	}
2490	/* j_start does not get set correctly if we don't replay any transactions.
2491	 ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
2492	 ** copy the trans_id from the header
2493	 */
2494	if (valid_journal_header && replay_count == 0) {
2495		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
2496		journal->j_trans_id =
2497		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2498		/* check for trans_id overflow */
2499		if (journal->j_trans_id == 0)
2500			journal->j_trans_id = 10;
2501		journal->j_last_flush_trans_id =
2502		    le32_to_cpu(jh->j_last_flush_trans_id);
2503		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
2504	} else {
2505		journal->j_mount_id = newest_mount_id + 1;
2506	}
2507	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2508		       "newest_mount_id to %lu", journal->j_mount_id);
2509	journal->j_first_unflushed_offset = journal->j_start;
2510	if (replay_count > 0) {
2511		reiserfs_info(sb,
2512			      "replayed %d transactions in %lu seconds\n",
2513			      replay_count, get_seconds() - start);
2514	}
2515	if (!bdev_read_only(sb->s_bdev) &&
2516	    _update_journal_header_block(sb, journal->j_start,
2517					 journal->j_last_flush_trans_id)) {
2518		/* replay failed, caller must call free_journal_ram and abort
2519		 ** the mount
2520		 */
2521		return -1;
2522	}
2523	return 0;
2524}
2525
2526static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2527{
2528	struct reiserfs_journal_list *jl;
2529	jl = kzalloc(sizeof(struct reiserfs_journal_list),
2530		     GFP_NOFS | __GFP_NOFAIL);
2531	INIT_LIST_HEAD(&jl->j_list);
2532	INIT_LIST_HEAD(&jl->j_working_list);
2533	INIT_LIST_HEAD(&jl->j_tail_bh_list);
2534	INIT_LIST_HEAD(&jl->j_bh_list);
2535	mutex_init(&jl->j_commit_mutex);
2536	SB_JOURNAL(s)->j_num_lists++;
2537	get_journal_list(jl);
2538	return jl;
2539}
2540
2541static void journal_list_init(struct super_block *sb)
2542{
2543	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
2544}
2545
2546static int release_journal_dev(struct super_block *super,
2547			       struct reiserfs_journal *journal)
2548{
2549	int result;
2550
2551	result = 0;
2552
2553	if (journal->j_dev_bd != NULL) {
2554		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2555		journal->j_dev_bd = NULL;
2556	}
2557
2558	if (result != 0) {
2559		reiserfs_warning(super, "sh-457",
2560				 "Cannot release journal device: %i", result);
2561	}
2562	return result;
2563}
2564
2565static int journal_init_dev(struct super_block *super,
2566			    struct reiserfs_journal *journal,
2567			    const char *jdev_name)
2568{
2569	int result;
2570	dev_t jdev;
2571	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2572	char b[BDEVNAME_SIZE];
2573
2574	result = 0;
2575
2576	journal->j_dev_bd = NULL;
2577	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
2578	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
2579
2580	if (bdev_read_only(super->s_bdev))
2581		blkdev_mode = FMODE_READ;
2582
2583	/* there is no "jdev" option and journal is on separate device */
2584	if ((!jdev_name || !jdev_name[0])) {
2585		if (jdev == super->s_dev)
2586			blkdev_mode &= ~FMODE_EXCL;
2587		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
2588						      journal);
2589		journal->j_dev_mode = blkdev_mode;
2590		if (IS_ERR(journal->j_dev_bd)) {
2591			result = PTR_ERR(journal->j_dev_bd);
2592			journal->j_dev_bd = NULL;
2593			reiserfs_warning(super, "sh-458",
2594					 "cannot init journal device '%s': %i",
2595					 __bdevname(jdev, b), result);
2596			return result;
2597		} else if (jdev != super->s_dev)
2598			set_blocksize(journal->j_dev_bd, super->s_blocksize);
2599
2600		return 0;
2601	}
2602
2603	journal->j_dev_mode = blkdev_mode;
2604	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2605	if (IS_ERR(journal->j_dev_bd)) {
2606		result = PTR_ERR(journal->j_dev_bd);
2607		journal->j_dev_bd = NULL;
2608		reiserfs_warning(super,
2609				 "journal_init_dev: Cannot open '%s': %i",
2610				 jdev_name, result);
2611		return result;
2612	}
2613
2614	set_blocksize(journal->j_dev_bd, super->s_blocksize);
2615	reiserfs_info(super,
2616		      "journal_init_dev: journal device: %s\n",
2617		      bdevname(journal->j_dev_bd, b));
2618	return 0;
2619}
2620
2621/**
2622 * When creating/tuning a file system user can assign some
2623 * journal params within boundaries which depend on the ratio
2624 * blocksize/standard_blocksize.
2625 *
2626 * For blocks >= standard_blocksize transaction size should
2627 * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
2628 * then JOURNAL_TRANS_MAX_DEFAULT.
2629 *
2630 * For blocks < standard_blocksize these boundaries should be
2631 * decreased proportionally.
2632 */
2633#define REISERFS_STANDARD_BLKSIZE (4096)
2634
2635static int check_advise_trans_params(struct super_block *sb,
2636				     struct reiserfs_journal *journal)
2637{
2638        if (journal->j_trans_max) {
2639	        /* Non-default journal params.
2640		   Do sanity check for them. */
2641	        int ratio = 1;
2642		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
2643		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
2644
2645		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
2646		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
2647		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
2648		    JOURNAL_MIN_RATIO) {
2649			reiserfs_warning(sb, "sh-462",
2650					 "bad transaction max size (%u). "
2651					 "FSCK?", journal->j_trans_max);
2652			return 1;
2653		}
2654		if (journal->j_max_batch != (journal->j_trans_max) *
2655		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
2656			reiserfs_warning(sb, "sh-463",
2657					 "bad transaction max batch (%u). "
2658					 "FSCK?", journal->j_max_batch);
2659			return 1;
2660		}
2661	} else {
2662		/* Default journal params.
2663                   The file system was created by old version
2664		   of mkreiserfs, so some fields contain zeros,
2665		   and we need to advise proper values for them */
2666		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
2667			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
2668					 sb->s_blocksize);
2669			return 1;
2670		}
2671		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
2672		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
2673		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
2674	}
2675	return 0;
2676}
2677
2678/*
2679** must be called once on fs mount.  calls journal_read for you
2680*/
2681int journal_init(struct super_block *sb, const char *j_dev_name,
2682		 int old_format, unsigned int commit_max_age)
2683{
2684	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
2685	struct buffer_head *bhjh;
2686	struct reiserfs_super_block *rs;
2687	struct reiserfs_journal_header *jh;
2688	struct reiserfs_journal *journal;
2689	struct reiserfs_journal_list *jl;
2690	char b[BDEVNAME_SIZE];
2691	int ret;
2692
2693	/*
2694	 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
2695	 * dependency inversion warnings.
2696	 */
2697	reiserfs_write_unlock(sb);
2698	journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
2699	if (!journal) {
2700		reiserfs_warning(sb, "journal-1256",
2701				 "unable to get memory for journal structure");
2702		reiserfs_write_lock(sb);
2703		return 1;
2704	}
2705	memset(journal, 0, sizeof(struct reiserfs_journal));
2706	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
2707	INIT_LIST_HEAD(&journal->j_prealloc_list);
2708	INIT_LIST_HEAD(&journal->j_working_list);
2709	INIT_LIST_HEAD(&journal->j_journal_list);
2710	journal->j_persistent_trans = 0;
2711	ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2712					   reiserfs_bmap_count(sb));
2713	reiserfs_write_lock(sb);
2714	if (ret)
2715		goto free_and_return;
2716
2717	allocate_bitmap_nodes(sb);
2718
2719	/* reserved for journal area support */
2720	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
2721						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
2722						 / sb->s_blocksize +
2723						 reiserfs_bmap_count(sb) +
2724						 1 :
2725						 REISERFS_DISK_OFFSET_IN_BYTES /
2726						 sb->s_blocksize + 2);
2727
2728	/* Sanity check to see is the standard journal fitting within first bitmap
2729	   (actual for small blocksizes) */
2730	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2731	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
2732	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
2733		reiserfs_warning(sb, "journal-1393",
2734				 "journal does not fit for area addressed "
2735				 "by first of bitmap blocks. It starts at "
2736				 "%u and its size is %u. Block size %ld",
2737				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
2738				 SB_ONDISK_JOURNAL_SIZE(sb),
2739				 sb->s_blocksize);
2740		goto free_and_return;
2741	}
2742
2743	/*
2744	 * We need to unlock here to avoid creating the following
2745	 * dependency:
2746	 * reiserfs_lock -> sysfs_mutex
2747	 * Because the reiserfs mmap path creates the following dependency:
2748	 * mm->mmap -> reiserfs_lock, hence we have
2749	 * mm->mmap -> reiserfs_lock ->sysfs_mutex
2750	 * This would ends up in a circular dependency with sysfs readdir path
2751	 * which does sysfs_mutex -> mm->mmap_sem
2752	 * This is fine because the reiserfs lock is useless in mount path,
2753	 * at least until we call journal_begin. We keep it for paranoid
2754	 * reasons.
2755	 */
2756	reiserfs_write_unlock(sb);
2757	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2758		reiserfs_write_lock(sb);
2759		reiserfs_warning(sb, "sh-462",
2760				 "unable to initialize jornal device");
2761		goto free_and_return;
2762	}
2763	reiserfs_write_lock(sb);
2764
2765	rs = SB_DISK_SUPER_BLOCK(sb);
2766
2767	/* read journal header */
2768	bhjh = journal_bread(sb,
2769			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2770			     SB_ONDISK_JOURNAL_SIZE(sb));
2771	if (!bhjh) {
2772		reiserfs_warning(sb, "sh-459",
2773				 "unable to read journal header");
2774		goto free_and_return;
2775	}
2776	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
2777
2778	/* make sure that journal matches to the super block */
2779	if (is_reiserfs_jr(rs)
2780	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
2781		sb_jp_journal_magic(rs))) {
2782		reiserfs_warning(sb, "sh-460",
2783				 "journal header magic %x (device %s) does "
2784				 "not match to magic found in super block %x",
2785				 jh->jh_journal.jp_journal_magic,
2786				 bdevname(journal->j_dev_bd, b),
2787				 sb_jp_journal_magic(rs));
2788		brelse(bhjh);
2789		goto free_and_return;
2790	}
2791
2792	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
2793	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
2794	journal->j_max_commit_age =
2795	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
2796	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
2797
2798	if (check_advise_trans_params(sb, journal) != 0)
2799	        goto free_and_return;
2800	journal->j_default_max_commit_age = journal->j_max_commit_age;
2801
2802	if (commit_max_age != 0) {
2803		journal->j_max_commit_age = commit_max_age;
2804		journal->j_max_trans_age = commit_max_age;
2805	}
2806
2807	reiserfs_info(sb, "journal params: device %s, size %u, "
2808		      "journal first block %u, max trans len %u, max batch %u, "
2809		      "max commit age %u, max trans age %u\n",
2810		      bdevname(journal->j_dev_bd, b),
2811		      SB_ONDISK_JOURNAL_SIZE(sb),
2812		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2813		      journal->j_trans_max,
2814		      journal->j_max_batch,
2815		      journal->j_max_commit_age, journal->j_max_trans_age);
2816
2817	brelse(bhjh);
2818
2819	journal->j_list_bitmap_index = 0;
2820	journal_list_init(sb);
2821
2822	memset(journal->j_list_hash_table, 0,
2823	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
2824
2825	INIT_LIST_HEAD(&journal->j_dirty_buffers);
2826	spin_lock_init(&journal->j_dirty_buffers_lock);
2827
2828	journal->j_start = 0;
2829	journal->j_len = 0;
2830	journal->j_len_alloc = 0;
2831	atomic_set(&(journal->j_wcount), 0);
2832	atomic_set(&(journal->j_async_throttle), 0);
2833	journal->j_bcount = 0;
2834	journal->j_trans_start_time = 0;
2835	journal->j_last = NULL;
2836	journal->j_first = NULL;
2837	init_waitqueue_head(&(journal->j_join_wait));
2838	mutex_init(&journal->j_mutex);
2839	mutex_init(&journal->j_flush_mutex);
2840
2841	journal->j_trans_id = 10;
2842	journal->j_mount_id = 10;
2843	journal->j_state = 0;
2844	atomic_set(&(journal->j_jlock), 0);
2845	reiserfs_write_unlock(sb);
2846	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2847	reiserfs_write_lock(sb);
2848	journal->j_cnode_free_orig = journal->j_cnode_free_list;
2849	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2850	journal->j_cnode_used = 0;
2851	journal->j_must_wait = 0;
2852
2853	if (journal->j_cnode_free == 0) {
2854		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
2855		                 "allocation failed (%ld bytes). Journal is "
2856		                 "too large for available memory. Usually "
2857		                 "this is due to a journal that is too large.",
2858		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
2859        	goto free_and_return;
2860	}
2861
2862	init_journal_hash(sb);
2863	jl = journal->j_current_jl;
2864	jl->j_list_bitmap = get_list_bitmap(sb, jl);
2865	if (!jl->j_list_bitmap) {
2866		reiserfs_warning(sb, "journal-2005",
2867				 "get_list_bitmap failed for journal list 0");
2868		goto free_and_return;
2869	}
2870	if (journal_read(sb) < 0) {
2871		reiserfs_warning(sb, "reiserfs-2006",
2872				 "Replay Failure, unable to mount");
2873		goto free_and_return;
2874	}
2875
2876	reiserfs_mounted_fs_count++;
2877	if (reiserfs_mounted_fs_count <= 1) {
2878		reiserfs_write_unlock(sb);
2879		commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
2880		reiserfs_write_lock(sb);
2881	}
2882
2883	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2884	journal->j_work_sb = sb;
2885	return 0;
2886      free_and_return:
2887	free_journal_ram(sb);
2888	return 1;
2889}
2890
2891/*
2892** test for a polite end of the current transaction.  Used by file_write, and should
2893** be used by delete to make sure they don't write more than can fit inside a single
2894** transaction
2895*/
2896int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2897				   int new_alloc)
2898{
2899	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2900	time_t now = get_seconds();
2901	/* cannot restart while nested */
2902	BUG_ON(!th->t_trans_id);
2903	if (th->t_refcount > 1)
2904		return 0;
2905	if (journal->j_must_wait > 0 ||
2906	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
2907	    atomic_read(&(journal->j_jlock)) ||
2908	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
2909	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
2910		return 1;
2911	}
2912	/* protected by the BKL here */
2913	journal->j_len_alloc += new_alloc;
2914	th->t_blocks_allocated += new_alloc ;
2915	return 0;
2916}
2917
2918/* this must be called inside a transaction, and requires the
2919** kernel_lock to be held
2920*/
2921void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
2922{
2923	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2924	BUG_ON(!th->t_trans_id);
2925	journal->j_must_wait = 1;
2926	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
2927	return;
2928}
2929
2930/* this must be called without a transaction started, and does not
2931** require BKL
2932*/
2933void reiserfs_allow_writes(struct super_block *s)
2934{
2935	struct reiserfs_journal *journal = SB_JOURNAL(s);
2936	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
2937	wake_up(&journal->j_join_wait);
2938}
2939
2940/* this must be called without a transaction started, and does not
2941** require BKL
2942*/
2943void reiserfs_wait_on_write_block(struct super_block *s)
2944{
2945	struct reiserfs_journal *journal = SB_JOURNAL(s);
2946	wait_event(journal->j_join_wait,
2947		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
2948}
2949
2950static void queue_log_writer(struct super_block *s)
2951{
2952	wait_queue_t wait;
2953	struct reiserfs_journal *journal = SB_JOURNAL(s);
2954	set_bit(J_WRITERS_QUEUED, &journal->j_state);
2955
2956	/*
2957	 * we don't want to use wait_event here because
2958	 * we only want to wait once.
2959	 */
2960	init_waitqueue_entry(&wait, current);
2961	add_wait_queue(&journal->j_join_wait, &wait);
2962	set_current_state(TASK_UNINTERRUPTIBLE);
2963	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
2964		reiserfs_write_unlock(s);
2965		schedule();
2966		reiserfs_write_lock(s);
2967	}
2968	__set_current_state(TASK_RUNNING);
2969	remove_wait_queue(&journal->j_join_wait, &wait);
2970}
2971
2972static void wake_queued_writers(struct super_block *s)
2973{
2974	struct reiserfs_journal *journal = SB_JOURNAL(s);
2975	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
2976		wake_up(&journal->j_join_wait);
2977}
2978
2979static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2980{
2981	struct reiserfs_journal *journal = SB_JOURNAL(sb);
2982	unsigned long bcount = journal->j_bcount;
2983	while (1) {
2984		reiserfs_write_unlock(sb);
2985		schedule_timeout_uninterruptible(1);
2986		reiserfs_write_lock(sb);
2987		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2988		while ((atomic_read(&journal->j_wcount) > 0 ||
2989			atomic_read(&journal->j_jlock)) &&
2990		       journal->j_trans_id == trans_id) {
2991			queue_log_writer(sb);
2992		}
2993		if (journal->j_trans_id != trans_id)
2994			break;
2995		if (bcount == journal->j_bcount)
2996			break;
2997		bcount = journal->j_bcount;
2998	}
2999}
3000
3001/* join == true if you must join an existing transaction.
3002** join == false if you can deal with waiting for others to finish
3003**
3004** this will block until the transaction is joinable.  send the number of blocks you
3005** expect to use in nblocks.
3006*/
3007static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3008			      struct super_block *sb, unsigned long nblocks,
3009			      int join)
3010{
3011	time_t now = get_seconds();
3012	unsigned int old_trans_id;
3013	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3014	struct reiserfs_transaction_handle myth;
3015	int sched_count = 0;
3016	int retval;
3017
3018	reiserfs_check_lock_depth(sb, "journal_begin");
3019	BUG_ON(nblocks > journal->j_trans_max);
3020
3021	PROC_INFO_INC(sb, journal.journal_being);
3022	/* set here for journal_join */
3023	th->t_refcount = 1;
3024	th->t_super = sb;
3025
3026      relock:
3027	lock_journal(sb);
3028	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
3029		unlock_journal(sb);
3030		retval = journal->j_errno;
3031		goto out_fail;
3032	}
3033	journal->j_bcount++;
3034
3035	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3036		unlock_journal(sb);
3037		reiserfs_write_unlock(sb);
3038		reiserfs_wait_on_write_block(sb);
3039		reiserfs_write_lock(sb);
3040		PROC_INFO_INC(sb, journal.journal_relock_writers);
3041		goto relock;
3042	}
3043	now = get_seconds();
3044
3045	/* if there is no room in the journal OR
3046	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
3047	 ** we don't sleep if there aren't other writers
3048	 */
3049
3050	if ((!join && journal->j_must_wait > 0) ||
3051	    (!join
3052	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
3053	    || (!join && atomic_read(&journal->j_wcount) > 0
3054		&& journal->j_trans_start_time > 0
3055		&& (now - journal->j_trans_start_time) >
3056		journal->j_max_trans_age) || (!join
3057					      && atomic_read(&journal->j_jlock))
3058	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
3059
3060		old_trans_id = journal->j_trans_id;
3061		unlock_journal(sb);	/* allow others to finish this transaction */
3062
3063		if (!join && (journal->j_len_alloc + nblocks + 2) >=
3064		    journal->j_max_batch &&
3065		    ((journal->j_len + nblocks + 2) * 100) <
3066		    (journal->j_len_alloc * 75)) {
3067			if (atomic_read(&journal->j_wcount) > 10) {
3068				sched_count++;
3069				queue_log_writer(sb);
3070				goto relock;
3071			}
3072		}
3073		/* don't mess with joining the transaction if all we have to do is
3074		 * wait for someone else to do a commit
3075		 */
3076		if (atomic_read(&journal->j_jlock)) {
3077			while (journal->j_trans_id == old_trans_id &&
3078			       atomic_read(&journal->j_jlock)) {
3079				queue_log_writer(sb);
3080			}
3081			goto relock;
3082		}
3083		retval = journal_join(&myth, sb, 1);
3084		if (retval)
3085			goto out_fail;
3086
3087		/* someone might have ended the transaction while we joined */
3088		if (old_trans_id != journal->j_trans_id) {
3089			retval = do_journal_end(&myth, sb, 1, 0);
3090		} else {
3091			retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
3092		}
3093
3094		if (retval)
3095			goto out_fail;
3096
3097		PROC_INFO_INC(sb, journal.journal_relock_wcount);
3098		goto relock;
3099	}
3100	/* we are the first writer, set trans_id */
3101	if (journal->j_trans_start_time == 0) {
3102		journal->j_trans_start_time = get_seconds();
3103	}
3104	atomic_inc(&(journal->j_wcount));
3105	journal->j_len_alloc += nblocks;
3106	th->t_blocks_logged = 0;
3107	th->t_blocks_allocated = nblocks;
3108	th->t_trans_id = journal->j_trans_id;
3109	unlock_journal(sb);
3110	INIT_LIST_HEAD(&th->t_list);
3111	get_fs_excl();
3112	return 0;
3113
3114      out_fail:
3115	memset(th, 0, sizeof(*th));
3116	/* Re-set th->t_super, so we can properly keep track of how many
3117	 * persistent transactions there are. We need to do this so if this
3118	 * call is part of a failed restart_transaction, we can free it later */
3119	th->t_super = sb;
3120	return retval;
3121}
3122
3123struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
3124								    super_block
3125								    *s,
3126								    int nblocks)
3127{
3128	int ret;
3129	struct reiserfs_transaction_handle *th;
3130
3131	/* if we're nesting into an existing transaction.  It will be
3132	 ** persistent on its own
3133	 */
3134	if (reiserfs_transaction_running(s)) {
3135		th = current->journal_info;
3136		th->t_refcount++;
3137		BUG_ON(th->t_refcount < 2);
3138
3139		return th;
3140	}
3141	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
3142	if (!th)
3143		return NULL;
3144	ret = journal_begin(th, s, nblocks);
3145	if (ret) {
3146		kfree(th);
3147		return NULL;
3148	}
3149
3150	SB_JOURNAL(s)->j_persistent_trans++;
3151	return th;
3152}
3153
3154int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
3155{
3156	struct super_block *s = th->t_super;
3157	int ret = 0;
3158	if (th->t_trans_id)
3159		ret = journal_end(th, th->t_super, th->t_blocks_allocated);
3160	else
3161		ret = -EIO;
3162	if (th->t_refcount == 0) {
3163		SB_JOURNAL(s)->j_persistent_trans--;
3164		kfree(th);
3165	}
3166	return ret;
3167}
3168
3169static int journal_join(struct reiserfs_transaction_handle *th,
3170			struct super_block *sb, unsigned long nblocks)
3171{
3172	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3173
3174	/* this keeps do_journal_end from NULLing out the current->journal_info
3175	 ** pointer
3176	 */
3177	th->t_handle_save = cur_th;
3178	BUG_ON(cur_th && cur_th->t_refcount > 1);
3179	return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
3180}
3181
3182int journal_join_abort(struct reiserfs_transaction_handle *th,
3183		       struct super_block *sb, unsigned long nblocks)
3184{
3185	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3186
3187	/* this keeps do_journal_end from NULLing out the current->journal_info
3188	 ** pointer
3189	 */
3190	th->t_handle_save = cur_th;
3191	BUG_ON(cur_th && cur_th->t_refcount > 1);
3192	return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
3193}
3194
3195int journal_begin(struct reiserfs_transaction_handle *th,
3196		  struct super_block *sb, unsigned long nblocks)
3197{
3198	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3199	int ret;
3200
3201	th->t_handle_save = NULL;
3202	if (cur_th) {
3203		/* we are nesting into the current transaction */
3204		if (cur_th->t_super == sb) {
3205			BUG_ON(!cur_th->t_refcount);
3206			cur_th->t_refcount++;
3207			memcpy(th, cur_th, sizeof(*th));
3208			if (th->t_refcount <= 1)
3209				reiserfs_warning(sb, "reiserfs-2005",
3210						 "BAD: refcount <= 1, but "
3211						 "journal_info != 0");
3212			return 0;
3213		} else {
3214			/* we've ended up with a handle from a different filesystem.
3215			 ** save it and restore on journal_end.  This should never
3216			 ** really happen...
3217			 */
3218			reiserfs_warning(sb, "clm-2100",
3219					 "nesting info a different FS");
3220			th->t_handle_save = current->journal_info;
3221			current->journal_info = th;
3222		}
3223	} else {
3224		current->journal_info = th;
3225	}
3226	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
3227	BUG_ON(current->journal_info != th);
3228
3229	/* I guess this boils down to being the reciprocal of clm-2100 above.
3230	 * If do_journal_begin_r fails, we need to put it back, since journal_end
3231	 * won't be called to do it. */
3232	if (ret)
3233		current->journal_info = th->t_handle_save;
3234	else
3235		BUG_ON(!th->t_refcount);
3236
3237	return ret;
3238}
3239
3240/*
3241** puts bh into the current transaction.  If it was already there, reorders removes the
3242** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
3243**
3244** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
3245** transaction is committed.
3246**
3247** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
3248*/
3249int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3250		       struct super_block *sb, struct buffer_head *bh)
3251{
3252	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3253	struct reiserfs_journal_cnode *cn = NULL;
3254	int count_already_incd = 0;
3255	int prepared = 0;
3256	BUG_ON(!th->t_trans_id);
3257
3258	PROC_INFO_INC(sb, journal.mark_dirty);
3259	if (th->t_trans_id != journal->j_trans_id) {
3260		reiserfs_panic(th->t_super, "journal-1577",
3261			       "handle trans id %ld != current trans id %ld",
3262			       th->t_trans_id, journal->j_trans_id);
3263	}
3264
3265	sb->s_dirt = 1;
3266
3267	prepared = test_clear_buffer_journal_prepared(bh);
3268	clear_buffer_journal_restore_dirty(bh);
3269	/* already in this transaction, we are done */
3270	if (buffer_journaled(bh)) {
3271		PROC_INFO_INC(sb, journal.mark_dirty_already);
3272		return 0;
3273	}
3274
3275	/* this must be turned into a panic instead of a warning.  We can't allow
3276	 ** a dirty or journal_dirty or locked buffer to be logged, as some changes
3277	 ** could get to disk too early.  NOT GOOD.
3278	 */
3279	if (!prepared || buffer_dirty(bh)) {
3280		reiserfs_warning(sb, "journal-1777",
3281				 "buffer %llu bad state "
3282				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
3283				 (unsigned long long)bh->b_blocknr,
3284				 prepared ? ' ' : '!',
3285				 buffer_locked(bh) ? ' ' : '!',
3286				 buffer_dirty(bh) ? ' ' : '!',
3287				 buffer_journal_dirty(bh) ? ' ' : '!');
3288	}
3289
3290	if (atomic_read(&(journal->j_wcount)) <= 0) {
3291		reiserfs_warning(sb, "journal-1409",
3292				 "returning because j_wcount was %d",
3293				 atomic_read(&(journal->j_wcount)));
3294		return 1;
3295	}
3296	/* this error means I've screwed up, and we've overflowed the transaction.
3297	 ** Nothing can be done here, except make the FS readonly or panic.
3298	 */
3299	if (journal->j_len >= journal->j_trans_max) {
3300		reiserfs_panic(th->t_super, "journal-1413",
3301			       "j_len (%lu) is too big",
3302			       journal->j_len);
3303	}
3304
3305	if (buffer_journal_dirty(bh)) {
3306		count_already_incd = 1;
3307		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
3308		clear_buffer_journal_dirty(bh);
3309	}
3310
3311	if (journal->j_len > journal->j_len_alloc) {
3312		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
3313	}
3314
3315	set_buffer_journaled(bh);
3316
3317	/* now put this guy on the end */
3318	if (!cn) {
3319		cn = get_cnode(sb);
3320		if (!cn) {
3321			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
3322		}
3323
3324		if (th->t_blocks_logged == th->t_blocks_allocated) {
3325			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
3326			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
3327		}
3328		th->t_blocks_logged++;
3329		journal->j_len++;
3330
3331		cn->bh = bh;
3332		cn->blocknr = bh->b_blocknr;
3333		cn->sb = sb;
3334		cn->jlist = NULL;
3335		insert_journal_hash(journal->j_hash_table, cn);
3336		if (!count_already_incd) {
3337			get_bh(bh);
3338		}
3339	}
3340	cn->next = NULL;
3341	cn->prev = journal->j_last;
3342	cn->bh = bh;
3343	if (journal->j_last) {
3344		journal->j_last->next = cn;
3345		journal->j_last = cn;
3346	} else {
3347		journal->j_first = cn;
3348		journal->j_last = cn;
3349	}
3350	return 0;
3351}
3352
3353int journal_end(struct reiserfs_transaction_handle *th,
3354		struct super_block *sb, unsigned long nblocks)
3355{
3356	if (!current->journal_info && th->t_refcount > 1)
3357		reiserfs_warning(sb, "REISER-NESTING",
3358				 "th NULL, refcount %d", th->t_refcount);
3359
3360	if (!th->t_trans_id) {
3361		WARN_ON(1);
3362		return -EIO;
3363	}
3364
3365	th->t_refcount--;
3366	if (th->t_refcount > 0) {
3367		struct reiserfs_transaction_handle *cur_th =
3368		    current->journal_info;
3369
3370		/* we aren't allowed to close a nested transaction on a different
3371		 ** filesystem from the one in the task struct
3372		 */
3373		BUG_ON(cur_th->t_super != th->t_super);
3374
3375		if (th != cur_th) {
3376			memcpy(current->journal_info, th, sizeof(*th));
3377			th->t_trans_id = 0;
3378		}
3379		return 0;
3380	} else {
3381		return do_journal_end(th, sb, nblocks, 0);
3382	}
3383}
3384
3385/* removes from the current transaction, relsing and descrementing any counters.
3386** also files the removed buffer directly onto the clean list
3387**
3388** called by journal_mark_freed when a block has been deleted
3389**
3390** returns 1 if it cleaned and relsed the buffer. 0 otherwise
3391*/
3392static int remove_from_transaction(struct super_block *sb,
3393				   b_blocknr_t blocknr, int already_cleaned)
3394{
3395	struct buffer_head *bh;
3396	struct reiserfs_journal_cnode *cn;
3397	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3398	int ret = 0;
3399
3400	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3401	if (!cn || !cn->bh) {
3402		return ret;
3403	}
3404	bh = cn->bh;
3405	if (cn->prev) {
3406		cn->prev->next = cn->next;
3407	}
3408	if (cn->next) {
3409		cn->next->prev = cn->prev;
3410	}
3411	if (cn == journal->j_first) {
3412		journal->j_first = cn->next;
3413	}
3414	if (cn == journal->j_last) {
3415		journal->j_last = cn->prev;
3416	}
3417	if (bh)
3418		remove_journal_hash(sb, journal->j_hash_table, NULL,
3419				    bh->b_blocknr, 0);
3420	clear_buffer_journaled(bh);	/* don't log this one */
3421
3422	if (!already_cleaned) {
3423		clear_buffer_journal_dirty(bh);
3424		clear_buffer_dirty(bh);
3425		clear_buffer_journal_test(bh);
3426		put_bh(bh);
3427		if (atomic_read(&(bh->b_count)) < 0) {
3428			reiserfs_warning(sb, "journal-1752",
3429					 "b_count < 0");
3430		}
3431		ret = 1;
3432	}
3433	journal->j_len--;
3434	journal->j_len_alloc--;
3435	free_cnode(sb, cn);
3436	return ret;
3437}
3438
3439/*
3440** for any cnode in a journal list, it can only be dirtied of all the
3441** transactions that include it are committed to disk.
3442** this checks through each transaction, and returns 1 if you are allowed to dirty,
3443** and 0 if you aren't
3444**
3445** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
3446** blocks for a given transaction on disk
3447**
3448*/
3449static int can_dirty(struct reiserfs_journal_cnode *cn)
3450{
3451	struct super_block *sb = cn->sb;
3452	b_blocknr_t blocknr = cn->blocknr;
3453	struct reiserfs_journal_cnode *cur = cn->hprev;
3454	int can_dirty = 1;
3455
3456	/* first test hprev.  These are all newer than cn, so any node here
3457	 ** with the same block number and dev means this node can't be sent
3458	 ** to disk right now.
3459	 */
3460	while (cur && can_dirty) {
3461		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
3462		    cur->blocknr == blocknr) {
3463			can_dirty = 0;
3464		}
3465		cur = cur->hprev;
3466	}
3467	/* then test hnext.  These are all older than cn.  As long as they
3468	 ** are committed to the log, it is safe to write cn to disk
3469	 */
3470	cur = cn->hnext;
3471	while (cur && can_dirty) {
3472		if (cur->jlist && cur->jlist->j_len > 0 &&
3473		    atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
3474		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
3475			can_dirty = 0;
3476		}
3477		cur = cur->hnext;
3478	}
3479	return can_dirty;
3480}
3481
3482/* syncs the commit blocks, but does not force the real buffers to disk
3483** will wait until the current transaction is done/committed before returning
3484*/
3485int journal_end_sync(struct reiserfs_transaction_handle *th,
3486		     struct super_block *sb, unsigned long nblocks)
3487{
3488	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3489
3490	BUG_ON(!th->t_trans_id);
3491	/* you can sync while nested, very, very bad */
3492	BUG_ON(th->t_refcount > 1);
3493	if (journal->j_len == 0) {
3494		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3495					     1);
3496		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3497	}
3498	return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
3499}
3500
3501/*
3502** writeback the pending async commits to disk
3503*/
3504static void flush_async_commits(struct work_struct *work)
3505{
3506	struct reiserfs_journal *journal =
3507		container_of(work, struct reiserfs_journal, j_work.work);
3508	struct super_block *sb = journal->j_work_sb;
3509	struct reiserfs_journal_list *jl;
3510	struct list_head *entry;
3511
3512	reiserfs_write_lock(sb);
3513	if (!list_empty(&journal->j_journal_list)) {
3514		/* last entry is the youngest, commit it and you get everything */
3515		entry = journal->j_journal_list.prev;
3516		jl = JOURNAL_LIST_ENTRY(entry);
3517		flush_commit_list(sb, jl, 1);
3518	}
3519	reiserfs_write_unlock(sb);
3520}
3521
3522/*
3523** flushes any old transactions to disk
3524** ends the current transaction if it is too old
3525*/
3526int reiserfs_flush_old_commits(struct super_block *sb)
3527{
3528	time_t now;
3529	struct reiserfs_transaction_handle th;
3530	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3531
3532	now = get_seconds();
3533	/* safety check so we don't flush while we are replaying the log during
3534	 * mount
3535	 */
3536	if (list_empty(&journal->j_journal_list)) {
3537		return 0;
3538	}
3539
3540	/* check the current transaction.  If there are no writers, and it is
3541	 * too old, finish it, and force the commit blocks to disk
3542	 */
3543	if (atomic_read(&journal->j_wcount) <= 0 &&
3544	    journal->j_trans_start_time > 0 &&
3545	    journal->j_len > 0 &&
3546	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3547		if (!journal_join(&th, sb, 1)) {
3548			reiserfs_prepare_for_journal(sb,
3549						     SB_BUFFER_WITH_SB(sb),
3550						     1);
3551			journal_mark_dirty(&th, sb,
3552					   SB_BUFFER_WITH_SB(sb));
3553
3554			/* we're only being called from kreiserfsd, it makes no sense to do
3555			 ** an async commit so that kreiserfsd can do it later
3556			 */
3557			do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
3558		}
3559	}
3560	return sb->s_dirt;
3561}
3562
3563/*
3564** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
3565**
3566** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
3567** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
3568** flushes the commit list and returns 0.
3569**
3570** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
3571**
3572** Note, we can't allow the journal_end to proceed while there are still writers in the log.
3573*/
3574static int check_journal_end(struct reiserfs_transaction_handle *th,
3575			     struct super_block *sb, unsigned long nblocks,
3576			     int flags)
3577{
3578
3579	time_t now;
3580	int flush = flags & FLUSH_ALL;
3581	int commit_now = flags & COMMIT_NOW;
3582	int wait_on_commit = flags & WAIT;
3583	struct reiserfs_journal_list *jl;
3584	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3585
3586	BUG_ON(!th->t_trans_id);
3587
3588	if (th->t_trans_id != journal->j_trans_id) {
3589		reiserfs_panic(th->t_super, "journal-1577",
3590			       "handle trans id %ld != current trans id %ld",
3591			       th->t_trans_id, journal->j_trans_id);
3592	}
3593
3594	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
3595	if (atomic_read(&(journal->j_wcount)) > 0) {	/* <= 0 is allowed.  unmounting might not call begin */
3596		atomic_dec(&(journal->j_wcount));
3597	}
3598
3599	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
3600	 ** will be dealt with by next transaction that actually writes something, but should be taken
3601	 ** care of in this trans
3602	 */
3603	BUG_ON(journal->j_len == 0);
3604
3605	/* if wcount > 0, and we are called to with flush or commit_now,
3606	 ** we wait on j_join_wait.  We will wake up when the last writer has
3607	 ** finished the transaction, and started it on its way to the disk.
3608	 ** Then, we flush the commit or journal list, and just return 0
3609	 ** because the rest of journal end was already done for this transaction.
3610	 */
3611	if (atomic_read(&(journal->j_wcount)) > 0) {
3612		if (flush || commit_now) {
3613			unsigned trans_id;
3614
3615			jl = journal->j_current_jl;
3616			trans_id = jl->j_trans_id;
3617			if (wait_on_commit)
3618				jl->j_state |= LIST_COMMIT_PENDING;
3619			atomic_set(&(journal->j_jlock), 1);
3620			if (flush) {
3621				journal->j_next_full_flush = 1;
3622			}
3623			unlock_journal(sb);
3624
3625			/* sleep while the current transaction is still j_jlocked */
3626			while (journal->j_trans_id == trans_id) {
3627				if (atomic_read(&journal->j_jlock)) {
3628					queue_log_writer(sb);
3629				} else {
3630					lock_journal(sb);
3631					if (journal->j_trans_id == trans_id) {
3632						atomic_set(&(journal->j_jlock),
3633							   1);
3634					}
3635					unlock_journal(sb);
3636				}
3637			}
3638			BUG_ON(journal->j_trans_id == trans_id);
3639
3640			if (commit_now
3641			    && journal_list_still_alive(sb, trans_id)
3642			    && wait_on_commit) {
3643				flush_commit_list(sb, jl, 1);
3644			}
3645			return 0;
3646		}
3647		unlock_journal(sb);
3648		return 0;
3649	}
3650
3651	/* deal with old transactions where we are the last writers */
3652	now = get_seconds();
3653	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3654		commit_now = 1;
3655		journal->j_next_async_flush = 1;
3656	}
3657	/* don't batch when someone is waiting on j_join_wait */
3658	/* don't batch when syncing the commit or flushing the whole trans */
3659	if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
3660	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
3661	    && journal->j_len_alloc < journal->j_max_batch
3662	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
3663		journal->j_bcount++;
3664		unlock_journal(sb);
3665		return 0;
3666	}
3667
3668	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
3669		reiserfs_panic(sb, "journal-003",
3670			       "j_start (%ld) is too high",
3671			       journal->j_start);
3672	}
3673	return 1;
3674}
3675
3676/*
3677** Does all the work that makes deleting blocks safe.
3678** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
3679**
3680** otherwise:
3681** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
3682** before this transaction has finished.
3683**
3684** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.  That will prevent any old transactions with
3685** this block from trying to flush to the real location.  Since we aren't removing the cnode from the journal_list_hash,
3686** the block can't be reallocated yet.
3687**
3688** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
3689*/
3690int journal_mark_freed(struct reiserfs_transaction_handle *th,
3691		       struct super_block *sb, b_blocknr_t blocknr)
3692{
3693	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3694	struct reiserfs_journal_cnode *cn = NULL;
3695	struct buffer_head *bh = NULL;
3696	struct reiserfs_list_bitmap *jb = NULL;
3697	int cleaned = 0;
3698	BUG_ON(!th->t_trans_id);
3699
3700	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3701	if (cn && cn->bh) {
3702		bh = cn->bh;
3703		get_bh(bh);
3704	}
3705	/* if it is journal new, we just remove it from this transaction */
3706	if (bh && buffer_journal_new(bh)) {
3707		clear_buffer_journal_new(bh);
3708		clear_prepared_bits(bh);
3709		reiserfs_clean_and_file_buffer(bh);
3710		cleaned = remove_from_transaction(sb, blocknr, cleaned);
3711	} else {
3712		/* set the bit for this block in the journal bitmap for this transaction */
3713		jb = journal->j_current_jl->j_list_bitmap;
3714		if (!jb) {
3715			reiserfs_panic(sb, "journal-1702",
3716				       "journal_list_bitmap is NULL");
3717		}
3718		set_bit_in_list_bitmap(sb, blocknr, jb);
3719
3720		/* Note, the entire while loop is not allowed to schedule.  */
3721
3722		if (bh) {
3723			clear_prepared_bits(bh);
3724			reiserfs_clean_and_file_buffer(bh);
3725		}
3726		cleaned = remove_from_transaction(sb, blocknr, cleaned);
3727
3728		/* find all older transactions with this block, make sure they don't try to write it out */
3729		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
3730					  blocknr);
3731		while (cn) {
3732			if (sb == cn->sb && blocknr == cn->blocknr) {
3733				set_bit(BLOCK_FREED, &cn->state);
3734				if (cn->bh) {
3735					if (!cleaned) {
3736						/* remove_from_transaction will brelse the buffer if it was
3737						 ** in the current trans
3738						 */
3739						clear_buffer_journal_dirty(cn->
3740									   bh);
3741						clear_buffer_dirty(cn->bh);
3742						clear_buffer_journal_test(cn->
3743									  bh);
3744						cleaned = 1;
3745						put_bh(cn->bh);
3746						if (atomic_read
3747						    (&(cn->bh->b_count)) < 0) {
3748							reiserfs_warning(sb,
3749								 "journal-2138",
3750								 "cn->bh->b_count < 0");
3751						}
3752					}
3753					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
3754						atomic_dec(&
3755							   (cn->jlist->
3756							    j_nonzerolen));
3757					}
3758					cn->bh = NULL;
3759				}
3760			}
3761			cn = cn->hnext;
3762		}
3763	}
3764
3765	if (bh)
3766		release_buffer_page(bh); /* get_hash grabs the buffer */
3767	return 0;
3768}
3769
3770void reiserfs_update_inode_transaction(struct inode *inode)
3771{
3772	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
3773	REISERFS_I(inode)->i_jl = journal->j_current_jl;
3774	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
3775}
3776
3777/*
3778 * returns -1 on error, 0 if no commits/barriers were done and 1
3779 * if a transaction was actually committed and the barrier was done
3780 */
3781static int __commit_trans_jl(struct inode *inode, unsigned long id,
3782			     struct reiserfs_journal_list *jl)
3783{
3784	struct reiserfs_transaction_handle th;
3785	struct super_block *sb = inode->i_sb;
3786	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3787	int ret = 0;
3788
3789	/* is it from the current transaction, or from an unknown transaction? */
3790	if (id == journal->j_trans_id) {
3791		jl = journal->j_current_jl;
3792		/* try to let other writers come in and grow this transaction */
3793		let_transaction_grow(sb, id);
3794		if (journal->j_trans_id != id) {
3795			goto flush_commit_only;
3796		}
3797
3798		ret = journal_begin(&th, sb, 1);
3799		if (ret)
3800			return ret;
3801
3802		/* someone might have ended this transaction while we joined */
3803		if (journal->j_trans_id != id) {
3804			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3805						     1);
3806			journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
3807			ret = journal_end(&th, sb, 1);
3808			goto flush_commit_only;
3809		}
3810
3811		ret = journal_end_sync(&th, sb, 1);
3812		if (!ret)
3813			ret = 1;
3814
3815	} else {
3816		/* this gets tricky, we have to make sure the journal list in
3817		 * the inode still exists.  We know the list is still around
3818		 * if we've got a larger transaction id than the oldest list
3819		 */
3820	      flush_commit_only:
3821		if (journal_list_still_alive(inode->i_sb, id)) {
3822			/*
3823			 * we only set ret to 1 when we know for sure
3824			 * the barrier hasn't been started yet on the commit
3825			 * block.
3826			 */
3827			if (atomic_read(&jl->j_commit_left) > 1)
3828				ret = 1;
3829			flush_commit_list(sb, jl, 1);
3830			if (journal->j_errno)
3831				ret = journal->j_errno;
3832		}
3833	}
3834	/* otherwise the list is gone, and long since committed */
3835	return ret;
3836}
3837
3838int reiserfs_commit_for_inode(struct inode *inode)
3839{
3840	unsigned int id = REISERFS_I(inode)->i_trans_id;
3841	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
3842
3843	/* for the whole inode, assume unset id means it was
3844	 * changed in the current transaction.  More conservative
3845	 */
3846	if (!id || !jl) {
3847		reiserfs_update_inode_transaction(inode);
3848		id = REISERFS_I(inode)->i_trans_id;
3849		/* jl will be updated in __commit_trans_jl */
3850	}
3851
3852	return __commit_trans_jl(inode, id, jl);
3853}
3854
3855void reiserfs_restore_prepared_buffer(struct super_block *sb,
3856				      struct buffer_head *bh)
3857{
3858	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3859	PROC_INFO_INC(sb, journal.restore_prepared);
3860	if (!bh) {
3861		return;
3862	}
3863	if (test_clear_buffer_journal_restore_dirty(bh) &&
3864	    buffer_journal_dirty(bh)) {
3865		struct reiserfs_journal_cnode *cn;
3866		cn = get_journal_hash_dev(sb,
3867					  journal->j_list_hash_table,
3868					  bh->b_blocknr);
3869		if (cn && can_dirty(cn)) {
3870			set_buffer_journal_test(bh);
3871			mark_buffer_dirty(bh);
3872		}
3873	}
3874	clear_buffer_journal_prepared(bh);
3875}
3876
3877extern struct tree_balance *cur_tb;
3878/*
3879** before we can change a metadata block, we have to make sure it won't
3880** be written to disk while we are altering it.  So, we must:
3881** clean it
3882** wait on it.
3883**
3884*/
3885int reiserfs_prepare_for_journal(struct super_block *sb,
3886				 struct buffer_head *bh, int wait)
3887{
3888	PROC_INFO_INC(sb, journal.prepare);
3889
3890	if (!trylock_buffer(bh)) {
3891		if (!wait)
3892			return 0;
3893		lock_buffer(bh);
3894	}
3895	set_buffer_journal_prepared(bh);
3896	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
3897		clear_buffer_journal_test(bh);
3898		set_buffer_journal_restore_dirty(bh);
3899	}
3900	unlock_buffer(bh);
3901	return 1;
3902}
3903
3904static void flush_old_journal_lists(struct super_block *s)
3905{
3906	struct reiserfs_journal *journal = SB_JOURNAL(s);
3907	struct reiserfs_journal_list *jl;
3908	struct list_head *entry;
3909	time_t now = get_seconds();
3910
3911	while (!list_empty(&journal->j_journal_list)) {
3912		entry = journal->j_journal_list.next;
3913		jl = JOURNAL_LIST_ENTRY(entry);
3914		/* this check should always be run, to send old lists to disk */
3915		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
3916		    atomic_read(&jl->j_commit_left) == 0 &&
3917		    test_transaction(s, jl)) {
3918			flush_used_journal_lists(s, jl);
3919		} else {
3920			break;
3921		}
3922	}
3923}
3924
3925/*
3926** long and ugly.  If flush, will not return until all commit
3927** blocks and all real buffers in the trans are on disk.
3928** If no_async, won't return until all commit blocks are on disk.
3929**
3930** keep reading, there are comments as you go along
3931**
3932** If the journal is aborted, we just clean up. Things like flushing
3933** journal lists, etc just won't happen.
3934*/
3935static int do_journal_end(struct reiserfs_transaction_handle *th,
3936			  struct super_block *sb, unsigned long nblocks,
3937			  int flags)
3938{
3939	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3940	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
3941	struct reiserfs_journal_cnode *last_cn = NULL;
3942	struct reiserfs_journal_desc *desc;
3943	struct reiserfs_journal_commit *commit;
3944	struct buffer_head *c_bh;	/* commit bh */
3945	struct buffer_head *d_bh;	/* desc bh */
3946	int cur_write_start = 0;	/* start index of current log write */
3947	int old_start;
3948	int i;
3949	int flush;
3950	int wait_on_commit;
3951	struct reiserfs_journal_list *jl, *temp_jl;
3952	struct list_head *entry, *safe;
3953	unsigned long jindex;
3954	unsigned int commit_trans_id;
3955	int trans_half;
3956
3957	BUG_ON(th->t_refcount > 1);
3958	BUG_ON(!th->t_trans_id);
3959
3960	/* protect flush_older_commits from doing mistakes if the
3961           transaction ID counter gets overflowed.  */
3962	if (th->t_trans_id == ~0U)
3963		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
3964	flush = flags & FLUSH_ALL;
3965	wait_on_commit = flags & WAIT;
3966
3967	put_fs_excl();
3968	current->journal_info = th->t_handle_save;
3969	reiserfs_check_lock_depth(sb, "journal end");
3970	if (journal->j_len == 0) {
3971		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3972					     1);
3973		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3974	}
3975
3976	lock_journal(sb);
3977	if (journal->j_next_full_flush) {
3978		flags |= FLUSH_ALL;
3979		flush = 1;
3980	}
3981	if (journal->j_next_async_flush) {
3982		flags |= COMMIT_NOW | WAIT;
3983		wait_on_commit = 1;
3984	}
3985
3986	/* check_journal_end locks the journal, and unlocks if it does not return 1
3987	 ** it tells us if we should continue with the journal_end, or just return
3988	 */
3989	if (!check_journal_end(th, sb, nblocks, flags)) {
3990		sb->s_dirt = 1;
3991		wake_queued_writers(sb);
3992		reiserfs_async_progress_wait(sb);
3993		goto out;
3994	}
3995
3996	/* check_journal_end might set these, check again */
3997	if (journal->j_next_full_flush) {
3998		flush = 1;
3999	}
4000
4001	/*
4002	 ** j must wait means we have to flush the log blocks, and the real blocks for
4003	 ** this transaction
4004	 */
4005	if (journal->j_must_wait > 0) {
4006		flush = 1;
4007	}
4008#ifdef REISERFS_PREALLOCATE
4009	/* quota ops might need to nest, setup the journal_info pointer for them
4010	 * and raise the refcount so that it is > 0. */
4011	current->journal_info = th;
4012	th->t_refcount++;
4013	reiserfs_discard_all_prealloc(th);	/* it should not involve new blocks into
4014						 * the transaction */
4015	th->t_refcount--;
4016	current->journal_info = th->t_handle_save;
4017#endif
4018
4019	/* setup description block */
4020	d_bh =
4021	    journal_getblk(sb,
4022			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4023			   journal->j_start);
4024	set_buffer_uptodate(d_bh);
4025	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
4026	memset(d_bh->b_data, 0, d_bh->b_size);
4027	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
4028	set_desc_trans_id(desc, journal->j_trans_id);
4029
4030	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
4031	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4032			      ((journal->j_start + journal->j_len +
4033				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
4034	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
4035	memset(c_bh->b_data, 0, c_bh->b_size);
4036	set_commit_trans_id(commit, journal->j_trans_id);
4037	set_buffer_uptodate(c_bh);
4038
4039	/* init this journal list */
4040	jl = journal->j_current_jl;
4041
4042	/* we lock the commit before doing anything because
4043	 * we want to make sure nobody tries to run flush_commit_list until
4044	 * the new transaction is fully setup, and we've already flushed the
4045	 * ordered bh list
4046	 */
4047	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4048
4049	/* save the transaction id in case we need to commit it later */
4050	commit_trans_id = jl->j_trans_id;
4051
4052	atomic_set(&jl->j_older_commits_done, 0);
4053	jl->j_trans_id = journal->j_trans_id;
4054	jl->j_timestamp = journal->j_trans_start_time;
4055	jl->j_commit_bh = c_bh;
4056	jl->j_start = journal->j_start;
4057	jl->j_len = journal->j_len;
4058	atomic_set(&jl->j_nonzerolen, journal->j_len);
4059	atomic_set(&jl->j_commit_left, journal->j_len + 2);
4060	jl->j_realblock = NULL;
4061
4062	/* The ENTIRE FOR LOOP MUST not cause schedule to occur.
4063	 **  for each real block, add it to the journal list hash,
4064	 ** copy into real block index array in the commit or desc block
4065	 */
4066	trans_half = journal_trans_half(sb->s_blocksize);
4067	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
4068		if (buffer_journaled(cn->bh)) {
4069			jl_cn = get_cnode(sb);
4070			if (!jl_cn) {
4071				reiserfs_panic(sb, "journal-1676",
4072					       "get_cnode returned NULL");
4073			}
4074			if (i == 0) {
4075				jl->j_realblock = jl_cn;
4076			}
4077			jl_cn->prev = last_cn;
4078			jl_cn->next = NULL;
4079			if (last_cn) {
4080				last_cn->next = jl_cn;
4081			}
4082			last_cn = jl_cn;
4083			/* make sure the block we are trying to log is not a block
4084			   of journal or reserved area */
4085
4086			if (is_block_in_log_or_reserved_area
4087			    (sb, cn->bh->b_blocknr)) {
4088				reiserfs_panic(sb, "journal-2332",
4089					       "Trying to log block %lu, "
4090					       "which is a log block",
4091					       cn->bh->b_blocknr);
4092			}
4093			jl_cn->blocknr = cn->bh->b_blocknr;
4094			jl_cn->state = 0;
4095			jl_cn->sb = sb;
4096			jl_cn->bh = cn->bh;
4097			jl_cn->jlist = jl;
4098			insert_journal_hash(journal->j_list_hash_table, jl_cn);
4099			if (i < trans_half) {
4100				desc->j_realblock[i] =
4101				    cpu_to_le32(cn->bh->b_blocknr);
4102			} else {
4103				commit->j_realblock[i - trans_half] =
4104				    cpu_to_le32(cn->bh->b_blocknr);
4105			}
4106		} else {
4107			i--;
4108		}
4109	}
4110	set_desc_trans_len(desc, journal->j_len);
4111	set_desc_mount_id(desc, journal->j_mount_id);
4112	set_desc_trans_id(desc, journal->j_trans_id);
4113	set_commit_trans_len(commit, journal->j_len);
4114
4115	/* special check in case all buffers in the journal were marked for not logging */
4116	BUG_ON(journal->j_len == 0);
4117
4118	/* we're about to dirty all the log blocks, mark the description block
4119	 * dirty now too.  Don't mark the commit block dirty until all the
4120	 * others are on disk
4121	 */
4122	mark_buffer_dirty(d_bh);
4123
4124	/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
4125	cur_write_start = journal->j_start;
4126	cn = journal->j_first;
4127	jindex = 1;		/* start at one so we don't get the desc again */
4128	while (cn) {
4129		clear_buffer_journal_new(cn->bh);
4130		/* copy all the real blocks into log area.  dirty log blocks */
4131		if (buffer_journaled(cn->bh)) {
4132			struct buffer_head *tmp_bh;
4133			char *addr;
4134			struct page *page;
4135			tmp_bh =
4136			    journal_getblk(sb,
4137					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4138					   ((cur_write_start +
4139					     jindex) %
4140					    SB_ONDISK_JOURNAL_SIZE(sb)));
4141			set_buffer_uptodate(tmp_bh);
4142			page = cn->bh->b_page;
4143			addr = kmap(page);
4144			memcpy(tmp_bh->b_data,
4145			       addr + offset_in_page(cn->bh->b_data),
4146			       cn->bh->b_size);
4147			kunmap(page);
4148			mark_buffer_dirty(tmp_bh);
4149			jindex++;
4150			set_buffer_journal_dirty(cn->bh);
4151			clear_buffer_journaled(cn->bh);
4152		} else {
4153			/* JDirty cleared sometime during transaction.  don't log this one */
4154			reiserfs_warning(sb, "journal-2048",
4155					 "BAD, buffer in journal hash, "
4156					 "but not JDirty!");
4157			brelse(cn->bh);
4158		}
4159		next = cn->next;
4160		free_cnode(sb, cn);
4161		cn = next;
4162		reiserfs_write_unlock(sb);
4163		cond_resched();
4164		reiserfs_write_lock(sb);
4165	}
4166
4167	/* we are done  with both the c_bh and d_bh, but
4168	 ** c_bh must be written after all other commit blocks,
4169	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4170	 */
4171
4172	journal->j_current_jl = alloc_journal_list(sb);
4173
4174	/* now it is safe to insert this transaction on the main list */
4175	list_add_tail(&jl->j_list, &journal->j_journal_list);
4176	list_add_tail(&jl->j_working_list, &journal->j_working_list);
4177	journal->j_num_work_lists++;
4178
4179	/* reset journal values for the next transaction */
4180	old_start = journal->j_start;
4181	journal->j_start =
4182	    (journal->j_start + journal->j_len +
4183	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
4184	atomic_set(&(journal->j_wcount), 0);
4185	journal->j_bcount = 0;
4186	journal->j_last = NULL;
4187	journal->j_first = NULL;
4188	journal->j_len = 0;
4189	journal->j_trans_start_time = 0;
4190	/* check for trans_id overflow */
4191	if (++journal->j_trans_id == 0)
4192		journal->j_trans_id = 10;
4193	journal->j_current_jl->j_trans_id = journal->j_trans_id;
4194	journal->j_must_wait = 0;
4195	journal->j_len_alloc = 0;
4196	journal->j_next_full_flush = 0;
4197	journal->j_next_async_flush = 0;
4198	init_journal_hash(sb);
4199
4200	// make sure reiserfs_add_jh sees the new current_jl before we
4201	// write out the tails
4202	smp_mb();
4203
4204	/* tail conversion targets have to hit the disk before we end the
4205	 * transaction.  Otherwise a later transaction might repack the tail
4206	 * before this transaction commits, leaving the data block unflushed and
4207	 * clean, if we crash before the later transaction commits, the data block
4208	 * is lost.
4209	 */
4210	if (!list_empty(&jl->j_tail_bh_list)) {
4211		reiserfs_write_unlock(sb);
4212		write_ordered_buffers(&journal->j_dirty_buffers_lock,
4213				      journal, jl, &jl->j_tail_bh_list);
4214		reiserfs_write_lock(sb);
4215	}
4216	BUG_ON(!list_empty(&jl->j_tail_bh_list));
4217	mutex_unlock(&jl->j_commit_mutex);
4218
4219	/* honor the flush wishes from the caller, simple commits can
4220	 ** be done outside the journal lock, they are done below
4221	 **
4222	 ** if we don't flush the commit list right now, we put it into
4223	 ** the work queue so the people waiting on the async progress work
4224	 ** queue don't wait for this proc to flush journal lists and such.
4225	 */
4226	if (flush) {
4227		flush_commit_list(sb, jl, 1);
4228		flush_journal_list(sb, jl, 1);
4229	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
4230		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
4231
4232	/* if the next transaction has any chance of wrapping, flush
4233	 ** transactions that might get overwritten.  If any journal lists are very
4234	 ** old flush them as well.
4235	 */
4236      first_jl:
4237	list_for_each_safe(entry, safe, &journal->j_journal_list) {
4238		temp_jl = JOURNAL_LIST_ENTRY(entry);
4239		if (journal->j_start <= temp_jl->j_start) {
4240			if ((journal->j_start + journal->j_trans_max + 1) >=
4241			    temp_jl->j_start) {
4242				flush_used_journal_lists(sb, temp_jl);
4243				goto first_jl;
4244			} else if ((journal->j_start +
4245				    journal->j_trans_max + 1) <
4246				   SB_ONDISK_JOURNAL_SIZE(sb)) {
4247				/* if we don't cross into the next transaction and we don't
4248				 * wrap, there is no way we can overlap any later transactions
4249				 * break now
4250				 */
4251				break;
4252			}
4253		} else if ((journal->j_start +
4254			    journal->j_trans_max + 1) >
4255			   SB_ONDISK_JOURNAL_SIZE(sb)) {
4256			if (((journal->j_start + journal->j_trans_max + 1) %
4257			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
4258			    temp_jl->j_start) {
4259				flush_used_journal_lists(sb, temp_jl);
4260				goto first_jl;
4261			} else {
4262				/* we don't overlap anything from out start to the end of the
4263				 * log, and our wrapped portion doesn't overlap anything at
4264				 * the start of the log.  We can break
4265				 */
4266				break;
4267			}
4268		}
4269	}
4270	flush_old_journal_lists(sb);
4271
4272	journal->j_current_jl->j_list_bitmap =
4273	    get_list_bitmap(sb, journal->j_current_jl);
4274
4275	if (!(journal->j_current_jl->j_list_bitmap)) {
4276		reiserfs_panic(sb, "journal-1996",
4277			       "could not get a list bitmap");
4278	}
4279
4280	atomic_set(&(journal->j_jlock), 0);
4281	unlock_journal(sb);
4282	/* wake up any body waiting to join. */
4283	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
4284	wake_up(&(journal->j_join_wait));
4285
4286	if (!flush && wait_on_commit &&
4287	    journal_list_still_alive(sb, commit_trans_id)) {
4288		flush_commit_list(sb, jl, 1);
4289	}
4290      out:
4291	reiserfs_check_lock_depth(sb, "journal end2");
4292
4293	memset(th, 0, sizeof(*th));
4294	/* Re-set th->t_super, so we can properly keep track of how many
4295	 * persistent transactions there are. We need to do this so if this
4296	 * call is part of a failed restart_transaction, we can free it later */
4297	th->t_super = sb;
4298
4299	return journal->j_errno;
4300}
4301
4302/* Send the file system read only and refuse new transactions */
4303void reiserfs_abort_journal(struct super_block *sb, int errno)
4304{
4305	struct reiserfs_journal *journal = SB_JOURNAL(sb);
4306	if (test_bit(J_ABORTED, &journal->j_state))
4307		return;
4308
4309	if (!journal->j_errno)
4310		journal->j_errno = errno;
4311
4312	sb->s_flags |= MS_RDONLY;
4313	set_bit(J_ABORTED, &journal->j_state);
4314
4315#ifdef CONFIG_REISERFS_CHECK
4316	dump_stack();
4317#endif
4318}
4319
4320