inode.c revision 27b3a5c51b50a234fb4a21146841e6723b5934ce
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/fs.h>
7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <asm/uaccess.h>
15#include <asm/unaligned.h>
16#include <linux/buffer_head.h>
17#include <linux/mpage.h>
18#include <linux/writeback.h>
19#include <linux/quotaops.h>
20#include <linux/swap.h>
21
22int reiserfs_commit_write(struct file *f, struct page *page,
23			  unsigned from, unsigned to);
24int reiserfs_prepare_write(struct file *f, struct page *page,
25			   unsigned from, unsigned to);
26
27void reiserfs_delete_inode(struct inode *inode)
28{
29	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
30	int jbegin_count =
31	    JOURNAL_PER_BALANCE_CNT * 2 +
32	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
33	struct reiserfs_transaction_handle th;
34	int err;
35
36	truncate_inode_pages(&inode->i_data, 0);
37
38	reiserfs_write_lock(inode->i_sb);
39
40	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
42		reiserfs_delete_xattrs(inode);
43
44		if (journal_begin(&th, inode->i_sb, jbegin_count))
45			goto out;
46		reiserfs_update_inode_transaction(inode);
47
48		reiserfs_discard_prealloc(&th, inode);
49
50		err = reiserfs_delete_object(&th, inode);
51
52		/* Do quota update inside a transaction for journaled quotas. We must do that
53		 * after delete_object so that quota updates go into the same transaction as
54		 * stat data deletion */
55		if (!err)
56			vfs_dq_free_inode(inode);
57
58		if (journal_end(&th, inode->i_sb, jbegin_count))
59			goto out;
60
61		/* check return value from reiserfs_delete_object after
62		 * ending the transaction
63		 */
64		if (err)
65		    goto out;
66
67		/* all items of file are deleted, so we can remove "save" link */
68		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
69								 * about an error here */
70	} else {
71		/* no object items are in the tree */
72		;
73	}
74      out:
75	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
76	inode->i_blocks = 0;
77	reiserfs_write_unlock(inode->i_sb);
78}
79
80static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
81			  __u32 objectid, loff_t offset, int type, int length)
82{
83	key->version = version;
84
85	key->on_disk_key.k_dir_id = dirid;
86	key->on_disk_key.k_objectid = objectid;
87	set_cpu_key_k_offset(key, offset);
88	set_cpu_key_k_type(key, type);
89	key->key_length = length;
90}
91
92/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
93   offset and type of key */
94void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
95		  int type, int length)
96{
97	_make_cpu_key(key, get_inode_item_key_version(inode),
98		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
99		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
100		      length);
101}
102
103//
104// when key is 0, do not set version and short key
105//
106inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
107			      int version,
108			      loff_t offset, int type, int length,
109			      int entry_count /*or ih_free_space */ )
110{
111	if (key) {
112		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
113		ih->ih_key.k_objectid =
114		    cpu_to_le32(key->on_disk_key.k_objectid);
115	}
116	put_ih_version(ih, version);
117	set_le_ih_k_offset(ih, offset);
118	set_le_ih_k_type(ih, type);
119	put_ih_item_len(ih, length);
120	/*    set_ih_free_space (ih, 0); */
121	// for directory items it is entry count, for directs and stat
122	// datas - 0xffff, for indirects - 0
123	put_ih_entry_count(ih, entry_count);
124}
125
126//
127// FIXME: we might cache recently accessed indirect item
128
129// Ugh.  Not too eager for that....
130//  I cut the code until such time as I see a convincing argument (benchmark).
131// I don't want a bloated inode struct..., and I don't like code complexity....
132
133/* cutting the code is fine, since it really isn't in use yet and is easy
134** to add back in.  But, Vladimir has a really good idea here.  Think
135** about what happens for reading a file.  For each page,
136** The VFS layer calls reiserfs_readpage, who searches the tree to find
137** an indirect item.  This indirect item has X number of pointers, where
138** X is a big number if we've done the block allocation right.  But,
139** we only use one or two of these pointers during each call to readpage,
140** needlessly researching again later on.
141**
142** The size of the cache could be dynamic based on the size of the file.
143**
144** I'd also like to see us cache the location the stat data item, since
145** we are needlessly researching for that frequently.
146**
147** --chris
148*/
149
150/* If this page has a file tail in it, and
151** it was read in by get_block_create_0, the page data is valid,
152** but tail is still sitting in a direct item, and we can't write to
153** it.  So, look through this page, and check all the mapped buffers
154** to make sure they have valid block numbers.  Any that don't need
155** to be unmapped, so that block_prepare_write will correctly call
156** reiserfs_get_block to convert the tail into an unformatted node
157*/
158static inline void fix_tail_page_for_writing(struct page *page)
159{
160	struct buffer_head *head, *next, *bh;
161
162	if (page && page_has_buffers(page)) {
163		head = page_buffers(page);
164		bh = head;
165		do {
166			next = bh->b_this_page;
167			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
168				reiserfs_unmap_buffer(bh);
169			}
170			bh = next;
171		} while (bh != head);
172	}
173}
174
175/* reiserfs_get_block does not need to allocate a block only if it has been
176   done already or non-hole position has been found in the indirect item */
177static inline int allocation_needed(int retval, b_blocknr_t allocated,
178				    struct item_head *ih,
179				    __le32 * item, int pos_in_item)
180{
181	if (allocated)
182		return 0;
183	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
184	    get_block_num(item, pos_in_item))
185		return 0;
186	return 1;
187}
188
189static inline int indirect_item_found(int retval, struct item_head *ih)
190{
191	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
192}
193
194static inline void set_block_dev_mapped(struct buffer_head *bh,
195					b_blocknr_t block, struct inode *inode)
196{
197	map_bh(bh, inode->i_sb, block);
198}
199
200//
201// files which were created in the earlier version can not be longer,
202// than 2 gb
203//
204static int file_capable(struct inode *inode, sector_t block)
205{
206	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
207	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
208		return 1;
209
210	return 0;
211}
212
213static int restart_transaction(struct reiserfs_transaction_handle *th,
214			       struct inode *inode, struct treepath *path)
215{
216	struct super_block *s = th->t_super;
217	int len = th->t_blocks_allocated;
218	int err;
219
220	BUG_ON(!th->t_trans_id);
221	BUG_ON(!th->t_refcount);
222
223	pathrelse(path);
224
225	/* we cannot restart while nested */
226	if (th->t_refcount > 1) {
227		return 0;
228	}
229	reiserfs_update_sd(th, inode);
230	err = journal_end(th, s, len);
231	if (!err) {
232		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
233		if (!err)
234			reiserfs_update_inode_transaction(inode);
235	}
236	return err;
237}
238
239// it is called by get_block when create == 0. Returns block number
240// for 'block'-th logical block of file. When it hits direct item it
241// returns 0 (being called from bmap) or read direct item into piece
242// of page (bh_result)
243
244// Please improve the english/clarity in the comment above, as it is
245// hard to understand.
246
247static int _get_block_create_0(struct inode *inode, sector_t block,
248			       struct buffer_head *bh_result, int args)
249{
250	INITIALIZE_PATH(path);
251	struct cpu_key key;
252	struct buffer_head *bh;
253	struct item_head *ih, tmp_ih;
254	b_blocknr_t blocknr;
255	char *p = NULL;
256	int chars;
257	int ret;
258	int result;
259	int done = 0;
260	unsigned long offset;
261
262	// prepare the key to look for the 'block'-th block of file
263	make_cpu_key(&key, inode,
264		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
265		     3);
266
267	result = search_for_position_by_key(inode->i_sb, &key, &path);
268	if (result != POSITION_FOUND) {
269		pathrelse(&path);
270		if (p)
271			kunmap(bh_result->b_page);
272		if (result == IO_ERROR)
273			return -EIO;
274		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
275		// That there is some MMAPED data associated with it that is yet to be written to disk.
276		if ((args & GET_BLOCK_NO_HOLE)
277		    && !PageUptodate(bh_result->b_page)) {
278			return -ENOENT;
279		}
280		return 0;
281	}
282	//
283	bh = get_last_bh(&path);
284	ih = get_ih(&path);
285	if (is_indirect_le_ih(ih)) {
286		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
287
288		/* FIXME: here we could cache indirect item or part of it in
289		   the inode to avoid search_by_key in case of subsequent
290		   access to file */
291		blocknr = get_block_num(ind_item, path.pos_in_item);
292		ret = 0;
293		if (blocknr) {
294			map_bh(bh_result, inode->i_sb, blocknr);
295			if (path.pos_in_item ==
296			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
297				set_buffer_boundary(bh_result);
298			}
299		} else
300			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
301			// That there is some MMAPED data associated with it that is yet to  be written to disk.
302		if ((args & GET_BLOCK_NO_HOLE)
303			    && !PageUptodate(bh_result->b_page)) {
304			ret = -ENOENT;
305		}
306
307		pathrelse(&path);
308		if (p)
309			kunmap(bh_result->b_page);
310		return ret;
311	}
312	// requested data are in direct item(s)
313	if (!(args & GET_BLOCK_READ_DIRECT)) {
314		// we are called by bmap. FIXME: we can not map block of file
315		// when it is stored in direct item(s)
316		pathrelse(&path);
317		if (p)
318			kunmap(bh_result->b_page);
319		return -ENOENT;
320	}
321
322	/* if we've got a direct item, and the buffer or page was uptodate,
323	 ** we don't want to pull data off disk again.  skip to the
324	 ** end, where we map the buffer and return
325	 */
326	if (buffer_uptodate(bh_result)) {
327		goto finished;
328	} else
329		/*
330		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
331		 ** pages without any buffers.  If the page is up to date, we don't want
332		 ** read old data off disk.  Set the up to date bit on the buffer instead
333		 ** and jump to the end
334		 */
335	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
336		set_buffer_uptodate(bh_result);
337		goto finished;
338	}
339	// read file tail into part of page
340	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
341	copy_item_head(&tmp_ih, ih);
342
343	/* we only want to kmap if we are reading the tail into the page.
344	 ** this is not the common case, so we don't kmap until we are
345	 ** sure we need to.  But, this means the item might move if
346	 ** kmap schedules
347	 */
348	if (!p)
349		p = (char *)kmap(bh_result->b_page);
350
351	p += offset;
352	memset(p, 0, inode->i_sb->s_blocksize);
353	do {
354		if (!is_direct_le_ih(ih)) {
355			BUG();
356		}
357		/* make sure we don't read more bytes than actually exist in
358		 ** the file.  This can happen in odd cases where i_size isn't
359		 ** correct, and when direct item padding results in a few
360		 ** extra bytes at the end of the direct item
361		 */
362		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
363			break;
364		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
365			chars =
366			    inode->i_size - (le_ih_k_offset(ih) - 1) -
367			    path.pos_in_item;
368			done = 1;
369		} else {
370			chars = ih_item_len(ih) - path.pos_in_item;
371		}
372		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
373
374		if (done)
375			break;
376
377		p += chars;
378
379		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
380			// we done, if read direct item is not the last item of
381			// node FIXME: we could try to check right delimiting key
382			// to see whether direct item continues in the right
383			// neighbor or rely on i_size
384			break;
385
386		// update key to look for the next piece
387		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
388		result = search_for_position_by_key(inode->i_sb, &key, &path);
389		if (result != POSITION_FOUND)
390			// i/o error most likely
391			break;
392		bh = get_last_bh(&path);
393		ih = get_ih(&path);
394	} while (1);
395
396	flush_dcache_page(bh_result->b_page);
397	kunmap(bh_result->b_page);
398
399      finished:
400	pathrelse(&path);
401
402	if (result == IO_ERROR)
403		return -EIO;
404
405	/* this buffer has valid data, but isn't valid for io.  mapping it to
406	 * block #0 tells the rest of reiserfs it just has a tail in it
407	 */
408	map_bh(bh_result, inode->i_sb, 0);
409	set_buffer_uptodate(bh_result);
410	return 0;
411}
412
413// this is called to create file map. So, _get_block_create_0 will not
414// read direct item
415static int reiserfs_bmap(struct inode *inode, sector_t block,
416			 struct buffer_head *bh_result, int create)
417{
418	if (!file_capable(inode, block))
419		return -EFBIG;
420
421	reiserfs_write_lock(inode->i_sb);
422	/* do not read the direct item */
423	_get_block_create_0(inode, block, bh_result, 0);
424	reiserfs_write_unlock(inode->i_sb);
425	return 0;
426}
427
428/* special version of get_block that is only used by grab_tail_page right
429** now.  It is sent to block_prepare_write, and when you try to get a
430** block past the end of the file (or a block from a hole) it returns
431** -ENOENT instead of a valid buffer.  block_prepare_write expects to
432** be able to do i/o on the buffers returned, unless an error value
433** is also returned.
434**
435** So, this allows block_prepare_write to be used for reading a single block
436** in a page.  Where it does not produce a valid page for holes, or past the
437** end of the file.  This turns out to be exactly what we need for reading
438** tails for conversion.
439**
440** The point of the wrapper is forcing a certain value for create, even
441** though the VFS layer is calling this function with create==1.  If you
442** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
443** don't use this function.
444*/
445static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
446				       struct buffer_head *bh_result,
447				       int create)
448{
449	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
450}
451
452/* This is special helper for reiserfs_get_block in case we are executing
453   direct_IO request. */
454static int reiserfs_get_blocks_direct_io(struct inode *inode,
455					 sector_t iblock,
456					 struct buffer_head *bh_result,
457					 int create)
458{
459	int ret;
460
461	bh_result->b_page = NULL;
462
463	/* We set the b_size before reiserfs_get_block call since it is
464	   referenced in convert_tail_for_hole() that may be called from
465	   reiserfs_get_block() */
466	bh_result->b_size = (1 << inode->i_blkbits);
467
468	ret = reiserfs_get_block(inode, iblock, bh_result,
469				 create | GET_BLOCK_NO_DANGLE);
470	if (ret)
471		goto out;
472
473	/* don't allow direct io onto tail pages */
474	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
475		/* make sure future calls to the direct io funcs for this offset
476		 ** in the file fail by unmapping the buffer
477		 */
478		clear_buffer_mapped(bh_result);
479		ret = -EINVAL;
480	}
481	/* Possible unpacked tail. Flush the data before pages have
482	   disappeared */
483	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
484		int err;
485
486		reiserfs_write_lock(inode->i_sb);
487
488		err = reiserfs_commit_for_inode(inode);
489		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
490
491		reiserfs_write_unlock(inode->i_sb);
492
493		if (err < 0)
494			ret = err;
495	}
496      out:
497	return ret;
498}
499
500/*
501** helper function for when reiserfs_get_block is called for a hole
502** but the file tail is still in a direct item
503** bh_result is the buffer head for the hole
504** tail_offset is the offset of the start of the tail in the file
505**
506** This calls prepare_write, which will start a new transaction
507** you should not be in a transaction, or have any paths held when you
508** call this.
509*/
510static int convert_tail_for_hole(struct inode *inode,
511				 struct buffer_head *bh_result,
512				 loff_t tail_offset)
513{
514	unsigned long index;
515	unsigned long tail_end;
516	unsigned long tail_start;
517	struct page *tail_page;
518	struct page *hole_page = bh_result->b_page;
519	int retval = 0;
520
521	if ((tail_offset & (bh_result->b_size - 1)) != 1)
522		return -EIO;
523
524	/* always try to read until the end of the block */
525	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
526	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
527
528	index = tail_offset >> PAGE_CACHE_SHIFT;
529	/* hole_page can be zero in case of direct_io, we are sure
530	   that we cannot get here if we write with O_DIRECT into
531	   tail page */
532	if (!hole_page || index != hole_page->index) {
533		tail_page = grab_cache_page(inode->i_mapping, index);
534		retval = -ENOMEM;
535		if (!tail_page) {
536			goto out;
537		}
538	} else {
539		tail_page = hole_page;
540	}
541
542	/* we don't have to make sure the conversion did not happen while
543	 ** we were locking the page because anyone that could convert
544	 ** must first take i_mutex.
545	 **
546	 ** We must fix the tail page for writing because it might have buffers
547	 ** that are mapped, but have a block number of 0.  This indicates tail
548	 ** data that has been read directly into the page, and block_prepare_write
549	 ** won't trigger a get_block in this case.
550	 */
551	fix_tail_page_for_writing(tail_page);
552	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
553	if (retval)
554		goto unlock;
555
556	/* tail conversion might change the data in the page */
557	flush_dcache_page(tail_page);
558
559	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
560
561      unlock:
562	if (tail_page != hole_page) {
563		unlock_page(tail_page);
564		page_cache_release(tail_page);
565	}
566      out:
567	return retval;
568}
569
570static inline int _allocate_block(struct reiserfs_transaction_handle *th,
571				  sector_t block,
572				  struct inode *inode,
573				  b_blocknr_t * allocated_block_nr,
574				  struct treepath *path, int flags)
575{
576	BUG_ON(!th->t_trans_id);
577
578#ifdef REISERFS_PREALLOCATE
579	if (!(flags & GET_BLOCK_NO_IMUX)) {
580		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
581						  path, block);
582	}
583#endif
584	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
585					 block);
586}
587
588int reiserfs_get_block(struct inode *inode, sector_t block,
589		       struct buffer_head *bh_result, int create)
590{
591	int repeat, retval = 0;
592	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
593	INITIALIZE_PATH(path);
594	int pos_in_item;
595	struct cpu_key key;
596	struct buffer_head *bh, *unbh = NULL;
597	struct item_head *ih, tmp_ih;
598	__le32 *item;
599	int done;
600	int fs_gen;
601	int lock_depth;
602	struct reiserfs_transaction_handle *th = NULL;
603	/* space reserved in transaction batch:
604	   . 3 balancings in direct->indirect conversion
605	   . 1 block involved into reiserfs_update_sd()
606	   XXX in practically impossible worst case direct2indirect()
607	   can incur (much) more than 3 balancings.
608	   quota update for user, group */
609	int jbegin_count =
610	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
611	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
612	int version;
613	int dangle = 1;
614	loff_t new_offset =
615	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
616
617	lock_depth = reiserfs_write_lock_once(inode->i_sb);
618	version = get_inode_item_key_version(inode);
619
620	if (!file_capable(inode, block)) {
621		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
622		return -EFBIG;
623	}
624
625	/* if !create, we aren't changing the FS, so we don't need to
626	 ** log anything, so we don't need to start a transaction
627	 */
628	if (!(create & GET_BLOCK_CREATE)) {
629		int ret;
630		/* find number of block-th logical block of the file */
631		ret = _get_block_create_0(inode, block, bh_result,
632					  create | GET_BLOCK_READ_DIRECT);
633		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
634		return ret;
635	}
636	/*
637	 * if we're already in a transaction, make sure to close
638	 * any new transactions we start in this func
639	 */
640	if ((create & GET_BLOCK_NO_DANGLE) ||
641	    reiserfs_transaction_running(inode->i_sb))
642		dangle = 0;
643
644	/* If file is of such a size, that it might have a tail and tails are enabled
645	 ** we should mark it as possibly needing tail packing on close
646	 */
647	if ((have_large_tails(inode->i_sb)
648	     && inode->i_size < i_block_size(inode) * 4)
649	    || (have_small_tails(inode->i_sb)
650		&& inode->i_size < i_block_size(inode)))
651		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
652
653	/* set the key of the first byte in the 'block'-th block of file */
654	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
655	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
656	      start_trans:
657		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
658		if (!th) {
659			retval = -ENOMEM;
660			goto failure;
661		}
662		reiserfs_update_inode_transaction(inode);
663	}
664      research:
665
666	retval = search_for_position_by_key(inode->i_sb, &key, &path);
667	if (retval == IO_ERROR) {
668		retval = -EIO;
669		goto failure;
670	}
671
672	bh = get_last_bh(&path);
673	ih = get_ih(&path);
674	item = get_item(&path);
675	pos_in_item = path.pos_in_item;
676
677	fs_gen = get_generation(inode->i_sb);
678	copy_item_head(&tmp_ih, ih);
679
680	if (allocation_needed
681	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
682		/* we have to allocate block for the unformatted node */
683		if (!th) {
684			pathrelse(&path);
685			goto start_trans;
686		}
687
688		repeat =
689		    _allocate_block(th, block, inode, &allocated_block_nr,
690				    &path, create);
691
692		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
693			/* restart the transaction to give the journal a chance to free
694			 ** some blocks.  releases the path, so we have to go back to
695			 ** research if we succeed on the second try
696			 */
697			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
698			retval = restart_transaction(th, inode, &path);
699			if (retval)
700				goto failure;
701			repeat =
702			    _allocate_block(th, block, inode,
703					    &allocated_block_nr, NULL, create);
704
705			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
706				goto research;
707			}
708			if (repeat == QUOTA_EXCEEDED)
709				retval = -EDQUOT;
710			else
711				retval = -ENOSPC;
712			goto failure;
713		}
714
715		if (fs_changed(fs_gen, inode->i_sb)
716		    && item_moved(&tmp_ih, &path)) {
717			goto research;
718		}
719	}
720
721	if (indirect_item_found(retval, ih)) {
722		b_blocknr_t unfm_ptr;
723		/* 'block'-th block is in the file already (there is
724		   corresponding cell in some indirect item). But it may be
725		   zero unformatted node pointer (hole) */
726		unfm_ptr = get_block_num(item, pos_in_item);
727		if (unfm_ptr == 0) {
728			/* use allocated block to plug the hole */
729			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
730			if (fs_changed(fs_gen, inode->i_sb)
731			    && item_moved(&tmp_ih, &path)) {
732				reiserfs_restore_prepared_buffer(inode->i_sb,
733								 bh);
734				goto research;
735			}
736			set_buffer_new(bh_result);
737			if (buffer_dirty(bh_result)
738			    && reiserfs_data_ordered(inode->i_sb))
739				reiserfs_add_ordered_list(inode, bh_result);
740			put_block_num(item, pos_in_item, allocated_block_nr);
741			unfm_ptr = allocated_block_nr;
742			journal_mark_dirty(th, inode->i_sb, bh);
743			reiserfs_update_sd(th, inode);
744		}
745		set_block_dev_mapped(bh_result, unfm_ptr, inode);
746		pathrelse(&path);
747		retval = 0;
748		if (!dangle && th)
749			retval = reiserfs_end_persistent_transaction(th);
750
751		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
752
753		/* the item was found, so new blocks were not added to the file
754		 ** there is no need to make sure the inode is updated with this
755		 ** transaction
756		 */
757		return retval;
758	}
759
760	if (!th) {
761		pathrelse(&path);
762		goto start_trans;
763	}
764
765	/* desired position is not found or is in the direct item. We have
766	   to append file with holes up to 'block'-th block converting
767	   direct items to indirect one if necessary */
768	done = 0;
769	do {
770		if (is_statdata_le_ih(ih)) {
771			__le32 unp = 0;
772			struct cpu_key tmp_key;
773
774			/* indirect item has to be inserted */
775			make_le_item_head(&tmp_ih, &key, version, 1,
776					  TYPE_INDIRECT, UNFM_P_SIZE,
777					  0 /* free_space */ );
778
779			if (cpu_key_k_offset(&key) == 1) {
780				/* we are going to add 'block'-th block to the file. Use
781				   allocated block for that */
782				unp = cpu_to_le32(allocated_block_nr);
783				set_block_dev_mapped(bh_result,
784						     allocated_block_nr, inode);
785				set_buffer_new(bh_result);
786				done = 1;
787			}
788			tmp_key = key;	// ;)
789			set_cpu_key_k_offset(&tmp_key, 1);
790			PATH_LAST_POSITION(&path)++;
791
792			retval =
793			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
794						 inode, (char *)&unp);
795			if (retval) {
796				reiserfs_free_block(th, inode,
797						    allocated_block_nr, 1);
798				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
799			}
800			//mark_tail_converted (inode);
801		} else if (is_direct_le_ih(ih)) {
802			/* direct item has to be converted */
803			loff_t tail_offset;
804
805			tail_offset =
806			    ((le_ih_k_offset(ih) -
807			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
808			if (tail_offset == cpu_key_k_offset(&key)) {
809				/* direct item we just found fits into block we have
810				   to map. Convert it into unformatted node: use
811				   bh_result for the conversion */
812				set_block_dev_mapped(bh_result,
813						     allocated_block_nr, inode);
814				unbh = bh_result;
815				done = 1;
816			} else {
817				/* we have to padd file tail stored in direct item(s)
818				   up to block size and convert it to unformatted
819				   node. FIXME: this should also get into page cache */
820
821				pathrelse(&path);
822				/*
823				 * ugly, but we can only end the transaction if
824				 * we aren't nested
825				 */
826				BUG_ON(!th->t_refcount);
827				if (th->t_refcount == 1) {
828					retval =
829					    reiserfs_end_persistent_transaction
830					    (th);
831					th = NULL;
832					if (retval)
833						goto failure;
834				}
835
836				retval =
837				    convert_tail_for_hole(inode, bh_result,
838							  tail_offset);
839				if (retval) {
840					if (retval != -ENOSPC)
841						reiserfs_error(inode->i_sb,
842							"clm-6004",
843							"convert tail failed "
844							"inode %lu, error %d",
845							inode->i_ino,
846							retval);
847					if (allocated_block_nr) {
848						/* the bitmap, the super, and the stat data == 3 */
849						if (!th)
850							th = reiserfs_persistent_transaction(inode->i_sb, 3);
851						if (th)
852							reiserfs_free_block(th,
853									    inode,
854									    allocated_block_nr,
855									    1);
856					}
857					goto failure;
858				}
859				goto research;
860			}
861			retval =
862			    direct2indirect(th, inode, &path, unbh,
863					    tail_offset);
864			if (retval) {
865				reiserfs_unmap_buffer(unbh);
866				reiserfs_free_block(th, inode,
867						    allocated_block_nr, 1);
868				goto failure;
869			}
870			/* it is important the set_buffer_uptodate is done after
871			 ** the direct2indirect.  The buffer might contain valid
872			 ** data newer than the data on disk (read by readpage, changed,
873			 ** and then sent here by writepage).  direct2indirect needs
874			 ** to know if unbh was already up to date, so it can decide
875			 ** if the data in unbh needs to be replaced with data from
876			 ** the disk
877			 */
878			set_buffer_uptodate(unbh);
879
880			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
881			   buffer will disappear shortly, so it should not be added to
882			 */
883			if (unbh->b_page) {
884				/* we've converted the tail, so we must
885				 ** flush unbh before the transaction commits
886				 */
887				reiserfs_add_tail_list(inode, unbh);
888
889				/* mark it dirty now to prevent commit_write from adding
890				 ** this buffer to the inode's dirty buffer list
891				 */
892				/*
893				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
894				 * It's still atomic, but it sets the page dirty too,
895				 * which makes it eligible for writeback at any time by the
896				 * VM (which was also the case with __mark_buffer_dirty())
897				 */
898				mark_buffer_dirty(unbh);
899			}
900		} else {
901			/* append indirect item with holes if needed, when appending
902			   pointer to 'block'-th block use block, which is already
903			   allocated */
904			struct cpu_key tmp_key;
905			unp_t unf_single = 0;	// We use this in case we need to allocate only
906			// one block which is a fastpath
907			unp_t *un;
908			__u64 max_to_insert =
909			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
910			    UNFM_P_SIZE;
911			__u64 blocks_needed;
912
913			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
914			       "vs-804: invalid position for append");
915			/* indirect item has to be appended, set up key of that position */
916			make_cpu_key(&tmp_key, inode,
917				     le_key_k_offset(version,
918						     &(ih->ih_key)) +
919				     op_bytes_number(ih,
920						     inode->i_sb->s_blocksize),
921				     //pos_in_item * inode->i_sb->s_blocksize,
922				     TYPE_INDIRECT, 3);	// key type is unimportant
923
924			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
925			       "green-805: invalid offset");
926			blocks_needed =
927			    1 +
928			    ((cpu_key_k_offset(&key) -
929			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
930			     s_blocksize_bits);
931
932			if (blocks_needed == 1) {
933				un = &unf_single;
934			} else {
935				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
936				if (!un) {
937					un = &unf_single;
938					blocks_needed = 1;
939					max_to_insert = 0;
940				}
941			}
942			if (blocks_needed <= max_to_insert) {
943				/* we are going to add target block to the file. Use allocated
944				   block for that */
945				un[blocks_needed - 1] =
946				    cpu_to_le32(allocated_block_nr);
947				set_block_dev_mapped(bh_result,
948						     allocated_block_nr, inode);
949				set_buffer_new(bh_result);
950				done = 1;
951			} else {
952				/* paste hole to the indirect item */
953				/* If kmalloc failed, max_to_insert becomes zero and it means we
954				   only have space for one block */
955				blocks_needed =
956				    max_to_insert ? max_to_insert : 1;
957			}
958			retval =
959			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
960						     (char *)un,
961						     UNFM_P_SIZE *
962						     blocks_needed);
963
964			if (blocks_needed != 1)
965				kfree(un);
966
967			if (retval) {
968				reiserfs_free_block(th, inode,
969						    allocated_block_nr, 1);
970				goto failure;
971			}
972			if (!done) {
973				/* We need to mark new file size in case this function will be
974				   interrupted/aborted later on. And we may do this only for
975				   holes. */
976				inode->i_size +=
977				    inode->i_sb->s_blocksize * blocks_needed;
978			}
979		}
980
981		if (done == 1)
982			break;
983
984		/* this loop could log more blocks than we had originally asked
985		 ** for.  So, we have to allow the transaction to end if it is
986		 ** too big or too full.  Update the inode so things are
987		 ** consistent if we crash before the function returns
988		 **
989		 ** release the path so that anybody waiting on the path before
990		 ** ending their transaction will be able to continue.
991		 */
992		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
993			retval = restart_transaction(th, inode, &path);
994			if (retval)
995				goto failure;
996		}
997		/*
998		 * inserting indirect pointers for a hole can take a
999		 * long time.  reschedule if needed and also release the write
1000		 * lock for others.
1001		 */
1002		if (need_resched()) {
1003			reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1004			schedule();
1005			lock_depth = reiserfs_write_lock_once(inode->i_sb);
1006		}
1007
1008		retval = search_for_position_by_key(inode->i_sb, &key, &path);
1009		if (retval == IO_ERROR) {
1010			retval = -EIO;
1011			goto failure;
1012		}
1013		if (retval == POSITION_FOUND) {
1014			reiserfs_warning(inode->i_sb, "vs-825",
1015					 "%K should not be found", &key);
1016			retval = -EEXIST;
1017			if (allocated_block_nr)
1018				reiserfs_free_block(th, inode,
1019						    allocated_block_nr, 1);
1020			pathrelse(&path);
1021			goto failure;
1022		}
1023		bh = get_last_bh(&path);
1024		ih = get_ih(&path);
1025		item = get_item(&path);
1026		pos_in_item = path.pos_in_item;
1027	} while (1);
1028
1029	retval = 0;
1030
1031      failure:
1032	if (th && (!dangle || (retval && !th->t_trans_id))) {
1033		int err;
1034		if (th->t_trans_id)
1035			reiserfs_update_sd(th, inode);
1036		err = reiserfs_end_persistent_transaction(th);
1037		if (err)
1038			retval = err;
1039	}
1040
1041	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1042	reiserfs_check_path(&path);
1043	return retval;
1044}
1045
1046static int
1047reiserfs_readpages(struct file *file, struct address_space *mapping,
1048		   struct list_head *pages, unsigned nr_pages)
1049{
1050	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1051}
1052
1053/* Compute real number of used bytes by file
1054 * Following three functions can go away when we'll have enough space in stat item
1055 */
1056static int real_space_diff(struct inode *inode, int sd_size)
1057{
1058	int bytes;
1059	loff_t blocksize = inode->i_sb->s_blocksize;
1060
1061	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1062		return sd_size;
1063
1064	/* End of file is also in full block with indirect reference, so round
1065	 ** up to the next block.
1066	 **
1067	 ** there is just no way to know if the tail is actually packed
1068	 ** on the file, so we have to assume it isn't.  When we pack the
1069	 ** tail, we add 4 bytes to pretend there really is an unformatted
1070	 ** node pointer
1071	 */
1072	bytes =
1073	    ((inode->i_size +
1074	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1075	    sd_size;
1076	return bytes;
1077}
1078
1079static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1080					int sd_size)
1081{
1082	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1083		return inode->i_size +
1084		    (loff_t) (real_space_diff(inode, sd_size));
1085	}
1086	return ((loff_t) real_space_diff(inode, sd_size)) +
1087	    (((loff_t) blocks) << 9);
1088}
1089
1090/* Compute number of blocks used by file in ReiserFS counting */
1091static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1092{
1093	loff_t bytes = inode_get_bytes(inode);
1094	loff_t real_space = real_space_diff(inode, sd_size);
1095
1096	/* keeps fsck and non-quota versions of reiserfs happy */
1097	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1098		bytes += (loff_t) 511;
1099	}
1100
1101	/* files from before the quota patch might i_blocks such that
1102	 ** bytes < real_space.  Deal with that here to prevent it from
1103	 ** going negative.
1104	 */
1105	if (bytes < real_space)
1106		return 0;
1107	return (bytes - real_space) >> 9;
1108}
1109
1110//
1111// BAD: new directories have stat data of new type and all other items
1112// of old type. Version stored in the inode says about body items, so
1113// in update_stat_data we can not rely on inode, but have to check
1114// item version directly
1115//
1116
1117// called by read_locked_inode
1118static void init_inode(struct inode *inode, struct treepath *path)
1119{
1120	struct buffer_head *bh;
1121	struct item_head *ih;
1122	__u32 rdev;
1123	//int version = ITEM_VERSION_1;
1124
1125	bh = PATH_PLAST_BUFFER(path);
1126	ih = PATH_PITEM_HEAD(path);
1127
1128	copy_key(INODE_PKEY(inode), &(ih->ih_key));
1129
1130	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1131	REISERFS_I(inode)->i_flags = 0;
1132	REISERFS_I(inode)->i_prealloc_block = 0;
1133	REISERFS_I(inode)->i_prealloc_count = 0;
1134	REISERFS_I(inode)->i_trans_id = 0;
1135	REISERFS_I(inode)->i_jl = NULL;
1136	mutex_init(&(REISERFS_I(inode)->i_mmap));
1137	reiserfs_init_xattr_rwsem(inode);
1138
1139	if (stat_data_v1(ih)) {
1140		struct stat_data_v1 *sd =
1141		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1142		unsigned long blocks;
1143
1144		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1145		set_inode_sd_version(inode, STAT_DATA_V1);
1146		inode->i_mode = sd_v1_mode(sd);
1147		inode->i_nlink = sd_v1_nlink(sd);
1148		inode->i_uid = sd_v1_uid(sd);
1149		inode->i_gid = sd_v1_gid(sd);
1150		inode->i_size = sd_v1_size(sd);
1151		inode->i_atime.tv_sec = sd_v1_atime(sd);
1152		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1153		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1154		inode->i_atime.tv_nsec = 0;
1155		inode->i_ctime.tv_nsec = 0;
1156		inode->i_mtime.tv_nsec = 0;
1157
1158		inode->i_blocks = sd_v1_blocks(sd);
1159		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1160		blocks = (inode->i_size + 511) >> 9;
1161		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1162		if (inode->i_blocks > blocks) {
1163			// there was a bug in <=3.5.23 when i_blocks could take negative
1164			// values. Starting from 3.5.17 this value could even be stored in
1165			// stat data. For such files we set i_blocks based on file
1166			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1167			// only updated if file's inode will ever change
1168			inode->i_blocks = blocks;
1169		}
1170
1171		rdev = sd_v1_rdev(sd);
1172		REISERFS_I(inode)->i_first_direct_byte =
1173		    sd_v1_first_direct_byte(sd);
1174		/* an early bug in the quota code can give us an odd number for the
1175		 ** block count.  This is incorrect, fix it here.
1176		 */
1177		if (inode->i_blocks & 1) {
1178			inode->i_blocks++;
1179		}
1180		inode_set_bytes(inode,
1181				to_real_used_space(inode, inode->i_blocks,
1182						   SD_V1_SIZE));
1183		/* nopack is initially zero for v1 objects. For v2 objects,
1184		   nopack is initialised from sd_attrs */
1185		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1186	} else {
1187		// new stat data found, but object may have old items
1188		// (directories and symlinks)
1189		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1190
1191		inode->i_mode = sd_v2_mode(sd);
1192		inode->i_nlink = sd_v2_nlink(sd);
1193		inode->i_uid = sd_v2_uid(sd);
1194		inode->i_size = sd_v2_size(sd);
1195		inode->i_gid = sd_v2_gid(sd);
1196		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1197		inode->i_atime.tv_sec = sd_v2_atime(sd);
1198		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1199		inode->i_ctime.tv_nsec = 0;
1200		inode->i_mtime.tv_nsec = 0;
1201		inode->i_atime.tv_nsec = 0;
1202		inode->i_blocks = sd_v2_blocks(sd);
1203		rdev = sd_v2_rdev(sd);
1204		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1205			inode->i_generation =
1206			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1207		else
1208			inode->i_generation = sd_v2_generation(sd);
1209
1210		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1211			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1212		else
1213			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1214		REISERFS_I(inode)->i_first_direct_byte = 0;
1215		set_inode_sd_version(inode, STAT_DATA_V2);
1216		inode_set_bytes(inode,
1217				to_real_used_space(inode, inode->i_blocks,
1218						   SD_V2_SIZE));
1219		/* read persistent inode attributes from sd and initalise
1220		   generic inode flags from them */
1221		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1222		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1223	}
1224
1225	pathrelse(path);
1226	if (S_ISREG(inode->i_mode)) {
1227		inode->i_op = &reiserfs_file_inode_operations;
1228		inode->i_fop = &reiserfs_file_operations;
1229		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1230	} else if (S_ISDIR(inode->i_mode)) {
1231		inode->i_op = &reiserfs_dir_inode_operations;
1232		inode->i_fop = &reiserfs_dir_operations;
1233	} else if (S_ISLNK(inode->i_mode)) {
1234		inode->i_op = &reiserfs_symlink_inode_operations;
1235		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1236	} else {
1237		inode->i_blocks = 0;
1238		inode->i_op = &reiserfs_special_inode_operations;
1239		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1240	}
1241}
1242
1243// update new stat data with inode fields
1244static void inode2sd(void *sd, struct inode *inode, loff_t size)
1245{
1246	struct stat_data *sd_v2 = (struct stat_data *)sd;
1247	__u16 flags;
1248
1249	set_sd_v2_mode(sd_v2, inode->i_mode);
1250	set_sd_v2_nlink(sd_v2, inode->i_nlink);
1251	set_sd_v2_uid(sd_v2, inode->i_uid);
1252	set_sd_v2_size(sd_v2, size);
1253	set_sd_v2_gid(sd_v2, inode->i_gid);
1254	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1255	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1256	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1257	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1258	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1259		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1260	else
1261		set_sd_v2_generation(sd_v2, inode->i_generation);
1262	flags = REISERFS_I(inode)->i_attrs;
1263	i_attrs_to_sd_attrs(inode, &flags);
1264	set_sd_v2_attrs(sd_v2, flags);
1265}
1266
1267// used to copy inode's fields to old stat data
1268static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1269{
1270	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1271
1272	set_sd_v1_mode(sd_v1, inode->i_mode);
1273	set_sd_v1_uid(sd_v1, inode->i_uid);
1274	set_sd_v1_gid(sd_v1, inode->i_gid);
1275	set_sd_v1_nlink(sd_v1, inode->i_nlink);
1276	set_sd_v1_size(sd_v1, size);
1277	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1278	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1279	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1280
1281	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1282		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1283	else
1284		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1285
1286	// Sigh. i_first_direct_byte is back
1287	set_sd_v1_first_direct_byte(sd_v1,
1288				    REISERFS_I(inode)->i_first_direct_byte);
1289}
1290
1291/* NOTE, you must prepare the buffer head before sending it here,
1292** and then log it after the call
1293*/
1294static void update_stat_data(struct treepath *path, struct inode *inode,
1295			     loff_t size)
1296{
1297	struct buffer_head *bh;
1298	struct item_head *ih;
1299
1300	bh = PATH_PLAST_BUFFER(path);
1301	ih = PATH_PITEM_HEAD(path);
1302
1303	if (!is_statdata_le_ih(ih))
1304		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
1305			       INODE_PKEY(inode), ih);
1306
1307	if (stat_data_v1(ih)) {
1308		// path points to old stat data
1309		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1310	} else {
1311		inode2sd(B_I_PITEM(bh, ih), inode, size);
1312	}
1313
1314	return;
1315}
1316
1317void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1318			     struct inode *inode, loff_t size)
1319{
1320	struct cpu_key key;
1321	INITIALIZE_PATH(path);
1322	struct buffer_head *bh;
1323	int fs_gen;
1324	struct item_head *ih, tmp_ih;
1325	int retval;
1326
1327	BUG_ON(!th->t_trans_id);
1328
1329	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
1330
1331	for (;;) {
1332		int pos;
1333		/* look for the object's stat data */
1334		retval = search_item(inode->i_sb, &key, &path);
1335		if (retval == IO_ERROR) {
1336			reiserfs_error(inode->i_sb, "vs-13050",
1337				       "i/o failure occurred trying to "
1338				       "update %K stat data", &key);
1339			return;
1340		}
1341		if (retval == ITEM_NOT_FOUND) {
1342			pos = PATH_LAST_POSITION(&path);
1343			pathrelse(&path);
1344			if (inode->i_nlink == 0) {
1345				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1346				return;
1347			}
1348			reiserfs_warning(inode->i_sb, "vs-13060",
1349					 "stat data of object %k (nlink == %d) "
1350					 "not found (pos %d)",
1351					 INODE_PKEY(inode), inode->i_nlink,
1352					 pos);
1353			reiserfs_check_path(&path);
1354			return;
1355		}
1356
1357		/* sigh, prepare_for_journal might schedule.  When it schedules the
1358		 ** FS might change.  We have to detect that, and loop back to the
1359		 ** search if the stat data item has moved
1360		 */
1361		bh = get_last_bh(&path);
1362		ih = get_ih(&path);
1363		copy_item_head(&tmp_ih, ih);
1364		fs_gen = get_generation(inode->i_sb);
1365		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1366		if (fs_changed(fs_gen, inode->i_sb)
1367		    && item_moved(&tmp_ih, &path)) {
1368			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1369			continue;	/* Stat_data item has been moved after scheduling. */
1370		}
1371		break;
1372	}
1373	update_stat_data(&path, inode, size);
1374	journal_mark_dirty(th, th->t_super, bh);
1375	pathrelse(&path);
1376	return;
1377}
1378
1379/* reiserfs_read_locked_inode is called to read the inode off disk, and it
1380** does a make_bad_inode when things go wrong.  But, we need to make sure
1381** and clear the key in the private portion of the inode, otherwise a
1382** corresponding iput might try to delete whatever object the inode last
1383** represented.
1384*/
1385static void reiserfs_make_bad_inode(struct inode *inode)
1386{
1387	memset(INODE_PKEY(inode), 0, KEY_SIZE);
1388	make_bad_inode(inode);
1389}
1390
1391//
1392// initially this function was derived from minix or ext2's analog and
1393// evolved as the prototype did
1394//
1395
1396int reiserfs_init_locked_inode(struct inode *inode, void *p)
1397{
1398	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1399	inode->i_ino = args->objectid;
1400	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1401	return 0;
1402}
1403
1404/* looks for stat data in the tree, and fills up the fields of in-core
1405   inode stat data fields */
1406void reiserfs_read_locked_inode(struct inode *inode,
1407				struct reiserfs_iget_args *args)
1408{
1409	INITIALIZE_PATH(path_to_sd);
1410	struct cpu_key key;
1411	unsigned long dirino;
1412	int retval;
1413
1414	dirino = args->dirid;
1415
1416	/* set version 1, version 2 could be used too, because stat data
1417	   key is the same in both versions */
1418	key.version = KEY_FORMAT_3_5;
1419	key.on_disk_key.k_dir_id = dirino;
1420	key.on_disk_key.k_objectid = inode->i_ino;
1421	key.on_disk_key.k_offset = 0;
1422	key.on_disk_key.k_type = 0;
1423
1424	/* look for the object's stat data */
1425	retval = search_item(inode->i_sb, &key, &path_to_sd);
1426	if (retval == IO_ERROR) {
1427		reiserfs_error(inode->i_sb, "vs-13070",
1428			       "i/o failure occurred trying to find "
1429			       "stat data of %K", &key);
1430		reiserfs_make_bad_inode(inode);
1431		return;
1432	}
1433	if (retval != ITEM_FOUND) {
1434		/* a stale NFS handle can trigger this without it being an error */
1435		pathrelse(&path_to_sd);
1436		reiserfs_make_bad_inode(inode);
1437		inode->i_nlink = 0;
1438		return;
1439	}
1440
1441	init_inode(inode, &path_to_sd);
1442
1443	/* It is possible that knfsd is trying to access inode of a file
1444	   that is being removed from the disk by some other thread. As we
1445	   update sd on unlink all that is required is to check for nlink
1446	   here. This bug was first found by Sizif when debugging
1447	   SquidNG/Butterfly, forgotten, and found again after Philippe
1448	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1449
1450	   More logical fix would require changes in fs/inode.c:iput() to
1451	   remove inode from hash-table _after_ fs cleaned disk stuff up and
1452	   in iget() to return NULL if I_FREEING inode is found in
1453	   hash-table. */
1454	/* Currently there is one place where it's ok to meet inode with
1455	   nlink==0: processing of open-unlinked and half-truncated files
1456	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
1457	if ((inode->i_nlink == 0) &&
1458	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1459		reiserfs_warning(inode->i_sb, "vs-13075",
1460				 "dead inode read from disk %K. "
1461				 "This is likely to be race with knfsd. Ignore",
1462				 &key);
1463		reiserfs_make_bad_inode(inode);
1464	}
1465
1466	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
1467
1468}
1469
1470/**
1471 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1472 *
1473 * @inode:    inode from hash table to check
1474 * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1475 *
1476 * This function is called by iget5_locked() to distinguish reiserfs inodes
1477 * having the same inode numbers. Such inodes can only exist due to some
1478 * error condition. One of them should be bad. Inodes with identical
1479 * inode numbers (objectids) are distinguished by parent directory ids.
1480 *
1481 */
1482int reiserfs_find_actor(struct inode *inode, void *opaque)
1483{
1484	struct reiserfs_iget_args *args;
1485
1486	args = opaque;
1487	/* args is already in CPU order */
1488	return (inode->i_ino == args->objectid) &&
1489	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1490}
1491
1492struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1493{
1494	struct inode *inode;
1495	struct reiserfs_iget_args args;
1496
1497	args.objectid = key->on_disk_key.k_objectid;
1498	args.dirid = key->on_disk_key.k_dir_id;
1499	inode = iget5_locked(s, key->on_disk_key.k_objectid,
1500			     reiserfs_find_actor, reiserfs_init_locked_inode,
1501			     (void *)(&args));
1502	if (!inode)
1503		return ERR_PTR(-ENOMEM);
1504
1505	if (inode->i_state & I_NEW) {
1506		reiserfs_read_locked_inode(inode, &args);
1507		unlock_new_inode(inode);
1508	}
1509
1510	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1511		/* either due to i/o error or a stale NFS handle */
1512		iput(inode);
1513		inode = NULL;
1514	}
1515	return inode;
1516}
1517
1518static struct dentry *reiserfs_get_dentry(struct super_block *sb,
1519	u32 objectid, u32 dir_id, u32 generation)
1520
1521{
1522	struct cpu_key key;
1523	struct inode *inode;
1524
1525	key.on_disk_key.k_objectid = objectid;
1526	key.on_disk_key.k_dir_id = dir_id;
1527	reiserfs_write_lock(sb);
1528	inode = reiserfs_iget(sb, &key);
1529	if (inode && !IS_ERR(inode) && generation != 0 &&
1530	    generation != inode->i_generation) {
1531		iput(inode);
1532		inode = NULL;
1533	}
1534	reiserfs_write_unlock(sb);
1535
1536	return d_obtain_alias(inode);
1537}
1538
1539struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1540		int fh_len, int fh_type)
1541{
1542	/* fhtype happens to reflect the number of u32s encoded.
1543	 * due to a bug in earlier code, fhtype might indicate there
1544	 * are more u32s then actually fitted.
1545	 * so if fhtype seems to be more than len, reduce fhtype.
1546	 * Valid types are:
1547	 *   2 - objectid + dir_id - legacy support
1548	 *   3 - objectid + dir_id + generation
1549	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1550	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
1551	 *   6 - as above plus generation of directory
1552	 * 6 does not fit in NFSv2 handles
1553	 */
1554	if (fh_type > fh_len) {
1555		if (fh_type != 6 || fh_len != 5)
1556			reiserfs_warning(sb, "reiserfs-13077",
1557				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
1558				fh_type, fh_len);
1559		fh_type = 5;
1560	}
1561
1562	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1563		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
1564}
1565
1566struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1567		int fh_len, int fh_type)
1568{
1569	if (fh_type < 4)
1570		return NULL;
1571
1572	return reiserfs_get_dentry(sb,
1573		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
1574		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
1575		(fh_type == 6) ? fid->raw[5] : 0);
1576}
1577
1578int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1579		       int need_parent)
1580{
1581	struct inode *inode = dentry->d_inode;
1582	int maxlen = *lenp;
1583
1584	if (maxlen < 3)
1585		return 255;
1586
1587	data[0] = inode->i_ino;
1588	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1589	data[2] = inode->i_generation;
1590	*lenp = 3;
1591	/* no room for directory info? return what we've stored so far */
1592	if (maxlen < 5 || !need_parent)
1593		return 3;
1594
1595	spin_lock(&dentry->d_lock);
1596	inode = dentry->d_parent->d_inode;
1597	data[3] = inode->i_ino;
1598	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1599	*lenp = 5;
1600	if (maxlen >= 6) {
1601		data[5] = inode->i_generation;
1602		*lenp = 6;
1603	}
1604	spin_unlock(&dentry->d_lock);
1605	return *lenp;
1606}
1607
1608/* looks for stat data, then copies fields to it, marks the buffer
1609   containing stat data as dirty */
1610/* reiserfs inodes are never really dirty, since the dirty inode call
1611** always logs them.  This call allows the VFS inode marking routines
1612** to properly mark inodes for datasync and such, but only actually
1613** does something when called for a synchronous update.
1614*/
1615int reiserfs_write_inode(struct inode *inode, int do_sync)
1616{
1617	struct reiserfs_transaction_handle th;
1618	int jbegin_count = 1;
1619
1620	if (inode->i_sb->s_flags & MS_RDONLY)
1621		return -EROFS;
1622	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
1623	 ** these cases are just when the system needs ram, not when the
1624	 ** inode needs to reach disk for safety, and they can safely be
1625	 ** ignored because the altered inode has already been logged.
1626	 */
1627	if (do_sync && !(current->flags & PF_MEMALLOC)) {
1628		reiserfs_write_lock(inode->i_sb);
1629		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1630			reiserfs_update_sd(&th, inode);
1631			journal_end_sync(&th, inode->i_sb, jbegin_count);
1632		}
1633		reiserfs_write_unlock(inode->i_sb);
1634	}
1635	return 0;
1636}
1637
1638/* stat data of new object is inserted already, this inserts the item
1639   containing "." and ".." entries */
1640static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1641				  struct inode *inode,
1642				  struct item_head *ih, struct treepath *path,
1643				  struct inode *dir)
1644{
1645	struct super_block *sb = th->t_super;
1646	char empty_dir[EMPTY_DIR_SIZE];
1647	char *body = empty_dir;
1648	struct cpu_key key;
1649	int retval;
1650
1651	BUG_ON(!th->t_trans_id);
1652
1653	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1654		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1655		      TYPE_DIRENTRY, 3 /*key length */ );
1656
1657	/* compose item head for new item. Directories consist of items of
1658	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1659	   is done by reiserfs_new_inode */
1660	if (old_format_only(sb)) {
1661		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1662				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1663
1664		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1665				       ih->ih_key.k_objectid,
1666				       INODE_PKEY(dir)->k_dir_id,
1667				       INODE_PKEY(dir)->k_objectid);
1668	} else {
1669		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1670				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1671
1672		make_empty_dir_item(body, ih->ih_key.k_dir_id,
1673				    ih->ih_key.k_objectid,
1674				    INODE_PKEY(dir)->k_dir_id,
1675				    INODE_PKEY(dir)->k_objectid);
1676	}
1677
1678	/* look for place in the tree for new item */
1679	retval = search_item(sb, &key, path);
1680	if (retval == IO_ERROR) {
1681		reiserfs_error(sb, "vs-13080",
1682			       "i/o failure occurred creating new directory");
1683		return -EIO;
1684	}
1685	if (retval == ITEM_FOUND) {
1686		pathrelse(path);
1687		reiserfs_warning(sb, "vs-13070",
1688				 "object with this key exists (%k)",
1689				 &(ih->ih_key));
1690		return -EEXIST;
1691	}
1692
1693	/* insert item, that is empty directory item */
1694	return reiserfs_insert_item(th, path, &key, ih, inode, body);
1695}
1696
1697/* stat data of object has been inserted, this inserts the item
1698   containing the body of symlink */
1699static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
1700				struct item_head *ih,
1701				struct treepath *path, const char *symname,
1702				int item_len)
1703{
1704	struct super_block *sb = th->t_super;
1705	struct cpu_key key;
1706	int retval;
1707
1708	BUG_ON(!th->t_trans_id);
1709
1710	_make_cpu_key(&key, KEY_FORMAT_3_5,
1711		      le32_to_cpu(ih->ih_key.k_dir_id),
1712		      le32_to_cpu(ih->ih_key.k_objectid),
1713		      1, TYPE_DIRECT, 3 /*key length */ );
1714
1715	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1716			  0 /*free_space */ );
1717
1718	/* look for place in the tree for new item */
1719	retval = search_item(sb, &key, path);
1720	if (retval == IO_ERROR) {
1721		reiserfs_error(sb, "vs-13080",
1722			       "i/o failure occurred creating new symlink");
1723		return -EIO;
1724	}
1725	if (retval == ITEM_FOUND) {
1726		pathrelse(path);
1727		reiserfs_warning(sb, "vs-13080",
1728				 "object with this key exists (%k)",
1729				 &(ih->ih_key));
1730		return -EEXIST;
1731	}
1732
1733	/* insert item, that is body of symlink */
1734	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1735}
1736
1737/* inserts the stat data into the tree, and then calls
1738   reiserfs_new_directory (to insert ".", ".." item if new object is
1739   directory) or reiserfs_new_symlink (to insert symlink body if new
1740   object is symlink) or nothing (if new object is regular file)
1741
1742   NOTE! uid and gid must already be set in the inode.  If we return
1743   non-zero due to an error, we have to drop the quota previously allocated
1744   for the fresh inode.  This can only be done outside a transaction, so
1745   if we return non-zero, we also end the transaction.  */
1746int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1747		       struct inode *dir, int mode, const char *symname,
1748		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1749		          strlen (symname) for symlinks) */
1750		       loff_t i_size, struct dentry *dentry,
1751		       struct inode *inode,
1752		       struct reiserfs_security_handle *security)
1753{
1754	struct super_block *sb;
1755	struct reiserfs_iget_args args;
1756	INITIALIZE_PATH(path_to_key);
1757	struct cpu_key key;
1758	struct item_head ih;
1759	struct stat_data sd;
1760	int retval;
1761	int err;
1762
1763	BUG_ON(!th->t_trans_id);
1764
1765	if (vfs_dq_alloc_inode(inode)) {
1766		err = -EDQUOT;
1767		goto out_end_trans;
1768	}
1769	if (!dir->i_nlink) {
1770		err = -EPERM;
1771		goto out_bad_inode;
1772	}
1773
1774	sb = dir->i_sb;
1775
1776	/* item head of new item */
1777	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1778	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1779	if (!ih.ih_key.k_objectid) {
1780		err = -ENOMEM;
1781		goto out_bad_inode;
1782	}
1783	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1784	if (old_format_only(sb))
1785		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1786				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1787	else
1788		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1789				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1790	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1791	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1792	if (insert_inode_locked4(inode, args.objectid,
1793			     reiserfs_find_actor, &args) < 0) {
1794		err = -EINVAL;
1795		goto out_bad_inode;
1796	}
1797	if (old_format_only(sb))
1798		/* not a perfect generation count, as object ids can be reused, but
1799		 ** this is as good as reiserfs can do right now.
1800		 ** note that the private part of inode isn't filled in yet, we have
1801		 ** to use the directory.
1802		 */
1803		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1804	else
1805#if defined( USE_INODE_GENERATION_COUNTER )
1806		inode->i_generation =
1807		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1808#else
1809		inode->i_generation = ++event;
1810#endif
1811
1812	/* fill stat data */
1813	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1814
1815	/* uid and gid must already be set by the caller for quota init */
1816
1817	/* symlink cannot be immutable or append only, right? */
1818	if (S_ISLNK(inode->i_mode))
1819		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1820
1821	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1822	inode->i_size = i_size;
1823	inode->i_blocks = 0;
1824	inode->i_bytes = 0;
1825	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1826	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1827
1828	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1829	REISERFS_I(inode)->i_flags = 0;
1830	REISERFS_I(inode)->i_prealloc_block = 0;
1831	REISERFS_I(inode)->i_prealloc_count = 0;
1832	REISERFS_I(inode)->i_trans_id = 0;
1833	REISERFS_I(inode)->i_jl = NULL;
1834	REISERFS_I(inode)->i_attrs =
1835	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1836	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1837	mutex_init(&(REISERFS_I(inode)->i_mmap));
1838	reiserfs_init_xattr_rwsem(inode);
1839
1840	/* key to search for correct place for new stat data */
1841	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1842		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1843		      TYPE_STAT_DATA, 3 /*key length */ );
1844
1845	/* find proper place for inserting of stat data */
1846	retval = search_item(sb, &key, &path_to_key);
1847	if (retval == IO_ERROR) {
1848		err = -EIO;
1849		goto out_bad_inode;
1850	}
1851	if (retval == ITEM_FOUND) {
1852		pathrelse(&path_to_key);
1853		err = -EEXIST;
1854		goto out_bad_inode;
1855	}
1856	if (old_format_only(sb)) {
1857		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1858			pathrelse(&path_to_key);
1859			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
1860			err = -EINVAL;
1861			goto out_bad_inode;
1862		}
1863		inode2sd_v1(&sd, inode, inode->i_size);
1864	} else {
1865		inode2sd(&sd, inode, inode->i_size);
1866	}
1867	// store in in-core inode the key of stat data and version all
1868	// object items will have (directory items will have old offset
1869	// format, other new objects will consist of new items)
1870	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1871		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1872	else
1873		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1874	if (old_format_only(sb))
1875		set_inode_sd_version(inode, STAT_DATA_V1);
1876	else
1877		set_inode_sd_version(inode, STAT_DATA_V2);
1878
1879	/* insert the stat data into the tree */
1880#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1881	if (REISERFS_I(dir)->new_packing_locality)
1882		th->displace_new_blocks = 1;
1883#endif
1884	retval =
1885	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1886				 (char *)(&sd));
1887	if (retval) {
1888		err = retval;
1889		reiserfs_check_path(&path_to_key);
1890		goto out_bad_inode;
1891	}
1892#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1893	if (!th->displace_new_blocks)
1894		REISERFS_I(dir)->new_packing_locality = 0;
1895#endif
1896	if (S_ISDIR(mode)) {
1897		/* insert item with "." and ".." */
1898		retval =
1899		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1900	}
1901
1902	if (S_ISLNK(mode)) {
1903		/* insert body of symlink */
1904		if (!old_format_only(sb))
1905			i_size = ROUND_UP(i_size);
1906		retval =
1907		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1908					 i_size);
1909	}
1910	if (retval) {
1911		err = retval;
1912		reiserfs_check_path(&path_to_key);
1913		journal_end(th, th->t_super, th->t_blocks_allocated);
1914		goto out_inserted_sd;
1915	}
1916
1917	if (reiserfs_posixacl(inode->i_sb)) {
1918		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
1919		if (retval) {
1920			err = retval;
1921			reiserfs_check_path(&path_to_key);
1922			journal_end(th, th->t_super, th->t_blocks_allocated);
1923			goto out_inserted_sd;
1924		}
1925	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
1926		reiserfs_warning(inode->i_sb, "jdm-13090",
1927				 "ACLs aren't enabled in the fs, "
1928				 "but vfs thinks they are!");
1929	} else if (IS_PRIVATE(dir))
1930		inode->i_flags |= S_PRIVATE;
1931
1932	if (security->name) {
1933		retval = reiserfs_security_write(th, inode, security);
1934		if (retval) {
1935			err = retval;
1936			reiserfs_check_path(&path_to_key);
1937			retval = journal_end(th, th->t_super,
1938					     th->t_blocks_allocated);
1939			if (retval)
1940				err = retval;
1941			goto out_inserted_sd;
1942		}
1943	}
1944
1945	reiserfs_update_sd(th, inode);
1946	reiserfs_check_path(&path_to_key);
1947
1948	return 0;
1949
1950/* it looks like you can easily compress these two goto targets into
1951 * one.  Keeping it like this doesn't actually hurt anything, and they
1952 * are place holders for what the quota code actually needs.
1953 */
1954      out_bad_inode:
1955	/* Invalidate the object, nothing was inserted yet */
1956	INODE_PKEY(inode)->k_objectid = 0;
1957
1958	/* Quota change must be inside a transaction for journaling */
1959	vfs_dq_free_inode(inode);
1960
1961      out_end_trans:
1962	journal_end(th, th->t_super, th->t_blocks_allocated);
1963	/* Drop can be outside and it needs more credits so it's better to have it outside */
1964	vfs_dq_drop(inode);
1965	inode->i_flags |= S_NOQUOTA;
1966	make_bad_inode(inode);
1967
1968      out_inserted_sd:
1969	inode->i_nlink = 0;
1970	th->t_trans_id = 0;	/* so the caller can't use this handle later */
1971	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1972	iput(inode);
1973	return err;
1974}
1975
1976/*
1977** finds the tail page in the page cache,
1978** reads the last block in.
1979**
1980** On success, page_result is set to a locked, pinned page, and bh_result
1981** is set to an up to date buffer for the last block in the file.  returns 0.
1982**
1983** tail conversion is not done, so bh_result might not be valid for writing
1984** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1985** trying to write the block.
1986**
1987** on failure, nonzero is returned, page_result and bh_result are untouched.
1988*/
1989static int grab_tail_page(struct inode *inode,
1990			  struct page **page_result,
1991			  struct buffer_head **bh_result)
1992{
1993
1994	/* we want the page with the last byte in the file,
1995	 ** not the page that will hold the next byte for appending
1996	 */
1997	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
1998	unsigned long pos = 0;
1999	unsigned long start = 0;
2000	unsigned long blocksize = inode->i_sb->s_blocksize;
2001	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
2002	struct buffer_head *bh;
2003	struct buffer_head *head;
2004	struct page *page;
2005	int error;
2006
2007	/* we know that we are only called with inode->i_size > 0.
2008	 ** we also know that a file tail can never be as big as a block
2009	 ** If i_size % blocksize == 0, our file is currently block aligned
2010	 ** and it won't need converting or zeroing after a truncate.
2011	 */
2012	if ((offset & (blocksize - 1)) == 0) {
2013		return -ENOENT;
2014	}
2015	page = grab_cache_page(inode->i_mapping, index);
2016	error = -ENOMEM;
2017	if (!page) {
2018		goto out;
2019	}
2020	/* start within the page of the last block in the file */
2021	start = (offset / blocksize) * blocksize;
2022
2023	error = block_prepare_write(page, start, offset,
2024				    reiserfs_get_block_create_0);
2025	if (error)
2026		goto unlock;
2027
2028	head = page_buffers(page);
2029	bh = head;
2030	do {
2031		if (pos >= start) {
2032			break;
2033		}
2034		bh = bh->b_this_page;
2035		pos += blocksize;
2036	} while (bh != head);
2037
2038	if (!buffer_uptodate(bh)) {
2039		/* note, this should never happen, prepare_write should
2040		 ** be taking care of this for us.  If the buffer isn't up to date,
2041		 ** I've screwed up the code to find the buffer, or the code to
2042		 ** call prepare_write
2043		 */
2044		reiserfs_error(inode->i_sb, "clm-6000",
2045			       "error reading block %lu", bh->b_blocknr);
2046		error = -EIO;
2047		goto unlock;
2048	}
2049	*bh_result = bh;
2050	*page_result = page;
2051
2052      out:
2053	return error;
2054
2055      unlock:
2056	unlock_page(page);
2057	page_cache_release(page);
2058	return error;
2059}
2060
2061/*
2062** vfs version of truncate file.  Must NOT be called with
2063** a transaction already started.
2064**
2065** some code taken from block_truncate_page
2066*/
2067int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2068{
2069	struct reiserfs_transaction_handle th;
2070	/* we want the offset for the first byte after the end of the file */
2071	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2072	unsigned blocksize = inode->i_sb->s_blocksize;
2073	unsigned length;
2074	struct page *page = NULL;
2075	int error;
2076	struct buffer_head *bh = NULL;
2077	int err2;
2078	int lock_depth;
2079
2080	lock_depth = reiserfs_write_lock_once(inode->i_sb);
2081
2082	if (inode->i_size > 0) {
2083		error = grab_tail_page(inode, &page, &bh);
2084		if (error) {
2085			// -ENOENT means we truncated past the end of the file,
2086			// and get_block_create_0 could not find a block to read in,
2087			// which is ok.
2088			if (error != -ENOENT)
2089				reiserfs_error(inode->i_sb, "clm-6001",
2090					       "grab_tail_page failed %d",
2091					       error);
2092			page = NULL;
2093			bh = NULL;
2094		}
2095	}
2096
2097	/* so, if page != NULL, we have a buffer head for the offset at
2098	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2099	 ** then we have an unformatted node.  Otherwise, we have a direct item,
2100	 ** and no zeroing is required on disk.  We zero after the truncate,
2101	 ** because the truncate might pack the item anyway
2102	 ** (it will unmap bh if it packs).
2103	 */
2104	/* it is enough to reserve space in transaction for 2 balancings:
2105	   one for "save" link adding and another for the first
2106	   cut_from_item. 1 is for update_sd */
2107	error = journal_begin(&th, inode->i_sb,
2108			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
2109	if (error)
2110		goto out;
2111	reiserfs_update_inode_transaction(inode);
2112	if (update_timestamps)
2113		/* we are doing real truncate: if the system crashes before the last
2114		   transaction of truncating gets committed - on reboot the file
2115		   either appears truncated properly or not truncated at all */
2116		add_save_link(&th, inode, 1);
2117	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
2118	error =
2119	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2120	if (error)
2121		goto out;
2122
2123	/* check reiserfs_do_truncate after ending the transaction */
2124	if (err2) {
2125		error = err2;
2126  		goto out;
2127	}
2128
2129	if (update_timestamps) {
2130		error = remove_save_link(inode, 1 /* truncate */);
2131		if (error)
2132			goto out;
2133	}
2134
2135	if (page) {
2136		length = offset & (blocksize - 1);
2137		/* if we are not on a block boundary */
2138		if (length) {
2139			length = blocksize - length;
2140			zero_user(page, offset, length);
2141			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2142				mark_buffer_dirty(bh);
2143			}
2144		}
2145		unlock_page(page);
2146		page_cache_release(page);
2147	}
2148
2149	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2150
2151	return 0;
2152      out:
2153	if (page) {
2154		unlock_page(page);
2155		page_cache_release(page);
2156	}
2157
2158	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2159
2160	return error;
2161}
2162
2163static int map_block_for_writepage(struct inode *inode,
2164				   struct buffer_head *bh_result,
2165				   unsigned long block)
2166{
2167	struct reiserfs_transaction_handle th;
2168	int fs_gen;
2169	struct item_head tmp_ih;
2170	struct item_head *ih;
2171	struct buffer_head *bh;
2172	__le32 *item;
2173	struct cpu_key key;
2174	INITIALIZE_PATH(path);
2175	int pos_in_item;
2176	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2177	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2178	int retval;
2179	int use_get_block = 0;
2180	int bytes_copied = 0;
2181	int copy_size;
2182	int trans_running = 0;
2183
2184	/* catch places below that try to log something without starting a trans */
2185	th.t_trans_id = 0;
2186
2187	if (!buffer_uptodate(bh_result)) {
2188		return -EIO;
2189	}
2190
2191	kmap(bh_result->b_page);
2192      start_over:
2193	reiserfs_write_lock(inode->i_sb);
2194	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2195
2196      research:
2197	retval = search_for_position_by_key(inode->i_sb, &key, &path);
2198	if (retval != POSITION_FOUND) {
2199		use_get_block = 1;
2200		goto out;
2201	}
2202
2203	bh = get_last_bh(&path);
2204	ih = get_ih(&path);
2205	item = get_item(&path);
2206	pos_in_item = path.pos_in_item;
2207
2208	/* we've found an unformatted node */
2209	if (indirect_item_found(retval, ih)) {
2210		if (bytes_copied > 0) {
2211			reiserfs_warning(inode->i_sb, "clm-6002",
2212					 "bytes_copied %d", bytes_copied);
2213		}
2214		if (!get_block_num(item, pos_in_item)) {
2215			/* crap, we are writing to a hole */
2216			use_get_block = 1;
2217			goto out;
2218		}
2219		set_block_dev_mapped(bh_result,
2220				     get_block_num(item, pos_in_item), inode);
2221	} else if (is_direct_le_ih(ih)) {
2222		char *p;
2223		p = page_address(bh_result->b_page);
2224		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2225		copy_size = ih_item_len(ih) - pos_in_item;
2226
2227		fs_gen = get_generation(inode->i_sb);
2228		copy_item_head(&tmp_ih, ih);
2229
2230		if (!trans_running) {
2231			/* vs-3050 is gone, no need to drop the path */
2232			retval = journal_begin(&th, inode->i_sb, jbegin_count);
2233			if (retval)
2234				goto out;
2235			reiserfs_update_inode_transaction(inode);
2236			trans_running = 1;
2237			if (fs_changed(fs_gen, inode->i_sb)
2238			    && item_moved(&tmp_ih, &path)) {
2239				reiserfs_restore_prepared_buffer(inode->i_sb,
2240								 bh);
2241				goto research;
2242			}
2243		}
2244
2245		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2246
2247		if (fs_changed(fs_gen, inode->i_sb)
2248		    && item_moved(&tmp_ih, &path)) {
2249			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2250			goto research;
2251		}
2252
2253		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2254		       copy_size);
2255
2256		journal_mark_dirty(&th, inode->i_sb, bh);
2257		bytes_copied += copy_size;
2258		set_block_dev_mapped(bh_result, 0, inode);
2259
2260		/* are there still bytes left? */
2261		if (bytes_copied < bh_result->b_size &&
2262		    (byte_offset + bytes_copied) < inode->i_size) {
2263			set_cpu_key_k_offset(&key,
2264					     cpu_key_k_offset(&key) +
2265					     copy_size);
2266			goto research;
2267		}
2268	} else {
2269		reiserfs_warning(inode->i_sb, "clm-6003",
2270				 "bad item inode %lu", inode->i_ino);
2271		retval = -EIO;
2272		goto out;
2273	}
2274	retval = 0;
2275
2276      out:
2277	pathrelse(&path);
2278	if (trans_running) {
2279		int err = journal_end(&th, inode->i_sb, jbegin_count);
2280		if (err)
2281			retval = err;
2282		trans_running = 0;
2283	}
2284	reiserfs_write_unlock(inode->i_sb);
2285
2286	/* this is where we fill in holes in the file. */
2287	if (use_get_block) {
2288		retval = reiserfs_get_block(inode, block, bh_result,
2289					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2290					    | GET_BLOCK_NO_DANGLE);
2291		if (!retval) {
2292			if (!buffer_mapped(bh_result)
2293			    || bh_result->b_blocknr == 0) {
2294				/* get_block failed to find a mapped unformatted node. */
2295				use_get_block = 0;
2296				goto start_over;
2297			}
2298		}
2299	}
2300	kunmap(bh_result->b_page);
2301
2302	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2303		/* we've copied data from the page into the direct item, so the
2304		 * buffer in the page is now clean, mark it to reflect that.
2305		 */
2306		lock_buffer(bh_result);
2307		clear_buffer_dirty(bh_result);
2308		unlock_buffer(bh_result);
2309	}
2310	return retval;
2311}
2312
2313/*
2314 * mason@suse.com: updated in 2.5.54 to follow the same general io
2315 * start/recovery path as __block_write_full_page, along with special
2316 * code to handle reiserfs tails.
2317 */
2318static int reiserfs_write_full_page(struct page *page,
2319				    struct writeback_control *wbc)
2320{
2321	struct inode *inode = page->mapping->host;
2322	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2323	int error = 0;
2324	unsigned long block;
2325	sector_t last_block;
2326	struct buffer_head *head, *bh;
2327	int partial = 0;
2328	int nr = 0;
2329	int checked = PageChecked(page);
2330	struct reiserfs_transaction_handle th;
2331	struct super_block *s = inode->i_sb;
2332	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2333	th.t_trans_id = 0;
2334
2335	/* no logging allowed when nonblocking or from PF_MEMALLOC */
2336	if (checked && (current->flags & PF_MEMALLOC)) {
2337		redirty_page_for_writepage(wbc, page);
2338		unlock_page(page);
2339		return 0;
2340	}
2341
2342	/* The page dirty bit is cleared before writepage is called, which
2343	 * means we have to tell create_empty_buffers to make dirty buffers
2344	 * The page really should be up to date at this point, so tossing
2345	 * in the BH_Uptodate is just a sanity check.
2346	 */
2347	if (!page_has_buffers(page)) {
2348		create_empty_buffers(page, s->s_blocksize,
2349				     (1 << BH_Dirty) | (1 << BH_Uptodate));
2350	}
2351	head = page_buffers(page);
2352
2353	/* last page in the file, zero out any contents past the
2354	 ** last byte in the file
2355	 */
2356	if (page->index >= end_index) {
2357		unsigned last_offset;
2358
2359		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2360		/* no file contents in this page */
2361		if (page->index >= end_index + 1 || !last_offset) {
2362			unlock_page(page);
2363			return 0;
2364		}
2365		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
2366	}
2367	bh = head;
2368	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2369	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
2370	/* first map all the buffers, logging any direct items we find */
2371	do {
2372		if (block > last_block) {
2373			/*
2374			 * This can happen when the block size is less than
2375			 * the page size.  The corresponding bytes in the page
2376			 * were zero filled above
2377			 */
2378			clear_buffer_dirty(bh);
2379			set_buffer_uptodate(bh);
2380		} else if ((checked || buffer_dirty(bh)) &&
2381		           (!buffer_mapped(bh) || (buffer_mapped(bh)
2382						       && bh->b_blocknr ==
2383						       0))) {
2384			/* not mapped yet, or it points to a direct item, search
2385			 * the btree for the mapping info, and log any direct
2386			 * items found
2387			 */
2388			if ((error = map_block_for_writepage(inode, bh, block))) {
2389				goto fail;
2390			}
2391		}
2392		bh = bh->b_this_page;
2393		block++;
2394	} while (bh != head);
2395
2396	/*
2397	 * we start the transaction after map_block_for_writepage,
2398	 * because it can create holes in the file (an unbounded operation).
2399	 * starting it here, we can make a reliable estimate for how many
2400	 * blocks we're going to log
2401	 */
2402	if (checked) {
2403		ClearPageChecked(page);
2404		reiserfs_write_lock(s);
2405		error = journal_begin(&th, s, bh_per_page + 1);
2406		if (error) {
2407			reiserfs_write_unlock(s);
2408			goto fail;
2409		}
2410		reiserfs_update_inode_transaction(inode);
2411	}
2412	/* now go through and lock any dirty buffers on the page */
2413	do {
2414		get_bh(bh);
2415		if (!buffer_mapped(bh))
2416			continue;
2417		if (buffer_mapped(bh) && bh->b_blocknr == 0)
2418			continue;
2419
2420		if (checked) {
2421			reiserfs_prepare_for_journal(s, bh, 1);
2422			journal_mark_dirty(&th, s, bh);
2423			continue;
2424		}
2425		/* from this point on, we know the buffer is mapped to a
2426		 * real block and not a direct item
2427		 */
2428		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2429			lock_buffer(bh);
2430		} else {
2431			if (!trylock_buffer(bh)) {
2432				redirty_page_for_writepage(wbc, page);
2433				continue;
2434			}
2435		}
2436		if (test_clear_buffer_dirty(bh)) {
2437			mark_buffer_async_write(bh);
2438		} else {
2439			unlock_buffer(bh);
2440		}
2441	} while ((bh = bh->b_this_page) != head);
2442
2443	if (checked) {
2444		error = journal_end(&th, s, bh_per_page + 1);
2445		reiserfs_write_unlock(s);
2446		if (error)
2447			goto fail;
2448	}
2449	BUG_ON(PageWriteback(page));
2450	set_page_writeback(page);
2451	unlock_page(page);
2452
2453	/*
2454	 * since any buffer might be the only dirty buffer on the page,
2455	 * the first submit_bh can bring the page out of writeback.
2456	 * be careful with the buffers.
2457	 */
2458	do {
2459		struct buffer_head *next = bh->b_this_page;
2460		if (buffer_async_write(bh)) {
2461			submit_bh(WRITE, bh);
2462			nr++;
2463		}
2464		put_bh(bh);
2465		bh = next;
2466	} while (bh != head);
2467
2468	error = 0;
2469      done:
2470	if (nr == 0) {
2471		/*
2472		 * if this page only had a direct item, it is very possible for
2473		 * no io to be required without there being an error.  Or,
2474		 * someone else could have locked them and sent them down the
2475		 * pipe without locking the page
2476		 */
2477		bh = head;
2478		do {
2479			if (!buffer_uptodate(bh)) {
2480				partial = 1;
2481				break;
2482			}
2483			bh = bh->b_this_page;
2484		} while (bh != head);
2485		if (!partial)
2486			SetPageUptodate(page);
2487		end_page_writeback(page);
2488	}
2489	return error;
2490
2491      fail:
2492	/* catches various errors, we need to make sure any valid dirty blocks
2493	 * get to the media.  The page is currently locked and not marked for
2494	 * writeback
2495	 */
2496	ClearPageUptodate(page);
2497	bh = head;
2498	do {
2499		get_bh(bh);
2500		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2501			lock_buffer(bh);
2502			mark_buffer_async_write(bh);
2503		} else {
2504			/*
2505			 * clear any dirty bits that might have come from getting
2506			 * attached to a dirty page
2507			 */
2508			clear_buffer_dirty(bh);
2509		}
2510		bh = bh->b_this_page;
2511	} while (bh != head);
2512	SetPageError(page);
2513	BUG_ON(PageWriteback(page));
2514	set_page_writeback(page);
2515	unlock_page(page);
2516	do {
2517		struct buffer_head *next = bh->b_this_page;
2518		if (buffer_async_write(bh)) {
2519			clear_buffer_dirty(bh);
2520			submit_bh(WRITE, bh);
2521			nr++;
2522		}
2523		put_bh(bh);
2524		bh = next;
2525	} while (bh != head);
2526	goto done;
2527}
2528
2529static int reiserfs_readpage(struct file *f, struct page *page)
2530{
2531	return block_read_full_page(page, reiserfs_get_block);
2532}
2533
2534static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2535{
2536	struct inode *inode = page->mapping->host;
2537	reiserfs_wait_on_write_block(inode->i_sb);
2538	return reiserfs_write_full_page(page, wbc);
2539}
2540
2541static int reiserfs_write_begin(struct file *file,
2542				struct address_space *mapping,
2543				loff_t pos, unsigned len, unsigned flags,
2544				struct page **pagep, void **fsdata)
2545{
2546	struct inode *inode;
2547	struct page *page;
2548	pgoff_t index;
2549	int ret;
2550	int old_ref = 0;
2551
2552 	inode = mapping->host;
2553	*fsdata = 0;
2554 	if (flags & AOP_FLAG_CONT_EXPAND &&
2555 	    (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
2556 		pos ++;
2557		*fsdata = (void *)(unsigned long)flags;
2558	}
2559
2560	index = pos >> PAGE_CACHE_SHIFT;
2561	page = grab_cache_page_write_begin(mapping, index, flags);
2562	if (!page)
2563		return -ENOMEM;
2564	*pagep = page;
2565
2566	reiserfs_wait_on_write_block(inode->i_sb);
2567	fix_tail_page_for_writing(page);
2568	if (reiserfs_transaction_running(inode->i_sb)) {
2569		struct reiserfs_transaction_handle *th;
2570		th = (struct reiserfs_transaction_handle *)current->
2571		    journal_info;
2572		BUG_ON(!th->t_refcount);
2573		BUG_ON(!th->t_trans_id);
2574		old_ref = th->t_refcount;
2575		th->t_refcount++;
2576	}
2577	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2578				reiserfs_get_block);
2579	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2580		struct reiserfs_transaction_handle *th = current->journal_info;
2581		/* this gets a little ugly.  If reiserfs_get_block returned an
2582		 * error and left a transacstion running, we've got to close it,
2583		 * and we've got to free handle if it was a persistent transaction.
2584		 *
2585		 * But, if we had nested into an existing transaction, we need
2586		 * to just drop the ref count on the handle.
2587		 *
2588		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2589		 * and it was a persistent trans.  Otherwise, it was nested above.
2590		 */
2591		if (th->t_refcount > old_ref) {
2592			if (old_ref)
2593				th->t_refcount--;
2594			else {
2595				int err;
2596				reiserfs_write_lock(inode->i_sb);
2597				err = reiserfs_end_persistent_transaction(th);
2598				reiserfs_write_unlock(inode->i_sb);
2599				if (err)
2600					ret = err;
2601			}
2602		}
2603	}
2604	if (ret) {
2605		unlock_page(page);
2606		page_cache_release(page);
2607	}
2608	return ret;
2609}
2610
2611int reiserfs_prepare_write(struct file *f, struct page *page,
2612			   unsigned from, unsigned to)
2613{
2614	struct inode *inode = page->mapping->host;
2615	int ret;
2616	int old_ref = 0;
2617
2618	reiserfs_write_unlock(inode->i_sb);
2619	reiserfs_wait_on_write_block(inode->i_sb);
2620	reiserfs_write_lock(inode->i_sb);
2621
2622	fix_tail_page_for_writing(page);
2623	if (reiserfs_transaction_running(inode->i_sb)) {
2624		struct reiserfs_transaction_handle *th;
2625		th = (struct reiserfs_transaction_handle *)current->
2626		    journal_info;
2627		BUG_ON(!th->t_refcount);
2628		BUG_ON(!th->t_trans_id);
2629		old_ref = th->t_refcount;
2630		th->t_refcount++;
2631	}
2632
2633	ret = block_prepare_write(page, from, to, reiserfs_get_block);
2634	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2635		struct reiserfs_transaction_handle *th = current->journal_info;
2636		/* this gets a little ugly.  If reiserfs_get_block returned an
2637		 * error and left a transacstion running, we've got to close it,
2638		 * and we've got to free handle if it was a persistent transaction.
2639		 *
2640		 * But, if we had nested into an existing transaction, we need
2641		 * to just drop the ref count on the handle.
2642		 *
2643		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2644		 * and it was a persistent trans.  Otherwise, it was nested above.
2645		 */
2646		if (th->t_refcount > old_ref) {
2647			if (old_ref)
2648				th->t_refcount--;
2649			else {
2650				int err;
2651				reiserfs_write_lock(inode->i_sb);
2652				err = reiserfs_end_persistent_transaction(th);
2653				reiserfs_write_unlock(inode->i_sb);
2654				if (err)
2655					ret = err;
2656			}
2657		}
2658	}
2659	return ret;
2660
2661}
2662
2663static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2664{
2665	return generic_block_bmap(as, block, reiserfs_bmap);
2666}
2667
2668static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2669			      loff_t pos, unsigned len, unsigned copied,
2670			      struct page *page, void *fsdata)
2671{
2672	struct inode *inode = page->mapping->host;
2673	int ret = 0;
2674	int update_sd = 0;
2675	struct reiserfs_transaction_handle *th;
2676	unsigned start;
2677	int lock_depth = 0;
2678	bool locked = false;
2679
2680	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
2681		pos ++;
2682
2683	reiserfs_wait_on_write_block(inode->i_sb);
2684	if (reiserfs_transaction_running(inode->i_sb))
2685		th = current->journal_info;
2686	else
2687		th = NULL;
2688
2689	start = pos & (PAGE_CACHE_SIZE - 1);
2690	if (unlikely(copied < len)) {
2691		if (!PageUptodate(page))
2692			copied = 0;
2693
2694		page_zero_new_buffers(page, start + copied, start + len);
2695	}
2696	flush_dcache_page(page);
2697
2698	reiserfs_commit_page(inode, page, start, start + copied);
2699
2700	/* generic_commit_write does this for us, but does not update the
2701	 ** transaction tracking stuff when the size changes.  So, we have
2702	 ** to do the i_size updates here.
2703	 */
2704	pos += copied;
2705
2706	if (pos > inode->i_size) {
2707		struct reiserfs_transaction_handle myth;
2708		lock_depth = reiserfs_write_lock_once(inode->i_sb);
2709		locked = true;
2710		/* If the file have grown beyond the border where it
2711		   can have a tail, unmark it as needing a tail
2712		   packing */
2713		if ((have_large_tails(inode->i_sb)
2714		     && inode->i_size > i_block_size(inode) * 4)
2715		    || (have_small_tails(inode->i_sb)
2716			&& inode->i_size > i_block_size(inode)))
2717			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2718
2719		ret = journal_begin(&myth, inode->i_sb, 1);
2720		if (ret)
2721			goto journal_error;
2722
2723		reiserfs_update_inode_transaction(inode);
2724		inode->i_size = pos;
2725		/*
2726		 * this will just nest into our transaction.  It's important
2727		 * to use mark_inode_dirty so the inode gets pushed around on the
2728		 * dirty lists, and so that O_SYNC works as expected
2729		 */
2730		mark_inode_dirty(inode);
2731		reiserfs_update_sd(&myth, inode);
2732		update_sd = 1;
2733		ret = journal_end(&myth, inode->i_sb, 1);
2734		if (ret)
2735			goto journal_error;
2736	}
2737	if (th) {
2738		if (!locked) {
2739			lock_depth = reiserfs_write_lock_once(inode->i_sb);
2740			locked = true;
2741		}
2742		if (!update_sd)
2743			mark_inode_dirty(inode);
2744		ret = reiserfs_end_persistent_transaction(th);
2745		if (ret)
2746			goto out;
2747	}
2748
2749      out:
2750	if (locked)
2751		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2752	unlock_page(page);
2753	page_cache_release(page);
2754	return ret == 0 ? copied : ret;
2755
2756      journal_error:
2757	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2758	locked = false;
2759	if (th) {
2760		if (!update_sd)
2761			reiserfs_update_sd(th, inode);
2762		ret = reiserfs_end_persistent_transaction(th);
2763	}
2764	goto out;
2765}
2766
2767int reiserfs_commit_write(struct file *f, struct page *page,
2768			  unsigned from, unsigned to)
2769{
2770	struct inode *inode = page->mapping->host;
2771	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2772	int ret = 0;
2773	int update_sd = 0;
2774	struct reiserfs_transaction_handle *th = NULL;
2775
2776	reiserfs_write_unlock(inode->i_sb);
2777	reiserfs_wait_on_write_block(inode->i_sb);
2778	reiserfs_write_lock(inode->i_sb);
2779
2780	if (reiserfs_transaction_running(inode->i_sb)) {
2781		th = current->journal_info;
2782	}
2783	reiserfs_commit_page(inode, page, from, to);
2784
2785	/* generic_commit_write does this for us, but does not update the
2786	 ** transaction tracking stuff when the size changes.  So, we have
2787	 ** to do the i_size updates here.
2788	 */
2789	if (pos > inode->i_size) {
2790		struct reiserfs_transaction_handle myth;
2791		/* If the file have grown beyond the border where it
2792		   can have a tail, unmark it as needing a tail
2793		   packing */
2794		if ((have_large_tails(inode->i_sb)
2795		     && inode->i_size > i_block_size(inode) * 4)
2796		    || (have_small_tails(inode->i_sb)
2797			&& inode->i_size > i_block_size(inode)))
2798			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2799
2800		ret = journal_begin(&myth, inode->i_sb, 1);
2801		if (ret)
2802			goto journal_error;
2803
2804		reiserfs_update_inode_transaction(inode);
2805		inode->i_size = pos;
2806		/*
2807		 * this will just nest into our transaction.  It's important
2808		 * to use mark_inode_dirty so the inode gets pushed around on the
2809		 * dirty lists, and so that O_SYNC works as expected
2810		 */
2811		mark_inode_dirty(inode);
2812		reiserfs_update_sd(&myth, inode);
2813		update_sd = 1;
2814		ret = journal_end(&myth, inode->i_sb, 1);
2815		if (ret)
2816			goto journal_error;
2817	}
2818	if (th) {
2819		if (!update_sd)
2820			mark_inode_dirty(inode);
2821		ret = reiserfs_end_persistent_transaction(th);
2822		if (ret)
2823			goto out;
2824	}
2825
2826      out:
2827	return ret;
2828
2829      journal_error:
2830	if (th) {
2831		if (!update_sd)
2832			reiserfs_update_sd(th, inode);
2833		ret = reiserfs_end_persistent_transaction(th);
2834	}
2835
2836	return ret;
2837}
2838
2839void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2840{
2841	if (reiserfs_attrs(inode->i_sb)) {
2842		if (sd_attrs & REISERFS_SYNC_FL)
2843			inode->i_flags |= S_SYNC;
2844		else
2845			inode->i_flags &= ~S_SYNC;
2846		if (sd_attrs & REISERFS_IMMUTABLE_FL)
2847			inode->i_flags |= S_IMMUTABLE;
2848		else
2849			inode->i_flags &= ~S_IMMUTABLE;
2850		if (sd_attrs & REISERFS_APPEND_FL)
2851			inode->i_flags |= S_APPEND;
2852		else
2853			inode->i_flags &= ~S_APPEND;
2854		if (sd_attrs & REISERFS_NOATIME_FL)
2855			inode->i_flags |= S_NOATIME;
2856		else
2857			inode->i_flags &= ~S_NOATIME;
2858		if (sd_attrs & REISERFS_NOTAIL_FL)
2859			REISERFS_I(inode)->i_flags |= i_nopack_mask;
2860		else
2861			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2862	}
2863}
2864
2865void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2866{
2867	if (reiserfs_attrs(inode->i_sb)) {
2868		if (inode->i_flags & S_IMMUTABLE)
2869			*sd_attrs |= REISERFS_IMMUTABLE_FL;
2870		else
2871			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2872		if (inode->i_flags & S_SYNC)
2873			*sd_attrs |= REISERFS_SYNC_FL;
2874		else
2875			*sd_attrs &= ~REISERFS_SYNC_FL;
2876		if (inode->i_flags & S_NOATIME)
2877			*sd_attrs |= REISERFS_NOATIME_FL;
2878		else
2879			*sd_attrs &= ~REISERFS_NOATIME_FL;
2880		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2881			*sd_attrs |= REISERFS_NOTAIL_FL;
2882		else
2883			*sd_attrs &= ~REISERFS_NOTAIL_FL;
2884	}
2885}
2886
2887/* decide if this buffer needs to stay around for data logging or ordered
2888** write purposes
2889*/
2890static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2891{
2892	int ret = 1;
2893	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2894
2895	lock_buffer(bh);
2896	spin_lock(&j->j_dirty_buffers_lock);
2897	if (!buffer_mapped(bh)) {
2898		goto free_jh;
2899	}
2900	/* the page is locked, and the only places that log a data buffer
2901	 * also lock the page.
2902	 */
2903	if (reiserfs_file_data_log(inode)) {
2904		/*
2905		 * very conservative, leave the buffer pinned if
2906		 * anyone might need it.
2907		 */
2908		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2909			ret = 0;
2910		}
2911	} else  if (buffer_dirty(bh)) {
2912		struct reiserfs_journal_list *jl;
2913		struct reiserfs_jh *jh = bh->b_private;
2914
2915		/* why is this safe?
2916		 * reiserfs_setattr updates i_size in the on disk
2917		 * stat data before allowing vmtruncate to be called.
2918		 *
2919		 * If buffer was put onto the ordered list for this
2920		 * transaction, we know for sure either this transaction
2921		 * or an older one already has updated i_size on disk,
2922		 * and this ordered data won't be referenced in the file
2923		 * if we crash.
2924		 *
2925		 * if the buffer was put onto the ordered list for an older
2926		 * transaction, we need to leave it around
2927		 */
2928		if (jh && (jl = jh->jl)
2929		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2930			ret = 0;
2931	}
2932      free_jh:
2933	if (ret && bh->b_private) {
2934		reiserfs_free_jh(bh);
2935	}
2936	spin_unlock(&j->j_dirty_buffers_lock);
2937	unlock_buffer(bh);
2938	return ret;
2939}
2940
2941/* clm -- taken from fs/buffer.c:block_invalidate_page */
2942static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2943{
2944	struct buffer_head *head, *bh, *next;
2945	struct inode *inode = page->mapping->host;
2946	unsigned int curr_off = 0;
2947	int ret = 1;
2948
2949	BUG_ON(!PageLocked(page));
2950
2951	if (offset == 0)
2952		ClearPageChecked(page);
2953
2954	if (!page_has_buffers(page))
2955		goto out;
2956
2957	head = page_buffers(page);
2958	bh = head;
2959	do {
2960		unsigned int next_off = curr_off + bh->b_size;
2961		next = bh->b_this_page;
2962
2963		/*
2964		 * is this block fully invalidated?
2965		 */
2966		if (offset <= curr_off) {
2967			if (invalidatepage_can_drop(inode, bh))
2968				reiserfs_unmap_buffer(bh);
2969			else
2970				ret = 0;
2971		}
2972		curr_off = next_off;
2973		bh = next;
2974	} while (bh != head);
2975
2976	/*
2977	 * We release buffers only if the entire page is being invalidated.
2978	 * The get_block cached value has been unconditionally invalidated,
2979	 * so real IO is not possible anymore.
2980	 */
2981	if (!offset && ret) {
2982		ret = try_to_release_page(page, 0);
2983		/* maybe should BUG_ON(!ret); - neilb */
2984	}
2985      out:
2986	return;
2987}
2988
2989static int reiserfs_set_page_dirty(struct page *page)
2990{
2991	struct inode *inode = page->mapping->host;
2992	if (reiserfs_file_data_log(inode)) {
2993		SetPageChecked(page);
2994		return __set_page_dirty_nobuffers(page);
2995	}
2996	return __set_page_dirty_buffers(page);
2997}
2998
2999/*
3000 * Returns 1 if the page's buffers were dropped.  The page is locked.
3001 *
3002 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
3003 * in the buffers at page_buffers(page).
3004 *
3005 * even in -o notail mode, we can't be sure an old mount without -o notail
3006 * didn't create files with tails.
3007 */
3008static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
3009{
3010	struct inode *inode = page->mapping->host;
3011	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
3012	struct buffer_head *head;
3013	struct buffer_head *bh;
3014	int ret = 1;
3015
3016	WARN_ON(PageChecked(page));
3017	spin_lock(&j->j_dirty_buffers_lock);
3018	head = page_buffers(page);
3019	bh = head;
3020	do {
3021		if (bh->b_private) {
3022			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
3023				reiserfs_free_jh(bh);
3024			} else {
3025				ret = 0;
3026				break;
3027			}
3028		}
3029		bh = bh->b_this_page;
3030	} while (bh != head);
3031	if (ret)
3032		ret = try_to_free_buffers(page);
3033	spin_unlock(&j->j_dirty_buffers_lock);
3034	return ret;
3035}
3036
3037/* We thank Mingming Cao for helping us understand in great detail what
3038   to do in this section of the code. */
3039static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3040				  const struct iovec *iov, loff_t offset,
3041				  unsigned long nr_segs)
3042{
3043	struct file *file = iocb->ki_filp;
3044	struct inode *inode = file->f_mapping->host;
3045
3046	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3047				  offset, nr_segs,
3048				  reiserfs_get_blocks_direct_io, NULL);
3049}
3050
3051int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3052{
3053	struct inode *inode = dentry->d_inode;
3054	int error;
3055	unsigned int ia_valid;
3056
3057	/* must be turned off for recursive notify_change calls */
3058	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3059
3060	reiserfs_write_lock(inode->i_sb);
3061	if (attr->ia_valid & ATTR_SIZE) {
3062		/* version 2 items will be caught by the s_maxbytes check
3063		 ** done for us in vmtruncate
3064		 */
3065		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
3066		    attr->ia_size > MAX_NON_LFS) {
3067			error = -EFBIG;
3068			goto out;
3069		}
3070		/* fill in hole pointers in the expanding truncate case. */
3071		if (attr->ia_size > inode->i_size) {
3072			error = generic_cont_expand_simple(inode, attr->ia_size);
3073			if (REISERFS_I(inode)->i_prealloc_count > 0) {
3074				int err;
3075				struct reiserfs_transaction_handle th;
3076				/* we're changing at most 2 bitmaps, inode + super */
3077				err = journal_begin(&th, inode->i_sb, 4);
3078				if (!err) {
3079					reiserfs_discard_prealloc(&th, inode);
3080					err = journal_end(&th, inode->i_sb, 4);
3081				}
3082				if (err)
3083					error = err;
3084			}
3085			if (error)
3086				goto out;
3087			/*
3088			 * file size is changed, ctime and mtime are
3089			 * to be updated
3090			 */
3091			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
3092		}
3093	}
3094
3095	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
3096	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
3097	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
3098		/* stat data of format v3.5 has 16 bit uid and gid */
3099		error = -EINVAL;
3100		goto out;
3101	}
3102
3103	error = inode_change_ok(inode, attr);
3104	if (!error) {
3105		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3106		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3107			error = reiserfs_chown_xattrs(inode, attr);
3108
3109			if (!error) {
3110				struct reiserfs_transaction_handle th;
3111				int jbegin_count =
3112				    2 *
3113				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
3114				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
3115				    2;
3116
3117				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3118				error =
3119				    journal_begin(&th, inode->i_sb,
3120						  jbegin_count);
3121				if (error)
3122					goto out;
3123				error =
3124				    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3125				if (error) {
3126					journal_end(&th, inode->i_sb,
3127						    jbegin_count);
3128					goto out;
3129				}
3130				/* Update corresponding info in inode so that everything is in
3131				 * one transaction */
3132				if (attr->ia_valid & ATTR_UID)
3133					inode->i_uid = attr->ia_uid;
3134				if (attr->ia_valid & ATTR_GID)
3135					inode->i_gid = attr->ia_gid;
3136				mark_inode_dirty(inode);
3137				error =
3138				    journal_end(&th, inode->i_sb, jbegin_count);
3139			}
3140		}
3141		if (!error)
3142			error = inode_setattr(inode, attr);
3143	}
3144
3145	if (!error && reiserfs_posixacl(inode->i_sb)) {
3146		if (attr->ia_valid & ATTR_MODE)
3147			error = reiserfs_acl_chmod(inode);
3148	}
3149
3150      out:
3151	reiserfs_write_unlock(inode->i_sb);
3152	return error;
3153}
3154
3155const struct address_space_operations reiserfs_address_space_operations = {
3156	.writepage = reiserfs_writepage,
3157	.readpage = reiserfs_readpage,
3158	.readpages = reiserfs_readpages,
3159	.releasepage = reiserfs_releasepage,
3160	.invalidatepage = reiserfs_invalidatepage,
3161	.sync_page = block_sync_page,
3162	.write_begin = reiserfs_write_begin,
3163	.write_end = reiserfs_write_end,
3164	.bmap = reiserfs_aop_bmap,
3165	.direct_IO = reiserfs_direct_IO,
3166	.set_page_dirty = reiserfs_set_page_dirty,
3167};
3168