inode.c revision 0222e6571c332563a48d4cf5487b67feabe60b5e
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/fs.h>
7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <asm/uaccess.h>
15#include <asm/unaligned.h>
16#include <linux/buffer_head.h>
17#include <linux/mpage.h>
18#include <linux/writeback.h>
19#include <linux/quotaops.h>
20#include <linux/swap.h>
21
22int reiserfs_commit_write(struct file *f, struct page *page,
23			  unsigned from, unsigned to);
24int reiserfs_prepare_write(struct file *f, struct page *page,
25			   unsigned from, unsigned to);
26
27void reiserfs_delete_inode(struct inode *inode)
28{
29	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
30	int jbegin_count =
31	    JOURNAL_PER_BALANCE_CNT * 2 +
32	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
33	struct reiserfs_transaction_handle th;
34	int err;
35
36	truncate_inode_pages(&inode->i_data, 0);
37
38	reiserfs_write_lock(inode->i_sb);
39
40	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
42		reiserfs_delete_xattrs(inode);
43
44		if (journal_begin(&th, inode->i_sb, jbegin_count))
45			goto out;
46		reiserfs_update_inode_transaction(inode);
47
48		reiserfs_discard_prealloc(&th, inode);
49
50		err = reiserfs_delete_object(&th, inode);
51
52		/* Do quota update inside a transaction for journaled quotas. We must do that
53		 * after delete_object so that quota updates go into the same transaction as
54		 * stat data deletion */
55		if (!err)
56			DQUOT_FREE_INODE(inode);
57
58		if (journal_end(&th, inode->i_sb, jbegin_count))
59			goto out;
60
61		/* check return value from reiserfs_delete_object after
62		 * ending the transaction
63		 */
64		if (err)
65		    goto out;
66
67		/* all items of file are deleted, so we can remove "save" link */
68		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
69								 * about an error here */
70	} else {
71		/* no object items are in the tree */
72		;
73	}
74      out:
75	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
76	inode->i_blocks = 0;
77	reiserfs_write_unlock(inode->i_sb);
78}
79
80static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
81			  __u32 objectid, loff_t offset, int type, int length)
82{
83	key->version = version;
84
85	key->on_disk_key.k_dir_id = dirid;
86	key->on_disk_key.k_objectid = objectid;
87	set_cpu_key_k_offset(key, offset);
88	set_cpu_key_k_type(key, type);
89	key->key_length = length;
90}
91
92/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
93   offset and type of key */
94void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
95		  int type, int length)
96{
97	_make_cpu_key(key, get_inode_item_key_version(inode),
98		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
99		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
100		      length);
101}
102
103//
104// when key is 0, do not set version and short key
105//
106inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
107			      int version,
108			      loff_t offset, int type, int length,
109			      int entry_count /*or ih_free_space */ )
110{
111	if (key) {
112		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
113		ih->ih_key.k_objectid =
114		    cpu_to_le32(key->on_disk_key.k_objectid);
115	}
116	put_ih_version(ih, version);
117	set_le_ih_k_offset(ih, offset);
118	set_le_ih_k_type(ih, type);
119	put_ih_item_len(ih, length);
120	/*    set_ih_free_space (ih, 0); */
121	// for directory items it is entry count, for directs and stat
122	// datas - 0xffff, for indirects - 0
123	put_ih_entry_count(ih, entry_count);
124}
125
126//
127// FIXME: we might cache recently accessed indirect item
128
129// Ugh.  Not too eager for that....
130//  I cut the code until such time as I see a convincing argument (benchmark).
131// I don't want a bloated inode struct..., and I don't like code complexity....
132
133/* cutting the code is fine, since it really isn't in use yet and is easy
134** to add back in.  But, Vladimir has a really good idea here.  Think
135** about what happens for reading a file.  For each page,
136** The VFS layer calls reiserfs_readpage, who searches the tree to find
137** an indirect item.  This indirect item has X number of pointers, where
138** X is a big number if we've done the block allocation right.  But,
139** we only use one or two of these pointers during each call to readpage,
140** needlessly researching again later on.
141**
142** The size of the cache could be dynamic based on the size of the file.
143**
144** I'd also like to see us cache the location the stat data item, since
145** we are needlessly researching for that frequently.
146**
147** --chris
148*/
149
150/* If this page has a file tail in it, and
151** it was read in by get_block_create_0, the page data is valid,
152** but tail is still sitting in a direct item, and we can't write to
153** it.  So, look through this page, and check all the mapped buffers
154** to make sure they have valid block numbers.  Any that don't need
155** to be unmapped, so that block_prepare_write will correctly call
156** reiserfs_get_block to convert the tail into an unformatted node
157*/
158static inline void fix_tail_page_for_writing(struct page *page)
159{
160	struct buffer_head *head, *next, *bh;
161
162	if (page && page_has_buffers(page)) {
163		head = page_buffers(page);
164		bh = head;
165		do {
166			next = bh->b_this_page;
167			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
168				reiserfs_unmap_buffer(bh);
169			}
170			bh = next;
171		} while (bh != head);
172	}
173}
174
175/* reiserfs_get_block does not need to allocate a block only if it has been
176   done already or non-hole position has been found in the indirect item */
177static inline int allocation_needed(int retval, b_blocknr_t allocated,
178				    struct item_head *ih,
179				    __le32 * item, int pos_in_item)
180{
181	if (allocated)
182		return 0;
183	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
184	    get_block_num(item, pos_in_item))
185		return 0;
186	return 1;
187}
188
189static inline int indirect_item_found(int retval, struct item_head *ih)
190{
191	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
192}
193
194static inline void set_block_dev_mapped(struct buffer_head *bh,
195					b_blocknr_t block, struct inode *inode)
196{
197	map_bh(bh, inode->i_sb, block);
198}
199
200//
201// files which were created in the earlier version can not be longer,
202// than 2 gb
203//
204static int file_capable(struct inode *inode, sector_t block)
205{
206	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
207	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
208		return 1;
209
210	return 0;
211}
212
213static int restart_transaction(struct reiserfs_transaction_handle *th,
214			       struct inode *inode, struct treepath *path)
215{
216	struct super_block *s = th->t_super;
217	int len = th->t_blocks_allocated;
218	int err;
219
220	BUG_ON(!th->t_trans_id);
221	BUG_ON(!th->t_refcount);
222
223	pathrelse(path);
224
225	/* we cannot restart while nested */
226	if (th->t_refcount > 1) {
227		return 0;
228	}
229	reiserfs_update_sd(th, inode);
230	err = journal_end(th, s, len);
231	if (!err) {
232		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
233		if (!err)
234			reiserfs_update_inode_transaction(inode);
235	}
236	return err;
237}
238
239// it is called by get_block when create == 0. Returns block number
240// for 'block'-th logical block of file. When it hits direct item it
241// returns 0 (being called from bmap) or read direct item into piece
242// of page (bh_result)
243
244// Please improve the english/clarity in the comment above, as it is
245// hard to understand.
246
247static int _get_block_create_0(struct inode *inode, sector_t block,
248			       struct buffer_head *bh_result, int args)
249{
250	INITIALIZE_PATH(path);
251	struct cpu_key key;
252	struct buffer_head *bh;
253	struct item_head *ih, tmp_ih;
254	int fs_gen;
255	b_blocknr_t blocknr;
256	char *p = NULL;
257	int chars;
258	int ret;
259	int result;
260	int done = 0;
261	unsigned long offset;
262
263	// prepare the key to look for the 'block'-th block of file
264	make_cpu_key(&key, inode,
265		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
266		     3);
267
268      research:
269	result = search_for_position_by_key(inode->i_sb, &key, &path);
270	if (result != POSITION_FOUND) {
271		pathrelse(&path);
272		if (p)
273			kunmap(bh_result->b_page);
274		if (result == IO_ERROR)
275			return -EIO;
276		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
277		// That there is some MMAPED data associated with it that is yet to be written to disk.
278		if ((args & GET_BLOCK_NO_HOLE)
279		    && !PageUptodate(bh_result->b_page)) {
280			return -ENOENT;
281		}
282		return 0;
283	}
284	//
285	bh = get_last_bh(&path);
286	ih = get_ih(&path);
287	if (is_indirect_le_ih(ih)) {
288		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
289
290		/* FIXME: here we could cache indirect item or part of it in
291		   the inode to avoid search_by_key in case of subsequent
292		   access to file */
293		blocknr = get_block_num(ind_item, path.pos_in_item);
294		ret = 0;
295		if (blocknr) {
296			map_bh(bh_result, inode->i_sb, blocknr);
297			if (path.pos_in_item ==
298			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
299				set_buffer_boundary(bh_result);
300			}
301		} else
302			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
303			// That there is some MMAPED data associated with it that is yet to  be written to disk.
304		if ((args & GET_BLOCK_NO_HOLE)
305			    && !PageUptodate(bh_result->b_page)) {
306			ret = -ENOENT;
307		}
308
309		pathrelse(&path);
310		if (p)
311			kunmap(bh_result->b_page);
312		return ret;
313	}
314	// requested data are in direct item(s)
315	if (!(args & GET_BLOCK_READ_DIRECT)) {
316		// we are called by bmap. FIXME: we can not map block of file
317		// when it is stored in direct item(s)
318		pathrelse(&path);
319		if (p)
320			kunmap(bh_result->b_page);
321		return -ENOENT;
322	}
323
324	/* if we've got a direct item, and the buffer or page was uptodate,
325	 ** we don't want to pull data off disk again.  skip to the
326	 ** end, where we map the buffer and return
327	 */
328	if (buffer_uptodate(bh_result)) {
329		goto finished;
330	} else
331		/*
332		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
333		 ** pages without any buffers.  If the page is up to date, we don't want
334		 ** read old data off disk.  Set the up to date bit on the buffer instead
335		 ** and jump to the end
336		 */
337	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
338		set_buffer_uptodate(bh_result);
339		goto finished;
340	}
341	// read file tail into part of page
342	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
343	fs_gen = get_generation(inode->i_sb);
344	copy_item_head(&tmp_ih, ih);
345
346	/* we only want to kmap if we are reading the tail into the page.
347	 ** this is not the common case, so we don't kmap until we are
348	 ** sure we need to.  But, this means the item might move if
349	 ** kmap schedules
350	 */
351	if (!p) {
352		p = (char *)kmap(bh_result->b_page);
353		if (fs_changed(fs_gen, inode->i_sb)
354		    && item_moved(&tmp_ih, &path)) {
355			goto research;
356		}
357	}
358	p += offset;
359	memset(p, 0, inode->i_sb->s_blocksize);
360	do {
361		if (!is_direct_le_ih(ih)) {
362			BUG();
363		}
364		/* make sure we don't read more bytes than actually exist in
365		 ** the file.  This can happen in odd cases where i_size isn't
366		 ** correct, and when direct item padding results in a few
367		 ** extra bytes at the end of the direct item
368		 */
369		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
370			break;
371		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
372			chars =
373			    inode->i_size - (le_ih_k_offset(ih) - 1) -
374			    path.pos_in_item;
375			done = 1;
376		} else {
377			chars = ih_item_len(ih) - path.pos_in_item;
378		}
379		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
380
381		if (done)
382			break;
383
384		p += chars;
385
386		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
387			// we done, if read direct item is not the last item of
388			// node FIXME: we could try to check right delimiting key
389			// to see whether direct item continues in the right
390			// neighbor or rely on i_size
391			break;
392
393		// update key to look for the next piece
394		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
395		result = search_for_position_by_key(inode->i_sb, &key, &path);
396		if (result != POSITION_FOUND)
397			// i/o error most likely
398			break;
399		bh = get_last_bh(&path);
400		ih = get_ih(&path);
401	} while (1);
402
403	flush_dcache_page(bh_result->b_page);
404	kunmap(bh_result->b_page);
405
406      finished:
407	pathrelse(&path);
408
409	if (result == IO_ERROR)
410		return -EIO;
411
412	/* this buffer has valid data, but isn't valid for io.  mapping it to
413	 * block #0 tells the rest of reiserfs it just has a tail in it
414	 */
415	map_bh(bh_result, inode->i_sb, 0);
416	set_buffer_uptodate(bh_result);
417	return 0;
418}
419
420// this is called to create file map. So, _get_block_create_0 will not
421// read direct item
422static int reiserfs_bmap(struct inode *inode, sector_t block,
423			 struct buffer_head *bh_result, int create)
424{
425	if (!file_capable(inode, block))
426		return -EFBIG;
427
428	reiserfs_write_lock(inode->i_sb);
429	/* do not read the direct item */
430	_get_block_create_0(inode, block, bh_result, 0);
431	reiserfs_write_unlock(inode->i_sb);
432	return 0;
433}
434
435/* special version of get_block that is only used by grab_tail_page right
436** now.  It is sent to block_prepare_write, and when you try to get a
437** block past the end of the file (or a block from a hole) it returns
438** -ENOENT instead of a valid buffer.  block_prepare_write expects to
439** be able to do i/o on the buffers returned, unless an error value
440** is also returned.
441**
442** So, this allows block_prepare_write to be used for reading a single block
443** in a page.  Where it does not produce a valid page for holes, or past the
444** end of the file.  This turns out to be exactly what we need for reading
445** tails for conversion.
446**
447** The point of the wrapper is forcing a certain value for create, even
448** though the VFS layer is calling this function with create==1.  If you
449** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
450** don't use this function.
451*/
452static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
453				       struct buffer_head *bh_result,
454				       int create)
455{
456	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
457}
458
459/* This is special helper for reiserfs_get_block in case we are executing
460   direct_IO request. */
461static int reiserfs_get_blocks_direct_io(struct inode *inode,
462					 sector_t iblock,
463					 struct buffer_head *bh_result,
464					 int create)
465{
466	int ret;
467
468	bh_result->b_page = NULL;
469
470	/* We set the b_size before reiserfs_get_block call since it is
471	   referenced in convert_tail_for_hole() that may be called from
472	   reiserfs_get_block() */
473	bh_result->b_size = (1 << inode->i_blkbits);
474
475	ret = reiserfs_get_block(inode, iblock, bh_result,
476				 create | GET_BLOCK_NO_DANGLE);
477	if (ret)
478		goto out;
479
480	/* don't allow direct io onto tail pages */
481	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
482		/* make sure future calls to the direct io funcs for this offset
483		 ** in the file fail by unmapping the buffer
484		 */
485		clear_buffer_mapped(bh_result);
486		ret = -EINVAL;
487	}
488	/* Possible unpacked tail. Flush the data before pages have
489	   disappeared */
490	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
491		int err;
492		lock_kernel();
493		err = reiserfs_commit_for_inode(inode);
494		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
495		unlock_kernel();
496		if (err < 0)
497			ret = err;
498	}
499      out:
500	return ret;
501}
502
503/*
504** helper function for when reiserfs_get_block is called for a hole
505** but the file tail is still in a direct item
506** bh_result is the buffer head for the hole
507** tail_offset is the offset of the start of the tail in the file
508**
509** This calls prepare_write, which will start a new transaction
510** you should not be in a transaction, or have any paths held when you
511** call this.
512*/
513static int convert_tail_for_hole(struct inode *inode,
514				 struct buffer_head *bh_result,
515				 loff_t tail_offset)
516{
517	unsigned long index;
518	unsigned long tail_end;
519	unsigned long tail_start;
520	struct page *tail_page;
521	struct page *hole_page = bh_result->b_page;
522	int retval = 0;
523
524	if ((tail_offset & (bh_result->b_size - 1)) != 1)
525		return -EIO;
526
527	/* always try to read until the end of the block */
528	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
529	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
530
531	index = tail_offset >> PAGE_CACHE_SHIFT;
532	/* hole_page can be zero in case of direct_io, we are sure
533	   that we cannot get here if we write with O_DIRECT into
534	   tail page */
535	if (!hole_page || index != hole_page->index) {
536		tail_page = grab_cache_page(inode->i_mapping, index);
537		retval = -ENOMEM;
538		if (!tail_page) {
539			goto out;
540		}
541	} else {
542		tail_page = hole_page;
543	}
544
545	/* we don't have to make sure the conversion did not happen while
546	 ** we were locking the page because anyone that could convert
547	 ** must first take i_mutex.
548	 **
549	 ** We must fix the tail page for writing because it might have buffers
550	 ** that are mapped, but have a block number of 0.  This indicates tail
551	 ** data that has been read directly into the page, and block_prepare_write
552	 ** won't trigger a get_block in this case.
553	 */
554	fix_tail_page_for_writing(tail_page);
555	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
556	if (retval)
557		goto unlock;
558
559	/* tail conversion might change the data in the page */
560	flush_dcache_page(tail_page);
561
562	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
563
564      unlock:
565	if (tail_page != hole_page) {
566		unlock_page(tail_page);
567		page_cache_release(tail_page);
568	}
569      out:
570	return retval;
571}
572
573static inline int _allocate_block(struct reiserfs_transaction_handle *th,
574				  sector_t block,
575				  struct inode *inode,
576				  b_blocknr_t * allocated_block_nr,
577				  struct treepath *path, int flags)
578{
579	BUG_ON(!th->t_trans_id);
580
581#ifdef REISERFS_PREALLOCATE
582	if (!(flags & GET_BLOCK_NO_IMUX)) {
583		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
584						  path, block);
585	}
586#endif
587	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
588					 block);
589}
590
591int reiserfs_get_block(struct inode *inode, sector_t block,
592		       struct buffer_head *bh_result, int create)
593{
594	int repeat, retval = 0;
595	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
596	INITIALIZE_PATH(path);
597	int pos_in_item;
598	struct cpu_key key;
599	struct buffer_head *bh, *unbh = NULL;
600	struct item_head *ih, tmp_ih;
601	__le32 *item;
602	int done;
603	int fs_gen;
604	struct reiserfs_transaction_handle *th = NULL;
605	/* space reserved in transaction batch:
606	   . 3 balancings in direct->indirect conversion
607	   . 1 block involved into reiserfs_update_sd()
608	   XXX in practically impossible worst case direct2indirect()
609	   can incur (much) more than 3 balancings.
610	   quota update for user, group */
611	int jbegin_count =
612	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
613	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
614	int version;
615	int dangle = 1;
616	loff_t new_offset =
617	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
618
619	/* bad.... */
620	reiserfs_write_lock(inode->i_sb);
621	version = get_inode_item_key_version(inode);
622
623	if (!file_capable(inode, block)) {
624		reiserfs_write_unlock(inode->i_sb);
625		return -EFBIG;
626	}
627
628	/* if !create, we aren't changing the FS, so we don't need to
629	 ** log anything, so we don't need to start a transaction
630	 */
631	if (!(create & GET_BLOCK_CREATE)) {
632		int ret;
633		/* find number of block-th logical block of the file */
634		ret = _get_block_create_0(inode, block, bh_result,
635					  create | GET_BLOCK_READ_DIRECT);
636		reiserfs_write_unlock(inode->i_sb);
637		return ret;
638	}
639	/*
640	 * if we're already in a transaction, make sure to close
641	 * any new transactions we start in this func
642	 */
643	if ((create & GET_BLOCK_NO_DANGLE) ||
644	    reiserfs_transaction_running(inode->i_sb))
645		dangle = 0;
646
647	/* If file is of such a size, that it might have a tail and tails are enabled
648	 ** we should mark it as possibly needing tail packing on close
649	 */
650	if ((have_large_tails(inode->i_sb)
651	     && inode->i_size < i_block_size(inode) * 4)
652	    || (have_small_tails(inode->i_sb)
653		&& inode->i_size < i_block_size(inode)))
654		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
655
656	/* set the key of the first byte in the 'block'-th block of file */
657	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
658	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
659	      start_trans:
660		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
661		if (!th) {
662			retval = -ENOMEM;
663			goto failure;
664		}
665		reiserfs_update_inode_transaction(inode);
666	}
667      research:
668
669	retval = search_for_position_by_key(inode->i_sb, &key, &path);
670	if (retval == IO_ERROR) {
671		retval = -EIO;
672		goto failure;
673	}
674
675	bh = get_last_bh(&path);
676	ih = get_ih(&path);
677	item = get_item(&path);
678	pos_in_item = path.pos_in_item;
679
680	fs_gen = get_generation(inode->i_sb);
681	copy_item_head(&tmp_ih, ih);
682
683	if (allocation_needed
684	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
685		/* we have to allocate block for the unformatted node */
686		if (!th) {
687			pathrelse(&path);
688			goto start_trans;
689		}
690
691		repeat =
692		    _allocate_block(th, block, inode, &allocated_block_nr,
693				    &path, create);
694
695		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
696			/* restart the transaction to give the journal a chance to free
697			 ** some blocks.  releases the path, so we have to go back to
698			 ** research if we succeed on the second try
699			 */
700			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
701			retval = restart_transaction(th, inode, &path);
702			if (retval)
703				goto failure;
704			repeat =
705			    _allocate_block(th, block, inode,
706					    &allocated_block_nr, NULL, create);
707
708			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
709				goto research;
710			}
711			if (repeat == QUOTA_EXCEEDED)
712				retval = -EDQUOT;
713			else
714				retval = -ENOSPC;
715			goto failure;
716		}
717
718		if (fs_changed(fs_gen, inode->i_sb)
719		    && item_moved(&tmp_ih, &path)) {
720			goto research;
721		}
722	}
723
724	if (indirect_item_found(retval, ih)) {
725		b_blocknr_t unfm_ptr;
726		/* 'block'-th block is in the file already (there is
727		   corresponding cell in some indirect item). But it may be
728		   zero unformatted node pointer (hole) */
729		unfm_ptr = get_block_num(item, pos_in_item);
730		if (unfm_ptr == 0) {
731			/* use allocated block to plug the hole */
732			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
733			if (fs_changed(fs_gen, inode->i_sb)
734			    && item_moved(&tmp_ih, &path)) {
735				reiserfs_restore_prepared_buffer(inode->i_sb,
736								 bh);
737				goto research;
738			}
739			set_buffer_new(bh_result);
740			if (buffer_dirty(bh_result)
741			    && reiserfs_data_ordered(inode->i_sb))
742				reiserfs_add_ordered_list(inode, bh_result);
743			put_block_num(item, pos_in_item, allocated_block_nr);
744			unfm_ptr = allocated_block_nr;
745			journal_mark_dirty(th, inode->i_sb, bh);
746			reiserfs_update_sd(th, inode);
747		}
748		set_block_dev_mapped(bh_result, unfm_ptr, inode);
749		pathrelse(&path);
750		retval = 0;
751		if (!dangle && th)
752			retval = reiserfs_end_persistent_transaction(th);
753
754		reiserfs_write_unlock(inode->i_sb);
755
756		/* the item was found, so new blocks were not added to the file
757		 ** there is no need to make sure the inode is updated with this
758		 ** transaction
759		 */
760		return retval;
761	}
762
763	if (!th) {
764		pathrelse(&path);
765		goto start_trans;
766	}
767
768	/* desired position is not found or is in the direct item. We have
769	   to append file with holes up to 'block'-th block converting
770	   direct items to indirect one if necessary */
771	done = 0;
772	do {
773		if (is_statdata_le_ih(ih)) {
774			__le32 unp = 0;
775			struct cpu_key tmp_key;
776
777			/* indirect item has to be inserted */
778			make_le_item_head(&tmp_ih, &key, version, 1,
779					  TYPE_INDIRECT, UNFM_P_SIZE,
780					  0 /* free_space */ );
781
782			if (cpu_key_k_offset(&key) == 1) {
783				/* we are going to add 'block'-th block to the file. Use
784				   allocated block for that */
785				unp = cpu_to_le32(allocated_block_nr);
786				set_block_dev_mapped(bh_result,
787						     allocated_block_nr, inode);
788				set_buffer_new(bh_result);
789				done = 1;
790			}
791			tmp_key = key;	// ;)
792			set_cpu_key_k_offset(&tmp_key, 1);
793			PATH_LAST_POSITION(&path)++;
794
795			retval =
796			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
797						 inode, (char *)&unp);
798			if (retval) {
799				reiserfs_free_block(th, inode,
800						    allocated_block_nr, 1);
801				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
802			}
803			//mark_tail_converted (inode);
804		} else if (is_direct_le_ih(ih)) {
805			/* direct item has to be converted */
806			loff_t tail_offset;
807
808			tail_offset =
809			    ((le_ih_k_offset(ih) -
810			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
811			if (tail_offset == cpu_key_k_offset(&key)) {
812				/* direct item we just found fits into block we have
813				   to map. Convert it into unformatted node: use
814				   bh_result for the conversion */
815				set_block_dev_mapped(bh_result,
816						     allocated_block_nr, inode);
817				unbh = bh_result;
818				done = 1;
819			} else {
820				/* we have to padd file tail stored in direct item(s)
821				   up to block size and convert it to unformatted
822				   node. FIXME: this should also get into page cache */
823
824				pathrelse(&path);
825				/*
826				 * ugly, but we can only end the transaction if
827				 * we aren't nested
828				 */
829				BUG_ON(!th->t_refcount);
830				if (th->t_refcount == 1) {
831					retval =
832					    reiserfs_end_persistent_transaction
833					    (th);
834					th = NULL;
835					if (retval)
836						goto failure;
837				}
838
839				retval =
840				    convert_tail_for_hole(inode, bh_result,
841							  tail_offset);
842				if (retval) {
843					if (retval != -ENOSPC)
844						reiserfs_error(inode->i_sb,
845							"clm-6004",
846							"convert tail failed "
847							"inode %lu, error %d",
848							inode->i_ino,
849							retval);
850					if (allocated_block_nr) {
851						/* the bitmap, the super, and the stat data == 3 */
852						if (!th)
853							th = reiserfs_persistent_transaction(inode->i_sb, 3);
854						if (th)
855							reiserfs_free_block(th,
856									    inode,
857									    allocated_block_nr,
858									    1);
859					}
860					goto failure;
861				}
862				goto research;
863			}
864			retval =
865			    direct2indirect(th, inode, &path, unbh,
866					    tail_offset);
867			if (retval) {
868				reiserfs_unmap_buffer(unbh);
869				reiserfs_free_block(th, inode,
870						    allocated_block_nr, 1);
871				goto failure;
872			}
873			/* it is important the set_buffer_uptodate is done after
874			 ** the direct2indirect.  The buffer might contain valid
875			 ** data newer than the data on disk (read by readpage, changed,
876			 ** and then sent here by writepage).  direct2indirect needs
877			 ** to know if unbh was already up to date, so it can decide
878			 ** if the data in unbh needs to be replaced with data from
879			 ** the disk
880			 */
881			set_buffer_uptodate(unbh);
882
883			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
884			   buffer will disappear shortly, so it should not be added to
885			 */
886			if (unbh->b_page) {
887				/* we've converted the tail, so we must
888				 ** flush unbh before the transaction commits
889				 */
890				reiserfs_add_tail_list(inode, unbh);
891
892				/* mark it dirty now to prevent commit_write from adding
893				 ** this buffer to the inode's dirty buffer list
894				 */
895				/*
896				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
897				 * It's still atomic, but it sets the page dirty too,
898				 * which makes it eligible for writeback at any time by the
899				 * VM (which was also the case with __mark_buffer_dirty())
900				 */
901				mark_buffer_dirty(unbh);
902			}
903		} else {
904			/* append indirect item with holes if needed, when appending
905			   pointer to 'block'-th block use block, which is already
906			   allocated */
907			struct cpu_key tmp_key;
908			unp_t unf_single = 0;	// We use this in case we need to allocate only
909			// one block which is a fastpath
910			unp_t *un;
911			__u64 max_to_insert =
912			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
913			    UNFM_P_SIZE;
914			__u64 blocks_needed;
915
916			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
917			       "vs-804: invalid position for append");
918			/* indirect item has to be appended, set up key of that position */
919			make_cpu_key(&tmp_key, inode,
920				     le_key_k_offset(version,
921						     &(ih->ih_key)) +
922				     op_bytes_number(ih,
923						     inode->i_sb->s_blocksize),
924				     //pos_in_item * inode->i_sb->s_blocksize,
925				     TYPE_INDIRECT, 3);	// key type is unimportant
926
927			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
928			       "green-805: invalid offset");
929			blocks_needed =
930			    1 +
931			    ((cpu_key_k_offset(&key) -
932			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
933			     s_blocksize_bits);
934
935			if (blocks_needed == 1) {
936				un = &unf_single;
937			} else {
938				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
939				if (!un) {
940					un = &unf_single;
941					blocks_needed = 1;
942					max_to_insert = 0;
943				}
944			}
945			if (blocks_needed <= max_to_insert) {
946				/* we are going to add target block to the file. Use allocated
947				   block for that */
948				un[blocks_needed - 1] =
949				    cpu_to_le32(allocated_block_nr);
950				set_block_dev_mapped(bh_result,
951						     allocated_block_nr, inode);
952				set_buffer_new(bh_result);
953				done = 1;
954			} else {
955				/* paste hole to the indirect item */
956				/* If kmalloc failed, max_to_insert becomes zero and it means we
957				   only have space for one block */
958				blocks_needed =
959				    max_to_insert ? max_to_insert : 1;
960			}
961			retval =
962			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
963						     (char *)un,
964						     UNFM_P_SIZE *
965						     blocks_needed);
966
967			if (blocks_needed != 1)
968				kfree(un);
969
970			if (retval) {
971				reiserfs_free_block(th, inode,
972						    allocated_block_nr, 1);
973				goto failure;
974			}
975			if (!done) {
976				/* We need to mark new file size in case this function will be
977				   interrupted/aborted later on. And we may do this only for
978				   holes. */
979				inode->i_size +=
980				    inode->i_sb->s_blocksize * blocks_needed;
981			}
982		}
983
984		if (done == 1)
985			break;
986
987		/* this loop could log more blocks than we had originally asked
988		 ** for.  So, we have to allow the transaction to end if it is
989		 ** too big or too full.  Update the inode so things are
990		 ** consistent if we crash before the function returns
991		 **
992		 ** release the path so that anybody waiting on the path before
993		 ** ending their transaction will be able to continue.
994		 */
995		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
996			retval = restart_transaction(th, inode, &path);
997			if (retval)
998				goto failure;
999		}
1000		/* inserting indirect pointers for a hole can take a
1001		 ** long time.  reschedule if needed
1002		 */
1003		cond_resched();
1004
1005		retval = search_for_position_by_key(inode->i_sb, &key, &path);
1006		if (retval == IO_ERROR) {
1007			retval = -EIO;
1008			goto failure;
1009		}
1010		if (retval == POSITION_FOUND) {
1011			reiserfs_warning(inode->i_sb, "vs-825",
1012					 "%K should not be found", &key);
1013			retval = -EEXIST;
1014			if (allocated_block_nr)
1015				reiserfs_free_block(th, inode,
1016						    allocated_block_nr, 1);
1017			pathrelse(&path);
1018			goto failure;
1019		}
1020		bh = get_last_bh(&path);
1021		ih = get_ih(&path);
1022		item = get_item(&path);
1023		pos_in_item = path.pos_in_item;
1024	} while (1);
1025
1026	retval = 0;
1027
1028      failure:
1029	if (th && (!dangle || (retval && !th->t_trans_id))) {
1030		int err;
1031		if (th->t_trans_id)
1032			reiserfs_update_sd(th, inode);
1033		err = reiserfs_end_persistent_transaction(th);
1034		if (err)
1035			retval = err;
1036	}
1037
1038	reiserfs_write_unlock(inode->i_sb);
1039	reiserfs_check_path(&path);
1040	return retval;
1041}
1042
1043static int
1044reiserfs_readpages(struct file *file, struct address_space *mapping,
1045		   struct list_head *pages, unsigned nr_pages)
1046{
1047	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1048}
1049
1050/* Compute real number of used bytes by file
1051 * Following three functions can go away when we'll have enough space in stat item
1052 */
1053static int real_space_diff(struct inode *inode, int sd_size)
1054{
1055	int bytes;
1056	loff_t blocksize = inode->i_sb->s_blocksize;
1057
1058	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1059		return sd_size;
1060
1061	/* End of file is also in full block with indirect reference, so round
1062	 ** up to the next block.
1063	 **
1064	 ** there is just no way to know if the tail is actually packed
1065	 ** on the file, so we have to assume it isn't.  When we pack the
1066	 ** tail, we add 4 bytes to pretend there really is an unformatted
1067	 ** node pointer
1068	 */
1069	bytes =
1070	    ((inode->i_size +
1071	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1072	    sd_size;
1073	return bytes;
1074}
1075
1076static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1077					int sd_size)
1078{
1079	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1080		return inode->i_size +
1081		    (loff_t) (real_space_diff(inode, sd_size));
1082	}
1083	return ((loff_t) real_space_diff(inode, sd_size)) +
1084	    (((loff_t) blocks) << 9);
1085}
1086
1087/* Compute number of blocks used by file in ReiserFS counting */
1088static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1089{
1090	loff_t bytes = inode_get_bytes(inode);
1091	loff_t real_space = real_space_diff(inode, sd_size);
1092
1093	/* keeps fsck and non-quota versions of reiserfs happy */
1094	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1095		bytes += (loff_t) 511;
1096	}
1097
1098	/* files from before the quota patch might i_blocks such that
1099	 ** bytes < real_space.  Deal with that here to prevent it from
1100	 ** going negative.
1101	 */
1102	if (bytes < real_space)
1103		return 0;
1104	return (bytes - real_space) >> 9;
1105}
1106
1107//
1108// BAD: new directories have stat data of new type and all other items
1109// of old type. Version stored in the inode says about body items, so
1110// in update_stat_data we can not rely on inode, but have to check
1111// item version directly
1112//
1113
1114// called by read_locked_inode
1115static void init_inode(struct inode *inode, struct treepath *path)
1116{
1117	struct buffer_head *bh;
1118	struct item_head *ih;
1119	__u32 rdev;
1120	//int version = ITEM_VERSION_1;
1121
1122	bh = PATH_PLAST_BUFFER(path);
1123	ih = PATH_PITEM_HEAD(path);
1124
1125	copy_key(INODE_PKEY(inode), &(ih->ih_key));
1126
1127	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1128	REISERFS_I(inode)->i_flags = 0;
1129	REISERFS_I(inode)->i_prealloc_block = 0;
1130	REISERFS_I(inode)->i_prealloc_count = 0;
1131	REISERFS_I(inode)->i_trans_id = 0;
1132	REISERFS_I(inode)->i_jl = NULL;
1133	mutex_init(&(REISERFS_I(inode)->i_mmap));
1134	reiserfs_init_acl_access(inode);
1135	reiserfs_init_acl_default(inode);
1136	reiserfs_init_xattr_rwsem(inode);
1137
1138	if (stat_data_v1(ih)) {
1139		struct stat_data_v1 *sd =
1140		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1141		unsigned long blocks;
1142
1143		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1144		set_inode_sd_version(inode, STAT_DATA_V1);
1145		inode->i_mode = sd_v1_mode(sd);
1146		inode->i_nlink = sd_v1_nlink(sd);
1147		inode->i_uid = sd_v1_uid(sd);
1148		inode->i_gid = sd_v1_gid(sd);
1149		inode->i_size = sd_v1_size(sd);
1150		inode->i_atime.tv_sec = sd_v1_atime(sd);
1151		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1152		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1153		inode->i_atime.tv_nsec = 0;
1154		inode->i_ctime.tv_nsec = 0;
1155		inode->i_mtime.tv_nsec = 0;
1156
1157		inode->i_blocks = sd_v1_blocks(sd);
1158		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1159		blocks = (inode->i_size + 511) >> 9;
1160		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1161		if (inode->i_blocks > blocks) {
1162			// there was a bug in <=3.5.23 when i_blocks could take negative
1163			// values. Starting from 3.5.17 this value could even be stored in
1164			// stat data. For such files we set i_blocks based on file
1165			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1166			// only updated if file's inode will ever change
1167			inode->i_blocks = blocks;
1168		}
1169
1170		rdev = sd_v1_rdev(sd);
1171		REISERFS_I(inode)->i_first_direct_byte =
1172		    sd_v1_first_direct_byte(sd);
1173		/* an early bug in the quota code can give us an odd number for the
1174		 ** block count.  This is incorrect, fix it here.
1175		 */
1176		if (inode->i_blocks & 1) {
1177			inode->i_blocks++;
1178		}
1179		inode_set_bytes(inode,
1180				to_real_used_space(inode, inode->i_blocks,
1181						   SD_V1_SIZE));
1182		/* nopack is initially zero for v1 objects. For v2 objects,
1183		   nopack is initialised from sd_attrs */
1184		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1185	} else {
1186		// new stat data found, but object may have old items
1187		// (directories and symlinks)
1188		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1189
1190		inode->i_mode = sd_v2_mode(sd);
1191		inode->i_nlink = sd_v2_nlink(sd);
1192		inode->i_uid = sd_v2_uid(sd);
1193		inode->i_size = sd_v2_size(sd);
1194		inode->i_gid = sd_v2_gid(sd);
1195		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1196		inode->i_atime.tv_sec = sd_v2_atime(sd);
1197		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1198		inode->i_ctime.tv_nsec = 0;
1199		inode->i_mtime.tv_nsec = 0;
1200		inode->i_atime.tv_nsec = 0;
1201		inode->i_blocks = sd_v2_blocks(sd);
1202		rdev = sd_v2_rdev(sd);
1203		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1204			inode->i_generation =
1205			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1206		else
1207			inode->i_generation = sd_v2_generation(sd);
1208
1209		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1210			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1211		else
1212			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1213		REISERFS_I(inode)->i_first_direct_byte = 0;
1214		set_inode_sd_version(inode, STAT_DATA_V2);
1215		inode_set_bytes(inode,
1216				to_real_used_space(inode, inode->i_blocks,
1217						   SD_V2_SIZE));
1218		/* read persistent inode attributes from sd and initalise
1219		   generic inode flags from them */
1220		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1221		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1222	}
1223
1224	pathrelse(path);
1225	if (S_ISREG(inode->i_mode)) {
1226		inode->i_op = &reiserfs_file_inode_operations;
1227		inode->i_fop = &reiserfs_file_operations;
1228		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1229	} else if (S_ISDIR(inode->i_mode)) {
1230		inode->i_op = &reiserfs_dir_inode_operations;
1231		inode->i_fop = &reiserfs_dir_operations;
1232	} else if (S_ISLNK(inode->i_mode)) {
1233		inode->i_op = &reiserfs_symlink_inode_operations;
1234		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1235	} else {
1236		inode->i_blocks = 0;
1237		inode->i_op = &reiserfs_special_inode_operations;
1238		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1239	}
1240}
1241
1242// update new stat data with inode fields
1243static void inode2sd(void *sd, struct inode *inode, loff_t size)
1244{
1245	struct stat_data *sd_v2 = (struct stat_data *)sd;
1246	__u16 flags;
1247
1248	set_sd_v2_mode(sd_v2, inode->i_mode);
1249	set_sd_v2_nlink(sd_v2, inode->i_nlink);
1250	set_sd_v2_uid(sd_v2, inode->i_uid);
1251	set_sd_v2_size(sd_v2, size);
1252	set_sd_v2_gid(sd_v2, inode->i_gid);
1253	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1254	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1255	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1256	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1257	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1258		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1259	else
1260		set_sd_v2_generation(sd_v2, inode->i_generation);
1261	flags = REISERFS_I(inode)->i_attrs;
1262	i_attrs_to_sd_attrs(inode, &flags);
1263	set_sd_v2_attrs(sd_v2, flags);
1264}
1265
1266// used to copy inode's fields to old stat data
1267static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1268{
1269	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1270
1271	set_sd_v1_mode(sd_v1, inode->i_mode);
1272	set_sd_v1_uid(sd_v1, inode->i_uid);
1273	set_sd_v1_gid(sd_v1, inode->i_gid);
1274	set_sd_v1_nlink(sd_v1, inode->i_nlink);
1275	set_sd_v1_size(sd_v1, size);
1276	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1277	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1278	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1279
1280	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1281		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1282	else
1283		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1284
1285	// Sigh. i_first_direct_byte is back
1286	set_sd_v1_first_direct_byte(sd_v1,
1287				    REISERFS_I(inode)->i_first_direct_byte);
1288}
1289
1290/* NOTE, you must prepare the buffer head before sending it here,
1291** and then log it after the call
1292*/
1293static void update_stat_data(struct treepath *path, struct inode *inode,
1294			     loff_t size)
1295{
1296	struct buffer_head *bh;
1297	struct item_head *ih;
1298
1299	bh = PATH_PLAST_BUFFER(path);
1300	ih = PATH_PITEM_HEAD(path);
1301
1302	if (!is_statdata_le_ih(ih))
1303		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
1304			       INODE_PKEY(inode), ih);
1305
1306	if (stat_data_v1(ih)) {
1307		// path points to old stat data
1308		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1309	} else {
1310		inode2sd(B_I_PITEM(bh, ih), inode, size);
1311	}
1312
1313	return;
1314}
1315
1316void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1317			     struct inode *inode, loff_t size)
1318{
1319	struct cpu_key key;
1320	INITIALIZE_PATH(path);
1321	struct buffer_head *bh;
1322	int fs_gen;
1323	struct item_head *ih, tmp_ih;
1324	int retval;
1325
1326	BUG_ON(!th->t_trans_id);
1327
1328	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
1329
1330	for (;;) {
1331		int pos;
1332		/* look for the object's stat data */
1333		retval = search_item(inode->i_sb, &key, &path);
1334		if (retval == IO_ERROR) {
1335			reiserfs_error(inode->i_sb, "vs-13050",
1336				       "i/o failure occurred trying to "
1337				       "update %K stat data", &key);
1338			return;
1339		}
1340		if (retval == ITEM_NOT_FOUND) {
1341			pos = PATH_LAST_POSITION(&path);
1342			pathrelse(&path);
1343			if (inode->i_nlink == 0) {
1344				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1345				return;
1346			}
1347			reiserfs_warning(inode->i_sb, "vs-13060",
1348					 "stat data of object %k (nlink == %d) "
1349					 "not found (pos %d)",
1350					 INODE_PKEY(inode), inode->i_nlink,
1351					 pos);
1352			reiserfs_check_path(&path);
1353			return;
1354		}
1355
1356		/* sigh, prepare_for_journal might schedule.  When it schedules the
1357		 ** FS might change.  We have to detect that, and loop back to the
1358		 ** search if the stat data item has moved
1359		 */
1360		bh = get_last_bh(&path);
1361		ih = get_ih(&path);
1362		copy_item_head(&tmp_ih, ih);
1363		fs_gen = get_generation(inode->i_sb);
1364		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1365		if (fs_changed(fs_gen, inode->i_sb)
1366		    && item_moved(&tmp_ih, &path)) {
1367			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1368			continue;	/* Stat_data item has been moved after scheduling. */
1369		}
1370		break;
1371	}
1372	update_stat_data(&path, inode, size);
1373	journal_mark_dirty(th, th->t_super, bh);
1374	pathrelse(&path);
1375	return;
1376}
1377
1378/* reiserfs_read_locked_inode is called to read the inode off disk, and it
1379** does a make_bad_inode when things go wrong.  But, we need to make sure
1380** and clear the key in the private portion of the inode, otherwise a
1381** corresponding iput might try to delete whatever object the inode last
1382** represented.
1383*/
1384static void reiserfs_make_bad_inode(struct inode *inode)
1385{
1386	memset(INODE_PKEY(inode), 0, KEY_SIZE);
1387	make_bad_inode(inode);
1388}
1389
1390//
1391// initially this function was derived from minix or ext2's analog and
1392// evolved as the prototype did
1393//
1394
1395int reiserfs_init_locked_inode(struct inode *inode, void *p)
1396{
1397	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1398	inode->i_ino = args->objectid;
1399	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1400	return 0;
1401}
1402
1403/* looks for stat data in the tree, and fills up the fields of in-core
1404   inode stat data fields */
1405void reiserfs_read_locked_inode(struct inode *inode,
1406				struct reiserfs_iget_args *args)
1407{
1408	INITIALIZE_PATH(path_to_sd);
1409	struct cpu_key key;
1410	unsigned long dirino;
1411	int retval;
1412
1413	dirino = args->dirid;
1414
1415	/* set version 1, version 2 could be used too, because stat data
1416	   key is the same in both versions */
1417	key.version = KEY_FORMAT_3_5;
1418	key.on_disk_key.k_dir_id = dirino;
1419	key.on_disk_key.k_objectid = inode->i_ino;
1420	key.on_disk_key.k_offset = 0;
1421	key.on_disk_key.k_type = 0;
1422
1423	/* look for the object's stat data */
1424	retval = search_item(inode->i_sb, &key, &path_to_sd);
1425	if (retval == IO_ERROR) {
1426		reiserfs_error(inode->i_sb, "vs-13070",
1427			       "i/o failure occurred trying to find "
1428			       "stat data of %K", &key);
1429		reiserfs_make_bad_inode(inode);
1430		return;
1431	}
1432	if (retval != ITEM_FOUND) {
1433		/* a stale NFS handle can trigger this without it being an error */
1434		pathrelse(&path_to_sd);
1435		reiserfs_make_bad_inode(inode);
1436		inode->i_nlink = 0;
1437		return;
1438	}
1439
1440	init_inode(inode, &path_to_sd);
1441
1442	/* It is possible that knfsd is trying to access inode of a file
1443	   that is being removed from the disk by some other thread. As we
1444	   update sd on unlink all that is required is to check for nlink
1445	   here. This bug was first found by Sizif when debugging
1446	   SquidNG/Butterfly, forgotten, and found again after Philippe
1447	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1448
1449	   More logical fix would require changes in fs/inode.c:iput() to
1450	   remove inode from hash-table _after_ fs cleaned disk stuff up and
1451	   in iget() to return NULL if I_FREEING inode is found in
1452	   hash-table. */
1453	/* Currently there is one place where it's ok to meet inode with
1454	   nlink==0: processing of open-unlinked and half-truncated files
1455	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
1456	if ((inode->i_nlink == 0) &&
1457	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1458		reiserfs_warning(inode->i_sb, "vs-13075",
1459				 "dead inode read from disk %K. "
1460				 "This is likely to be race with knfsd. Ignore",
1461				 &key);
1462		reiserfs_make_bad_inode(inode);
1463	}
1464
1465	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
1466
1467}
1468
1469/**
1470 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1471 *
1472 * @inode:    inode from hash table to check
1473 * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1474 *
1475 * This function is called by iget5_locked() to distinguish reiserfs inodes
1476 * having the same inode numbers. Such inodes can only exist due to some
1477 * error condition. One of them should be bad. Inodes with identical
1478 * inode numbers (objectids) are distinguished by parent directory ids.
1479 *
1480 */
1481int reiserfs_find_actor(struct inode *inode, void *opaque)
1482{
1483	struct reiserfs_iget_args *args;
1484
1485	args = opaque;
1486	/* args is already in CPU order */
1487	return (inode->i_ino == args->objectid) &&
1488	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1489}
1490
1491struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1492{
1493	struct inode *inode;
1494	struct reiserfs_iget_args args;
1495
1496	args.objectid = key->on_disk_key.k_objectid;
1497	args.dirid = key->on_disk_key.k_dir_id;
1498	inode = iget5_locked(s, key->on_disk_key.k_objectid,
1499			     reiserfs_find_actor, reiserfs_init_locked_inode,
1500			     (void *)(&args));
1501	if (!inode)
1502		return ERR_PTR(-ENOMEM);
1503
1504	if (inode->i_state & I_NEW) {
1505		reiserfs_read_locked_inode(inode, &args);
1506		unlock_new_inode(inode);
1507	}
1508
1509	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1510		/* either due to i/o error or a stale NFS handle */
1511		iput(inode);
1512		inode = NULL;
1513	}
1514	return inode;
1515}
1516
1517static struct dentry *reiserfs_get_dentry(struct super_block *sb,
1518	u32 objectid, u32 dir_id, u32 generation)
1519
1520{
1521	struct cpu_key key;
1522	struct inode *inode;
1523
1524	key.on_disk_key.k_objectid = objectid;
1525	key.on_disk_key.k_dir_id = dir_id;
1526	reiserfs_write_lock(sb);
1527	inode = reiserfs_iget(sb, &key);
1528	if (inode && !IS_ERR(inode) && generation != 0 &&
1529	    generation != inode->i_generation) {
1530		iput(inode);
1531		inode = NULL;
1532	}
1533	reiserfs_write_unlock(sb);
1534
1535	return d_obtain_alias(inode);
1536}
1537
1538struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1539		int fh_len, int fh_type)
1540{
1541	/* fhtype happens to reflect the number of u32s encoded.
1542	 * due to a bug in earlier code, fhtype might indicate there
1543	 * are more u32s then actually fitted.
1544	 * so if fhtype seems to be more than len, reduce fhtype.
1545	 * Valid types are:
1546	 *   2 - objectid + dir_id - legacy support
1547	 *   3 - objectid + dir_id + generation
1548	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1549	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
1550	 *   6 - as above plus generation of directory
1551	 * 6 does not fit in NFSv2 handles
1552	 */
1553	if (fh_type > fh_len) {
1554		if (fh_type != 6 || fh_len != 5)
1555			reiserfs_warning(sb, "reiserfs-13077",
1556				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
1557				fh_type, fh_len);
1558		fh_type = 5;
1559	}
1560
1561	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1562		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
1563}
1564
1565struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1566		int fh_len, int fh_type)
1567{
1568	if (fh_type < 4)
1569		return NULL;
1570
1571	return reiserfs_get_dentry(sb,
1572		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
1573		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
1574		(fh_type == 6) ? fid->raw[5] : 0);
1575}
1576
1577int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1578		       int need_parent)
1579{
1580	struct inode *inode = dentry->d_inode;
1581	int maxlen = *lenp;
1582
1583	if (maxlen < 3)
1584		return 255;
1585
1586	data[0] = inode->i_ino;
1587	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1588	data[2] = inode->i_generation;
1589	*lenp = 3;
1590	/* no room for directory info? return what we've stored so far */
1591	if (maxlen < 5 || !need_parent)
1592		return 3;
1593
1594	spin_lock(&dentry->d_lock);
1595	inode = dentry->d_parent->d_inode;
1596	data[3] = inode->i_ino;
1597	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1598	*lenp = 5;
1599	if (maxlen >= 6) {
1600		data[5] = inode->i_generation;
1601		*lenp = 6;
1602	}
1603	spin_unlock(&dentry->d_lock);
1604	return *lenp;
1605}
1606
1607/* looks for stat data, then copies fields to it, marks the buffer
1608   containing stat data as dirty */
1609/* reiserfs inodes are never really dirty, since the dirty inode call
1610** always logs them.  This call allows the VFS inode marking routines
1611** to properly mark inodes for datasync and such, but only actually
1612** does something when called for a synchronous update.
1613*/
1614int reiserfs_write_inode(struct inode *inode, int do_sync)
1615{
1616	struct reiserfs_transaction_handle th;
1617	int jbegin_count = 1;
1618
1619	if (inode->i_sb->s_flags & MS_RDONLY)
1620		return -EROFS;
1621	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
1622	 ** these cases are just when the system needs ram, not when the
1623	 ** inode needs to reach disk for safety, and they can safely be
1624	 ** ignored because the altered inode has already been logged.
1625	 */
1626	if (do_sync && !(current->flags & PF_MEMALLOC)) {
1627		reiserfs_write_lock(inode->i_sb);
1628		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1629			reiserfs_update_sd(&th, inode);
1630			journal_end_sync(&th, inode->i_sb, jbegin_count);
1631		}
1632		reiserfs_write_unlock(inode->i_sb);
1633	}
1634	return 0;
1635}
1636
1637/* stat data of new object is inserted already, this inserts the item
1638   containing "." and ".." entries */
1639static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1640				  struct inode *inode,
1641				  struct item_head *ih, struct treepath *path,
1642				  struct inode *dir)
1643{
1644	struct super_block *sb = th->t_super;
1645	char empty_dir[EMPTY_DIR_SIZE];
1646	char *body = empty_dir;
1647	struct cpu_key key;
1648	int retval;
1649
1650	BUG_ON(!th->t_trans_id);
1651
1652	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1653		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1654		      TYPE_DIRENTRY, 3 /*key length */ );
1655
1656	/* compose item head for new item. Directories consist of items of
1657	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1658	   is done by reiserfs_new_inode */
1659	if (old_format_only(sb)) {
1660		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1661				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1662
1663		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1664				       ih->ih_key.k_objectid,
1665				       INODE_PKEY(dir)->k_dir_id,
1666				       INODE_PKEY(dir)->k_objectid);
1667	} else {
1668		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1669				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1670
1671		make_empty_dir_item(body, ih->ih_key.k_dir_id,
1672				    ih->ih_key.k_objectid,
1673				    INODE_PKEY(dir)->k_dir_id,
1674				    INODE_PKEY(dir)->k_objectid);
1675	}
1676
1677	/* look for place in the tree for new item */
1678	retval = search_item(sb, &key, path);
1679	if (retval == IO_ERROR) {
1680		reiserfs_error(sb, "vs-13080",
1681			       "i/o failure occurred creating new directory");
1682		return -EIO;
1683	}
1684	if (retval == ITEM_FOUND) {
1685		pathrelse(path);
1686		reiserfs_warning(sb, "vs-13070",
1687				 "object with this key exists (%k)",
1688				 &(ih->ih_key));
1689		return -EEXIST;
1690	}
1691
1692	/* insert item, that is empty directory item */
1693	return reiserfs_insert_item(th, path, &key, ih, inode, body);
1694}
1695
1696/* stat data of object has been inserted, this inserts the item
1697   containing the body of symlink */
1698static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
1699				struct item_head *ih,
1700				struct treepath *path, const char *symname,
1701				int item_len)
1702{
1703	struct super_block *sb = th->t_super;
1704	struct cpu_key key;
1705	int retval;
1706
1707	BUG_ON(!th->t_trans_id);
1708
1709	_make_cpu_key(&key, KEY_FORMAT_3_5,
1710		      le32_to_cpu(ih->ih_key.k_dir_id),
1711		      le32_to_cpu(ih->ih_key.k_objectid),
1712		      1, TYPE_DIRECT, 3 /*key length */ );
1713
1714	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1715			  0 /*free_space */ );
1716
1717	/* look for place in the tree for new item */
1718	retval = search_item(sb, &key, path);
1719	if (retval == IO_ERROR) {
1720		reiserfs_error(sb, "vs-13080",
1721			       "i/o failure occurred creating new symlink");
1722		return -EIO;
1723	}
1724	if (retval == ITEM_FOUND) {
1725		pathrelse(path);
1726		reiserfs_warning(sb, "vs-13080",
1727				 "object with this key exists (%k)",
1728				 &(ih->ih_key));
1729		return -EEXIST;
1730	}
1731
1732	/* insert item, that is body of symlink */
1733	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1734}
1735
1736/* inserts the stat data into the tree, and then calls
1737   reiserfs_new_directory (to insert ".", ".." item if new object is
1738   directory) or reiserfs_new_symlink (to insert symlink body if new
1739   object is symlink) or nothing (if new object is regular file)
1740
1741   NOTE! uid and gid must already be set in the inode.  If we return
1742   non-zero due to an error, we have to drop the quota previously allocated
1743   for the fresh inode.  This can only be done outside a transaction, so
1744   if we return non-zero, we also end the transaction.  */
1745int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1746		       struct inode *dir, int mode, const char *symname,
1747		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1748		          strlen (symname) for symlinks) */
1749		       loff_t i_size, struct dentry *dentry,
1750		       struct inode *inode,
1751		       struct reiserfs_security_handle *security)
1752{
1753	struct super_block *sb;
1754	struct reiserfs_iget_args args;
1755	INITIALIZE_PATH(path_to_key);
1756	struct cpu_key key;
1757	struct item_head ih;
1758	struct stat_data sd;
1759	int retval;
1760	int err;
1761
1762	BUG_ON(!th->t_trans_id);
1763
1764	if (DQUOT_ALLOC_INODE(inode)) {
1765		err = -EDQUOT;
1766		goto out_end_trans;
1767	}
1768	if (!dir->i_nlink) {
1769		err = -EPERM;
1770		goto out_bad_inode;
1771	}
1772
1773	sb = dir->i_sb;
1774
1775	/* item head of new item */
1776	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1777	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1778	if (!ih.ih_key.k_objectid) {
1779		err = -ENOMEM;
1780		goto out_bad_inode;
1781	}
1782	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1783	if (old_format_only(sb))
1784		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1785				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1786	else
1787		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1788				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1789	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1790	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1791	if (insert_inode_locked4(inode, args.objectid,
1792			     reiserfs_find_actor, &args) < 0) {
1793		err = -EINVAL;
1794		goto out_bad_inode;
1795	}
1796	if (old_format_only(sb))
1797		/* not a perfect generation count, as object ids can be reused, but
1798		 ** this is as good as reiserfs can do right now.
1799		 ** note that the private part of inode isn't filled in yet, we have
1800		 ** to use the directory.
1801		 */
1802		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1803	else
1804#if defined( USE_INODE_GENERATION_COUNTER )
1805		inode->i_generation =
1806		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1807#else
1808		inode->i_generation = ++event;
1809#endif
1810
1811	/* fill stat data */
1812	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1813
1814	/* uid and gid must already be set by the caller for quota init */
1815
1816	/* symlink cannot be immutable or append only, right? */
1817	if (S_ISLNK(inode->i_mode))
1818		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1819
1820	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1821	inode->i_size = i_size;
1822	inode->i_blocks = 0;
1823	inode->i_bytes = 0;
1824	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1825	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1826
1827	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1828	REISERFS_I(inode)->i_flags = 0;
1829	REISERFS_I(inode)->i_prealloc_block = 0;
1830	REISERFS_I(inode)->i_prealloc_count = 0;
1831	REISERFS_I(inode)->i_trans_id = 0;
1832	REISERFS_I(inode)->i_jl = NULL;
1833	REISERFS_I(inode)->i_attrs =
1834	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1835	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1836	mutex_init(&(REISERFS_I(inode)->i_mmap));
1837	reiserfs_init_acl_access(inode);
1838	reiserfs_init_acl_default(inode);
1839	reiserfs_init_xattr_rwsem(inode);
1840
1841	/* key to search for correct place for new stat data */
1842	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1843		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1844		      TYPE_STAT_DATA, 3 /*key length */ );
1845
1846	/* find proper place for inserting of stat data */
1847	retval = search_item(sb, &key, &path_to_key);
1848	if (retval == IO_ERROR) {
1849		err = -EIO;
1850		goto out_bad_inode;
1851	}
1852	if (retval == ITEM_FOUND) {
1853		pathrelse(&path_to_key);
1854		err = -EEXIST;
1855		goto out_bad_inode;
1856	}
1857	if (old_format_only(sb)) {
1858		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1859			pathrelse(&path_to_key);
1860			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
1861			err = -EINVAL;
1862			goto out_bad_inode;
1863		}
1864		inode2sd_v1(&sd, inode, inode->i_size);
1865	} else {
1866		inode2sd(&sd, inode, inode->i_size);
1867	}
1868	// store in in-core inode the key of stat data and version all
1869	// object items will have (directory items will have old offset
1870	// format, other new objects will consist of new items)
1871	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1872		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1873	else
1874		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1875	if (old_format_only(sb))
1876		set_inode_sd_version(inode, STAT_DATA_V1);
1877	else
1878		set_inode_sd_version(inode, STAT_DATA_V2);
1879
1880	/* insert the stat data into the tree */
1881#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1882	if (REISERFS_I(dir)->new_packing_locality)
1883		th->displace_new_blocks = 1;
1884#endif
1885	retval =
1886	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1887				 (char *)(&sd));
1888	if (retval) {
1889		err = retval;
1890		reiserfs_check_path(&path_to_key);
1891		goto out_bad_inode;
1892	}
1893#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1894	if (!th->displace_new_blocks)
1895		REISERFS_I(dir)->new_packing_locality = 0;
1896#endif
1897	if (S_ISDIR(mode)) {
1898		/* insert item with "." and ".." */
1899		retval =
1900		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1901	}
1902
1903	if (S_ISLNK(mode)) {
1904		/* insert body of symlink */
1905		if (!old_format_only(sb))
1906			i_size = ROUND_UP(i_size);
1907		retval =
1908		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1909					 i_size);
1910	}
1911	if (retval) {
1912		err = retval;
1913		reiserfs_check_path(&path_to_key);
1914		journal_end(th, th->t_super, th->t_blocks_allocated);
1915		goto out_inserted_sd;
1916	}
1917
1918	if (reiserfs_posixacl(inode->i_sb)) {
1919		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
1920		if (retval) {
1921			err = retval;
1922			reiserfs_check_path(&path_to_key);
1923			journal_end(th, th->t_super, th->t_blocks_allocated);
1924			goto out_inserted_sd;
1925		}
1926	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
1927		reiserfs_warning(inode->i_sb, "jdm-13090",
1928				 "ACLs aren't enabled in the fs, "
1929				 "but vfs thinks they are!");
1930	} else if (IS_PRIVATE(dir))
1931		inode->i_flags |= S_PRIVATE;
1932
1933	if (security->name) {
1934		retval = reiserfs_security_write(th, inode, security);
1935		if (retval) {
1936			err = retval;
1937			reiserfs_check_path(&path_to_key);
1938			retval = journal_end(th, th->t_super,
1939					     th->t_blocks_allocated);
1940			if (retval)
1941				err = retval;
1942			goto out_inserted_sd;
1943		}
1944	}
1945
1946	reiserfs_update_sd(th, inode);
1947	reiserfs_check_path(&path_to_key);
1948
1949	return 0;
1950
1951/* it looks like you can easily compress these two goto targets into
1952 * one.  Keeping it like this doesn't actually hurt anything, and they
1953 * are place holders for what the quota code actually needs.
1954 */
1955      out_bad_inode:
1956	/* Invalidate the object, nothing was inserted yet */
1957	INODE_PKEY(inode)->k_objectid = 0;
1958
1959	/* Quota change must be inside a transaction for journaling */
1960	DQUOT_FREE_INODE(inode);
1961
1962      out_end_trans:
1963	journal_end(th, th->t_super, th->t_blocks_allocated);
1964	/* Drop can be outside and it needs more credits so it's better to have it outside */
1965	DQUOT_DROP(inode);
1966	inode->i_flags |= S_NOQUOTA;
1967	make_bad_inode(inode);
1968
1969      out_inserted_sd:
1970	inode->i_nlink = 0;
1971	th->t_trans_id = 0;	/* so the caller can't use this handle later */
1972	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1973	iput(inode);
1974	return err;
1975}
1976
1977/*
1978** finds the tail page in the page cache,
1979** reads the last block in.
1980**
1981** On success, page_result is set to a locked, pinned page, and bh_result
1982** is set to an up to date buffer for the last block in the file.  returns 0.
1983**
1984** tail conversion is not done, so bh_result might not be valid for writing
1985** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1986** trying to write the block.
1987**
1988** on failure, nonzero is returned, page_result and bh_result are untouched.
1989*/
1990static int grab_tail_page(struct inode *p_s_inode,
1991			  struct page **page_result,
1992			  struct buffer_head **bh_result)
1993{
1994
1995	/* we want the page with the last byte in the file,
1996	 ** not the page that will hold the next byte for appending
1997	 */
1998	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
1999	unsigned long pos = 0;
2000	unsigned long start = 0;
2001	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
2002	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
2003	struct buffer_head *bh;
2004	struct buffer_head *head;
2005	struct page *page;
2006	int error;
2007
2008	/* we know that we are only called with inode->i_size > 0.
2009	 ** we also know that a file tail can never be as big as a block
2010	 ** If i_size % blocksize == 0, our file is currently block aligned
2011	 ** and it won't need converting or zeroing after a truncate.
2012	 */
2013	if ((offset & (blocksize - 1)) == 0) {
2014		return -ENOENT;
2015	}
2016	page = grab_cache_page(p_s_inode->i_mapping, index);
2017	error = -ENOMEM;
2018	if (!page) {
2019		goto out;
2020	}
2021	/* start within the page of the last block in the file */
2022	start = (offset / blocksize) * blocksize;
2023
2024	error = block_prepare_write(page, start, offset,
2025				    reiserfs_get_block_create_0);
2026	if (error)
2027		goto unlock;
2028
2029	head = page_buffers(page);
2030	bh = head;
2031	do {
2032		if (pos >= start) {
2033			break;
2034		}
2035		bh = bh->b_this_page;
2036		pos += blocksize;
2037	} while (bh != head);
2038
2039	if (!buffer_uptodate(bh)) {
2040		/* note, this should never happen, prepare_write should
2041		 ** be taking care of this for us.  If the buffer isn't up to date,
2042		 ** I've screwed up the code to find the buffer, or the code to
2043		 ** call prepare_write
2044		 */
2045		reiserfs_error(p_s_inode->i_sb, "clm-6000",
2046			       "error reading block %lu", bh->b_blocknr);
2047		error = -EIO;
2048		goto unlock;
2049	}
2050	*bh_result = bh;
2051	*page_result = page;
2052
2053      out:
2054	return error;
2055
2056      unlock:
2057	unlock_page(page);
2058	page_cache_release(page);
2059	return error;
2060}
2061
2062/*
2063** vfs version of truncate file.  Must NOT be called with
2064** a transaction already started.
2065**
2066** some code taken from block_truncate_page
2067*/
2068int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2069{
2070	struct reiserfs_transaction_handle th;
2071	/* we want the offset for the first byte after the end of the file */
2072	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
2073	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
2074	unsigned length;
2075	struct page *page = NULL;
2076	int error;
2077	struct buffer_head *bh = NULL;
2078	int err2;
2079
2080	reiserfs_write_lock(p_s_inode->i_sb);
2081
2082	if (p_s_inode->i_size > 0) {
2083		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
2084			// -ENOENT means we truncated past the end of the file,
2085			// and get_block_create_0 could not find a block to read in,
2086			// which is ok.
2087			if (error != -ENOENT)
2088				reiserfs_error(p_s_inode->i_sb, "clm-6001",
2089					       "grab_tail_page failed %d",
2090					       error);
2091			page = NULL;
2092			bh = NULL;
2093		}
2094	}
2095
2096	/* so, if page != NULL, we have a buffer head for the offset at
2097	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2098	 ** then we have an unformatted node.  Otherwise, we have a direct item,
2099	 ** and no zeroing is required on disk.  We zero after the truncate,
2100	 ** because the truncate might pack the item anyway
2101	 ** (it will unmap bh if it packs).
2102	 */
2103	/* it is enough to reserve space in transaction for 2 balancings:
2104	   one for "save" link adding and another for the first
2105	   cut_from_item. 1 is for update_sd */
2106	error = journal_begin(&th, p_s_inode->i_sb,
2107			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
2108	if (error)
2109		goto out;
2110	reiserfs_update_inode_transaction(p_s_inode);
2111	if (update_timestamps)
2112		/* we are doing real truncate: if the system crashes before the last
2113		   transaction of truncating gets committed - on reboot the file
2114		   either appears truncated properly or not truncated at all */
2115		add_save_link(&th, p_s_inode, 1);
2116	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
2117	error =
2118	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2119	if (error)
2120		goto out;
2121
2122	/* check reiserfs_do_truncate after ending the transaction */
2123	if (err2) {
2124		error = err2;
2125  		goto out;
2126	}
2127
2128	if (update_timestamps) {
2129		error = remove_save_link(p_s_inode, 1 /* truncate */ );
2130		if (error)
2131			goto out;
2132	}
2133
2134	if (page) {
2135		length = offset & (blocksize - 1);
2136		/* if we are not on a block boundary */
2137		if (length) {
2138			length = blocksize - length;
2139			zero_user(page, offset, length);
2140			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2141				mark_buffer_dirty(bh);
2142			}
2143		}
2144		unlock_page(page);
2145		page_cache_release(page);
2146	}
2147
2148	reiserfs_write_unlock(p_s_inode->i_sb);
2149	return 0;
2150      out:
2151	if (page) {
2152		unlock_page(page);
2153		page_cache_release(page);
2154	}
2155	reiserfs_write_unlock(p_s_inode->i_sb);
2156	return error;
2157}
2158
2159static int map_block_for_writepage(struct inode *inode,
2160				   struct buffer_head *bh_result,
2161				   unsigned long block)
2162{
2163	struct reiserfs_transaction_handle th;
2164	int fs_gen;
2165	struct item_head tmp_ih;
2166	struct item_head *ih;
2167	struct buffer_head *bh;
2168	__le32 *item;
2169	struct cpu_key key;
2170	INITIALIZE_PATH(path);
2171	int pos_in_item;
2172	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2173	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2174	int retval;
2175	int use_get_block = 0;
2176	int bytes_copied = 0;
2177	int copy_size;
2178	int trans_running = 0;
2179
2180	/* catch places below that try to log something without starting a trans */
2181	th.t_trans_id = 0;
2182
2183	if (!buffer_uptodate(bh_result)) {
2184		return -EIO;
2185	}
2186
2187	kmap(bh_result->b_page);
2188      start_over:
2189	reiserfs_write_lock(inode->i_sb);
2190	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2191
2192      research:
2193	retval = search_for_position_by_key(inode->i_sb, &key, &path);
2194	if (retval != POSITION_FOUND) {
2195		use_get_block = 1;
2196		goto out;
2197	}
2198
2199	bh = get_last_bh(&path);
2200	ih = get_ih(&path);
2201	item = get_item(&path);
2202	pos_in_item = path.pos_in_item;
2203
2204	/* we've found an unformatted node */
2205	if (indirect_item_found(retval, ih)) {
2206		if (bytes_copied > 0) {
2207			reiserfs_warning(inode->i_sb, "clm-6002",
2208					 "bytes_copied %d", bytes_copied);
2209		}
2210		if (!get_block_num(item, pos_in_item)) {
2211			/* crap, we are writing to a hole */
2212			use_get_block = 1;
2213			goto out;
2214		}
2215		set_block_dev_mapped(bh_result,
2216				     get_block_num(item, pos_in_item), inode);
2217	} else if (is_direct_le_ih(ih)) {
2218		char *p;
2219		p = page_address(bh_result->b_page);
2220		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2221		copy_size = ih_item_len(ih) - pos_in_item;
2222
2223		fs_gen = get_generation(inode->i_sb);
2224		copy_item_head(&tmp_ih, ih);
2225
2226		if (!trans_running) {
2227			/* vs-3050 is gone, no need to drop the path */
2228			retval = journal_begin(&th, inode->i_sb, jbegin_count);
2229			if (retval)
2230				goto out;
2231			reiserfs_update_inode_transaction(inode);
2232			trans_running = 1;
2233			if (fs_changed(fs_gen, inode->i_sb)
2234			    && item_moved(&tmp_ih, &path)) {
2235				reiserfs_restore_prepared_buffer(inode->i_sb,
2236								 bh);
2237				goto research;
2238			}
2239		}
2240
2241		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2242
2243		if (fs_changed(fs_gen, inode->i_sb)
2244		    && item_moved(&tmp_ih, &path)) {
2245			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2246			goto research;
2247		}
2248
2249		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2250		       copy_size);
2251
2252		journal_mark_dirty(&th, inode->i_sb, bh);
2253		bytes_copied += copy_size;
2254		set_block_dev_mapped(bh_result, 0, inode);
2255
2256		/* are there still bytes left? */
2257		if (bytes_copied < bh_result->b_size &&
2258		    (byte_offset + bytes_copied) < inode->i_size) {
2259			set_cpu_key_k_offset(&key,
2260					     cpu_key_k_offset(&key) +
2261					     copy_size);
2262			goto research;
2263		}
2264	} else {
2265		reiserfs_warning(inode->i_sb, "clm-6003",
2266				 "bad item inode %lu", inode->i_ino);
2267		retval = -EIO;
2268		goto out;
2269	}
2270	retval = 0;
2271
2272      out:
2273	pathrelse(&path);
2274	if (trans_running) {
2275		int err = journal_end(&th, inode->i_sb, jbegin_count);
2276		if (err)
2277			retval = err;
2278		trans_running = 0;
2279	}
2280	reiserfs_write_unlock(inode->i_sb);
2281
2282	/* this is where we fill in holes in the file. */
2283	if (use_get_block) {
2284		retval = reiserfs_get_block(inode, block, bh_result,
2285					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2286					    | GET_BLOCK_NO_DANGLE);
2287		if (!retval) {
2288			if (!buffer_mapped(bh_result)
2289			    || bh_result->b_blocknr == 0) {
2290				/* get_block failed to find a mapped unformatted node. */
2291				use_get_block = 0;
2292				goto start_over;
2293			}
2294		}
2295	}
2296	kunmap(bh_result->b_page);
2297
2298	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2299		/* we've copied data from the page into the direct item, so the
2300		 * buffer in the page is now clean, mark it to reflect that.
2301		 */
2302		lock_buffer(bh_result);
2303		clear_buffer_dirty(bh_result);
2304		unlock_buffer(bh_result);
2305	}
2306	return retval;
2307}
2308
2309/*
2310 * mason@suse.com: updated in 2.5.54 to follow the same general io
2311 * start/recovery path as __block_write_full_page, along with special
2312 * code to handle reiserfs tails.
2313 */
2314static int reiserfs_write_full_page(struct page *page,
2315				    struct writeback_control *wbc)
2316{
2317	struct inode *inode = page->mapping->host;
2318	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2319	int error = 0;
2320	unsigned long block;
2321	sector_t last_block;
2322	struct buffer_head *head, *bh;
2323	int partial = 0;
2324	int nr = 0;
2325	int checked = PageChecked(page);
2326	struct reiserfs_transaction_handle th;
2327	struct super_block *s = inode->i_sb;
2328	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2329	th.t_trans_id = 0;
2330
2331	/* no logging allowed when nonblocking or from PF_MEMALLOC */
2332	if (checked && (current->flags & PF_MEMALLOC)) {
2333		redirty_page_for_writepage(wbc, page);
2334		unlock_page(page);
2335		return 0;
2336	}
2337
2338	/* The page dirty bit is cleared before writepage is called, which
2339	 * means we have to tell create_empty_buffers to make dirty buffers
2340	 * The page really should be up to date at this point, so tossing
2341	 * in the BH_Uptodate is just a sanity check.
2342	 */
2343	if (!page_has_buffers(page)) {
2344		create_empty_buffers(page, s->s_blocksize,
2345				     (1 << BH_Dirty) | (1 << BH_Uptodate));
2346	}
2347	head = page_buffers(page);
2348
2349	/* last page in the file, zero out any contents past the
2350	 ** last byte in the file
2351	 */
2352	if (page->index >= end_index) {
2353		unsigned last_offset;
2354
2355		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2356		/* no file contents in this page */
2357		if (page->index >= end_index + 1 || !last_offset) {
2358			unlock_page(page);
2359			return 0;
2360		}
2361		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
2362	}
2363	bh = head;
2364	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2365	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
2366	/* first map all the buffers, logging any direct items we find */
2367	do {
2368		if (block > last_block) {
2369			/*
2370			 * This can happen when the block size is less than
2371			 * the page size.  The corresponding bytes in the page
2372			 * were zero filled above
2373			 */
2374			clear_buffer_dirty(bh);
2375			set_buffer_uptodate(bh);
2376		} else if ((checked || buffer_dirty(bh)) &&
2377		           (!buffer_mapped(bh) || (buffer_mapped(bh)
2378						       && bh->b_blocknr ==
2379						       0))) {
2380			/* not mapped yet, or it points to a direct item, search
2381			 * the btree for the mapping info, and log any direct
2382			 * items found
2383			 */
2384			if ((error = map_block_for_writepage(inode, bh, block))) {
2385				goto fail;
2386			}
2387		}
2388		bh = bh->b_this_page;
2389		block++;
2390	} while (bh != head);
2391
2392	/*
2393	 * we start the transaction after map_block_for_writepage,
2394	 * because it can create holes in the file (an unbounded operation).
2395	 * starting it here, we can make a reliable estimate for how many
2396	 * blocks we're going to log
2397	 */
2398	if (checked) {
2399		ClearPageChecked(page);
2400		reiserfs_write_lock(s);
2401		error = journal_begin(&th, s, bh_per_page + 1);
2402		if (error) {
2403			reiserfs_write_unlock(s);
2404			goto fail;
2405		}
2406		reiserfs_update_inode_transaction(inode);
2407	}
2408	/* now go through and lock any dirty buffers on the page */
2409	do {
2410		get_bh(bh);
2411		if (!buffer_mapped(bh))
2412			continue;
2413		if (buffer_mapped(bh) && bh->b_blocknr == 0)
2414			continue;
2415
2416		if (checked) {
2417			reiserfs_prepare_for_journal(s, bh, 1);
2418			journal_mark_dirty(&th, s, bh);
2419			continue;
2420		}
2421		/* from this point on, we know the buffer is mapped to a
2422		 * real block and not a direct item
2423		 */
2424		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2425			lock_buffer(bh);
2426		} else {
2427			if (!trylock_buffer(bh)) {
2428				redirty_page_for_writepage(wbc, page);
2429				continue;
2430			}
2431		}
2432		if (test_clear_buffer_dirty(bh)) {
2433			mark_buffer_async_write(bh);
2434		} else {
2435			unlock_buffer(bh);
2436		}
2437	} while ((bh = bh->b_this_page) != head);
2438
2439	if (checked) {
2440		error = journal_end(&th, s, bh_per_page + 1);
2441		reiserfs_write_unlock(s);
2442		if (error)
2443			goto fail;
2444	}
2445	BUG_ON(PageWriteback(page));
2446	set_page_writeback(page);
2447	unlock_page(page);
2448
2449	/*
2450	 * since any buffer might be the only dirty buffer on the page,
2451	 * the first submit_bh can bring the page out of writeback.
2452	 * be careful with the buffers.
2453	 */
2454	do {
2455		struct buffer_head *next = bh->b_this_page;
2456		if (buffer_async_write(bh)) {
2457			submit_bh(WRITE, bh);
2458			nr++;
2459		}
2460		put_bh(bh);
2461		bh = next;
2462	} while (bh != head);
2463
2464	error = 0;
2465      done:
2466	if (nr == 0) {
2467		/*
2468		 * if this page only had a direct item, it is very possible for
2469		 * no io to be required without there being an error.  Or,
2470		 * someone else could have locked them and sent them down the
2471		 * pipe without locking the page
2472		 */
2473		bh = head;
2474		do {
2475			if (!buffer_uptodate(bh)) {
2476				partial = 1;
2477				break;
2478			}
2479			bh = bh->b_this_page;
2480		} while (bh != head);
2481		if (!partial)
2482			SetPageUptodate(page);
2483		end_page_writeback(page);
2484	}
2485	return error;
2486
2487      fail:
2488	/* catches various errors, we need to make sure any valid dirty blocks
2489	 * get to the media.  The page is currently locked and not marked for
2490	 * writeback
2491	 */
2492	ClearPageUptodate(page);
2493	bh = head;
2494	do {
2495		get_bh(bh);
2496		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2497			lock_buffer(bh);
2498			mark_buffer_async_write(bh);
2499		} else {
2500			/*
2501			 * clear any dirty bits that might have come from getting
2502			 * attached to a dirty page
2503			 */
2504			clear_buffer_dirty(bh);
2505		}
2506		bh = bh->b_this_page;
2507	} while (bh != head);
2508	SetPageError(page);
2509	BUG_ON(PageWriteback(page));
2510	set_page_writeback(page);
2511	unlock_page(page);
2512	do {
2513		struct buffer_head *next = bh->b_this_page;
2514		if (buffer_async_write(bh)) {
2515			clear_buffer_dirty(bh);
2516			submit_bh(WRITE, bh);
2517			nr++;
2518		}
2519		put_bh(bh);
2520		bh = next;
2521	} while (bh != head);
2522	goto done;
2523}
2524
2525static int reiserfs_readpage(struct file *f, struct page *page)
2526{
2527	return block_read_full_page(page, reiserfs_get_block);
2528}
2529
2530static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2531{
2532	struct inode *inode = page->mapping->host;
2533	reiserfs_wait_on_write_block(inode->i_sb);
2534	return reiserfs_write_full_page(page, wbc);
2535}
2536
2537static int reiserfs_write_begin(struct file *file,
2538				struct address_space *mapping,
2539				loff_t pos, unsigned len, unsigned flags,
2540				struct page **pagep, void **fsdata)
2541{
2542	struct inode *inode;
2543	struct page *page;
2544	pgoff_t index;
2545	int ret;
2546	int old_ref = 0;
2547
2548 	inode = mapping->host;
2549	*fsdata = 0;
2550 	if (flags & AOP_FLAG_CONT_EXPAND &&
2551 	    (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
2552 		pos ++;
2553		*fsdata = (void *)(unsigned long)flags;
2554	}
2555
2556	index = pos >> PAGE_CACHE_SHIFT;
2557	page = grab_cache_page_write_begin(mapping, index, flags);
2558	if (!page)
2559		return -ENOMEM;
2560	*pagep = page;
2561
2562	reiserfs_wait_on_write_block(inode->i_sb);
2563	fix_tail_page_for_writing(page);
2564	if (reiserfs_transaction_running(inode->i_sb)) {
2565		struct reiserfs_transaction_handle *th;
2566		th = (struct reiserfs_transaction_handle *)current->
2567		    journal_info;
2568		BUG_ON(!th->t_refcount);
2569		BUG_ON(!th->t_trans_id);
2570		old_ref = th->t_refcount;
2571		th->t_refcount++;
2572	}
2573	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2574				reiserfs_get_block);
2575	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2576		struct reiserfs_transaction_handle *th = current->journal_info;
2577		/* this gets a little ugly.  If reiserfs_get_block returned an
2578		 * error and left a transacstion running, we've got to close it,
2579		 * and we've got to free handle if it was a persistent transaction.
2580		 *
2581		 * But, if we had nested into an existing transaction, we need
2582		 * to just drop the ref count on the handle.
2583		 *
2584		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2585		 * and it was a persistent trans.  Otherwise, it was nested above.
2586		 */
2587		if (th->t_refcount > old_ref) {
2588			if (old_ref)
2589				th->t_refcount--;
2590			else {
2591				int err;
2592				reiserfs_write_lock(inode->i_sb);
2593				err = reiserfs_end_persistent_transaction(th);
2594				reiserfs_write_unlock(inode->i_sb);
2595				if (err)
2596					ret = err;
2597			}
2598		}
2599	}
2600	if (ret) {
2601		unlock_page(page);
2602		page_cache_release(page);
2603	}
2604	return ret;
2605}
2606
2607int reiserfs_prepare_write(struct file *f, struct page *page,
2608			   unsigned from, unsigned to)
2609{
2610	struct inode *inode = page->mapping->host;
2611	int ret;
2612	int old_ref = 0;
2613
2614	reiserfs_wait_on_write_block(inode->i_sb);
2615	fix_tail_page_for_writing(page);
2616	if (reiserfs_transaction_running(inode->i_sb)) {
2617		struct reiserfs_transaction_handle *th;
2618		th = (struct reiserfs_transaction_handle *)current->
2619		    journal_info;
2620		BUG_ON(!th->t_refcount);
2621		BUG_ON(!th->t_trans_id);
2622		old_ref = th->t_refcount;
2623		th->t_refcount++;
2624	}
2625
2626	ret = block_prepare_write(page, from, to, reiserfs_get_block);
2627	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2628		struct reiserfs_transaction_handle *th = current->journal_info;
2629		/* this gets a little ugly.  If reiserfs_get_block returned an
2630		 * error and left a transacstion running, we've got to close it,
2631		 * and we've got to free handle if it was a persistent transaction.
2632		 *
2633		 * But, if we had nested into an existing transaction, we need
2634		 * to just drop the ref count on the handle.
2635		 *
2636		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2637		 * and it was a persistent trans.  Otherwise, it was nested above.
2638		 */
2639		if (th->t_refcount > old_ref) {
2640			if (old_ref)
2641				th->t_refcount--;
2642			else {
2643				int err;
2644				reiserfs_write_lock(inode->i_sb);
2645				err = reiserfs_end_persistent_transaction(th);
2646				reiserfs_write_unlock(inode->i_sb);
2647				if (err)
2648					ret = err;
2649			}
2650		}
2651	}
2652	return ret;
2653
2654}
2655
2656static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2657{
2658	return generic_block_bmap(as, block, reiserfs_bmap);
2659}
2660
2661static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2662			      loff_t pos, unsigned len, unsigned copied,
2663			      struct page *page, void *fsdata)
2664{
2665	struct inode *inode = page->mapping->host;
2666	int ret = 0;
2667	int update_sd = 0;
2668	struct reiserfs_transaction_handle *th;
2669	unsigned start;
2670
2671	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
2672		pos ++;
2673
2674	reiserfs_wait_on_write_block(inode->i_sb);
2675	if (reiserfs_transaction_running(inode->i_sb))
2676		th = current->journal_info;
2677	else
2678		th = NULL;
2679
2680	start = pos & (PAGE_CACHE_SIZE - 1);
2681	if (unlikely(copied < len)) {
2682		if (!PageUptodate(page))
2683			copied = 0;
2684
2685		page_zero_new_buffers(page, start + copied, start + len);
2686	}
2687	flush_dcache_page(page);
2688
2689	reiserfs_commit_page(inode, page, start, start + copied);
2690
2691	/* generic_commit_write does this for us, but does not update the
2692	 ** transaction tracking stuff when the size changes.  So, we have
2693	 ** to do the i_size updates here.
2694	 */
2695	pos += copied;
2696	if (pos > inode->i_size) {
2697		struct reiserfs_transaction_handle myth;
2698		reiserfs_write_lock(inode->i_sb);
2699		/* If the file have grown beyond the border where it
2700		   can have a tail, unmark it as needing a tail
2701		   packing */
2702		if ((have_large_tails(inode->i_sb)
2703		     && inode->i_size > i_block_size(inode) * 4)
2704		    || (have_small_tails(inode->i_sb)
2705			&& inode->i_size > i_block_size(inode)))
2706			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2707
2708		ret = journal_begin(&myth, inode->i_sb, 1);
2709		if (ret) {
2710			reiserfs_write_unlock(inode->i_sb);
2711			goto journal_error;
2712		}
2713		reiserfs_update_inode_transaction(inode);
2714		inode->i_size = pos;
2715		/*
2716		 * this will just nest into our transaction.  It's important
2717		 * to use mark_inode_dirty so the inode gets pushed around on the
2718		 * dirty lists, and so that O_SYNC works as expected
2719		 */
2720		mark_inode_dirty(inode);
2721		reiserfs_update_sd(&myth, inode);
2722		update_sd = 1;
2723		ret = journal_end(&myth, inode->i_sb, 1);
2724		reiserfs_write_unlock(inode->i_sb);
2725		if (ret)
2726			goto journal_error;
2727	}
2728	if (th) {
2729		reiserfs_write_lock(inode->i_sb);
2730		if (!update_sd)
2731			mark_inode_dirty(inode);
2732		ret = reiserfs_end_persistent_transaction(th);
2733		reiserfs_write_unlock(inode->i_sb);
2734		if (ret)
2735			goto out;
2736	}
2737
2738      out:
2739	unlock_page(page);
2740	page_cache_release(page);
2741	return ret == 0 ? copied : ret;
2742
2743      journal_error:
2744	if (th) {
2745		reiserfs_write_lock(inode->i_sb);
2746		if (!update_sd)
2747			reiserfs_update_sd(th, inode);
2748		ret = reiserfs_end_persistent_transaction(th);
2749		reiserfs_write_unlock(inode->i_sb);
2750	}
2751
2752	goto out;
2753}
2754
2755int reiserfs_commit_write(struct file *f, struct page *page,
2756			  unsigned from, unsigned to)
2757{
2758	struct inode *inode = page->mapping->host;
2759	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2760	int ret = 0;
2761	int update_sd = 0;
2762	struct reiserfs_transaction_handle *th = NULL;
2763
2764	reiserfs_wait_on_write_block(inode->i_sb);
2765	if (reiserfs_transaction_running(inode->i_sb)) {
2766		th = current->journal_info;
2767	}
2768	reiserfs_commit_page(inode, page, from, to);
2769
2770	/* generic_commit_write does this for us, but does not update the
2771	 ** transaction tracking stuff when the size changes.  So, we have
2772	 ** to do the i_size updates here.
2773	 */
2774	if (pos > inode->i_size) {
2775		struct reiserfs_transaction_handle myth;
2776		reiserfs_write_lock(inode->i_sb);
2777		/* If the file have grown beyond the border where it
2778		   can have a tail, unmark it as needing a tail
2779		   packing */
2780		if ((have_large_tails(inode->i_sb)
2781		     && inode->i_size > i_block_size(inode) * 4)
2782		    || (have_small_tails(inode->i_sb)
2783			&& inode->i_size > i_block_size(inode)))
2784			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2785
2786		ret = journal_begin(&myth, inode->i_sb, 1);
2787		if (ret) {
2788			reiserfs_write_unlock(inode->i_sb);
2789			goto journal_error;
2790		}
2791		reiserfs_update_inode_transaction(inode);
2792		inode->i_size = pos;
2793		/*
2794		 * this will just nest into our transaction.  It's important
2795		 * to use mark_inode_dirty so the inode gets pushed around on the
2796		 * dirty lists, and so that O_SYNC works as expected
2797		 */
2798		mark_inode_dirty(inode);
2799		reiserfs_update_sd(&myth, inode);
2800		update_sd = 1;
2801		ret = journal_end(&myth, inode->i_sb, 1);
2802		reiserfs_write_unlock(inode->i_sb);
2803		if (ret)
2804			goto journal_error;
2805	}
2806	if (th) {
2807		reiserfs_write_lock(inode->i_sb);
2808		if (!update_sd)
2809			mark_inode_dirty(inode);
2810		ret = reiserfs_end_persistent_transaction(th);
2811		reiserfs_write_unlock(inode->i_sb);
2812		if (ret)
2813			goto out;
2814	}
2815
2816      out:
2817	return ret;
2818
2819      journal_error:
2820	if (th) {
2821		reiserfs_write_lock(inode->i_sb);
2822		if (!update_sd)
2823			reiserfs_update_sd(th, inode);
2824		ret = reiserfs_end_persistent_transaction(th);
2825		reiserfs_write_unlock(inode->i_sb);
2826	}
2827
2828	return ret;
2829}
2830
2831void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2832{
2833	if (reiserfs_attrs(inode->i_sb)) {
2834		if (sd_attrs & REISERFS_SYNC_FL)
2835			inode->i_flags |= S_SYNC;
2836		else
2837			inode->i_flags &= ~S_SYNC;
2838		if (sd_attrs & REISERFS_IMMUTABLE_FL)
2839			inode->i_flags |= S_IMMUTABLE;
2840		else
2841			inode->i_flags &= ~S_IMMUTABLE;
2842		if (sd_attrs & REISERFS_APPEND_FL)
2843			inode->i_flags |= S_APPEND;
2844		else
2845			inode->i_flags &= ~S_APPEND;
2846		if (sd_attrs & REISERFS_NOATIME_FL)
2847			inode->i_flags |= S_NOATIME;
2848		else
2849			inode->i_flags &= ~S_NOATIME;
2850		if (sd_attrs & REISERFS_NOTAIL_FL)
2851			REISERFS_I(inode)->i_flags |= i_nopack_mask;
2852		else
2853			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2854	}
2855}
2856
2857void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2858{
2859	if (reiserfs_attrs(inode->i_sb)) {
2860		if (inode->i_flags & S_IMMUTABLE)
2861			*sd_attrs |= REISERFS_IMMUTABLE_FL;
2862		else
2863			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2864		if (inode->i_flags & S_SYNC)
2865			*sd_attrs |= REISERFS_SYNC_FL;
2866		else
2867			*sd_attrs &= ~REISERFS_SYNC_FL;
2868		if (inode->i_flags & S_NOATIME)
2869			*sd_attrs |= REISERFS_NOATIME_FL;
2870		else
2871			*sd_attrs &= ~REISERFS_NOATIME_FL;
2872		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2873			*sd_attrs |= REISERFS_NOTAIL_FL;
2874		else
2875			*sd_attrs &= ~REISERFS_NOTAIL_FL;
2876	}
2877}
2878
2879/* decide if this buffer needs to stay around for data logging or ordered
2880** write purposes
2881*/
2882static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2883{
2884	int ret = 1;
2885	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2886
2887	lock_buffer(bh);
2888	spin_lock(&j->j_dirty_buffers_lock);
2889	if (!buffer_mapped(bh)) {
2890		goto free_jh;
2891	}
2892	/* the page is locked, and the only places that log a data buffer
2893	 * also lock the page.
2894	 */
2895	if (reiserfs_file_data_log(inode)) {
2896		/*
2897		 * very conservative, leave the buffer pinned if
2898		 * anyone might need it.
2899		 */
2900		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2901			ret = 0;
2902		}
2903	} else  if (buffer_dirty(bh)) {
2904		struct reiserfs_journal_list *jl;
2905		struct reiserfs_jh *jh = bh->b_private;
2906
2907		/* why is this safe?
2908		 * reiserfs_setattr updates i_size in the on disk
2909		 * stat data before allowing vmtruncate to be called.
2910		 *
2911		 * If buffer was put onto the ordered list for this
2912		 * transaction, we know for sure either this transaction
2913		 * or an older one already has updated i_size on disk,
2914		 * and this ordered data won't be referenced in the file
2915		 * if we crash.
2916		 *
2917		 * if the buffer was put onto the ordered list for an older
2918		 * transaction, we need to leave it around
2919		 */
2920		if (jh && (jl = jh->jl)
2921		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2922			ret = 0;
2923	}
2924      free_jh:
2925	if (ret && bh->b_private) {
2926		reiserfs_free_jh(bh);
2927	}
2928	spin_unlock(&j->j_dirty_buffers_lock);
2929	unlock_buffer(bh);
2930	return ret;
2931}
2932
2933/* clm -- taken from fs/buffer.c:block_invalidate_page */
2934static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2935{
2936	struct buffer_head *head, *bh, *next;
2937	struct inode *inode = page->mapping->host;
2938	unsigned int curr_off = 0;
2939	int ret = 1;
2940
2941	BUG_ON(!PageLocked(page));
2942
2943	if (offset == 0)
2944		ClearPageChecked(page);
2945
2946	if (!page_has_buffers(page))
2947		goto out;
2948
2949	head = page_buffers(page);
2950	bh = head;
2951	do {
2952		unsigned int next_off = curr_off + bh->b_size;
2953		next = bh->b_this_page;
2954
2955		/*
2956		 * is this block fully invalidated?
2957		 */
2958		if (offset <= curr_off) {
2959			if (invalidatepage_can_drop(inode, bh))
2960				reiserfs_unmap_buffer(bh);
2961			else
2962				ret = 0;
2963		}
2964		curr_off = next_off;
2965		bh = next;
2966	} while (bh != head);
2967
2968	/*
2969	 * We release buffers only if the entire page is being invalidated.
2970	 * The get_block cached value has been unconditionally invalidated,
2971	 * so real IO is not possible anymore.
2972	 */
2973	if (!offset && ret) {
2974		ret = try_to_release_page(page, 0);
2975		/* maybe should BUG_ON(!ret); - neilb */
2976	}
2977      out:
2978	return;
2979}
2980
2981static int reiserfs_set_page_dirty(struct page *page)
2982{
2983	struct inode *inode = page->mapping->host;
2984	if (reiserfs_file_data_log(inode)) {
2985		SetPageChecked(page);
2986		return __set_page_dirty_nobuffers(page);
2987	}
2988	return __set_page_dirty_buffers(page);
2989}
2990
2991/*
2992 * Returns 1 if the page's buffers were dropped.  The page is locked.
2993 *
2994 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2995 * in the buffers at page_buffers(page).
2996 *
2997 * even in -o notail mode, we can't be sure an old mount without -o notail
2998 * didn't create files with tails.
2999 */
3000static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
3001{
3002	struct inode *inode = page->mapping->host;
3003	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
3004	struct buffer_head *head;
3005	struct buffer_head *bh;
3006	int ret = 1;
3007
3008	WARN_ON(PageChecked(page));
3009	spin_lock(&j->j_dirty_buffers_lock);
3010	head = page_buffers(page);
3011	bh = head;
3012	do {
3013		if (bh->b_private) {
3014			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
3015				reiserfs_free_jh(bh);
3016			} else {
3017				ret = 0;
3018				break;
3019			}
3020		}
3021		bh = bh->b_this_page;
3022	} while (bh != head);
3023	if (ret)
3024		ret = try_to_free_buffers(page);
3025	spin_unlock(&j->j_dirty_buffers_lock);
3026	return ret;
3027}
3028
3029/* We thank Mingming Cao for helping us understand in great detail what
3030   to do in this section of the code. */
3031static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3032				  const struct iovec *iov, loff_t offset,
3033				  unsigned long nr_segs)
3034{
3035	struct file *file = iocb->ki_filp;
3036	struct inode *inode = file->f_mapping->host;
3037
3038	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3039				  offset, nr_segs,
3040				  reiserfs_get_blocks_direct_io, NULL);
3041}
3042
3043int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3044{
3045	struct inode *inode = dentry->d_inode;
3046	int error;
3047	unsigned int ia_valid;
3048
3049	/* must be turned off for recursive notify_change calls */
3050	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3051
3052	reiserfs_write_lock(inode->i_sb);
3053	if (attr->ia_valid & ATTR_SIZE) {
3054		/* version 2 items will be caught by the s_maxbytes check
3055		 ** done for us in vmtruncate
3056		 */
3057		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
3058		    attr->ia_size > MAX_NON_LFS) {
3059			error = -EFBIG;
3060			goto out;
3061		}
3062		/* fill in hole pointers in the expanding truncate case. */
3063		if (attr->ia_size > inode->i_size) {
3064			error = generic_cont_expand_simple(inode, attr->ia_size);
3065			if (REISERFS_I(inode)->i_prealloc_count > 0) {
3066				int err;
3067				struct reiserfs_transaction_handle th;
3068				/* we're changing at most 2 bitmaps, inode + super */
3069				err = journal_begin(&th, inode->i_sb, 4);
3070				if (!err) {
3071					reiserfs_discard_prealloc(&th, inode);
3072					err = journal_end(&th, inode->i_sb, 4);
3073				}
3074				if (err)
3075					error = err;
3076			}
3077			if (error)
3078				goto out;
3079			/*
3080			 * file size is changed, ctime and mtime are
3081			 * to be updated
3082			 */
3083			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
3084		}
3085	}
3086
3087	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
3088	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
3089	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
3090		/* stat data of format v3.5 has 16 bit uid and gid */
3091		error = -EINVAL;
3092		goto out;
3093	}
3094
3095	error = inode_change_ok(inode, attr);
3096	if (!error) {
3097		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3098		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3099			error = reiserfs_chown_xattrs(inode, attr);
3100
3101			if (!error) {
3102				struct reiserfs_transaction_handle th;
3103				int jbegin_count =
3104				    2 *
3105				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
3106				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
3107				    2;
3108
3109				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3110				error =
3111				    journal_begin(&th, inode->i_sb,
3112						  jbegin_count);
3113				if (error)
3114					goto out;
3115				error =
3116				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
3117				if (error) {
3118					journal_end(&th, inode->i_sb,
3119						    jbegin_count);
3120					goto out;
3121				}
3122				/* Update corresponding info in inode so that everything is in
3123				 * one transaction */
3124				if (attr->ia_valid & ATTR_UID)
3125					inode->i_uid = attr->ia_uid;
3126				if (attr->ia_valid & ATTR_GID)
3127					inode->i_gid = attr->ia_gid;
3128				mark_inode_dirty(inode);
3129				error =
3130				    journal_end(&th, inode->i_sb, jbegin_count);
3131			}
3132		}
3133		if (!error)
3134			error = inode_setattr(inode, attr);
3135	}
3136
3137	if (!error && reiserfs_posixacl(inode->i_sb)) {
3138		if (attr->ia_valid & ATTR_MODE)
3139			error = reiserfs_acl_chmod(inode);
3140	}
3141
3142      out:
3143	reiserfs_write_unlock(inode->i_sb);
3144	return error;
3145}
3146
3147const struct address_space_operations reiserfs_address_space_operations = {
3148	.writepage = reiserfs_writepage,
3149	.readpage = reiserfs_readpage,
3150	.readpages = reiserfs_readpages,
3151	.releasepage = reiserfs_releasepage,
3152	.invalidatepage = reiserfs_invalidatepage,
3153	.sync_page = block_sync_page,
3154	.write_begin = reiserfs_write_begin,
3155	.write_end = reiserfs_write_end,
3156	.bmap = reiserfs_aop_bmap,
3157	.direct_IO = reiserfs_direct_IO,
3158	.set_page_dirty = reiserfs_set_page_dirty,
3159};
3160