file.c revision 3e8962be915bacc1d70e4849a075041838d60a3f
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5
6#include <linux/time.h>
7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h>
10#include <linux/smp_lock.h>
11#include <asm/uaccess.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14#include <linux/writeback.h>
15#include <linux/blkdev.h>
16#include <linux/buffer_head.h>
17#include <linux/quotaops.h>
18
19/*
20** We pack the tails of files on file close, not at the time they are written.
21** This implies an unnecessary copy of the tail and an unnecessary indirect item
22** insertion/balancing, for files that are written in one write.
23** It avoids unnecessary tail packings (balances) for files that are written in
24** multiple writes and are small enough to have tails.
25**
26** file_release is called by the VFS layer when the file is closed.  If
27** this is the last open file descriptor, and the file
28** small enough to have a tail, and the tail is currently in an
29** unformatted node, the tail is converted back into a direct item.
30**
31** We use reiserfs_truncate_file to pack the tail, since it already has
32** all the conditions coded.
33*/
34static int reiserfs_file_release (struct inode * inode, struct file * filp)
35{
36
37    struct reiserfs_transaction_handle th ;
38    int err;
39    int jbegin_failure = 0;
40
41    if (!S_ISREG (inode->i_mode))
42	BUG ();
43
44    /* fast out for when nothing needs to be done */
45    if ((atomic_read(&inode->i_count) > 1 ||
46	!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
47         !tail_has_to_be_packed(inode))       &&
48	REISERFS_I(inode)->i_prealloc_count <= 0) {
49	return 0;
50    }
51
52    reiserfs_write_lock(inode->i_sb);
53    down (&inode->i_sem);
54    /* freeing preallocation only involves relogging blocks that
55     * are already in the current transaction.  preallocation gets
56     * freed at the end of each transaction, so it is impossible for
57     * us to log any additional blocks (including quota blocks)
58     */
59    err = journal_begin(&th, inode->i_sb, 1);
60    if (err) {
61	/* uh oh, we can't allow the inode to go away while there
62	 * is still preallocation blocks pending.  Try to join the
63	 * aborted transaction
64	 */
65	jbegin_failure = err;
66	err = journal_join_abort(&th, inode->i_sb, 1);
67
68	if (err) {
69	    /* hmpf, our choices here aren't good.  We can pin the inode
70	     * which will disallow unmount from every happening, we can
71	     * do nothing, which will corrupt random memory on unmount,
72	     * or we can forcibly remove the file from the preallocation
73	     * list, which will leak blocks on disk.  Lets pin the inode
74	     * and let the admin know what is going on.
75	     */
76	    igrab(inode);
77	    reiserfs_warning(inode->i_sb, "pinning inode %lu because the "
78	                     "preallocation can't be freed");
79	    goto out;
80	}
81    }
82    reiserfs_update_inode_transaction(inode) ;
83
84#ifdef REISERFS_PREALLOCATE
85    reiserfs_discard_prealloc (&th, inode);
86#endif
87    err = journal_end(&th, inode->i_sb, 1);
88
89    /* copy back the error code from journal_begin */
90    if (!err)
91        err = jbegin_failure;
92
93    if (!err && atomic_read(&inode->i_count) <= 1 &&
94	(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95        tail_has_to_be_packed (inode)) {
96	/* if regular file is released by last holder and it has been
97	   appended (we append by unformatted node only) or its direct
98	   item(s) had to be converted, then it may have to be
99	   indirect2direct converted */
100	err = reiserfs_truncate_file(inode, 0) ;
101    }
102out:
103    up (&inode->i_sem);
104    reiserfs_write_unlock(inode->i_sb);
105    return err;
106}
107
108static void reiserfs_vfs_truncate_file(struct inode *inode) {
109    reiserfs_truncate_file(inode, 1) ;
110}
111
112/* Sync a reiserfs file. */
113
114/*
115 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
116 * be removed...
117 */
118
119static int reiserfs_sync_file(
120			      struct file   * p_s_filp,
121			      struct dentry * p_s_dentry,
122			      int datasync
123			      ) {
124  struct inode * p_s_inode = p_s_dentry->d_inode;
125  int n_err;
126  int barrier_done;
127
128  if (!S_ISREG(p_s_inode->i_mode))
129      BUG ();
130  n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
131  reiserfs_write_lock(p_s_inode->i_sb);
132  barrier_done = reiserfs_commit_for_inode(p_s_inode);
133  reiserfs_write_unlock(p_s_inode->i_sb);
134  if (barrier_done != 1)
135      blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
136  if (barrier_done < 0)
137    return barrier_done;
138  return ( n_err < 0 ) ? -EIO : 0;
139}
140
141/* I really do not want to play with memory shortage right now, so
142   to simplify the code, we are not going to write more than this much pages at
143   a time. This still should considerably improve performance compared to 4k
144   at a time case. This is 32 pages of 4k size. */
145#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
146
147/* Allocates blocks for a file to fulfil write request.
148   Maps all unmapped but prepared pages from the list.
149   Updates metadata with newly allocated blocknumbers as needed */
150static int reiserfs_allocate_blocks_for_region(
151				struct reiserfs_transaction_handle *th,
152				struct inode *inode, /* Inode we work with */
153				loff_t pos, /* Writing position */
154				int num_pages, /* number of pages write going
155						  to touch */
156				int write_bytes, /* amount of bytes to write */
157				struct page **prepared_pages, /* array of
158							         prepared pages
159							       */
160				int blocks_to_allocate /* Amount of blocks we
161							  need to allocate to
162							  fit the data into file
163							 */
164				)
165{
166    struct cpu_key key; // cpu key of item that we are going to deal with
167    struct item_head *ih; // pointer to item head that we are going to deal with
168    struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
169    __le32 * item; // pointer to item we are going to deal with
170    INITIALIZE_PATH(path); // path to item, that we are going to deal with.
171    b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
172    reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
173    size_t res; // return value of various functions that we call.
174    int curr_block; // current block used to keep track of unmapped blocks.
175    int i; // loop counter
176    int itempos; // position in item
177    unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
178						       // first page
179    unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
180    __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
181    int modifying_this_item = 0; // Flag for items traversal code to keep track
182				 // of the fact that we already prepared
183				 // current block for journal
184    int will_prealloc = 0;
185    RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
186
187    /* only preallocate if this is a small write */
188    if (REISERFS_I(inode)->i_prealloc_count ||
189       (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
190        blocks_to_allocate <
191        REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
192        will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
193
194    allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
195    					sizeof(b_blocknr_t), GFP_NOFS);
196
197    /* First we compose a key to point at the writing position, we want to do
198       that outside of any locking region. */
199    make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
200
201    /* If we came here, it means we absolutely need to open a transaction,
202       since we need to allocate some blocks */
203    reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
204    res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough
205    if (res)
206        goto error_exit;
207    reiserfs_update_inode_transaction(inode) ;
208
209    /* Look for the in-tree position of our write, need path for block allocator */
210    res = search_for_position_by_key(inode->i_sb, &key, &path);
211    if ( res == IO_ERROR ) {
212	res = -EIO;
213	goto error_exit;
214    }
215
216    /* Allocate blocks */
217    /* First fill in "hint" structure for block allocator */
218    hint.th = th; // transaction handle.
219    hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
220    hint.inode = inode; // Inode is needed by block allocator too.
221    hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
222    hint.key = key.on_disk_key; // on disk key of file.
223    hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
224    hint.formatted_node = 0; // We are allocating blocks for unformatted node.
225    hint.preallocate = will_prealloc;
226
227    /* Call block allocator to allocate blocks */
228    res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
229    if ( res != CARRY_ON ) {
230	if ( res == NO_DISK_SPACE ) {
231	    /* We flush the transaction in case of no space. This way some
232	       blocks might become free */
233	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
234	    res = restart_transaction(th, inode, &path);
235            if (res)
236                goto error_exit;
237
238	    /* We might have scheduled, so search again */
239	    res = search_for_position_by_key(inode->i_sb, &key, &path);
240	    if ( res == IO_ERROR ) {
241		res = -EIO;
242		goto error_exit;
243	    }
244
245	    /* update changed info for hint structure. */
246	    res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
247	    if ( res != CARRY_ON ) {
248		res = -ENOSPC;
249		pathrelse(&path);
250		goto error_exit;
251	    }
252	} else {
253	    res = -ENOSPC;
254	    pathrelse(&path);
255	    goto error_exit;
256	}
257    }
258
259#ifdef __BIG_ENDIAN
260        // Too bad, I have not found any way to convert a given region from
261        // cpu format to little endian format
262    {
263        int i;
264        for ( i = 0; i < blocks_to_allocate ; i++)
265            allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
266    }
267#endif
268
269    /* Blocks allocating well might have scheduled and tree might have changed,
270       let's search the tree again */
271    /* find where in the tree our write should go */
272    res = search_for_position_by_key(inode->i_sb, &key, &path);
273    if ( res == IO_ERROR ) {
274	res = -EIO;
275	goto error_exit_free_blocks;
276    }
277
278    bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
279    ih = get_ih( &path );      // Get a pointer to last item head in path.
280    item = get_item( &path );  // Get a pointer to last item in path
281
282    /* Let's see what we have found */
283    if ( res != POSITION_FOUND ) { /* position not found, this means that we
284				      might need to append file with holes
285				      first */
286	// Since we are writing past the file's end, we need to find out if
287	// there is a hole that needs to be inserted before our writing
288	// position, and how many blocks it is going to cover (we need to
289	//  populate pointers to file blocks representing the hole with zeros)
290
291	{
292	    int item_offset = 1;
293	    /*
294	     * if ih is stat data, its offset is 0 and we don't want to
295	     * add 1 to pos in the hole_size calculation
296	     */
297	    if (is_statdata_le_ih(ih))
298	        item_offset = 0;
299	    hole_size = (pos + item_offset -
300	            (le_key_k_offset( get_inode_item_key_version(inode),
301		    &(ih->ih_key)) +
302		    op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
303		    inode->i_sb->s_blocksize_bits;
304	}
305
306	if ( hole_size > 0 ) {
307	    int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
308	    /* area filled with zeroes, to supply as list of zero blocknumbers
309	       We allocate it outside of loop just in case loop would spin for
310	       several iterations. */
311	    char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
312	    if ( !zeros ) {
313		res = -ENOMEM;
314		goto error_exit_free_blocks;
315	    }
316	    memset ( zeros, 0, to_paste*UNFM_P_SIZE);
317	    do {
318		to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
319		if ( is_indirect_le_ih(ih) ) {
320		    /* Ok, there is existing indirect item already. Need to append it */
321		    /* Calculate position past inserted item */
322		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
323		    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
324		    if ( res ) {
325			kfree(zeros);
326			goto error_exit_free_blocks;
327		    }
328		} else if ( is_statdata_le_ih(ih) ) {
329		    /* No existing item, create it */
330		    /* item head for new item */
331		    struct item_head ins_ih;
332
333		    /* create a key for our new item */
334		    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
335
336		    /* Create new item head for our new item */
337		    make_le_item_head (&ins_ih, &key, key.version, 1,
338				       TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
339				       0 /* free space */);
340
341		    /* Find where such item should live in the tree */
342		    res = search_item (inode->i_sb, &key, &path);
343		    if ( res != ITEM_NOT_FOUND ) {
344			/* item should not exist, otherwise we have error */
345			if ( res != -ENOSPC ) {
346			    reiserfs_warning (inode->i_sb,
347				"green-9008: search_by_key (%K) returned %d",
348					      &key, res);
349			}
350			res = -EIO;
351		        kfree(zeros);
352			goto error_exit_free_blocks;
353		    }
354		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
355		} else {
356		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
357		}
358		if ( res ) {
359		    kfree(zeros);
360		    goto error_exit_free_blocks;
361		}
362		/* Now we want to check if transaction is too full, and if it is
363		   we restart it. This will also free the path. */
364		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
365		    res = restart_transaction(th, inode, &path);
366                    if (res) {
367                        pathrelse (&path);
368                        kfree(zeros);
369                        goto error_exit;
370                    }
371                }
372
373		/* Well, need to recalculate path and stuff */
374		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
375		res = search_for_position_by_key(inode->i_sb, &key, &path);
376		if ( res == IO_ERROR ) {
377		    res = -EIO;
378		    kfree(zeros);
379		    goto error_exit_free_blocks;
380		}
381		bh=get_last_bh(&path);
382		ih=get_ih(&path);
383		item = get_item(&path);
384		hole_size -= to_paste;
385	    } while ( hole_size );
386	    kfree(zeros);
387	}
388    }
389
390    // Go through existing indirect items first
391    // replace all zeroes with blocknumbers from list
392    // Note that if no corresponding item was found, by previous search,
393    // it means there are no existing in-tree representation for file area
394    // we are going to overwrite, so there is nothing to scan through for holes.
395    for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
396retry:
397
398	if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
399	    /* We run out of data in this indirect item, let's look for another
400	       one. */
401	    /* First if we are already modifying current item, log it */
402	    if ( modifying_this_item ) {
403		journal_mark_dirty (th, inode->i_sb, bh);
404		modifying_this_item = 0;
405	    }
406	    /* Then set the key to look for a new indirect item (offset of old
407	       item is added to old item length */
408	    set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
409	    /* Search ofor position of new key in the tree. */
410	    res = search_for_position_by_key(inode->i_sb, &key, &path);
411	    if ( res == IO_ERROR) {
412		res = -EIO;
413		goto error_exit_free_blocks;
414	    }
415	    bh=get_last_bh(&path);
416	    ih=get_ih(&path);
417	    item = get_item(&path);
418	    itempos = path.pos_in_item;
419	    continue; // loop to check all kinds of conditions and so on.
420	}
421	/* Ok, we have correct position in item now, so let's see if it is
422	   representing file hole (blocknumber is zero) and fill it if needed */
423	if ( !item[itempos] ) {
424	    /* Ok, a hole. Now we need to check if we already prepared this
425	       block to be journaled */
426	    while ( !modifying_this_item ) { // loop until succeed
427		/* Well, this item is not journaled yet, so we must prepare
428		   it for journal first, before we can change it */
429		struct item_head tmp_ih; // We copy item head of found item,
430					 // here to detect if fs changed under
431					 // us while we were preparing for
432					 // journal.
433		int fs_gen; // We store fs generation here to find if someone
434			    // changes fs under our feet
435
436		copy_item_head (&tmp_ih, ih); // Remember itemhead
437		fs_gen = get_generation (inode->i_sb); // remember fs generation
438		reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
439		if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
440		    // Sigh, fs was changed under us, we need to look for new
441		    // location of item we are working with
442
443		    /* unmark prepaerd area as journaled and search for it's
444		       new position */
445		    reiserfs_restore_prepared_buffer(inode->i_sb, bh);
446		    res = search_for_position_by_key(inode->i_sb, &key, &path);
447		    if ( res == IO_ERROR) {
448			res = -EIO;
449			goto error_exit_free_blocks;
450		    }
451		    bh=get_last_bh(&path);
452		    ih=get_ih(&path);
453		    item = get_item(&path);
454		    itempos = path.pos_in_item;
455		    goto retry;
456		}
457		modifying_this_item = 1;
458	    }
459	    item[itempos] = allocated_blocks[curr_block]; // Assign new block
460	    curr_block++;
461	}
462	itempos++;
463    }
464
465    if ( modifying_this_item ) { // We need to log last-accessed block, if it
466				 // was modified, but not logged yet.
467	journal_mark_dirty (th, inode->i_sb, bh);
468    }
469
470    if ( curr_block < blocks_to_allocate ) {
471	// Oh, well need to append to indirect item, or to create indirect item
472	// if there weren't any
473	if ( is_indirect_le_ih(ih) ) {
474	    // Existing indirect item - append. First calculate key for append
475	    // position. We do not need to recalculate path as it should
476	    // already point to correct place.
477	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
478	    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
479	    if ( res ) {
480		goto error_exit_free_blocks;
481	    }
482	} else if (is_statdata_le_ih(ih) ) {
483	    // Last found item was statdata. That means we need to create indirect item.
484	    struct item_head ins_ih; /* itemhead for new item */
485
486	    /* create a key for our new item */
487	    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
488							    // because that's
489							    // where first
490							    // indirect item
491							    // begins
492	    /* Create new item head for our new item */
493	    make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
494			       (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
495			       0 /* free space */);
496	    /* Find where such item should live in the tree */
497	    res = search_item (inode->i_sb, &key, &path);
498	    if ( res != ITEM_NOT_FOUND ) {
499		/* Well, if we have found such item already, or some error
500		   occured, we need to warn user and return error */
501		if ( res != -ENOSPC ) {
502		    reiserfs_warning (inode->i_sb,
503				      "green-9009: search_by_key (%K) "
504				      "returned %d", &key, res);
505		}
506		res = -EIO;
507		goto error_exit_free_blocks;
508	    }
509	    /* Insert item into the tree with the data as its body */
510	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
511	} else {
512	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
513	}
514    }
515
516    // the caller is responsible for closing the transaction
517    // unless we return an error, they are also responsible for logging
518    // the inode.
519    //
520    pathrelse(&path);
521    /*
522     * cleanup prellocation from previous writes
523     * if this is a partial block write
524     */
525    if (write_bytes & (inode->i_sb->s_blocksize -1))
526        reiserfs_discard_prealloc(th, inode);
527    reiserfs_write_unlock(inode->i_sb);
528
529    // go through all the pages/buffers and map the buffers to newly allocated
530    // blocks (so that system knows where to write these pages later).
531    curr_block = 0;
532    for ( i = 0; i < num_pages ; i++ ) {
533	struct page *page=prepared_pages[i]; //current page
534	struct buffer_head *head = page_buffers(page);// first buffer for a page
535	int block_start, block_end; // in-page offsets for buffers.
536
537	if (!page_buffers(page))
538	    reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
539
540	/* For each buffer in page */
541	for(bh = head, block_start = 0; bh != head || !block_start;
542	    block_start=block_end, bh = bh->b_this_page) {
543	    if (!bh)
544		reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
545	    block_end = block_start+inode->i_sb->s_blocksize;
546	    if (i == 0 && block_end <= from )
547		/* if this buffer is before requested data to map, skip it */
548		continue;
549	    if (i == num_pages - 1 && block_start >= to)
550		/* If this buffer is after requested data to map, abort
551		   processing of current page */
552		break;
553
554	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
555		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
556		curr_block++;
557		set_buffer_new(bh);
558	    }
559	}
560    }
561
562    RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
563
564    kfree(allocated_blocks);
565    return 0;
566
567// Need to deal with transaction here.
568error_exit_free_blocks:
569    pathrelse(&path);
570    // free blocks
571    for( i = 0; i < blocks_to_allocate; i++ )
572	reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
573
574error_exit:
575    if (th->t_trans_id) {
576        int err;
577        // update any changes we made to blk count
578        reiserfs_update_sd(th, inode);
579        err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS);
580        if (err)
581            res = err;
582    }
583    reiserfs_write_unlock(inode->i_sb);
584    kfree(allocated_blocks);
585
586    return res;
587}
588
589/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
590static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
591			      size_t num_pages /* amount of pages */) {
592    int i; // loop counter
593
594    for (i=0; i < num_pages ; i++) {
595	struct page *page = prepared_pages[i];
596
597	try_to_free_buffers(page);
598	unlock_page(page);
599	page_cache_release(page);
600    }
601}
602
603/* This function will copy data from userspace to specified pages within
604   supplied byte range */
605static int reiserfs_copy_from_user_to_file_region(
606				loff_t pos, /* In-file position */
607				int num_pages, /* Number of pages affected */
608				int write_bytes, /* Amount of bytes to write */
609				struct page **prepared_pages, /* pointer to
610								 array to
611								 prepared pages
612								*/
613				const char __user *buf /* Pointer to user-supplied
614						   data*/
615				)
616{
617    long page_fault=0; // status of copy_from_user.
618    int i; // loop counter.
619    int offset; // offset in page
620
621    for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
622	size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
623	struct page *page=prepared_pages[i]; // Current page we process.
624
625	fault_in_pages_readable( buf, count);
626
627	/* Copy data from userspace to the current page */
628	kmap(page);
629	page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
630	/* Flush processor's dcache for this page */
631	flush_dcache_page(page);
632	kunmap(page);
633	buf+=count;
634	write_bytes-=count;
635
636	if (page_fault)
637	    break; // Was there a fault? abort.
638    }
639
640    return page_fault?-EFAULT:0;
641}
642
643/* taken fs/buffer.c:__block_commit_write */
644int reiserfs_commit_page(struct inode *inode, struct page *page,
645		unsigned from, unsigned to)
646{
647    unsigned block_start, block_end;
648    int partial = 0;
649    unsigned blocksize;
650    struct buffer_head *bh, *head;
651    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
652    int new;
653    int logit = reiserfs_file_data_log(inode);
654    struct super_block *s = inode->i_sb;
655    int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
656    struct reiserfs_transaction_handle th;
657    int ret = 0;
658
659    th.t_trans_id = 0;
660    blocksize = 1 << inode->i_blkbits;
661
662    if (logit) {
663	reiserfs_write_lock(s);
664	ret = journal_begin(&th, s, bh_per_page + 1);
665	if (ret)
666	    goto drop_write_lock;
667	reiserfs_update_inode_transaction(inode);
668    }
669    for(bh = head = page_buffers(page), block_start = 0;
670        bh != head || !block_start;
671	block_start=block_end, bh = bh->b_this_page)
672    {
673
674	new = buffer_new(bh);
675	clear_buffer_new(bh);
676	block_end = block_start + blocksize;
677	if (block_end <= from || block_start >= to) {
678	    if (!buffer_uptodate(bh))
679		    partial = 1;
680	} else {
681	    set_buffer_uptodate(bh);
682	    if (logit) {
683		reiserfs_prepare_for_journal(s, bh, 1);
684		journal_mark_dirty(&th, s, bh);
685	    } else if (!buffer_dirty(bh)) {
686		mark_buffer_dirty(bh);
687		/* do data=ordered on any page past the end
688		 * of file and any buffer marked BH_New.
689		 */
690		if (reiserfs_data_ordered(inode->i_sb) &&
691		    (new || page->index >= i_size_index)) {
692		    reiserfs_add_ordered_list(inode, bh);
693	        }
694	    }
695	}
696    }
697    if (logit) {
698	ret = journal_end(&th, s, bh_per_page + 1);
699drop_write_lock:
700	reiserfs_write_unlock(s);
701    }
702    /*
703     * If this is a partial write which happened to make all buffers
704     * uptodate then we can optimize away a bogus readpage() for
705     * the next read(). Here we 'discover' whether the page went
706     * uptodate as a result of this (potentially partial) write.
707     */
708    if (!partial)
709	SetPageUptodate(page);
710    return ret;
711}
712
713
714/* Submit pages for write. This was separated from actual file copying
715   because we might want to allocate block numbers in-between.
716   This function assumes that caller will adjust file size to correct value. */
717static int reiserfs_submit_file_region_for_write(
718				struct reiserfs_transaction_handle *th,
719				struct inode *inode,
720				loff_t pos, /* Writing position offset */
721				size_t num_pages, /* Number of pages to write */
722				size_t write_bytes, /* number of bytes to write */
723				struct page **prepared_pages /* list of pages */
724				)
725{
726    int status; // return status of block_commit_write.
727    int retval = 0; // Return value we are going to return.
728    int i; // loop counter
729    int offset; // Writing offset in page.
730    int orig_write_bytes = write_bytes;
731    int sd_update = 0;
732
733    for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
734	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
735	struct page *page=prepared_pages[i]; // Current page we process.
736
737	status = reiserfs_commit_page(inode, page, offset, offset+count);
738	if ( status )
739	    retval = status; // To not overcomplicate matters We are going to
740			     // submit all the pages even if there was error.
741			     // we only remember error status to report it on
742			     // exit.
743	write_bytes-=count;
744    }
745    /* now that we've gotten all the ordered buffers marked dirty,
746     * we can safely update i_size and close any running transaction
747     */
748    if ( pos + orig_write_bytes > inode->i_size) {
749	inode->i_size = pos + orig_write_bytes; // Set new size
750	/* If the file have grown so much that tail packing is no
751	 * longer possible, reset "need to pack" flag */
752	if ( (have_large_tails (inode->i_sb) &&
753	      inode->i_size > i_block_size (inode)*4) ||
754	     (have_small_tails (inode->i_sb) &&
755	     inode->i_size > i_block_size(inode)) )
756	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
757        else if ( (have_large_tails (inode->i_sb) &&
758	          inode->i_size < i_block_size (inode)*4) ||
759	          (have_small_tails (inode->i_sb) &&
760		  inode->i_size < i_block_size(inode)) )
761	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
762
763	if (th->t_trans_id) {
764	    reiserfs_write_lock(inode->i_sb);
765	    reiserfs_update_sd(th, inode); // And update on-disk metadata
766	    reiserfs_write_unlock(inode->i_sb);
767	} else
768	    inode->i_sb->s_op->dirty_inode(inode);
769
770        sd_update = 1;
771    }
772    if (th->t_trans_id) {
773	reiserfs_write_lock(inode->i_sb);
774	if (!sd_update)
775	    reiserfs_update_sd(th, inode);
776	status = journal_end(th, th->t_super, th->t_blocks_allocated);
777        if (status)
778            retval = status;
779	reiserfs_write_unlock(inode->i_sb);
780    }
781    th->t_trans_id = 0;
782
783    /*
784     * we have to unlock the pages after updating i_size, otherwise
785     * we race with writepage
786     */
787    for ( i = 0; i < num_pages ; i++) {
788	struct page *page=prepared_pages[i];
789	unlock_page(page);
790	mark_page_accessed(page);
791	page_cache_release(page);
792    }
793    return retval;
794}
795
796/* Look if passed writing region is going to touch file's tail
797   (if it is present). And if it is, convert the tail to unformatted node */
798static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
799					 loff_t pos, /* Writing position */
800					 int write_bytes /* amount of bytes to write */
801				        )
802{
803    INITIALIZE_PATH(path); // needed for search_for_position
804    struct cpu_key key; // Key that would represent last touched writing byte.
805    struct item_head *ih; // item header of found block;
806    int res; // Return value of various functions we call.
807    int cont_expand_offset; // We will put offset for generic_cont_expand here
808			    // This can be int just because tails are created
809			    // only for small files.
810
811/* this embodies a dependency on a particular tail policy */
812    if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
813	/* such a big files do not have tails, so we won't bother ourselves
814	   to look for tails, simply return */
815	return 0;
816    }
817
818    reiserfs_write_lock(inode->i_sb);
819    /* find the item containing the last byte to be written, or if
820     * writing past the end of the file then the last item of the
821     * file (and then we check its type). */
822    make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
823    res = search_for_position_by_key(inode->i_sb, &key, &path);
824    if ( res == IO_ERROR ) {
825        reiserfs_write_unlock(inode->i_sb);
826	return -EIO;
827    }
828    ih = get_ih(&path);
829    res = 0;
830    if ( is_direct_le_ih(ih) ) {
831	/* Ok, closest item is file tail (tails are stored in "direct"
832	 * items), so we need to unpack it. */
833	/* To not overcomplicate matters, we just call generic_cont_expand
834	   which will in turn call other stuff and finally will boil down to
835	    reiserfs_get_block() that would do necessary conversion. */
836	cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
837	pathrelse(&path);
838	res = generic_cont_expand( inode, cont_expand_offset);
839    } else
840	pathrelse(&path);
841
842    reiserfs_write_unlock(inode->i_sb);
843    return res;
844}
845
846/* This function locks pages starting from @pos for @inode.
847   @num_pages pages are locked and stored in
848   @prepared_pages array. Also buffers are allocated for these pages.
849   First and last page of the region is read if it is overwritten only
850   partially. If last page did not exist before write (file hole or file
851   append), it is zeroed, then.
852   Returns number of unallocated blocks that should be allocated to cover
853   new file data.*/
854static int reiserfs_prepare_file_region_for_write(
855				struct inode *inode /* Inode of the file */,
856				loff_t pos, /* position in the file */
857				size_t num_pages, /* number of pages to
858					          prepare */
859				size_t write_bytes, /* Amount of bytes to be
860						    overwritten from
861						    @pos */
862				struct page **prepared_pages /* pointer to array
863							       where to store
864							       prepared pages */
865					   )
866{
867    int res=0; // Return values of different functions we call.
868    unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
869    int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
870    int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
871					 /* offset of last modified byte in last
872				            page */
873    struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
874    int i; // Simple counter
875    int blocks = 0; /* Return value (blocks that should be allocated) */
876    struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
877				   // of a page.
878    unsigned block_start, block_end; // Starting and ending offsets of current
879				     // buffer in the page.
880    struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
881						 // Page appeared to be not up
882						 // to date. Note how we have
883						 // at most 2 buffers, this is
884						 // because we at most may
885						 // partially overwrite two
886						 // buffers for one page. One at                                                 // the beginning of write area
887						 // and one at the end.
888						 // Everything inthe middle gets                                                 // overwritten totally.
889
890    struct cpu_key key; // cpu key of item that we are going to deal with
891    struct item_head *ih = NULL; // pointer to item head that we are going to deal with
892    struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
893    INITIALIZE_PATH(path); // path to item, that we are going to deal with.
894    __le32 * item=NULL; // pointer to item we are going to deal with
895    int item_pos=-1; /* Position in indirect item */
896
897
898    if ( num_pages < 1 ) {
899	reiserfs_warning (inode->i_sb,
900			  "green-9001: reiserfs_prepare_file_region_for_write "
901			  "called with zero number of pages to process");
902	return -EFAULT;
903    }
904
905    /* We have 2 loops for pages. In first loop we grab and lock the pages, so
906       that nobody would touch these until we release the pages. Then
907       we'd start to deal with mapping buffers to blocks. */
908    for ( i = 0; i < num_pages; i++) {
909	prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
910	if ( !prepared_pages[i]) {
911	    res = -ENOMEM;
912	    goto failed_page_grabbing;
913	}
914	if (!page_has_buffers(prepared_pages[i]))
915	    create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
916    }
917
918    /* Let's count amount of blocks for a case where all the blocks
919       overwritten are new (we will substract already allocated blocks later)*/
920    if ( num_pages > 2 )
921	/* These are full-overwritten pages so we count all the blocks in
922	   these pages are counted as needed to be allocated */
923	blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
924
925    /* count blocks needed for first page (possibly partially written) */
926    blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
927	   !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
928
929    /* Now we account for last page. If last page == first page (we
930       overwrite only one page), we substract all the blocks past the
931       last writing position in a page out of already calculated number
932       of blocks */
933    blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
934	   ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
935	   /* Note how we do not roundup here since partial blocks still
936		   should be allocated */
937
938    /* Now if all the write area lies past the file end, no point in
939       maping blocks, since there is none, so we just zero out remaining
940       parts of first and last pages in write area (if needed) */
941    if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
942	if ( from != 0 ) {/* First page needs to be partially zeroed */
943	    char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
944	    memset(kaddr, 0, from);
945	    kunmap_atomic( kaddr, KM_USER0);
946	}
947	if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
948	    char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
949	    memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
950	    kunmap_atomic( kaddr, KM_USER0);
951	}
952
953	/* Since all blocks are new - use already calculated value */
954	return blocks;
955    }
956
957    /* Well, since we write somewhere into the middle of a file, there is
958       possibility we are writing over some already allocated blocks, so
959       let's map these blocks and substract number of such blocks out of blocks
960       we need to allocate (calculated above) */
961    /* Mask write position to start on blocksize, we do it out of the
962       loop for performance reasons */
963    pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
964    /* Set cpu key to the starting position in a file (on left block boundary)*/
965    make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
966
967    reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
968    for ( i = 0; i < num_pages ; i++ ) {
969
970	head = page_buffers(prepared_pages[i]);
971	/* For each buffer in the page */
972	for(bh = head, block_start = 0; bh != head || !block_start;
973	    block_start=block_end, bh = bh->b_this_page) {
974		if (!bh)
975		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
976		/* Find where this buffer ends */
977		block_end = block_start+inode->i_sb->s_blocksize;
978		if (i == 0 && block_end <= from )
979		    /* if this buffer is before requested data to map, skip it*/
980		    continue;
981
982		if (i == num_pages - 1 && block_start >= to) {
983		    /* If this buffer is after requested data to map, abort
984		       processing of current page */
985		    break;
986		}
987
988		if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
989		    /* This is optimisation for a case where buffer is mapped
990		       and have blocknumber assigned. In case significant amount
991		       of such buffers are present, we may avoid some amount
992		       of search_by_key calls.
993		       Probably it would be possible to move parts of this code
994		       out of BKL, but I afraid that would overcomplicate code
995		       without any noticeable benefit.
996		    */
997		    item_pos++;
998		    /* Update the key */
999		    set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1000		    blocks--; // Decrease the amount of blocks that need to be
1001			      // allocated
1002		    continue; // Go to the next buffer
1003		}
1004
1005		if ( !itembuf || /* if first iteration */
1006		     item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
1007					     { /* or if we progressed past the
1008						  current unformatted_item */
1009			/* Try to find next item */
1010			res = search_for_position_by_key(inode->i_sb, &key, &path);
1011			/* Abort if no more items */
1012			if ( res != POSITION_FOUND ) {
1013			    /* make sure later loops don't use this item */
1014			    itembuf = NULL;
1015			    item = NULL;
1016			    break;
1017			}
1018
1019			/* Update information about current indirect item */
1020			itembuf = get_last_bh( &path );
1021			ih = get_ih( &path );
1022			item = get_item( &path );
1023			item_pos = path.pos_in_item;
1024
1025			RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
1026		}
1027
1028		/* See if there is some block associated with the file
1029		   at that position, map the buffer to this block */
1030		if ( get_block_num(item,item_pos) ) {
1031		    map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
1032		    blocks--; // Decrease the amount of blocks that need to be
1033			      // allocated
1034		}
1035		item_pos++;
1036		/* Update the key */
1037		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1038	}
1039    }
1040    pathrelse(&path); // Free the path
1041    reiserfs_write_unlock(inode->i_sb);
1042
1043	/* Now zero out unmappend buffers for the first and last pages of
1044	   write area or issue read requests if page is mapped. */
1045	/* First page, see if it is not uptodate */
1046	if ( !PageUptodate(prepared_pages[0]) ) {
1047	    head = page_buffers(prepared_pages[0]);
1048
1049	    /* For each buffer in page */
1050	    for(bh = head, block_start = 0; bh != head || !block_start;
1051		block_start=block_end, bh = bh->b_this_page) {
1052
1053		if (!bh)
1054		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1055		/* Find where this buffer ends */
1056		block_end = block_start+inode->i_sb->s_blocksize;
1057		if ( block_end <= from )
1058		    /* if this buffer is before requested data to map, skip it*/
1059		    continue;
1060		if ( block_start < from ) { /* Aha, our partial buffer */
1061		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1062						  issue READ request for it to
1063						  not loose data */
1064			ll_rw_block(READ, 1, &bh);
1065			*wait_bh++=bh;
1066		    } else { /* Not mapped, zero it */
1067			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1068			memset(kaddr+block_start, 0, from-block_start);
1069			kunmap_atomic( kaddr, KM_USER0);
1070			set_buffer_uptodate(bh);
1071		    }
1072		}
1073	    }
1074	}
1075
1076	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1077	if ( !PageUptodate(prepared_pages[num_pages-1]) ||
1078	    ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
1079	    head = page_buffers(prepared_pages[num_pages-1]);
1080
1081	    /* for each buffer in page */
1082	    for(bh = head, block_start = 0; bh != head || !block_start;
1083		block_start=block_end, bh = bh->b_this_page) {
1084
1085		if (!bh)
1086		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1087		/* Find where this buffer ends */
1088		block_end = block_start+inode->i_sb->s_blocksize;
1089		if ( block_start >= to )
1090		    /* if this buffer is after requested data to map, skip it*/
1091		    break;
1092		if ( block_end > to ) { /* Aha, our partial buffer */
1093		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1094						  issue READ request for it to
1095						  not loose data */
1096			ll_rw_block(READ, 1, &bh);
1097			*wait_bh++=bh;
1098		    } else { /* Not mapped, zero it */
1099			char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
1100			memset(kaddr+to, 0, block_end-to);
1101			kunmap_atomic( kaddr, KM_USER0);
1102			set_buffer_uptodate(bh);
1103		    }
1104		}
1105	    }
1106	}
1107
1108    /* Wait for read requests we made to happen, if necessary */
1109    while(wait_bh > wait) {
1110	wait_on_buffer(*--wait_bh);
1111	if (!buffer_uptodate(*wait_bh)) {
1112	    res = -EIO;
1113	    goto failed_read;
1114	}
1115    }
1116
1117    return blocks;
1118failed_page_grabbing:
1119    num_pages = i;
1120failed_read:
1121    reiserfs_unprepare_pages(prepared_pages, num_pages);
1122    return res;
1123}
1124
1125/* Write @count bytes at position @ppos in a file indicated by @file
1126   from the buffer @buf.
1127
1128   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1129   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1130   written for (ext2/3).  This is for several reasons:
1131
1132   * It has no understanding of any filesystem specific optimizations.
1133
1134   * It enters the filesystem repeatedly for each page that is written.
1135
1136   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1137   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1138   * to reiserfs which allows for fewer tree traversals.
1139
1140   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1141
1142   * Asking the block allocation code for blocks one at a time is slightly less efficient.
1143
1144   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1145   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1146   things right finally.
1147
1148   Future Features: providing search_by_key with hints.
1149
1150*/
1151static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
1152                             const char __user *buf, /*  pointer to user supplied data
1153(in userspace) */
1154                             size_t count, /* amount of bytes to write */
1155                             loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
1156                                           * new current position before returning. */ )
1157{
1158    size_t already_written = 0; // Number of bytes already written to the file.
1159    loff_t pos; // Current position in the file.
1160    ssize_t res; // return value of various functions that we call.
1161    int err = 0;
1162    struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
1163				/* To simplify coding at this time, we store
1164				   locked pages in array for now */
1165    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1166    struct reiserfs_transaction_handle th;
1167    th.t_trans_id = 0;
1168
1169    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
1170	ssize_t result, after_file_end = 0;
1171	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
1172	    /* If we are appending a file, we need to put this savelink in here.
1173	       If we will crash while doing direct io, finish_unfinished will
1174	       cut the garbage from the file end. */
1175	    reiserfs_write_lock(inode->i_sb);
1176	    err = journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
1177            if (err) {
1178		reiserfs_write_unlock (inode->i_sb);
1179		return err;
1180	    }
1181	    reiserfs_update_inode_transaction(inode);
1182	    add_save_link (&th, inode, 1 /* Truncate */);
1183	    after_file_end = 1;
1184	    err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1185            reiserfs_write_unlock(inode->i_sb);
1186	    if (err)
1187		return err;
1188	}
1189	result = generic_file_write(file, buf, count, ppos);
1190
1191	if ( after_file_end ) { /* Now update i_size and remove the savelink */
1192	    struct reiserfs_transaction_handle th;
1193	    reiserfs_write_lock(inode->i_sb);
1194	    err = journal_begin(&th, inode->i_sb, 1);
1195            if (err) {
1196                reiserfs_write_unlock (inode->i_sb);
1197                return err;
1198            }
1199	    reiserfs_update_inode_transaction(inode);
1200	    reiserfs_update_sd(&th, inode);
1201	    err = journal_end(&th, inode->i_sb, 1);
1202            if (err) {
1203                reiserfs_write_unlock (inode->i_sb);
1204                return err;
1205            }
1206	    err = remove_save_link (inode, 1/* truncate */);
1207	    reiserfs_write_unlock(inode->i_sb);
1208            if (err)
1209                return err;
1210	}
1211
1212	return result;
1213    }
1214
1215    if ( unlikely((ssize_t) count < 0 ))
1216        return -EINVAL;
1217
1218    if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1219        return -EFAULT;
1220
1221    down(&inode->i_sem); // locks the entire file for just us
1222
1223    pos = *ppos;
1224
1225    /* Check if we can write to specified region of file, file
1226       is not overly big and this kind of stuff. Adjust pos and
1227       count, if needed */
1228    res = generic_write_checks(file, &pos, &count, 0);
1229    if (res)
1230	goto out;
1231
1232    if ( count == 0 )
1233	goto out;
1234
1235    res = remove_suid(file->f_dentry);
1236    if (res)
1237	goto out;
1238
1239    inode_update_time(inode, 1); /* Both mtime and ctime */
1240
1241    // Ok, we are done with all the checks.
1242
1243    // Now we should start real work
1244
1245    /* If we are going to write past the file's packed tail or if we are going
1246       to overwrite part of the tail, we need that tail to be converted into
1247       unformatted node */
1248    res = reiserfs_check_for_tail_and_convert( inode, pos, count);
1249    if (res)
1250	goto out;
1251
1252    while ( count > 0) {
1253	/* This is the main loop in which we running until some error occures
1254	   or until we write all of the data. */
1255	size_t num_pages;/* amount of pages we are going to write this iteration */
1256	size_t write_bytes; /* amount of bytes to write during this iteration */
1257	size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */
1258
1259        /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
1260	num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1261							  pages */
1262		    ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
1263						/* convert size to amount of
1264						   pages */
1265	reiserfs_write_lock(inode->i_sb);
1266	if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1267		|| num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
1268	    /* If we were asked to write more data than we want to or if there
1269	       is not that much space, then we shorten amount of data to write
1270	       for this iteration. */
1271	    num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
1272	    /* Also we should not forget to set size in bytes accordingly */
1273	    write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1274			    (pos & (PAGE_CACHE_SIZE-1));
1275					 /* If position is not on the
1276					    start of the page, we need
1277					    to substract the offset
1278					    within page */
1279	} else
1280	    write_bytes = count;
1281
1282	/* reserve the blocks to be allocated later, so that later on
1283	   we still have the space to write the blocks to */
1284	reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1285	reiserfs_write_unlock(inode->i_sb);
1286
1287	if ( !num_pages ) { /* If we do not have enough space even for a single page... */
1288	    if ( pos > inode->i_size+inode->i_sb->s_blocksize-(pos & (inode->i_sb->s_blocksize-1))) {
1289		res = -ENOSPC;
1290		break; // In case we are writing past the end of the last file block, break.
1291	    }
1292	    // Otherwise we are possibly overwriting the file, so
1293	    // let's set write size to be equal or less than blocksize.
1294	    // This way we get it correctly for file holes.
1295	    // But overwriting files on absolutelly full volumes would not
1296	    // be very efficient. Well, people are not supposed to fill
1297	    // 100% of disk space anyway.
1298	    write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
1299	    num_pages = 1;
1300	    // No blocks were claimed before, so do it now.
1301	    reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1302	}
1303
1304	/* Prepare for writing into the region, read in all the
1305	   partially overwritten pages, if needed. And lock the pages,
1306	   so that nobody else can access these until we are done.
1307	   We get number of actual blocks needed as a result.*/
1308	blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
1309	if ( blocks_to_allocate < 0 ) {
1310	    res = blocks_to_allocate;
1311	    reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1312	    break;
1313	}
1314
1315	/* First we correct our estimate of how many blocks we need */
1316	reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
1317
1318	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
1319	    /* Fill in all the possible holes and append the file if needed */
1320	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
1321	}
1322
1323	/* well, we have allocated the blocks, so it is time to free
1324	   the reservation we made earlier. */
1325	reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
1326	if ( res ) {
1327	    reiserfs_unprepare_pages(prepared_pages, num_pages);
1328	    break;
1329	}
1330
1331/* NOTE that allocating blocks and filling blocks can be done in reverse order
1332   and probably we would do that just to get rid of garbage in files after a
1333   crash */
1334
1335	/* Copy data from user-supplied buffer to file's pages */
1336	res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
1337	if ( res ) {
1338	    reiserfs_unprepare_pages(prepared_pages, num_pages);
1339	    break;
1340	}
1341
1342	/* Send the pages to disk and unlock them. */
1343	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
1344	                                            write_bytes,prepared_pages);
1345	if ( res )
1346	    break;
1347
1348	already_written += write_bytes;
1349	buf += write_bytes;
1350	*ppos = pos += write_bytes;
1351	count -= write_bytes;
1352	balance_dirty_pages_ratelimited(inode->i_mapping);
1353    }
1354
1355    /* this is only true on error */
1356    if (th.t_trans_id) {
1357        reiserfs_write_lock(inode->i_sb);
1358        err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1359        reiserfs_write_unlock(inode->i_sb);
1360        if (err) {
1361            res = err;
1362            goto out;
1363        }
1364    }
1365
1366    if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1367	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
1368
1369    up(&inode->i_sem);
1370    reiserfs_async_progress_wait(inode->i_sb);
1371    return (already_written != 0)?already_written:res;
1372
1373out:
1374    up(&inode->i_sem); // unlock the file on exit.
1375    return res;
1376}
1377
1378static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
1379			       size_t count, loff_t pos)
1380{
1381    return generic_file_aio_write(iocb, buf, count, pos);
1382}
1383
1384
1385
1386struct file_operations reiserfs_file_operations = {
1387    .read	= generic_file_read,
1388    .write	= reiserfs_file_write,
1389    .ioctl	= reiserfs_ioctl,
1390    .mmap	= generic_file_mmap,
1391    .release	= reiserfs_file_release,
1392    .fsync	= reiserfs_sync_file,
1393    .sendfile	= generic_file_sendfile,
1394    .aio_read   = generic_file_aio_read,
1395    .aio_write  = reiserfs_aio_write,
1396};
1397
1398
1399struct  inode_operations reiserfs_file_inode_operations = {
1400    .truncate	= reiserfs_vfs_truncate_file,
1401    .setattr    = reiserfs_setattr,
1402    .setxattr   = reiserfs_setxattr,
1403    .getxattr   = reiserfs_getxattr,
1404    .listxattr  = reiserfs_listxattr,
1405    .removexattr = reiserfs_removexattr,
1406    .permission = reiserfs_permission,
1407};
1408
1409
1410