file.c revision fa385bef256077f3b820b241e8f3755ef3905b74
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/reiserfs_fs.h>
7#include <linux/reiserfs_acl.h>
8#include <linux/reiserfs_xattr.h>
9#include <linux/smp_lock.h>
10#include <asm/uaccess.h>
11#include <linux/pagemap.h>
12#include <linux/swap.h>
13#include <linux/writeback.h>
14#include <linux/blkdev.h>
15#include <linux/buffer_head.h>
16#include <linux/quotaops.h>
17
18/*
19** We pack the tails of files on file close, not at the time they are written.
20** This implies an unnecessary copy of the tail and an unnecessary indirect item
21** insertion/balancing, for files that are written in one write.
22** It avoids unnecessary tail packings (balances) for files that are written in
23** multiple writes and are small enough to have tails.
24**
25** file_release is called by the VFS layer when the file is closed.  If
26** this is the last open file descriptor, and the file
27** small enough to have a tail, and the tail is currently in an
28** unformatted node, the tail is converted back into a direct item.
29**
30** We use reiserfs_truncate_file to pack the tail, since it already has
31** all the conditions coded.
32*/
33static int reiserfs_file_release(struct inode *inode, struct file *filp)
34{
35
36	struct reiserfs_transaction_handle th;
37	int err;
38	int jbegin_failure = 0;
39
40	if (!S_ISREG(inode->i_mode))
41		BUG();
42
43	/* fast out for when nothing needs to be done */
44	if ((atomic_read(&inode->i_count) > 1 ||
45	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
46	     !tail_has_to_be_packed(inode)) &&
47	    REISERFS_I(inode)->i_prealloc_count <= 0) {
48		return 0;
49	}
50
51	reiserfs_write_lock(inode->i_sb);
52	mutex_lock(&inode->i_mutex);
53	/* freeing preallocation only involves relogging blocks that
54	 * are already in the current transaction.  preallocation gets
55	 * freed at the end of each transaction, so it is impossible for
56	 * us to log any additional blocks (including quota blocks)
57	 */
58	err = journal_begin(&th, inode->i_sb, 1);
59	if (err) {
60		/* uh oh, we can't allow the inode to go away while there
61		 * is still preallocation blocks pending.  Try to join the
62		 * aborted transaction
63		 */
64		jbegin_failure = err;
65		err = journal_join_abort(&th, inode->i_sb, 1);
66
67		if (err) {
68			/* hmpf, our choices here aren't good.  We can pin the inode
69			 * which will disallow unmount from every happening, we can
70			 * do nothing, which will corrupt random memory on unmount,
71			 * or we can forcibly remove the file from the preallocation
72			 * list, which will leak blocks on disk.  Lets pin the inode
73			 * and let the admin know what is going on.
74			 */
75			igrab(inode);
76			reiserfs_warning(inode->i_sb,
77					 "pinning inode %lu because the "
78					 "preallocation can't be freed");
79			goto out;
80		}
81	}
82	reiserfs_update_inode_transaction(inode);
83
84#ifdef REISERFS_PREALLOCATE
85	reiserfs_discard_prealloc(&th, inode);
86#endif
87	err = journal_end(&th, inode->i_sb, 1);
88
89	/* copy back the error code from journal_begin */
90	if (!err)
91		err = jbegin_failure;
92
93	if (!err && atomic_read(&inode->i_count) <= 1 &&
94	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95	    tail_has_to_be_packed(inode)) {
96		/* if regular file is released by last holder and it has been
97		   appended (we append by unformatted node only) or its direct
98		   item(s) had to be converted, then it may have to be
99		   indirect2direct converted */
100		err = reiserfs_truncate_file(inode, 0);
101	}
102      out:
103	mutex_unlock(&inode->i_mutex);
104	reiserfs_write_unlock(inode->i_sb);
105	return err;
106}
107
108static void reiserfs_vfs_truncate_file(struct inode *inode)
109{
110	reiserfs_truncate_file(inode, 1);
111}
112
113/* Sync a reiserfs file. */
114
115/*
116 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
117 * be removed...
118 */
119
120static int reiserfs_sync_file(struct file *p_s_filp,
121			      struct dentry *p_s_dentry, int datasync)
122{
123	struct inode *p_s_inode = p_s_dentry->d_inode;
124	int n_err;
125	int barrier_done;
126
127	if (!S_ISREG(p_s_inode->i_mode))
128		BUG();
129	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
130	reiserfs_write_lock(p_s_inode->i_sb);
131	barrier_done = reiserfs_commit_for_inode(p_s_inode);
132	reiserfs_write_unlock(p_s_inode->i_sb);
133	if (barrier_done != 1)
134		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
135	if (barrier_done < 0)
136		return barrier_done;
137	return (n_err < 0) ? -EIO : 0;
138}
139
140/* I really do not want to play with memory shortage right now, so
141   to simplify the code, we are not going to write more than this much pages at
142   a time. This still should considerably improve performance compared to 4k
143   at a time case. This is 32 pages of 4k size. */
144#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
145
146/* Allocates blocks for a file to fulfil write request.
147   Maps all unmapped but prepared pages from the list.
148   Updates metadata with newly allocated blocknumbers as needed */
149static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
150					       loff_t pos,	/* Writing position */
151					       int num_pages,	/* number of pages write going
152								   to touch */
153					       int write_bytes,	/* amount of bytes to write */
154					       struct page **prepared_pages,	/* array of
155										   prepared pages
156										 */
157					       int blocks_to_allocate	/* Amount of blocks we
158									   need to allocate to
159									   fit the data into file
160									 */
161    )
162{
163	struct cpu_key key;	// cpu key of item that we are going to deal with
164	struct item_head *ih;	// pointer to item head that we are going to deal with
165	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
166	__le32 *item;		// pointer to item we are going to deal with
167	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
168	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
169	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
170	size_t res;		// return value of various functions that we call.
171	int curr_block;		// current block used to keep track of unmapped blocks.
172	int i;			// loop counter
173	int itempos;		// position in item
174	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
175	// first page
176	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
177	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
178	int modifying_this_item = 0;	// Flag for items traversal code to keep track
179	// of the fact that we already prepared
180	// current block for journal
181	int will_prealloc = 0;
182	RFALSE(!blocks_to_allocate,
183	       "green-9004: tried to allocate zero blocks?");
184
185	/* only preallocate if this is a small write */
186	if (REISERFS_I(inode)->i_prealloc_count ||
187	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
188	     blocks_to_allocate <
189	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
190		will_prealloc =
191		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
192
193	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
194				   sizeof(b_blocknr_t), GFP_NOFS);
195	if (!allocated_blocks)
196		return -ENOMEM;
197
198	/* First we compose a key to point at the writing position, we want to do
199	   that outside of any locking region. */
200	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
201
202	/* If we came here, it means we absolutely need to open a transaction,
203	   since we need to allocate some blocks */
204	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
205	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
206	if (res)
207		goto error_exit;
208	reiserfs_update_inode_transaction(inode);
209
210	/* Look for the in-tree position of our write, need path for block allocator */
211	res = search_for_position_by_key(inode->i_sb, &key, &path);
212	if (res == IO_ERROR) {
213		res = -EIO;
214		goto error_exit;
215	}
216
217	/* Allocate blocks */
218	/* First fill in "hint" structure for block allocator */
219	hint.th = th;		// transaction handle.
220	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
221	hint.inode = inode;	// Inode is needed by block allocator too.
222	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
223	hint.key = key.on_disk_key;	// on disk key of file.
224	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
225	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
226	hint.preallocate = will_prealloc;
227
228	/* Call block allocator to allocate blocks */
229	res =
230	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
231				       blocks_to_allocate, blocks_to_allocate);
232	if (res != CARRY_ON) {
233		if (res == NO_DISK_SPACE) {
234			/* We flush the transaction in case of no space. This way some
235			   blocks might become free */
236			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
237			res = restart_transaction(th, inode, &path);
238			if (res)
239				goto error_exit;
240
241			/* We might have scheduled, so search again */
242			res =
243			    search_for_position_by_key(inode->i_sb, &key,
244						       &path);
245			if (res == IO_ERROR) {
246				res = -EIO;
247				goto error_exit;
248			}
249
250			/* update changed info for hint structure. */
251			res =
252			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
253						       blocks_to_allocate,
254						       blocks_to_allocate);
255			if (res != CARRY_ON) {
256				res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
257				pathrelse(&path);
258				goto error_exit;
259			}
260		} else {
261			res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
262			pathrelse(&path);
263			goto error_exit;
264		}
265	}
266#ifdef __BIG_ENDIAN
267	// Too bad, I have not found any way to convert a given region from
268	// cpu format to little endian format
269	{
270		int i;
271		for (i = 0; i < blocks_to_allocate; i++)
272			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
273	}
274#endif
275
276	/* Blocks allocating well might have scheduled and tree might have changed,
277	   let's search the tree again */
278	/* find where in the tree our write should go */
279	res = search_for_position_by_key(inode->i_sb, &key, &path);
280	if (res == IO_ERROR) {
281		res = -EIO;
282		goto error_exit_free_blocks;
283	}
284
285	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
286	ih = get_ih(&path);	// Get a pointer to last item head in path.
287	item = get_item(&path);	// Get a pointer to last item in path
288
289	/* Let's see what we have found */
290	if (res != POSITION_FOUND) {	/* position not found, this means that we
291					   might need to append file with holes
292					   first */
293		// Since we are writing past the file's end, we need to find out if
294		// there is a hole that needs to be inserted before our writing
295		// position, and how many blocks it is going to cover (we need to
296		//  populate pointers to file blocks representing the hole with zeros)
297
298		{
299			int item_offset = 1;
300			/*
301			 * if ih is stat data, its offset is 0 and we don't want to
302			 * add 1 to pos in the hole_size calculation
303			 */
304			if (is_statdata_le_ih(ih))
305				item_offset = 0;
306			hole_size = (pos + item_offset -
307				     (le_key_k_offset
308				      (get_inode_item_key_version(inode),
309				       &(ih->ih_key)) + op_bytes_number(ih,
310									inode->
311									i_sb->
312									s_blocksize)))
313			    >> inode->i_sb->s_blocksize_bits;
314		}
315
316		if (hole_size > 0) {
317			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
318			/* area filled with zeroes, to supply as list of zero blocknumbers
319			   We allocate it outside of loop just in case loop would spin for
320			   several iterations. */
321			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
322			if (!zeros) {
323				res = -ENOMEM;
324				goto error_exit_free_blocks;
325			}
326			memset(zeros, 0, to_paste * UNFM_P_SIZE);
327			do {
328				to_paste =
329				    min_t(__u64, hole_size,
330					  MAX_ITEM_LEN(inode->i_sb->
331						       s_blocksize) /
332					  UNFM_P_SIZE);
333				if (is_indirect_le_ih(ih)) {
334					/* Ok, there is existing indirect item already. Need to append it */
335					/* Calculate position past inserted item */
336					make_cpu_key(&key, inode,
337						     le_key_k_offset
338						     (get_inode_item_key_version
339						      (inode),
340						      &(ih->ih_key)) +
341						     op_bytes_number(ih,
342								     inode->
343								     i_sb->
344								     s_blocksize),
345						     TYPE_INDIRECT, 3);
346					res =
347					    reiserfs_paste_into_item(th, &path,
348								     &key,
349								     inode,
350								     (char *)
351								     zeros,
352								     UNFM_P_SIZE
353								     *
354								     to_paste);
355					if (res) {
356						kfree(zeros);
357						goto error_exit_free_blocks;
358					}
359				} else if (is_statdata_le_ih(ih)) {
360					/* No existing item, create it */
361					/* item head for new item */
362					struct item_head ins_ih;
363
364					/* create a key for our new item */
365					make_cpu_key(&key, inode, 1,
366						     TYPE_INDIRECT, 3);
367
368					/* Create new item head for our new item */
369					make_le_item_head(&ins_ih, &key,
370							  key.version, 1,
371							  TYPE_INDIRECT,
372							  to_paste *
373							  UNFM_P_SIZE,
374							  0 /* free space */ );
375
376					/* Find where such item should live in the tree */
377					res =
378					    search_item(inode->i_sb, &key,
379							&path);
380					if (res != ITEM_NOT_FOUND) {
381						/* item should not exist, otherwise we have error */
382						if (res != -ENOSPC) {
383							reiserfs_warning(inode->
384									 i_sb,
385									 "green-9008: search_by_key (%K) returned %d",
386									 &key,
387									 res);
388						}
389						res = -EIO;
390						kfree(zeros);
391						goto error_exit_free_blocks;
392					}
393					res =
394					    reiserfs_insert_item(th, &path,
395								 &key, &ins_ih,
396								 inode,
397								 (char *)zeros);
398				} else {
399					reiserfs_panic(inode->i_sb,
400						       "green-9011: Unexpected key type %K\n",
401						       &key);
402				}
403				if (res) {
404					kfree(zeros);
405					goto error_exit_free_blocks;
406				}
407				/* Now we want to check if transaction is too full, and if it is
408				   we restart it. This will also free the path. */
409				if (journal_transaction_should_end
410				    (th, th->t_blocks_allocated)) {
411					res =
412					    restart_transaction(th, inode,
413								&path);
414					if (res) {
415						pathrelse(&path);
416						kfree(zeros);
417						goto error_exit;
418					}
419				}
420
421				/* Well, need to recalculate path and stuff */
422				set_cpu_key_k_offset(&key,
423						     cpu_key_k_offset(&key) +
424						     (to_paste << inode->
425						      i_blkbits));
426				res =
427				    search_for_position_by_key(inode->i_sb,
428							       &key, &path);
429				if (res == IO_ERROR) {
430					res = -EIO;
431					kfree(zeros);
432					goto error_exit_free_blocks;
433				}
434				bh = get_last_bh(&path);
435				ih = get_ih(&path);
436				item = get_item(&path);
437				hole_size -= to_paste;
438			} while (hole_size);
439			kfree(zeros);
440		}
441	}
442	// Go through existing indirect items first
443	// replace all zeroes with blocknumbers from list
444	// Note that if no corresponding item was found, by previous search,
445	// it means there are no existing in-tree representation for file area
446	// we are going to overwrite, so there is nothing to scan through for holes.
447	for (curr_block = 0, itempos = path.pos_in_item;
448	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
449	      retry:
450
451		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
452			/* We run out of data in this indirect item, let's look for another
453			   one. */
454			/* First if we are already modifying current item, log it */
455			if (modifying_this_item) {
456				journal_mark_dirty(th, inode->i_sb, bh);
457				modifying_this_item = 0;
458			}
459			/* Then set the key to look for a new indirect item (offset of old
460			   item is added to old item length */
461			set_cpu_key_k_offset(&key,
462					     le_key_k_offset
463					     (get_inode_item_key_version(inode),
464					      &(ih->ih_key)) +
465					     op_bytes_number(ih,
466							     inode->i_sb->
467							     s_blocksize));
468			/* Search ofor position of new key in the tree. */
469			res =
470			    search_for_position_by_key(inode->i_sb, &key,
471						       &path);
472			if (res == IO_ERROR) {
473				res = -EIO;
474				goto error_exit_free_blocks;
475			}
476			bh = get_last_bh(&path);
477			ih = get_ih(&path);
478			item = get_item(&path);
479			itempos = path.pos_in_item;
480			continue;	// loop to check all kinds of conditions and so on.
481		}
482		/* Ok, we have correct position in item now, so let's see if it is
483		   representing file hole (blocknumber is zero) and fill it if needed */
484		if (!item[itempos]) {
485			/* Ok, a hole. Now we need to check if we already prepared this
486			   block to be journaled */
487			while (!modifying_this_item) {	// loop until succeed
488				/* Well, this item is not journaled yet, so we must prepare
489				   it for journal first, before we can change it */
490				struct item_head tmp_ih;	// We copy item head of found item,
491				// here to detect if fs changed under
492				// us while we were preparing for
493				// journal.
494				int fs_gen;	// We store fs generation here to find if someone
495				// changes fs under our feet
496
497				copy_item_head(&tmp_ih, ih);	// Remember itemhead
498				fs_gen = get_generation(inode->i_sb);	// remember fs generation
499				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
500				if (fs_changed(fs_gen, inode->i_sb)
501				    && item_moved(&tmp_ih, &path)) {
502					// Sigh, fs was changed under us, we need to look for new
503					// location of item we are working with
504
505					/* unmark prepaerd area as journaled and search for it's
506					   new position */
507					reiserfs_restore_prepared_buffer(inode->
508									 i_sb,
509									 bh);
510					res =
511					    search_for_position_by_key(inode->
512								       i_sb,
513								       &key,
514								       &path);
515					if (res == IO_ERROR) {
516						res = -EIO;
517						goto error_exit_free_blocks;
518					}
519					bh = get_last_bh(&path);
520					ih = get_ih(&path);
521					item = get_item(&path);
522					itempos = path.pos_in_item;
523					goto retry;
524				}
525				modifying_this_item = 1;
526			}
527			item[itempos] = allocated_blocks[curr_block];	// Assign new block
528			curr_block++;
529		}
530		itempos++;
531	}
532
533	if (modifying_this_item) {	// We need to log last-accessed block, if it
534		// was modified, but not logged yet.
535		journal_mark_dirty(th, inode->i_sb, bh);
536	}
537
538	if (curr_block < blocks_to_allocate) {
539		// Oh, well need to append to indirect item, or to create indirect item
540		// if there weren't any
541		if (is_indirect_le_ih(ih)) {
542			// Existing indirect item - append. First calculate key for append
543			// position. We do not need to recalculate path as it should
544			// already point to correct place.
545			make_cpu_key(&key, inode,
546				     le_key_k_offset(get_inode_item_key_version
547						     (inode),
548						     &(ih->ih_key)) +
549				     op_bytes_number(ih,
550						     inode->i_sb->s_blocksize),
551				     TYPE_INDIRECT, 3);
552			res =
553			    reiserfs_paste_into_item(th, &path, &key, inode,
554						     (char *)(allocated_blocks +
555							      curr_block),
556						     UNFM_P_SIZE *
557						     (blocks_to_allocate -
558						      curr_block));
559			if (res) {
560				goto error_exit_free_blocks;
561			}
562		} else if (is_statdata_le_ih(ih)) {
563			// Last found item was statdata. That means we need to create indirect item.
564			struct item_head ins_ih;	/* itemhead for new item */
565
566			/* create a key for our new item */
567			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
568			// because that's
569			// where first
570			// indirect item
571			// begins
572			/* Create new item head for our new item */
573			make_le_item_head(&ins_ih, &key, key.version, 1,
574					  TYPE_INDIRECT,
575					  (blocks_to_allocate -
576					   curr_block) * UNFM_P_SIZE,
577					  0 /* free space */ );
578			/* Find where such item should live in the tree */
579			res = search_item(inode->i_sb, &key, &path);
580			if (res != ITEM_NOT_FOUND) {
581				/* Well, if we have found such item already, or some error
582				   occured, we need to warn user and return error */
583				if (res != -ENOSPC) {
584					reiserfs_warning(inode->i_sb,
585							 "green-9009: search_by_key (%K) "
586							 "returned %d", &key,
587							 res);
588				}
589				res = -EIO;
590				goto error_exit_free_blocks;
591			}
592			/* Insert item into the tree with the data as its body */
593			res =
594			    reiserfs_insert_item(th, &path, &key, &ins_ih,
595						 inode,
596						 (char *)(allocated_blocks +
597							  curr_block));
598		} else {
599			reiserfs_panic(inode->i_sb,
600				       "green-9010: unexpected item type for key %K\n",
601				       &key);
602		}
603	}
604	// the caller is responsible for closing the transaction
605	// unless we return an error, they are also responsible for logging
606	// the inode.
607	//
608	pathrelse(&path);
609	/*
610	 * cleanup prellocation from previous writes
611	 * if this is a partial block write
612	 */
613	if (write_bytes & (inode->i_sb->s_blocksize - 1))
614		reiserfs_discard_prealloc(th, inode);
615	reiserfs_write_unlock(inode->i_sb);
616
617	// go through all the pages/buffers and map the buffers to newly allocated
618	// blocks (so that system knows where to write these pages later).
619	curr_block = 0;
620	for (i = 0; i < num_pages; i++) {
621		struct page *page = prepared_pages[i];	//current page
622		struct buffer_head *head = page_buffers(page);	// first buffer for a page
623		int block_start, block_end;	// in-page offsets for buffers.
624
625		if (!page_buffers(page))
626			reiserfs_panic(inode->i_sb,
627				       "green-9005: No buffers for prepared page???");
628
629		/* For each buffer in page */
630		for (bh = head, block_start = 0; bh != head || !block_start;
631		     block_start = block_end, bh = bh->b_this_page) {
632			if (!bh)
633				reiserfs_panic(inode->i_sb,
634					       "green-9006: Allocated but absent buffer for a page?");
635			block_end = block_start + inode->i_sb->s_blocksize;
636			if (i == 0 && block_end <= from)
637				/* if this buffer is before requested data to map, skip it */
638				continue;
639			if (i == num_pages - 1 && block_start >= to)
640				/* If this buffer is after requested data to map, abort
641				   processing of current page */
642				break;
643
644			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
645				map_bh(bh, inode->i_sb,
646				       le32_to_cpu(allocated_blocks
647						   [curr_block]));
648				curr_block++;
649				set_buffer_new(bh);
650			}
651		}
652	}
653
654	RFALSE(curr_block > blocks_to_allocate,
655	       "green-9007: Used too many blocks? weird");
656
657	kfree(allocated_blocks);
658	return 0;
659
660// Need to deal with transaction here.
661      error_exit_free_blocks:
662	pathrelse(&path);
663	// free blocks
664	for (i = 0; i < blocks_to_allocate; i++)
665		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
666				    1);
667
668      error_exit:
669	if (th->t_trans_id) {
670		int err;
671		// update any changes we made to blk count
672		mark_inode_dirty(inode);
673		err =
674		    journal_end(th, inode->i_sb,
675				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
676				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
677		if (err)
678			res = err;
679	}
680	reiserfs_write_unlock(inode->i_sb);
681	kfree(allocated_blocks);
682
683	return res;
684}
685
686/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
687static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
688				     size_t num_pages /* amount of pages */ )
689{
690	int i;			// loop counter
691
692	for (i = 0; i < num_pages; i++) {
693		struct page *page = prepared_pages[i];
694
695		try_to_free_buffers(page);
696		unlock_page(page);
697		page_cache_release(page);
698	}
699}
700
701/* This function will copy data from userspace to specified pages within
702   supplied byte range */
703static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
704						  int num_pages,	/* Number of pages affected */
705						  int write_bytes,	/* Amount of bytes to write */
706						  struct page **prepared_pages,	/* pointer to
707										   array to
708										   prepared pages
709										 */
710						  const char __user * buf	/* Pointer to user-supplied
711										   data */
712    )
713{
714	long page_fault = 0;	// status of copy_from_user.
715	int i;			// loop counter.
716	int offset;		// offset in page
717
718	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
719	     i++, offset = 0) {
720		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
721		struct page *page = prepared_pages[i];	// Current page we process.
722
723		fault_in_pages_readable(buf, count);
724
725		/* Copy data from userspace to the current page */
726		kmap(page);
727		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
728		/* Flush processor's dcache for this page */
729		flush_dcache_page(page);
730		kunmap(page);
731		buf += count;
732		write_bytes -= count;
733
734		if (page_fault)
735			break;	// Was there a fault? abort.
736	}
737
738	return page_fault ? -EFAULT : 0;
739}
740
741/* taken fs/buffer.c:__block_commit_write */
742int reiserfs_commit_page(struct inode *inode, struct page *page,
743			 unsigned from, unsigned to)
744{
745	unsigned block_start, block_end;
746	int partial = 0;
747	unsigned blocksize;
748	struct buffer_head *bh, *head;
749	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
750	int new;
751	int logit = reiserfs_file_data_log(inode);
752	struct super_block *s = inode->i_sb;
753	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
754	struct reiserfs_transaction_handle th;
755	int ret = 0;
756
757	th.t_trans_id = 0;
758	blocksize = 1 << inode->i_blkbits;
759
760	if (logit) {
761		reiserfs_write_lock(s);
762		ret = journal_begin(&th, s, bh_per_page + 1);
763		if (ret)
764			goto drop_write_lock;
765		reiserfs_update_inode_transaction(inode);
766	}
767	for (bh = head = page_buffers(page), block_start = 0;
768	     bh != head || !block_start;
769	     block_start = block_end, bh = bh->b_this_page) {
770
771		new = buffer_new(bh);
772		clear_buffer_new(bh);
773		block_end = block_start + blocksize;
774		if (block_end <= from || block_start >= to) {
775			if (!buffer_uptodate(bh))
776				partial = 1;
777		} else {
778			set_buffer_uptodate(bh);
779			if (logit) {
780				reiserfs_prepare_for_journal(s, bh, 1);
781				journal_mark_dirty(&th, s, bh);
782			} else if (!buffer_dirty(bh)) {
783				mark_buffer_dirty(bh);
784				/* do data=ordered on any page past the end
785				 * of file and any buffer marked BH_New.
786				 */
787				if (reiserfs_data_ordered(inode->i_sb) &&
788				    (new || page->index >= i_size_index)) {
789					reiserfs_add_ordered_list(inode, bh);
790				}
791			}
792		}
793	}
794	if (logit) {
795		ret = journal_end(&th, s, bh_per_page + 1);
796	      drop_write_lock:
797		reiserfs_write_unlock(s);
798	}
799	/*
800	 * If this is a partial write which happened to make all buffers
801	 * uptodate then we can optimize away a bogus readpage() for
802	 * the next read(). Here we 'discover' whether the page went
803	 * uptodate as a result of this (potentially partial) write.
804	 */
805	if (!partial)
806		SetPageUptodate(page);
807	return ret;
808}
809
810/* Submit pages for write. This was separated from actual file copying
811   because we might want to allocate block numbers in-between.
812   This function assumes that caller will adjust file size to correct value. */
813static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
814						 size_t num_pages,	/* Number of pages to write */
815						 size_t write_bytes,	/* number of bytes to write */
816						 struct page **prepared_pages	/* list of pages */
817    )
818{
819	int status;		// return status of block_commit_write.
820	int retval = 0;		// Return value we are going to return.
821	int i;			// loop counter
822	int offset;		// Writing offset in page.
823	int orig_write_bytes = write_bytes;
824	int sd_update = 0;
825
826	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
827	     i++, offset = 0) {
828		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
829		struct page *page = prepared_pages[i];	// Current page we process.
830
831		status =
832		    reiserfs_commit_page(inode, page, offset, offset + count);
833		if (status)
834			retval = status;	// To not overcomplicate matters We are going to
835		// submit all the pages even if there was error.
836		// we only remember error status to report it on
837		// exit.
838		write_bytes -= count;
839	}
840	/* now that we've gotten all the ordered buffers marked dirty,
841	 * we can safely update i_size and close any running transaction
842	 */
843	if (pos + orig_write_bytes > inode->i_size) {
844		inode->i_size = pos + orig_write_bytes;	// Set new size
845		/* If the file have grown so much that tail packing is no
846		 * longer possible, reset "need to pack" flag */
847		if ((have_large_tails(inode->i_sb) &&
848		     inode->i_size > i_block_size(inode) * 4) ||
849		    (have_small_tails(inode->i_sb) &&
850		     inode->i_size > i_block_size(inode)))
851			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
852		else if ((have_large_tails(inode->i_sb) &&
853			  inode->i_size < i_block_size(inode) * 4) ||
854			 (have_small_tails(inode->i_sb) &&
855			  inode->i_size < i_block_size(inode)))
856			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
857
858		if (th->t_trans_id) {
859			reiserfs_write_lock(inode->i_sb);
860			// this sets the proper flags for O_SYNC to trigger a commit
861			mark_inode_dirty(inode);
862			reiserfs_write_unlock(inode->i_sb);
863		} else
864			mark_inode_dirty(inode);
865
866		sd_update = 1;
867	}
868	if (th->t_trans_id) {
869		reiserfs_write_lock(inode->i_sb);
870		if (!sd_update)
871			mark_inode_dirty(inode);
872		status = journal_end(th, th->t_super, th->t_blocks_allocated);
873		if (status)
874			retval = status;
875		reiserfs_write_unlock(inode->i_sb);
876	}
877	th->t_trans_id = 0;
878
879	/*
880	 * we have to unlock the pages after updating i_size, otherwise
881	 * we race with writepage
882	 */
883	for (i = 0; i < num_pages; i++) {
884		struct page *page = prepared_pages[i];
885		unlock_page(page);
886		mark_page_accessed(page);
887		page_cache_release(page);
888	}
889	return retval;
890}
891
892/* Look if passed writing region is going to touch file's tail
893   (if it is present). And if it is, convert the tail to unformatted node */
894static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
895					       loff_t pos,	/* Writing position */
896					       int write_bytes	/* amount of bytes to write */
897    )
898{
899	INITIALIZE_PATH(path);	// needed for search_for_position
900	struct cpu_key key;	// Key that would represent last touched writing byte.
901	struct item_head *ih;	// item header of found block;
902	int res;		// Return value of various functions we call.
903	int cont_expand_offset;	// We will put offset for generic_cont_expand here
904	// This can be int just because tails are created
905	// only for small files.
906
907/* this embodies a dependency on a particular tail policy */
908	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
909		/* such a big files do not have tails, so we won't bother ourselves
910		   to look for tails, simply return */
911		return 0;
912	}
913
914	reiserfs_write_lock(inode->i_sb);
915	/* find the item containing the last byte to be written, or if
916	 * writing past the end of the file then the last item of the
917	 * file (and then we check its type). */
918	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
919		     3 /*key length */ );
920	res = search_for_position_by_key(inode->i_sb, &key, &path);
921	if (res == IO_ERROR) {
922		reiserfs_write_unlock(inode->i_sb);
923		return -EIO;
924	}
925	ih = get_ih(&path);
926	res = 0;
927	if (is_direct_le_ih(ih)) {
928		/* Ok, closest item is file tail (tails are stored in "direct"
929		 * items), so we need to unpack it. */
930		/* To not overcomplicate matters, we just call generic_cont_expand
931		   which will in turn call other stuff and finally will boil down to
932		   reiserfs_get_block() that would do necessary conversion. */
933		cont_expand_offset =
934		    le_key_k_offset(get_inode_item_key_version(inode),
935				    &(ih->ih_key));
936		pathrelse(&path);
937		res = generic_cont_expand(inode, cont_expand_offset);
938	} else
939		pathrelse(&path);
940
941	reiserfs_write_unlock(inode->i_sb);
942	return res;
943}
944
945/* This function locks pages starting from @pos for @inode.
946   @num_pages pages are locked and stored in
947   @prepared_pages array. Also buffers are allocated for these pages.
948   First and last page of the region is read if it is overwritten only
949   partially. If last page did not exist before write (file hole or file
950   append), it is zeroed, then.
951   Returns number of unallocated blocks that should be allocated to cover
952   new file data.*/
953static int reiserfs_prepare_file_region_for_write(struct inode *inode
954						  /* Inode of the file */ ,
955						  loff_t pos,	/* position in the file */
956						  size_t num_pages,	/* number of pages to
957									   prepare */
958						  size_t write_bytes,	/* Amount of bytes to be
959									   overwritten from
960									   @pos */
961						  struct page **prepared_pages	/* pointer to array
962										   where to store
963										   prepared pages */
964    )
965{
966	int res = 0;		// Return values of different functions we call.
967	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
968	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
969	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
970	/* offset of last modified byte in last
971	   page */
972	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
973	int i;			// Simple counter
974	int blocks = 0;		/* Return value (blocks that should be allocated) */
975	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
976	// of a page.
977	unsigned block_start, block_end;	// Starting and ending offsets of current
978	// buffer in the page.
979	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
980	// Page appeared to be not up
981	// to date. Note how we have
982	// at most 2 buffers, this is
983	// because we at most may
984	// partially overwrite two
985	// buffers for one page. One at                                                 // the beginning of write area
986	// and one at the end.
987	// Everything inthe middle gets                                                 // overwritten totally.
988
989	struct cpu_key key;	// cpu key of item that we are going to deal with
990	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
991	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
992	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
993	__le32 *item = NULL;	// pointer to item we are going to deal with
994	int item_pos = -1;	/* Position in indirect item */
995
996	if (num_pages < 1) {
997		reiserfs_warning(inode->i_sb,
998				 "green-9001: reiserfs_prepare_file_region_for_write "
999				 "called with zero number of pages to process");
1000		return -EFAULT;
1001	}
1002
1003	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1004	   that nobody would touch these until we release the pages. Then
1005	   we'd start to deal with mapping buffers to blocks. */
1006	for (i = 0; i < num_pages; i++) {
1007		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1008		if (!prepared_pages[i]) {
1009			res = -ENOMEM;
1010			goto failed_page_grabbing;
1011		}
1012		if (!page_has_buffers(prepared_pages[i]))
1013			create_empty_buffers(prepared_pages[i],
1014					     inode->i_sb->s_blocksize, 0);
1015	}
1016
1017	/* Let's count amount of blocks for a case where all the blocks
1018	   overwritten are new (we will substract already allocated blocks later) */
1019	if (num_pages > 2)
1020		/* These are full-overwritten pages so we count all the blocks in
1021		   these pages are counted as needed to be allocated */
1022		blocks =
1023		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1024
1025	/* count blocks needed for first page (possibly partially written) */
1026	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1027
1028	/* Now we account for last page. If last page == first page (we
1029	   overwrite only one page), we substract all the blocks past the
1030	   last writing position in a page out of already calculated number
1031	   of blocks */
1032	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1033	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1034	/* Note how we do not roundup here since partial blocks still
1035	   should be allocated */
1036
1037	/* Now if all the write area lies past the file end, no point in
1038	   maping blocks, since there is none, so we just zero out remaining
1039	   parts of first and last pages in write area (if needed) */
1040	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1041		if (from != 0) {	/* First page needs to be partially zeroed */
1042			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1043			memset(kaddr, 0, from);
1044			kunmap_atomic(kaddr, KM_USER0);
1045		}
1046		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
1047			char *kaddr =
1048			    kmap_atomic(prepared_pages[num_pages - 1],
1049					KM_USER0);
1050			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1051			kunmap_atomic(kaddr, KM_USER0);
1052		}
1053
1054		/* Since all blocks are new - use already calculated value */
1055		return blocks;
1056	}
1057
1058	/* Well, since we write somewhere into the middle of a file, there is
1059	   possibility we are writing over some already allocated blocks, so
1060	   let's map these blocks and substract number of such blocks out of blocks
1061	   we need to allocate (calculated above) */
1062	/* Mask write position to start on blocksize, we do it out of the
1063	   loop for performance reasons */
1064	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1065	/* Set cpu key to the starting position in a file (on left block boundary) */
1066	make_cpu_key(&key, inode,
1067		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1068		     TYPE_ANY, 3 /*key length */ );
1069
1070	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1071	for (i = 0; i < num_pages; i++) {
1072
1073		head = page_buffers(prepared_pages[i]);
1074		/* For each buffer in the page */
1075		for (bh = head, block_start = 0; bh != head || !block_start;
1076		     block_start = block_end, bh = bh->b_this_page) {
1077			if (!bh)
1078				reiserfs_panic(inode->i_sb,
1079					       "green-9002: Allocated but absent buffer for a page?");
1080			/* Find where this buffer ends */
1081			block_end = block_start + inode->i_sb->s_blocksize;
1082			if (i == 0 && block_end <= from)
1083				/* if this buffer is before requested data to map, skip it */
1084				continue;
1085
1086			if (i == num_pages - 1 && block_start >= to) {
1087				/* If this buffer is after requested data to map, abort
1088				   processing of current page */
1089				break;
1090			}
1091
1092			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1093				/* This is optimisation for a case where buffer is mapped
1094				   and have blocknumber assigned. In case significant amount
1095				   of such buffers are present, we may avoid some amount
1096				   of search_by_key calls.
1097				   Probably it would be possible to move parts of this code
1098				   out of BKL, but I afraid that would overcomplicate code
1099				   without any noticeable benefit.
1100				 */
1101				item_pos++;
1102				/* Update the key */
1103				set_cpu_key_k_offset(&key,
1104						     cpu_key_k_offset(&key) +
1105						     inode->i_sb->s_blocksize);
1106				blocks--;	// Decrease the amount of blocks that need to be
1107				// allocated
1108				continue;	// Go to the next buffer
1109			}
1110
1111			if (!itembuf ||	/* if first iteration */
1112			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1113										   current unformatted_item */
1114				/* Try to find next item */
1115				res =
1116				    search_for_position_by_key(inode->i_sb,
1117							       &key, &path);
1118				/* Abort if no more items */
1119				if (res != POSITION_FOUND) {
1120					/* make sure later loops don't use this item */
1121					itembuf = NULL;
1122					item = NULL;
1123					break;
1124				}
1125
1126				/* Update information about current indirect item */
1127				itembuf = get_last_bh(&path);
1128				ih = get_ih(&path);
1129				item = get_item(&path);
1130				item_pos = path.pos_in_item;
1131
1132				RFALSE(!is_indirect_le_ih(ih),
1133				       "green-9003: indirect item expected");
1134			}
1135
1136			/* See if there is some block associated with the file
1137			   at that position, map the buffer to this block */
1138			if (get_block_num(item, item_pos)) {
1139				map_bh(bh, inode->i_sb,
1140				       get_block_num(item, item_pos));
1141				blocks--;	// Decrease the amount of blocks that need to be
1142				// allocated
1143			}
1144			item_pos++;
1145			/* Update the key */
1146			set_cpu_key_k_offset(&key,
1147					     cpu_key_k_offset(&key) +
1148					     inode->i_sb->s_blocksize);
1149		}
1150	}
1151	pathrelse(&path);	// Free the path
1152	reiserfs_write_unlock(inode->i_sb);
1153
1154	/* Now zero out unmappend buffers for the first and last pages of
1155	   write area or issue read requests if page is mapped. */
1156	/* First page, see if it is not uptodate */
1157	if (!PageUptodate(prepared_pages[0])) {
1158		head = page_buffers(prepared_pages[0]);
1159
1160		/* For each buffer in page */
1161		for (bh = head, block_start = 0; bh != head || !block_start;
1162		     block_start = block_end, bh = bh->b_this_page) {
1163
1164			if (!bh)
1165				reiserfs_panic(inode->i_sb,
1166					       "green-9002: Allocated but absent buffer for a page?");
1167			/* Find where this buffer ends */
1168			block_end = block_start + inode->i_sb->s_blocksize;
1169			if (block_end <= from)
1170				/* if this buffer is before requested data to map, skip it */
1171				continue;
1172			if (block_start < from) {	/* Aha, our partial buffer */
1173				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1174								   issue READ request for it to
1175								   not loose data */
1176					ll_rw_block(READ, 1, &bh);
1177					*wait_bh++ = bh;
1178				} else {	/* Not mapped, zero it */
1179					char *kaddr =
1180					    kmap_atomic(prepared_pages[0],
1181							KM_USER0);
1182					memset(kaddr + block_start, 0,
1183					       from - block_start);
1184					kunmap_atomic(kaddr, KM_USER0);
1185					set_buffer_uptodate(bh);
1186				}
1187			}
1188		}
1189	}
1190
1191	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1192	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1193	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1194	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1195		head = page_buffers(prepared_pages[num_pages - 1]);
1196
1197		/* for each buffer in page */
1198		for (bh = head, block_start = 0; bh != head || !block_start;
1199		     block_start = block_end, bh = bh->b_this_page) {
1200
1201			if (!bh)
1202				reiserfs_panic(inode->i_sb,
1203					       "green-9002: Allocated but absent buffer for a page?");
1204			/* Find where this buffer ends */
1205			block_end = block_start + inode->i_sb->s_blocksize;
1206			if (block_start >= to)
1207				/* if this buffer is after requested data to map, skip it */
1208				break;
1209			if (block_end > to) {	/* Aha, our partial buffer */
1210				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1211								   issue READ request for it to
1212								   not loose data */
1213					ll_rw_block(READ, 1, &bh);
1214					*wait_bh++ = bh;
1215				} else {	/* Not mapped, zero it */
1216					char *kaddr =
1217					    kmap_atomic(prepared_pages
1218							[num_pages - 1],
1219							KM_USER0);
1220					memset(kaddr + to, 0, block_end - to);
1221					kunmap_atomic(kaddr, KM_USER0);
1222					set_buffer_uptodate(bh);
1223				}
1224			}
1225		}
1226	}
1227
1228	/* Wait for read requests we made to happen, if necessary */
1229	while (wait_bh > wait) {
1230		wait_on_buffer(*--wait_bh);
1231		if (!buffer_uptodate(*wait_bh)) {
1232			res = -EIO;
1233			goto failed_read;
1234		}
1235	}
1236
1237	return blocks;
1238      failed_page_grabbing:
1239	num_pages = i;
1240      failed_read:
1241	reiserfs_unprepare_pages(prepared_pages, num_pages);
1242	return res;
1243}
1244
1245/* Write @count bytes at position @ppos in a file indicated by @file
1246   from the buffer @buf.
1247
1248   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1249   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1250   written for (ext2/3).  This is for several reasons:
1251
1252   * It has no understanding of any filesystem specific optimizations.
1253
1254   * It enters the filesystem repeatedly for each page that is written.
1255
1256   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1257   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1258   * to reiserfs which allows for fewer tree traversals.
1259
1260   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1261
1262   * Asking the block allocation code for blocks one at a time is slightly less efficient.
1263
1264   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1265   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1266   things right finally.
1267
1268   Future Features: providing search_by_key with hints.
1269
1270*/
1271static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1272				   const char __user * buf,	/*  pointer to user supplied data
1273								   (in userspace) */
1274				   size_t count,	/* amount of bytes to write */
1275				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1276							 * new current position before returning. */
1277				   )
1278{
1279	size_t already_written = 0;	// Number of bytes already written to the file.
1280	loff_t pos;		// Current position in the file.
1281	ssize_t res;		// return value of various functions that we call.
1282	int err = 0;
1283	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
1284	/* To simplify coding at this time, we store
1285	   locked pages in array for now */
1286	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1287	struct reiserfs_transaction_handle th;
1288	th.t_trans_id = 0;
1289
1290	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1291	* lying around (most of the disk, in fact). Despite the filesystem
1292	* now being a v3.6 format, the old items still can't support large
1293	* file sizes. Catch this case here, as the rest of the VFS layer is
1294	* oblivious to the different limitations between old and new items.
1295	* reiserfs_setattr catches this for truncates. This chunk is lifted
1296	* from generic_write_checks. */
1297	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1298	    *ppos + count > MAX_NON_LFS) {
1299		if (*ppos >= MAX_NON_LFS) {
1300			send_sig(SIGXFSZ, current, 0);
1301			return -EFBIG;
1302		}
1303		if (count > MAX_NON_LFS - (unsigned long)*ppos)
1304			count = MAX_NON_LFS - (unsigned long)*ppos;
1305	}
1306
1307	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
1308		ssize_t result, after_file_end = 0;
1309		if ((*ppos + count >= inode->i_size)
1310		    || (file->f_flags & O_APPEND)) {
1311			/* If we are appending a file, we need to put this savelink in here.
1312			   If we will crash while doing direct io, finish_unfinished will
1313			   cut the garbage from the file end. */
1314			reiserfs_write_lock(inode->i_sb);
1315			err =
1316			    journal_begin(&th, inode->i_sb,
1317					  JOURNAL_PER_BALANCE_CNT);
1318			if (err) {
1319				reiserfs_write_unlock(inode->i_sb);
1320				return err;
1321			}
1322			reiserfs_update_inode_transaction(inode);
1323			add_save_link(&th, inode, 1 /* Truncate */ );
1324			after_file_end = 1;
1325			err =
1326			    journal_end(&th, inode->i_sb,
1327					JOURNAL_PER_BALANCE_CNT);
1328			reiserfs_write_unlock(inode->i_sb);
1329			if (err)
1330				return err;
1331		}
1332		result = generic_file_write(file, buf, count, ppos);
1333
1334		if (after_file_end) {	/* Now update i_size and remove the savelink */
1335			struct reiserfs_transaction_handle th;
1336			reiserfs_write_lock(inode->i_sb);
1337			err = journal_begin(&th, inode->i_sb, 1);
1338			if (err) {
1339				reiserfs_write_unlock(inode->i_sb);
1340				return err;
1341			}
1342			reiserfs_update_inode_transaction(inode);
1343			mark_inode_dirty(inode);
1344			err = journal_end(&th, inode->i_sb, 1);
1345			if (err) {
1346				reiserfs_write_unlock(inode->i_sb);
1347				return err;
1348			}
1349			err = remove_save_link(inode, 1 /* truncate */ );
1350			reiserfs_write_unlock(inode->i_sb);
1351			if (err)
1352				return err;
1353		}
1354
1355		return result;
1356	}
1357
1358	if (unlikely((ssize_t) count < 0))
1359		return -EINVAL;
1360
1361	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1362		return -EFAULT;
1363
1364	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
1365
1366	pos = *ppos;
1367
1368	/* Check if we can write to specified region of file, file
1369	   is not overly big and this kind of stuff. Adjust pos and
1370	   count, if needed */
1371	res = generic_write_checks(file, &pos, &count, 0);
1372	if (res)
1373		goto out;
1374
1375	if (count == 0)
1376		goto out;
1377
1378	res = remove_suid(file->f_dentry);
1379	if (res)
1380		goto out;
1381
1382	file_update_time(file);
1383
1384	// Ok, we are done with all the checks.
1385
1386	// Now we should start real work
1387
1388	/* If we are going to write past the file's packed tail or if we are going
1389	   to overwrite part of the tail, we need that tail to be converted into
1390	   unformatted node */
1391	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1392	if (res)
1393		goto out;
1394
1395	while (count > 0) {
1396		/* This is the main loop in which we running until some error occures
1397		   or until we write all of the data. */
1398		size_t num_pages;	/* amount of pages we are going to write this iteration */
1399		size_t write_bytes;	/* amount of bytes to write during this iteration */
1400		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1401
1402		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1403		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1404									   pages */
1405		    ((count +
1406		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1407		/* convert size to amount of
1408		   pages */
1409		reiserfs_write_lock(inode->i_sb);
1410		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1411		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1412			/* If we were asked to write more data than we want to or if there
1413			   is not that much space, then we shorten amount of data to write
1414			   for this iteration. */
1415			num_pages =
1416			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1417				  reiserfs_can_fit_pages(inode->i_sb));
1418			/* Also we should not forget to set size in bytes accordingly */
1419			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1420			    (pos & (PAGE_CACHE_SIZE - 1));
1421			/* If position is not on the
1422			   start of the page, we need
1423			   to substract the offset
1424			   within page */
1425		} else
1426			write_bytes = count;
1427
1428		/* reserve the blocks to be allocated later, so that later on
1429		   we still have the space to write the blocks to */
1430		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1431						      num_pages <<
1432						      (PAGE_CACHE_SHIFT -
1433						       inode->i_blkbits));
1434		reiserfs_write_unlock(inode->i_sb);
1435
1436		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1437			if (pos >
1438			    inode->i_size + inode->i_sb->s_blocksize -
1439			    (pos & (inode->i_sb->s_blocksize - 1))) {
1440				res = -ENOSPC;
1441				break;	// In case we are writing past the end of the last file block, break.
1442			}
1443			// Otherwise we are possibly overwriting the file, so
1444			// let's set write size to be equal or less than blocksize.
1445			// This way we get it correctly for file holes.
1446			// But overwriting files on absolutelly full volumes would not
1447			// be very efficient. Well, people are not supposed to fill
1448			// 100% of disk space anyway.
1449			write_bytes =
1450			    min_t(size_t, count,
1451				  inode->i_sb->s_blocksize -
1452				  (pos & (inode->i_sb->s_blocksize - 1)));
1453			num_pages = 1;
1454			// No blocks were claimed before, so do it now.
1455			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1456							      1 <<
1457							      (PAGE_CACHE_SHIFT
1458							       -
1459							       inode->
1460							       i_blkbits));
1461		}
1462
1463		/* Prepare for writing into the region, read in all the
1464		   partially overwritten pages, if needed. And lock the pages,
1465		   so that nobody else can access these until we are done.
1466		   We get number of actual blocks needed as a result. */
1467		blocks_to_allocate =
1468		    reiserfs_prepare_file_region_for_write(inode, pos,
1469							   num_pages,
1470							   write_bytes,
1471							   prepared_pages);
1472		if (blocks_to_allocate < 0) {
1473			res = blocks_to_allocate;
1474			reiserfs_release_claimed_blocks(inode->i_sb,
1475							num_pages <<
1476							(PAGE_CACHE_SHIFT -
1477							 inode->i_blkbits));
1478			break;
1479		}
1480
1481		/* First we correct our estimate of how many blocks we need */
1482		reiserfs_release_claimed_blocks(inode->i_sb,
1483						(num_pages <<
1484						 (PAGE_CACHE_SHIFT -
1485						  inode->i_sb->
1486						  s_blocksize_bits)) -
1487						blocks_to_allocate);
1488
1489		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1490			/* Fill in all the possible holes and append the file if needed */
1491			res =
1492			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1493								num_pages,
1494								write_bytes,
1495								prepared_pages,
1496								blocks_to_allocate);
1497		}
1498
1499		/* well, we have allocated the blocks, so it is time to free
1500		   the reservation we made earlier. */
1501		reiserfs_release_claimed_blocks(inode->i_sb,
1502						blocks_to_allocate);
1503		if (res) {
1504			reiserfs_unprepare_pages(prepared_pages, num_pages);
1505			break;
1506		}
1507
1508/* NOTE that allocating blocks and filling blocks can be done in reverse order
1509   and probably we would do that just to get rid of garbage in files after a
1510   crash */
1511
1512		/* Copy data from user-supplied buffer to file's pages */
1513		res =
1514		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1515							   write_bytes,
1516							   prepared_pages, buf);
1517		if (res) {
1518			reiserfs_unprepare_pages(prepared_pages, num_pages);
1519			break;
1520		}
1521
1522		/* Send the pages to disk and unlock them. */
1523		res =
1524		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1525							  num_pages,
1526							  write_bytes,
1527							  prepared_pages);
1528		if (res)
1529			break;
1530
1531		already_written += write_bytes;
1532		buf += write_bytes;
1533		*ppos = pos += write_bytes;
1534		count -= write_bytes;
1535		balance_dirty_pages_ratelimited(inode->i_mapping);
1536	}
1537
1538	/* this is only true on error */
1539	if (th.t_trans_id) {
1540		reiserfs_write_lock(inode->i_sb);
1541		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1542		reiserfs_write_unlock(inode->i_sb);
1543		if (err) {
1544			res = err;
1545			goto out;
1546		}
1547	}
1548
1549	if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1550		res =
1551		    generic_osync_inode(inode, file->f_mapping,
1552					OSYNC_METADATA | OSYNC_DATA);
1553
1554	mutex_unlock(&inode->i_mutex);
1555	reiserfs_async_progress_wait(inode->i_sb);
1556	return (already_written != 0) ? already_written : res;
1557
1558      out:
1559	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
1560	return res;
1561}
1562
1563static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
1564				  size_t count, loff_t pos)
1565{
1566	return generic_file_aio_write(iocb, buf, count, pos);
1567}
1568
1569struct file_operations reiserfs_file_operations = {
1570	.read = generic_file_read,
1571	.write = reiserfs_file_write,
1572	.ioctl = reiserfs_ioctl,
1573	.mmap = generic_file_mmap,
1574	.release = reiserfs_file_release,
1575	.fsync = reiserfs_sync_file,
1576	.sendfile = generic_file_sendfile,
1577	.aio_read = generic_file_aio_read,
1578	.aio_write = reiserfs_aio_write,
1579};
1580
1581struct inode_operations reiserfs_file_inode_operations = {
1582	.truncate = reiserfs_vfs_truncate_file,
1583	.setattr = reiserfs_setattr,
1584	.setxattr = reiserfs_setxattr,
1585	.getxattr = reiserfs_getxattr,
1586	.listxattr = reiserfs_listxattr,
1587	.removexattr = reiserfs_removexattr,
1588	.permission = reiserfs_permission,
1589};
1590