file.c revision 0ad74ffa90fb20b4132ae6e67e473f24621c6af2
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/reiserfs_fs.h>
7#include <linux/reiserfs_acl.h>
8#include <linux/reiserfs_xattr.h>
9#include <linux/smp_lock.h>
10#include <asm/uaccess.h>
11#include <linux/pagemap.h>
12#include <linux/swap.h>
13#include <linux/writeback.h>
14#include <linux/blkdev.h>
15#include <linux/buffer_head.h>
16#include <linux/quotaops.h>
17
18/*
19** We pack the tails of files on file close, not at the time they are written.
20** This implies an unnecessary copy of the tail and an unnecessary indirect item
21** insertion/balancing, for files that are written in one write.
22** It avoids unnecessary tail packings (balances) for files that are written in
23** multiple writes and are small enough to have tails.
24**
25** file_release is called by the VFS layer when the file is closed.  If
26** this is the last open file descriptor, and the file
27** small enough to have a tail, and the tail is currently in an
28** unformatted node, the tail is converted back into a direct item.
29**
30** We use reiserfs_truncate_file to pack the tail, since it already has
31** all the conditions coded.
32*/
33static int reiserfs_file_release(struct inode *inode, struct file *filp)
34{
35
36	struct reiserfs_transaction_handle th;
37	int err;
38	int jbegin_failure = 0;
39
40	if (!S_ISREG(inode->i_mode))
41		BUG();
42
43	/* fast out for when nothing needs to be done */
44	if ((atomic_read(&inode->i_count) > 1 ||
45	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
46	     !tail_has_to_be_packed(inode)) &&
47	    REISERFS_I(inode)->i_prealloc_count <= 0) {
48		return 0;
49	}
50
51	reiserfs_write_lock(inode->i_sb);
52	down(&inode->i_sem);
53	/* freeing preallocation only involves relogging blocks that
54	 * are already in the current transaction.  preallocation gets
55	 * freed at the end of each transaction, so it is impossible for
56	 * us to log any additional blocks (including quota blocks)
57	 */
58	err = journal_begin(&th, inode->i_sb, 1);
59	if (err) {
60		/* uh oh, we can't allow the inode to go away while there
61		 * is still preallocation blocks pending.  Try to join the
62		 * aborted transaction
63		 */
64		jbegin_failure = err;
65		err = journal_join_abort(&th, inode->i_sb, 1);
66
67		if (err) {
68			/* hmpf, our choices here aren't good.  We can pin the inode
69			 * which will disallow unmount from every happening, we can
70			 * do nothing, which will corrupt random memory on unmount,
71			 * or we can forcibly remove the file from the preallocation
72			 * list, which will leak blocks on disk.  Lets pin the inode
73			 * and let the admin know what is going on.
74			 */
75			igrab(inode);
76			reiserfs_warning(inode->i_sb,
77					 "pinning inode %lu because the "
78					 "preallocation can't be freed");
79			goto out;
80		}
81	}
82	reiserfs_update_inode_transaction(inode);
83
84#ifdef REISERFS_PREALLOCATE
85	reiserfs_discard_prealloc(&th, inode);
86#endif
87	err = journal_end(&th, inode->i_sb, 1);
88
89	/* copy back the error code from journal_begin */
90	if (!err)
91		err = jbegin_failure;
92
93	if (!err && atomic_read(&inode->i_count) <= 1 &&
94	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95	    tail_has_to_be_packed(inode)) {
96		/* if regular file is released by last holder and it has been
97		   appended (we append by unformatted node only) or its direct
98		   item(s) had to be converted, then it may have to be
99		   indirect2direct converted */
100		err = reiserfs_truncate_file(inode, 0);
101	}
102      out:
103	up(&inode->i_sem);
104	reiserfs_write_unlock(inode->i_sb);
105	return err;
106}
107
108static void reiserfs_vfs_truncate_file(struct inode *inode)
109{
110	reiserfs_truncate_file(inode, 1);
111}
112
113/* Sync a reiserfs file. */
114
115/*
116 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
117 * be removed...
118 */
119
120static int reiserfs_sync_file(struct file *p_s_filp,
121			      struct dentry *p_s_dentry, int datasync)
122{
123	struct inode *p_s_inode = p_s_dentry->d_inode;
124	int n_err;
125	int barrier_done;
126
127	if (!S_ISREG(p_s_inode->i_mode))
128		BUG();
129	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
130	reiserfs_write_lock(p_s_inode->i_sb);
131	barrier_done = reiserfs_commit_for_inode(p_s_inode);
132	reiserfs_write_unlock(p_s_inode->i_sb);
133	if (barrier_done != 1)
134		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
135	if (barrier_done < 0)
136		return barrier_done;
137	return (n_err < 0) ? -EIO : 0;
138}
139
140/* I really do not want to play with memory shortage right now, so
141   to simplify the code, we are not going to write more than this much pages at
142   a time. This still should considerably improve performance compared to 4k
143   at a time case. This is 32 pages of 4k size. */
144#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
145
146/* Allocates blocks for a file to fulfil write request.
147   Maps all unmapped but prepared pages from the list.
148   Updates metadata with newly allocated blocknumbers as needed */
149static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
150					       loff_t pos,	/* Writing position */
151					       int num_pages,	/* number of pages write going
152								   to touch */
153					       int write_bytes,	/* amount of bytes to write */
154					       struct page **prepared_pages,	/* array of
155										   prepared pages
156										 */
157					       int blocks_to_allocate	/* Amount of blocks we
158									   need to allocate to
159									   fit the data into file
160									 */
161    )
162{
163	struct cpu_key key;	// cpu key of item that we are going to deal with
164	struct item_head *ih;	// pointer to item head that we are going to deal with
165	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
166	__le32 *item;		// pointer to item we are going to deal with
167	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
168	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
169	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
170	size_t res;		// return value of various functions that we call.
171	int curr_block;		// current block used to keep track of unmapped blocks.
172	int i;			// loop counter
173	int itempos;		// position in item
174	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
175	// first page
176	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
177	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
178	int modifying_this_item = 0;	// Flag for items traversal code to keep track
179	// of the fact that we already prepared
180	// current block for journal
181	int will_prealloc = 0;
182	RFALSE(!blocks_to_allocate,
183	       "green-9004: tried to allocate zero blocks?");
184
185	/* only preallocate if this is a small write */
186	if (REISERFS_I(inode)->i_prealloc_count ||
187	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
188	     blocks_to_allocate <
189	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
190		will_prealloc =
191		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
192
193	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
194				   sizeof(b_blocknr_t), GFP_NOFS);
195
196	/* First we compose a key to point at the writing position, we want to do
197	   that outside of any locking region. */
198	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
199
200	/* If we came here, it means we absolutely need to open a transaction,
201	   since we need to allocate some blocks */
202	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
203	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
204	if (res)
205		goto error_exit;
206	reiserfs_update_inode_transaction(inode);
207
208	/* Look for the in-tree position of our write, need path for block allocator */
209	res = search_for_position_by_key(inode->i_sb, &key, &path);
210	if (res == IO_ERROR) {
211		res = -EIO;
212		goto error_exit;
213	}
214
215	/* Allocate blocks */
216	/* First fill in "hint" structure for block allocator */
217	hint.th = th;		// transaction handle.
218	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
219	hint.inode = inode;	// Inode is needed by block allocator too.
220	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
221	hint.key = key.on_disk_key;	// on disk key of file.
222	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
223	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
224	hint.preallocate = will_prealloc;
225
226	/* Call block allocator to allocate blocks */
227	res =
228	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
229				       blocks_to_allocate, blocks_to_allocate);
230	if (res != CARRY_ON) {
231		if (res == NO_DISK_SPACE) {
232			/* We flush the transaction in case of no space. This way some
233			   blocks might become free */
234			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
235			res = restart_transaction(th, inode, &path);
236			if (res)
237				goto error_exit;
238
239			/* We might have scheduled, so search again */
240			res =
241			    search_for_position_by_key(inode->i_sb, &key,
242						       &path);
243			if (res == IO_ERROR) {
244				res = -EIO;
245				goto error_exit;
246			}
247
248			/* update changed info for hint structure. */
249			res =
250			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
251						       blocks_to_allocate,
252						       blocks_to_allocate);
253			if (res != CARRY_ON) {
254				res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
255				pathrelse(&path);
256				goto error_exit;
257			}
258		} else {
259			res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
260			pathrelse(&path);
261			goto error_exit;
262		}
263	}
264#ifdef __BIG_ENDIAN
265	// Too bad, I have not found any way to convert a given region from
266	// cpu format to little endian format
267	{
268		int i;
269		for (i = 0; i < blocks_to_allocate; i++)
270			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
271	}
272#endif
273
274	/* Blocks allocating well might have scheduled and tree might have changed,
275	   let's search the tree again */
276	/* find where in the tree our write should go */
277	res = search_for_position_by_key(inode->i_sb, &key, &path);
278	if (res == IO_ERROR) {
279		res = -EIO;
280		goto error_exit_free_blocks;
281	}
282
283	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
284	ih = get_ih(&path);	// Get a pointer to last item head in path.
285	item = get_item(&path);	// Get a pointer to last item in path
286
287	/* Let's see what we have found */
288	if (res != POSITION_FOUND) {	/* position not found, this means that we
289					   might need to append file with holes
290					   first */
291		// Since we are writing past the file's end, we need to find out if
292		// there is a hole that needs to be inserted before our writing
293		// position, and how many blocks it is going to cover (we need to
294		//  populate pointers to file blocks representing the hole with zeros)
295
296		{
297			int item_offset = 1;
298			/*
299			 * if ih is stat data, its offset is 0 and we don't want to
300			 * add 1 to pos in the hole_size calculation
301			 */
302			if (is_statdata_le_ih(ih))
303				item_offset = 0;
304			hole_size = (pos + item_offset -
305				     (le_key_k_offset
306				      (get_inode_item_key_version(inode),
307				       &(ih->ih_key)) + op_bytes_number(ih,
308									inode->
309									i_sb->
310									s_blocksize)))
311			    >> inode->i_sb->s_blocksize_bits;
312		}
313
314		if (hole_size > 0) {
315			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
316			/* area filled with zeroes, to supply as list of zero blocknumbers
317			   We allocate it outside of loop just in case loop would spin for
318			   several iterations. */
319			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
320			if (!zeros) {
321				res = -ENOMEM;
322				goto error_exit_free_blocks;
323			}
324			memset(zeros, 0, to_paste * UNFM_P_SIZE);
325			do {
326				to_paste =
327				    min_t(__u64, hole_size,
328					  MAX_ITEM_LEN(inode->i_sb->
329						       s_blocksize) /
330					  UNFM_P_SIZE);
331				if (is_indirect_le_ih(ih)) {
332					/* Ok, there is existing indirect item already. Need to append it */
333					/* Calculate position past inserted item */
334					make_cpu_key(&key, inode,
335						     le_key_k_offset
336						     (get_inode_item_key_version
337						      (inode),
338						      &(ih->ih_key)) +
339						     op_bytes_number(ih,
340								     inode->
341								     i_sb->
342								     s_blocksize),
343						     TYPE_INDIRECT, 3);
344					res =
345					    reiserfs_paste_into_item(th, &path,
346								     &key,
347								     inode,
348								     (char *)
349								     zeros,
350								     UNFM_P_SIZE
351								     *
352								     to_paste);
353					if (res) {
354						kfree(zeros);
355						goto error_exit_free_blocks;
356					}
357				} else if (is_statdata_le_ih(ih)) {
358					/* No existing item, create it */
359					/* item head for new item */
360					struct item_head ins_ih;
361
362					/* create a key for our new item */
363					make_cpu_key(&key, inode, 1,
364						     TYPE_INDIRECT, 3);
365
366					/* Create new item head for our new item */
367					make_le_item_head(&ins_ih, &key,
368							  key.version, 1,
369							  TYPE_INDIRECT,
370							  to_paste *
371							  UNFM_P_SIZE,
372							  0 /* free space */ );
373
374					/* Find where such item should live in the tree */
375					res =
376					    search_item(inode->i_sb, &key,
377							&path);
378					if (res != ITEM_NOT_FOUND) {
379						/* item should not exist, otherwise we have error */
380						if (res != -ENOSPC) {
381							reiserfs_warning(inode->
382									 i_sb,
383									 "green-9008: search_by_key (%K) returned %d",
384									 &key,
385									 res);
386						}
387						res = -EIO;
388						kfree(zeros);
389						goto error_exit_free_blocks;
390					}
391					res =
392					    reiserfs_insert_item(th, &path,
393								 &key, &ins_ih,
394								 inode,
395								 (char *)zeros);
396				} else {
397					reiserfs_panic(inode->i_sb,
398						       "green-9011: Unexpected key type %K\n",
399						       &key);
400				}
401				if (res) {
402					kfree(zeros);
403					goto error_exit_free_blocks;
404				}
405				/* Now we want to check if transaction is too full, and if it is
406				   we restart it. This will also free the path. */
407				if (journal_transaction_should_end
408				    (th, th->t_blocks_allocated)) {
409					res =
410					    restart_transaction(th, inode,
411								&path);
412					if (res) {
413						pathrelse(&path);
414						kfree(zeros);
415						goto error_exit;
416					}
417				}
418
419				/* Well, need to recalculate path and stuff */
420				set_cpu_key_k_offset(&key,
421						     cpu_key_k_offset(&key) +
422						     (to_paste << inode->
423						      i_blkbits));
424				res =
425				    search_for_position_by_key(inode->i_sb,
426							       &key, &path);
427				if (res == IO_ERROR) {
428					res = -EIO;
429					kfree(zeros);
430					goto error_exit_free_blocks;
431				}
432				bh = get_last_bh(&path);
433				ih = get_ih(&path);
434				item = get_item(&path);
435				hole_size -= to_paste;
436			} while (hole_size);
437			kfree(zeros);
438		}
439	}
440	// Go through existing indirect items first
441	// replace all zeroes with blocknumbers from list
442	// Note that if no corresponding item was found, by previous search,
443	// it means there are no existing in-tree representation for file area
444	// we are going to overwrite, so there is nothing to scan through for holes.
445	for (curr_block = 0, itempos = path.pos_in_item;
446	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
447	      retry:
448
449		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
450			/* We run out of data in this indirect item, let's look for another
451			   one. */
452			/* First if we are already modifying current item, log it */
453			if (modifying_this_item) {
454				journal_mark_dirty(th, inode->i_sb, bh);
455				modifying_this_item = 0;
456			}
457			/* Then set the key to look for a new indirect item (offset of old
458			   item is added to old item length */
459			set_cpu_key_k_offset(&key,
460					     le_key_k_offset
461					     (get_inode_item_key_version(inode),
462					      &(ih->ih_key)) +
463					     op_bytes_number(ih,
464							     inode->i_sb->
465							     s_blocksize));
466			/* Search ofor position of new key in the tree. */
467			res =
468			    search_for_position_by_key(inode->i_sb, &key,
469						       &path);
470			if (res == IO_ERROR) {
471				res = -EIO;
472				goto error_exit_free_blocks;
473			}
474			bh = get_last_bh(&path);
475			ih = get_ih(&path);
476			item = get_item(&path);
477			itempos = path.pos_in_item;
478			continue;	// loop to check all kinds of conditions and so on.
479		}
480		/* Ok, we have correct position in item now, so let's see if it is
481		   representing file hole (blocknumber is zero) and fill it if needed */
482		if (!item[itempos]) {
483			/* Ok, a hole. Now we need to check if we already prepared this
484			   block to be journaled */
485			while (!modifying_this_item) {	// loop until succeed
486				/* Well, this item is not journaled yet, so we must prepare
487				   it for journal first, before we can change it */
488				struct item_head tmp_ih;	// We copy item head of found item,
489				// here to detect if fs changed under
490				// us while we were preparing for
491				// journal.
492				int fs_gen;	// We store fs generation here to find if someone
493				// changes fs under our feet
494
495				copy_item_head(&tmp_ih, ih);	// Remember itemhead
496				fs_gen = get_generation(inode->i_sb);	// remember fs generation
497				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
498				if (fs_changed(fs_gen, inode->i_sb)
499				    && item_moved(&tmp_ih, &path)) {
500					// Sigh, fs was changed under us, we need to look for new
501					// location of item we are working with
502
503					/* unmark prepaerd area as journaled and search for it's
504					   new position */
505					reiserfs_restore_prepared_buffer(inode->
506									 i_sb,
507									 bh);
508					res =
509					    search_for_position_by_key(inode->
510								       i_sb,
511								       &key,
512								       &path);
513					if (res == IO_ERROR) {
514						res = -EIO;
515						goto error_exit_free_blocks;
516					}
517					bh = get_last_bh(&path);
518					ih = get_ih(&path);
519					item = get_item(&path);
520					itempos = path.pos_in_item;
521					goto retry;
522				}
523				modifying_this_item = 1;
524			}
525			item[itempos] = allocated_blocks[curr_block];	// Assign new block
526			curr_block++;
527		}
528		itempos++;
529	}
530
531	if (modifying_this_item) {	// We need to log last-accessed block, if it
532		// was modified, but not logged yet.
533		journal_mark_dirty(th, inode->i_sb, bh);
534	}
535
536	if (curr_block < blocks_to_allocate) {
537		// Oh, well need to append to indirect item, or to create indirect item
538		// if there weren't any
539		if (is_indirect_le_ih(ih)) {
540			// Existing indirect item - append. First calculate key for append
541			// position. We do not need to recalculate path as it should
542			// already point to correct place.
543			make_cpu_key(&key, inode,
544				     le_key_k_offset(get_inode_item_key_version
545						     (inode),
546						     &(ih->ih_key)) +
547				     op_bytes_number(ih,
548						     inode->i_sb->s_blocksize),
549				     TYPE_INDIRECT, 3);
550			res =
551			    reiserfs_paste_into_item(th, &path, &key, inode,
552						     (char *)(allocated_blocks +
553							      curr_block),
554						     UNFM_P_SIZE *
555						     (blocks_to_allocate -
556						      curr_block));
557			if (res) {
558				goto error_exit_free_blocks;
559			}
560		} else if (is_statdata_le_ih(ih)) {
561			// Last found item was statdata. That means we need to create indirect item.
562			struct item_head ins_ih;	/* itemhead for new item */
563
564			/* create a key for our new item */
565			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
566			// because that's
567			// where first
568			// indirect item
569			// begins
570			/* Create new item head for our new item */
571			make_le_item_head(&ins_ih, &key, key.version, 1,
572					  TYPE_INDIRECT,
573					  (blocks_to_allocate -
574					   curr_block) * UNFM_P_SIZE,
575					  0 /* free space */ );
576			/* Find where such item should live in the tree */
577			res = search_item(inode->i_sb, &key, &path);
578			if (res != ITEM_NOT_FOUND) {
579				/* Well, if we have found such item already, or some error
580				   occured, we need to warn user and return error */
581				if (res != -ENOSPC) {
582					reiserfs_warning(inode->i_sb,
583							 "green-9009: search_by_key (%K) "
584							 "returned %d", &key,
585							 res);
586				}
587				res = -EIO;
588				goto error_exit_free_blocks;
589			}
590			/* Insert item into the tree with the data as its body */
591			res =
592			    reiserfs_insert_item(th, &path, &key, &ins_ih,
593						 inode,
594						 (char *)(allocated_blocks +
595							  curr_block));
596		} else {
597			reiserfs_panic(inode->i_sb,
598				       "green-9010: unexpected item type for key %K\n",
599				       &key);
600		}
601	}
602	// the caller is responsible for closing the transaction
603	// unless we return an error, they are also responsible for logging
604	// the inode.
605	//
606	pathrelse(&path);
607	/*
608	 * cleanup prellocation from previous writes
609	 * if this is a partial block write
610	 */
611	if (write_bytes & (inode->i_sb->s_blocksize - 1))
612		reiserfs_discard_prealloc(th, inode);
613	reiserfs_write_unlock(inode->i_sb);
614
615	// go through all the pages/buffers and map the buffers to newly allocated
616	// blocks (so that system knows where to write these pages later).
617	curr_block = 0;
618	for (i = 0; i < num_pages; i++) {
619		struct page *page = prepared_pages[i];	//current page
620		struct buffer_head *head = page_buffers(page);	// first buffer for a page
621		int block_start, block_end;	// in-page offsets for buffers.
622
623		if (!page_buffers(page))
624			reiserfs_panic(inode->i_sb,
625				       "green-9005: No buffers for prepared page???");
626
627		/* For each buffer in page */
628		for (bh = head, block_start = 0; bh != head || !block_start;
629		     block_start = block_end, bh = bh->b_this_page) {
630			if (!bh)
631				reiserfs_panic(inode->i_sb,
632					       "green-9006: Allocated but absent buffer for a page?");
633			block_end = block_start + inode->i_sb->s_blocksize;
634			if (i == 0 && block_end <= from)
635				/* if this buffer is before requested data to map, skip it */
636				continue;
637			if (i == num_pages - 1 && block_start >= to)
638				/* If this buffer is after requested data to map, abort
639				   processing of current page */
640				break;
641
642			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
643				map_bh(bh, inode->i_sb,
644				       le32_to_cpu(allocated_blocks
645						   [curr_block]));
646				curr_block++;
647				set_buffer_new(bh);
648			}
649		}
650	}
651
652	RFALSE(curr_block > blocks_to_allocate,
653	       "green-9007: Used too many blocks? weird");
654
655	kfree(allocated_blocks);
656	return 0;
657
658// Need to deal with transaction here.
659      error_exit_free_blocks:
660	pathrelse(&path);
661	// free blocks
662	for (i = 0; i < blocks_to_allocate; i++)
663		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
664				    1);
665
666      error_exit:
667	if (th->t_trans_id) {
668		int err;
669		// update any changes we made to blk count
670		mark_inode_dirty(inode);
671		err =
672		    journal_end(th, inode->i_sb,
673				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
674				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
675		if (err)
676			res = err;
677	}
678	reiserfs_write_unlock(inode->i_sb);
679	kfree(allocated_blocks);
680
681	return res;
682}
683
684/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
685static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
686				     size_t num_pages /* amount of pages */ )
687{
688	int i;			// loop counter
689
690	for (i = 0; i < num_pages; i++) {
691		struct page *page = prepared_pages[i];
692
693		try_to_free_buffers(page);
694		unlock_page(page);
695		page_cache_release(page);
696	}
697}
698
699/* This function will copy data from userspace to specified pages within
700   supplied byte range */
701static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
702						  int num_pages,	/* Number of pages affected */
703						  int write_bytes,	/* Amount of bytes to write */
704						  struct page **prepared_pages,	/* pointer to
705										   array to
706										   prepared pages
707										 */
708						  const char __user * buf	/* Pointer to user-supplied
709										   data */
710    )
711{
712	long page_fault = 0;	// status of copy_from_user.
713	int i;			// loop counter.
714	int offset;		// offset in page
715
716	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
717	     i++, offset = 0) {
718		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
719		struct page *page = prepared_pages[i];	// Current page we process.
720
721		fault_in_pages_readable(buf, count);
722
723		/* Copy data from userspace to the current page */
724		kmap(page);
725		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
726		/* Flush processor's dcache for this page */
727		flush_dcache_page(page);
728		kunmap(page);
729		buf += count;
730		write_bytes -= count;
731
732		if (page_fault)
733			break;	// Was there a fault? abort.
734	}
735
736	return page_fault ? -EFAULT : 0;
737}
738
739/* taken fs/buffer.c:__block_commit_write */
740int reiserfs_commit_page(struct inode *inode, struct page *page,
741			 unsigned from, unsigned to)
742{
743	unsigned block_start, block_end;
744	int partial = 0;
745	unsigned blocksize;
746	struct buffer_head *bh, *head;
747	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
748	int new;
749	int logit = reiserfs_file_data_log(inode);
750	struct super_block *s = inode->i_sb;
751	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
752	struct reiserfs_transaction_handle th;
753	int ret = 0;
754
755	th.t_trans_id = 0;
756	blocksize = 1 << inode->i_blkbits;
757
758	if (logit) {
759		reiserfs_write_lock(s);
760		ret = journal_begin(&th, s, bh_per_page + 1);
761		if (ret)
762			goto drop_write_lock;
763		reiserfs_update_inode_transaction(inode);
764	}
765	for (bh = head = page_buffers(page), block_start = 0;
766	     bh != head || !block_start;
767	     block_start = block_end, bh = bh->b_this_page) {
768
769		new = buffer_new(bh);
770		clear_buffer_new(bh);
771		block_end = block_start + blocksize;
772		if (block_end <= from || block_start >= to) {
773			if (!buffer_uptodate(bh))
774				partial = 1;
775		} else {
776			set_buffer_uptodate(bh);
777			if (logit) {
778				reiserfs_prepare_for_journal(s, bh, 1);
779				journal_mark_dirty(&th, s, bh);
780			} else if (!buffer_dirty(bh)) {
781				mark_buffer_dirty(bh);
782				/* do data=ordered on any page past the end
783				 * of file and any buffer marked BH_New.
784				 */
785				if (reiserfs_data_ordered(inode->i_sb) &&
786				    (new || page->index >= i_size_index)) {
787					reiserfs_add_ordered_list(inode, bh);
788				}
789			}
790		}
791	}
792	if (logit) {
793		ret = journal_end(&th, s, bh_per_page + 1);
794	      drop_write_lock:
795		reiserfs_write_unlock(s);
796	}
797	/*
798	 * If this is a partial write which happened to make all buffers
799	 * uptodate then we can optimize away a bogus readpage() for
800	 * the next read(). Here we 'discover' whether the page went
801	 * uptodate as a result of this (potentially partial) write.
802	 */
803	if (!partial)
804		SetPageUptodate(page);
805	return ret;
806}
807
808/* Submit pages for write. This was separated from actual file copying
809   because we might want to allocate block numbers in-between.
810   This function assumes that caller will adjust file size to correct value. */
811static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
812						 size_t num_pages,	/* Number of pages to write */
813						 size_t write_bytes,	/* number of bytes to write */
814						 struct page **prepared_pages	/* list of pages */
815    )
816{
817	int status;		// return status of block_commit_write.
818	int retval = 0;		// Return value we are going to return.
819	int i;			// loop counter
820	int offset;		// Writing offset in page.
821	int orig_write_bytes = write_bytes;
822	int sd_update = 0;
823
824	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
825	     i++, offset = 0) {
826		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
827		struct page *page = prepared_pages[i];	// Current page we process.
828
829		status =
830		    reiserfs_commit_page(inode, page, offset, offset + count);
831		if (status)
832			retval = status;	// To not overcomplicate matters We are going to
833		// submit all the pages even if there was error.
834		// we only remember error status to report it on
835		// exit.
836		write_bytes -= count;
837	}
838	/* now that we've gotten all the ordered buffers marked dirty,
839	 * we can safely update i_size and close any running transaction
840	 */
841	if (pos + orig_write_bytes > inode->i_size) {
842		inode->i_size = pos + orig_write_bytes;	// Set new size
843		/* If the file have grown so much that tail packing is no
844		 * longer possible, reset "need to pack" flag */
845		if ((have_large_tails(inode->i_sb) &&
846		     inode->i_size > i_block_size(inode) * 4) ||
847		    (have_small_tails(inode->i_sb) &&
848		     inode->i_size > i_block_size(inode)))
849			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
850		else if ((have_large_tails(inode->i_sb) &&
851			  inode->i_size < i_block_size(inode) * 4) ||
852			 (have_small_tails(inode->i_sb) &&
853			  inode->i_size < i_block_size(inode)))
854			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
855
856		if (th->t_trans_id) {
857			reiserfs_write_lock(inode->i_sb);
858			// this sets the proper flags for O_SYNC to trigger a commit
859			mark_inode_dirty(inode);
860			reiserfs_write_unlock(inode->i_sb);
861		} else
862			mark_inode_dirty(inode);
863
864		sd_update = 1;
865	}
866	if (th->t_trans_id) {
867		reiserfs_write_lock(inode->i_sb);
868		if (!sd_update)
869			mark_inode_dirty(inode);
870		status = journal_end(th, th->t_super, th->t_blocks_allocated);
871		if (status)
872			retval = status;
873		reiserfs_write_unlock(inode->i_sb);
874	}
875	th->t_trans_id = 0;
876
877	/*
878	 * we have to unlock the pages after updating i_size, otherwise
879	 * we race with writepage
880	 */
881	for (i = 0; i < num_pages; i++) {
882		struct page *page = prepared_pages[i];
883		unlock_page(page);
884		mark_page_accessed(page);
885		page_cache_release(page);
886	}
887	return retval;
888}
889
890/* Look if passed writing region is going to touch file's tail
891   (if it is present). And if it is, convert the tail to unformatted node */
892static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
893					       loff_t pos,	/* Writing position */
894					       int write_bytes	/* amount of bytes to write */
895    )
896{
897	INITIALIZE_PATH(path);	// needed for search_for_position
898	struct cpu_key key;	// Key that would represent last touched writing byte.
899	struct item_head *ih;	// item header of found block;
900	int res;		// Return value of various functions we call.
901	int cont_expand_offset;	// We will put offset for generic_cont_expand here
902	// This can be int just because tails are created
903	// only for small files.
904
905/* this embodies a dependency on a particular tail policy */
906	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
907		/* such a big files do not have tails, so we won't bother ourselves
908		   to look for tails, simply return */
909		return 0;
910	}
911
912	reiserfs_write_lock(inode->i_sb);
913	/* find the item containing the last byte to be written, or if
914	 * writing past the end of the file then the last item of the
915	 * file (and then we check its type). */
916	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
917		     3 /*key length */ );
918	res = search_for_position_by_key(inode->i_sb, &key, &path);
919	if (res == IO_ERROR) {
920		reiserfs_write_unlock(inode->i_sb);
921		return -EIO;
922	}
923	ih = get_ih(&path);
924	res = 0;
925	if (is_direct_le_ih(ih)) {
926		/* Ok, closest item is file tail (tails are stored in "direct"
927		 * items), so we need to unpack it. */
928		/* To not overcomplicate matters, we just call generic_cont_expand
929		   which will in turn call other stuff and finally will boil down to
930		   reiserfs_get_block() that would do necessary conversion. */
931		cont_expand_offset =
932		    le_key_k_offset(get_inode_item_key_version(inode),
933				    &(ih->ih_key));
934		pathrelse(&path);
935		res = generic_cont_expand(inode, cont_expand_offset);
936	} else
937		pathrelse(&path);
938
939	reiserfs_write_unlock(inode->i_sb);
940	return res;
941}
942
943/* This function locks pages starting from @pos for @inode.
944   @num_pages pages are locked and stored in
945   @prepared_pages array. Also buffers are allocated for these pages.
946   First and last page of the region is read if it is overwritten only
947   partially. If last page did not exist before write (file hole or file
948   append), it is zeroed, then.
949   Returns number of unallocated blocks that should be allocated to cover
950   new file data.*/
951static int reiserfs_prepare_file_region_for_write(struct inode *inode
952						  /* Inode of the file */ ,
953						  loff_t pos,	/* position in the file */
954						  size_t num_pages,	/* number of pages to
955									   prepare */
956						  size_t write_bytes,	/* Amount of bytes to be
957									   overwritten from
958									   @pos */
959						  struct page **prepared_pages	/* pointer to array
960										   where to store
961										   prepared pages */
962    )
963{
964	int res = 0;		// Return values of different functions we call.
965	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
966	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
967	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
968	/* offset of last modified byte in last
969	   page */
970	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
971	int i;			// Simple counter
972	int blocks = 0;		/* Return value (blocks that should be allocated) */
973	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
974	// of a page.
975	unsigned block_start, block_end;	// Starting and ending offsets of current
976	// buffer in the page.
977	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
978	// Page appeared to be not up
979	// to date. Note how we have
980	// at most 2 buffers, this is
981	// because we at most may
982	// partially overwrite two
983	// buffers for one page. One at                                                 // the beginning of write area
984	// and one at the end.
985	// Everything inthe middle gets                                                 // overwritten totally.
986
987	struct cpu_key key;	// cpu key of item that we are going to deal with
988	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
989	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
990	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
991	__le32 *item = NULL;	// pointer to item we are going to deal with
992	int item_pos = -1;	/* Position in indirect item */
993
994	if (num_pages < 1) {
995		reiserfs_warning(inode->i_sb,
996				 "green-9001: reiserfs_prepare_file_region_for_write "
997				 "called with zero number of pages to process");
998		return -EFAULT;
999	}
1000
1001	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1002	   that nobody would touch these until we release the pages. Then
1003	   we'd start to deal with mapping buffers to blocks. */
1004	for (i = 0; i < num_pages; i++) {
1005		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1006		if (!prepared_pages[i]) {
1007			res = -ENOMEM;
1008			goto failed_page_grabbing;
1009		}
1010		if (!page_has_buffers(prepared_pages[i]))
1011			create_empty_buffers(prepared_pages[i],
1012					     inode->i_sb->s_blocksize, 0);
1013	}
1014
1015	/* Let's count amount of blocks for a case where all the blocks
1016	   overwritten are new (we will substract already allocated blocks later) */
1017	if (num_pages > 2)
1018		/* These are full-overwritten pages so we count all the blocks in
1019		   these pages are counted as needed to be allocated */
1020		blocks =
1021		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1022
1023	/* count blocks needed for first page (possibly partially written) */
1024	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1025
1026	/* Now we account for last page. If last page == first page (we
1027	   overwrite only one page), we substract all the blocks past the
1028	   last writing position in a page out of already calculated number
1029	   of blocks */
1030	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1031	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1032	/* Note how we do not roundup here since partial blocks still
1033	   should be allocated */
1034
1035	/* Now if all the write area lies past the file end, no point in
1036	   maping blocks, since there is none, so we just zero out remaining
1037	   parts of first and last pages in write area (if needed) */
1038	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1039		if (from != 0) {	/* First page needs to be partially zeroed */
1040			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1041			memset(kaddr, 0, from);
1042			kunmap_atomic(kaddr, KM_USER0);
1043		}
1044		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
1045			char *kaddr =
1046			    kmap_atomic(prepared_pages[num_pages - 1],
1047					KM_USER0);
1048			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1049			kunmap_atomic(kaddr, KM_USER0);
1050		}
1051
1052		/* Since all blocks are new - use already calculated value */
1053		return blocks;
1054	}
1055
1056	/* Well, since we write somewhere into the middle of a file, there is
1057	   possibility we are writing over some already allocated blocks, so
1058	   let's map these blocks and substract number of such blocks out of blocks
1059	   we need to allocate (calculated above) */
1060	/* Mask write position to start on blocksize, we do it out of the
1061	   loop for performance reasons */
1062	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1063	/* Set cpu key to the starting position in a file (on left block boundary) */
1064	make_cpu_key(&key, inode,
1065		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1066		     TYPE_ANY, 3 /*key length */ );
1067
1068	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1069	for (i = 0; i < num_pages; i++) {
1070
1071		head = page_buffers(prepared_pages[i]);
1072		/* For each buffer in the page */
1073		for (bh = head, block_start = 0; bh != head || !block_start;
1074		     block_start = block_end, bh = bh->b_this_page) {
1075			if (!bh)
1076				reiserfs_panic(inode->i_sb,
1077					       "green-9002: Allocated but absent buffer for a page?");
1078			/* Find where this buffer ends */
1079			block_end = block_start + inode->i_sb->s_blocksize;
1080			if (i == 0 && block_end <= from)
1081				/* if this buffer is before requested data to map, skip it */
1082				continue;
1083
1084			if (i == num_pages - 1 && block_start >= to) {
1085				/* If this buffer is after requested data to map, abort
1086				   processing of current page */
1087				break;
1088			}
1089
1090			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1091				/* This is optimisation for a case where buffer is mapped
1092				   and have blocknumber assigned. In case significant amount
1093				   of such buffers are present, we may avoid some amount
1094				   of search_by_key calls.
1095				   Probably it would be possible to move parts of this code
1096				   out of BKL, but I afraid that would overcomplicate code
1097				   without any noticeable benefit.
1098				 */
1099				item_pos++;
1100				/* Update the key */
1101				set_cpu_key_k_offset(&key,
1102						     cpu_key_k_offset(&key) +
1103						     inode->i_sb->s_blocksize);
1104				blocks--;	// Decrease the amount of blocks that need to be
1105				// allocated
1106				continue;	// Go to the next buffer
1107			}
1108
1109			if (!itembuf ||	/* if first iteration */
1110			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1111										   current unformatted_item */
1112				/* Try to find next item */
1113				res =
1114				    search_for_position_by_key(inode->i_sb,
1115							       &key, &path);
1116				/* Abort if no more items */
1117				if (res != POSITION_FOUND) {
1118					/* make sure later loops don't use this item */
1119					itembuf = NULL;
1120					item = NULL;
1121					break;
1122				}
1123
1124				/* Update information about current indirect item */
1125				itembuf = get_last_bh(&path);
1126				ih = get_ih(&path);
1127				item = get_item(&path);
1128				item_pos = path.pos_in_item;
1129
1130				RFALSE(!is_indirect_le_ih(ih),
1131				       "green-9003: indirect item expected");
1132			}
1133
1134			/* See if there is some block associated with the file
1135			   at that position, map the buffer to this block */
1136			if (get_block_num(item, item_pos)) {
1137				map_bh(bh, inode->i_sb,
1138				       get_block_num(item, item_pos));
1139				blocks--;	// Decrease the amount of blocks that need to be
1140				// allocated
1141			}
1142			item_pos++;
1143			/* Update the key */
1144			set_cpu_key_k_offset(&key,
1145					     cpu_key_k_offset(&key) +
1146					     inode->i_sb->s_blocksize);
1147		}
1148	}
1149	pathrelse(&path);	// Free the path
1150	reiserfs_write_unlock(inode->i_sb);
1151
1152	/* Now zero out unmappend buffers for the first and last pages of
1153	   write area or issue read requests if page is mapped. */
1154	/* First page, see if it is not uptodate */
1155	if (!PageUptodate(prepared_pages[0])) {
1156		head = page_buffers(prepared_pages[0]);
1157
1158		/* For each buffer in page */
1159		for (bh = head, block_start = 0; bh != head || !block_start;
1160		     block_start = block_end, bh = bh->b_this_page) {
1161
1162			if (!bh)
1163				reiserfs_panic(inode->i_sb,
1164					       "green-9002: Allocated but absent buffer for a page?");
1165			/* Find where this buffer ends */
1166			block_end = block_start + inode->i_sb->s_blocksize;
1167			if (block_end <= from)
1168				/* if this buffer is before requested data to map, skip it */
1169				continue;
1170			if (block_start < from) {	/* Aha, our partial buffer */
1171				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1172								   issue READ request for it to
1173								   not loose data */
1174					ll_rw_block(READ, 1, &bh);
1175					*wait_bh++ = bh;
1176				} else {	/* Not mapped, zero it */
1177					char *kaddr =
1178					    kmap_atomic(prepared_pages[0],
1179							KM_USER0);
1180					memset(kaddr + block_start, 0,
1181					       from - block_start);
1182					kunmap_atomic(kaddr, KM_USER0);
1183					set_buffer_uptodate(bh);
1184				}
1185			}
1186		}
1187	}
1188
1189	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1190	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1191	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1192	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1193		head = page_buffers(prepared_pages[num_pages - 1]);
1194
1195		/* for each buffer in page */
1196		for (bh = head, block_start = 0; bh != head || !block_start;
1197		     block_start = block_end, bh = bh->b_this_page) {
1198
1199			if (!bh)
1200				reiserfs_panic(inode->i_sb,
1201					       "green-9002: Allocated but absent buffer for a page?");
1202			/* Find where this buffer ends */
1203			block_end = block_start + inode->i_sb->s_blocksize;
1204			if (block_start >= to)
1205				/* if this buffer is after requested data to map, skip it */
1206				break;
1207			if (block_end > to) {	/* Aha, our partial buffer */
1208				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1209								   issue READ request for it to
1210								   not loose data */
1211					ll_rw_block(READ, 1, &bh);
1212					*wait_bh++ = bh;
1213				} else {	/* Not mapped, zero it */
1214					char *kaddr =
1215					    kmap_atomic(prepared_pages
1216							[num_pages - 1],
1217							KM_USER0);
1218					memset(kaddr + to, 0, block_end - to);
1219					kunmap_atomic(kaddr, KM_USER0);
1220					set_buffer_uptodate(bh);
1221				}
1222			}
1223		}
1224	}
1225
1226	/* Wait for read requests we made to happen, if necessary */
1227	while (wait_bh > wait) {
1228		wait_on_buffer(*--wait_bh);
1229		if (!buffer_uptodate(*wait_bh)) {
1230			res = -EIO;
1231			goto failed_read;
1232		}
1233	}
1234
1235	return blocks;
1236      failed_page_grabbing:
1237	num_pages = i;
1238      failed_read:
1239	reiserfs_unprepare_pages(prepared_pages, num_pages);
1240	return res;
1241}
1242
1243/* Write @count bytes at position @ppos in a file indicated by @file
1244   from the buffer @buf.
1245
1246   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1247   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1248   written for (ext2/3).  This is for several reasons:
1249
1250   * It has no understanding of any filesystem specific optimizations.
1251
1252   * It enters the filesystem repeatedly for each page that is written.
1253
1254   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1255   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1256   * to reiserfs which allows for fewer tree traversals.
1257
1258   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1259
1260   * Asking the block allocation code for blocks one at a time is slightly less efficient.
1261
1262   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1263   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1264   things right finally.
1265
1266   Future Features: providing search_by_key with hints.
1267
1268*/
1269static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1270				   const char __user * buf,	/*  pointer to user supplied data
1271								   (in userspace) */
1272				   size_t count,	/* amount of bytes to write */
1273				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1274							 * new current position before returning. */
1275				   )
1276{
1277	size_t already_written = 0;	// Number of bytes already written to the file.
1278	loff_t pos;		// Current position in the file.
1279	ssize_t res;		// return value of various functions that we call.
1280	int err = 0;
1281	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
1282	/* To simplify coding at this time, we store
1283	   locked pages in array for now */
1284	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1285	struct reiserfs_transaction_handle th;
1286	th.t_trans_id = 0;
1287
1288	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
1289		ssize_t result, after_file_end = 0;
1290		if ((*ppos + count >= inode->i_size)
1291		    || (file->f_flags & O_APPEND)) {
1292			/* If we are appending a file, we need to put this savelink in here.
1293			   If we will crash while doing direct io, finish_unfinished will
1294			   cut the garbage from the file end. */
1295			reiserfs_write_lock(inode->i_sb);
1296			err =
1297			    journal_begin(&th, inode->i_sb,
1298					  JOURNAL_PER_BALANCE_CNT);
1299			if (err) {
1300				reiserfs_write_unlock(inode->i_sb);
1301				return err;
1302			}
1303			reiserfs_update_inode_transaction(inode);
1304			add_save_link(&th, inode, 1 /* Truncate */ );
1305			after_file_end = 1;
1306			err =
1307			    journal_end(&th, inode->i_sb,
1308					JOURNAL_PER_BALANCE_CNT);
1309			reiserfs_write_unlock(inode->i_sb);
1310			if (err)
1311				return err;
1312		}
1313		result = generic_file_write(file, buf, count, ppos);
1314
1315		if (after_file_end) {	/* Now update i_size and remove the savelink */
1316			struct reiserfs_transaction_handle th;
1317			reiserfs_write_lock(inode->i_sb);
1318			err = journal_begin(&th, inode->i_sb, 1);
1319			if (err) {
1320				reiserfs_write_unlock(inode->i_sb);
1321				return err;
1322			}
1323			reiserfs_update_inode_transaction(inode);
1324			mark_inode_dirty(inode);
1325			err = journal_end(&th, inode->i_sb, 1);
1326			if (err) {
1327				reiserfs_write_unlock(inode->i_sb);
1328				return err;
1329			}
1330			err = remove_save_link(inode, 1 /* truncate */ );
1331			reiserfs_write_unlock(inode->i_sb);
1332			if (err)
1333				return err;
1334		}
1335
1336		return result;
1337	}
1338
1339	if (unlikely((ssize_t) count < 0))
1340		return -EINVAL;
1341
1342	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1343		return -EFAULT;
1344
1345	down(&inode->i_sem);	// locks the entire file for just us
1346
1347	pos = *ppos;
1348
1349	/* Check if we can write to specified region of file, file
1350	   is not overly big and this kind of stuff. Adjust pos and
1351	   count, if needed */
1352	res = generic_write_checks(file, &pos, &count, 0);
1353	if (res)
1354		goto out;
1355
1356	if (count == 0)
1357		goto out;
1358
1359	res = remove_suid(file->f_dentry);
1360	if (res)
1361		goto out;
1362
1363	inode_update_time(inode, 1);	/* Both mtime and ctime */
1364
1365	// Ok, we are done with all the checks.
1366
1367	// Now we should start real work
1368
1369	/* If we are going to write past the file's packed tail or if we are going
1370	   to overwrite part of the tail, we need that tail to be converted into
1371	   unformatted node */
1372	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1373	if (res)
1374		goto out;
1375
1376	while (count > 0) {
1377		/* This is the main loop in which we running until some error occures
1378		   or until we write all of the data. */
1379		size_t num_pages;	/* amount of pages we are going to write this iteration */
1380		size_t write_bytes;	/* amount of bytes to write during this iteration */
1381		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1382
1383		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1384		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1385									   pages */
1386		    ((count +
1387		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1388		/* convert size to amount of
1389		   pages */
1390		reiserfs_write_lock(inode->i_sb);
1391		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1392		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1393			/* If we were asked to write more data than we want to or if there
1394			   is not that much space, then we shorten amount of data to write
1395			   for this iteration. */
1396			num_pages =
1397			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1398				  reiserfs_can_fit_pages(inode->i_sb));
1399			/* Also we should not forget to set size in bytes accordingly */
1400			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1401			    (pos & (PAGE_CACHE_SIZE - 1));
1402			/* If position is not on the
1403			   start of the page, we need
1404			   to substract the offset
1405			   within page */
1406		} else
1407			write_bytes = count;
1408
1409		/* reserve the blocks to be allocated later, so that later on
1410		   we still have the space to write the blocks to */
1411		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1412						      num_pages <<
1413						      (PAGE_CACHE_SHIFT -
1414						       inode->i_blkbits));
1415		reiserfs_write_unlock(inode->i_sb);
1416
1417		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1418			if (pos >
1419			    inode->i_size + inode->i_sb->s_blocksize -
1420			    (pos & (inode->i_sb->s_blocksize - 1))) {
1421				res = -ENOSPC;
1422				break;	// In case we are writing past the end of the last file block, break.
1423			}
1424			// Otherwise we are possibly overwriting the file, so
1425			// let's set write size to be equal or less than blocksize.
1426			// This way we get it correctly for file holes.
1427			// But overwriting files on absolutelly full volumes would not
1428			// be very efficient. Well, people are not supposed to fill
1429			// 100% of disk space anyway.
1430			write_bytes =
1431			    min_t(size_t, count,
1432				  inode->i_sb->s_blocksize -
1433				  (pos & (inode->i_sb->s_blocksize - 1)));
1434			num_pages = 1;
1435			// No blocks were claimed before, so do it now.
1436			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1437							      1 <<
1438							      (PAGE_CACHE_SHIFT
1439							       -
1440							       inode->
1441							       i_blkbits));
1442		}
1443
1444		/* Prepare for writing into the region, read in all the
1445		   partially overwritten pages, if needed. And lock the pages,
1446		   so that nobody else can access these until we are done.
1447		   We get number of actual blocks needed as a result. */
1448		blocks_to_allocate =
1449		    reiserfs_prepare_file_region_for_write(inode, pos,
1450							   num_pages,
1451							   write_bytes,
1452							   prepared_pages);
1453		if (blocks_to_allocate < 0) {
1454			res = blocks_to_allocate;
1455			reiserfs_release_claimed_blocks(inode->i_sb,
1456							num_pages <<
1457							(PAGE_CACHE_SHIFT -
1458							 inode->i_blkbits));
1459			break;
1460		}
1461
1462		/* First we correct our estimate of how many blocks we need */
1463		reiserfs_release_claimed_blocks(inode->i_sb,
1464						(num_pages <<
1465						 (PAGE_CACHE_SHIFT -
1466						  inode->i_sb->
1467						  s_blocksize_bits)) -
1468						blocks_to_allocate);
1469
1470		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1471			/* Fill in all the possible holes and append the file if needed */
1472			res =
1473			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1474								num_pages,
1475								write_bytes,
1476								prepared_pages,
1477								blocks_to_allocate);
1478		}
1479
1480		/* well, we have allocated the blocks, so it is time to free
1481		   the reservation we made earlier. */
1482		reiserfs_release_claimed_blocks(inode->i_sb,
1483						blocks_to_allocate);
1484		if (res) {
1485			reiserfs_unprepare_pages(prepared_pages, num_pages);
1486			break;
1487		}
1488
1489/* NOTE that allocating blocks and filling blocks can be done in reverse order
1490   and probably we would do that just to get rid of garbage in files after a
1491   crash */
1492
1493		/* Copy data from user-supplied buffer to file's pages */
1494		res =
1495		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1496							   write_bytes,
1497							   prepared_pages, buf);
1498		if (res) {
1499			reiserfs_unprepare_pages(prepared_pages, num_pages);
1500			break;
1501		}
1502
1503		/* Send the pages to disk and unlock them. */
1504		res =
1505		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1506							  num_pages,
1507							  write_bytes,
1508							  prepared_pages);
1509		if (res)
1510			break;
1511
1512		already_written += write_bytes;
1513		buf += write_bytes;
1514		*ppos = pos += write_bytes;
1515		count -= write_bytes;
1516		balance_dirty_pages_ratelimited(inode->i_mapping);
1517	}
1518
1519	/* this is only true on error */
1520	if (th.t_trans_id) {
1521		reiserfs_write_lock(inode->i_sb);
1522		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1523		reiserfs_write_unlock(inode->i_sb);
1524		if (err) {
1525			res = err;
1526			goto out;
1527		}
1528	}
1529
1530	if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1531		res =
1532		    generic_osync_inode(inode, file->f_mapping,
1533					OSYNC_METADATA | OSYNC_DATA);
1534
1535	up(&inode->i_sem);
1536	reiserfs_async_progress_wait(inode->i_sb);
1537	return (already_written != 0) ? already_written : res;
1538
1539      out:
1540	up(&inode->i_sem);	// unlock the file on exit.
1541	return res;
1542}
1543
1544static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
1545				  size_t count, loff_t pos)
1546{
1547	return generic_file_aio_write(iocb, buf, count, pos);
1548}
1549
1550struct file_operations reiserfs_file_operations = {
1551	.read = generic_file_read,
1552	.write = reiserfs_file_write,
1553	.ioctl = reiserfs_ioctl,
1554	.mmap = generic_file_mmap,
1555	.release = reiserfs_file_release,
1556	.fsync = reiserfs_sync_file,
1557	.sendfile = generic_file_sendfile,
1558	.aio_read = generic_file_aio_read,
1559	.aio_write = reiserfs_aio_write,
1560};
1561
1562struct inode_operations reiserfs_file_inode_operations = {
1563	.truncate = reiserfs_vfs_truncate_file,
1564	.setattr = reiserfs_setattr,
1565	.setxattr = reiserfs_setxattr,
1566	.getxattr = reiserfs_getxattr,
1567	.listxattr = reiserfs_listxattr,
1568	.removexattr = reiserfs_removexattr,
1569	.permission = reiserfs_permission,
1570};
1571