extents.c revision 74d3487fc8aa58cec16dff7239dea1ca59bdab0e
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/ext4_jbd2.h>
36#include <linux/jbd2.h>
37#include <linux/highuid.h>
38#include <linux/pagemap.h>
39#include <linux/quotaops.h>
40#include <linux/string.h>
41#include <linux/slab.h>
42#include <linux/falloc.h>
43#include <linux/ext4_fs_extents.h>
44#include <asm/uaccess.h>
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53	ext4_fsblk_t block;
54
55	block = le32_to_cpu(ex->ee_start_lo);
56	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57	return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66	ext4_fsblk_t block;
67
68	block = le32_to_cpu(ix->ei_leaf_lo);
69	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70	return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
96{
97	int err;
98
99	if (handle->h_buffer_credits > needed)
100		return handle;
101	if (!ext4_journal_extend(handle, needed))
102		return handle;
103	err = ext4_journal_restart(handle, needed);
104
105	return handle;
106}
107
108/*
109 * could return:
110 *  - EROFS
111 *  - ENOMEM
112 */
113static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
114				struct ext4_ext_path *path)
115{
116	if (path->p_bh) {
117		/* path points to block */
118		return ext4_journal_get_write_access(handle, path->p_bh);
119	}
120	/* path points to leaf/index in inode body */
121	/* we use in-core data, no need to protect them */
122	return 0;
123}
124
125/*
126 * could return:
127 *  - EROFS
128 *  - ENOMEM
129 *  - EIO
130 */
131static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
132				struct ext4_ext_path *path)
133{
134	int err;
135	if (path->p_bh) {
136		/* path points to block */
137		err = ext4_journal_dirty_metadata(handle, path->p_bh);
138	} else {
139		/* path points to leaf/index in inode body */
140		err = ext4_mark_inode_dirty(handle, inode);
141	}
142	return err;
143}
144
145static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
146			      struct ext4_ext_path *path,
147			      ext4_lblk_t block)
148{
149	struct ext4_inode_info *ei = EXT4_I(inode);
150	ext4_fsblk_t bg_start;
151	ext4_fsblk_t last_block;
152	ext4_grpblk_t colour;
153	int depth;
154
155	if (path) {
156		struct ext4_extent *ex;
157		depth = path->p_depth;
158
159		/* try to predict block placement */
160		ex = path[depth].p_ext;
161		if (ex)
162			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
163
164		/* it looks like index is empty;
165		 * try to find starting block from index itself */
166		if (path[depth].p_bh)
167			return path[depth].p_bh->b_blocknr;
168	}
169
170	/* OK. use inode's group */
171	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
172		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
173	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
174
175	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
176		colour = (current->pid % 16) *
177			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
178	else
179		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
180	return bg_start + colour + block;
181}
182
183static ext4_fsblk_t
184ext4_ext_new_block(handle_t *handle, struct inode *inode,
185			struct ext4_ext_path *path,
186			struct ext4_extent *ex, int *err)
187{
188	ext4_fsblk_t goal, newblock;
189
190	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
191	newblock = ext4_new_block(handle, inode, goal, err);
192	return newblock;
193}
194
195static int ext4_ext_space_block(struct inode *inode)
196{
197	int size;
198
199	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
200			/ sizeof(struct ext4_extent);
201#ifdef AGGRESSIVE_TEST
202	if (size > 6)
203		size = 6;
204#endif
205	return size;
206}
207
208static int ext4_ext_space_block_idx(struct inode *inode)
209{
210	int size;
211
212	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
213			/ sizeof(struct ext4_extent_idx);
214#ifdef AGGRESSIVE_TEST
215	if (size > 5)
216		size = 5;
217#endif
218	return size;
219}
220
221static int ext4_ext_space_root(struct inode *inode)
222{
223	int size;
224
225	size = sizeof(EXT4_I(inode)->i_data);
226	size -= sizeof(struct ext4_extent_header);
227	size /= sizeof(struct ext4_extent);
228#ifdef AGGRESSIVE_TEST
229	if (size > 3)
230		size = 3;
231#endif
232	return size;
233}
234
235static int ext4_ext_space_root_idx(struct inode *inode)
236{
237	int size;
238
239	size = sizeof(EXT4_I(inode)->i_data);
240	size -= sizeof(struct ext4_extent_header);
241	size /= sizeof(struct ext4_extent_idx);
242#ifdef AGGRESSIVE_TEST
243	if (size > 4)
244		size = 4;
245#endif
246	return size;
247}
248
249static int
250ext4_ext_max_entries(struct inode *inode, int depth)
251{
252	int max;
253
254	if (depth == ext_depth(inode)) {
255		if (depth == 0)
256			max = ext4_ext_space_root(inode);
257		else
258			max = ext4_ext_space_root_idx(inode);
259	} else {
260		if (depth == 0)
261			max = ext4_ext_space_block(inode);
262		else
263			max = ext4_ext_space_block_idx(inode);
264	}
265
266	return max;
267}
268
269static int __ext4_ext_check_header(const char *function, struct inode *inode,
270					struct ext4_extent_header *eh,
271					int depth)
272{
273	const char *error_msg;
274	int max = 0;
275
276	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
277		error_msg = "invalid magic";
278		goto corrupted;
279	}
280	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
281		error_msg = "unexpected eh_depth";
282		goto corrupted;
283	}
284	if (unlikely(eh->eh_max == 0)) {
285		error_msg = "invalid eh_max";
286		goto corrupted;
287	}
288	max = ext4_ext_max_entries(inode, depth);
289	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
290		error_msg = "too large eh_max";
291		goto corrupted;
292	}
293	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
294		error_msg = "invalid eh_entries";
295		goto corrupted;
296	}
297	return 0;
298
299corrupted:
300	ext4_error(inode->i_sb, function,
301			"bad header in inode #%lu: %s - magic %x, "
302			"entries %u, max %u(%u), depth %u(%u)",
303			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
304			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
305			max, le16_to_cpu(eh->eh_depth), depth);
306
307	return -EIO;
308}
309
310#define ext4_ext_check_header(inode, eh, depth)	\
311	__ext4_ext_check_header(__FUNCTION__, inode, eh, depth)
312
313#ifdef EXT_DEBUG
314static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
315{
316	int k, l = path->p_depth;
317
318	ext_debug("path:");
319	for (k = 0; k <= l; k++, path++) {
320		if (path->p_idx) {
321		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
322			    idx_pblock(path->p_idx));
323		} else if (path->p_ext) {
324			ext_debug("  %d:%d:%llu ",
325				  le32_to_cpu(path->p_ext->ee_block),
326				  ext4_ext_get_actual_len(path->p_ext),
327				  ext_pblock(path->p_ext));
328		} else
329			ext_debug("  []");
330	}
331	ext_debug("\n");
332}
333
334static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
335{
336	int depth = ext_depth(inode);
337	struct ext4_extent_header *eh;
338	struct ext4_extent *ex;
339	int i;
340
341	if (!path)
342		return;
343
344	eh = path[depth].p_hdr;
345	ex = EXT_FIRST_EXTENT(eh);
346
347	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
348		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
349			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
350	}
351	ext_debug("\n");
352}
353#else
354#define ext4_ext_show_path(inode,path)
355#define ext4_ext_show_leaf(inode,path)
356#endif
357
358void ext4_ext_drop_refs(struct ext4_ext_path *path)
359{
360	int depth = path->p_depth;
361	int i;
362
363	for (i = 0; i <= depth; i++, path++)
364		if (path->p_bh) {
365			brelse(path->p_bh);
366			path->p_bh = NULL;
367		}
368}
369
370/*
371 * ext4_ext_binsearch_idx:
372 * binary search for the closest index of the given block
373 * the header must be checked before calling this
374 */
375static void
376ext4_ext_binsearch_idx(struct inode *inode,
377			struct ext4_ext_path *path, ext4_lblk_t block)
378{
379	struct ext4_extent_header *eh = path->p_hdr;
380	struct ext4_extent_idx *r, *l, *m;
381
382
383	ext_debug("binsearch for %u(idx):  ", block);
384
385	l = EXT_FIRST_INDEX(eh) + 1;
386	r = EXT_LAST_INDEX(eh);
387	while (l <= r) {
388		m = l + (r - l) / 2;
389		if (block < le32_to_cpu(m->ei_block))
390			r = m - 1;
391		else
392			l = m + 1;
393		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
394				m, le32_to_cpu(m->ei_block),
395				r, le32_to_cpu(r->ei_block));
396	}
397
398	path->p_idx = l - 1;
399	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
400		  idx_pblock(path->p_idx));
401
402#ifdef CHECK_BINSEARCH
403	{
404		struct ext4_extent_idx *chix, *ix;
405		int k;
406
407		chix = ix = EXT_FIRST_INDEX(eh);
408		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
409		  if (k != 0 &&
410		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
411				printk("k=%d, ix=0x%p, first=0x%p\n", k,
412					ix, EXT_FIRST_INDEX(eh));
413				printk("%u <= %u\n",
414				       le32_to_cpu(ix->ei_block),
415				       le32_to_cpu(ix[-1].ei_block));
416			}
417			BUG_ON(k && le32_to_cpu(ix->ei_block)
418					   <= le32_to_cpu(ix[-1].ei_block));
419			if (block < le32_to_cpu(ix->ei_block))
420				break;
421			chix = ix;
422		}
423		BUG_ON(chix != path->p_idx);
424	}
425#endif
426
427}
428
429/*
430 * ext4_ext_binsearch:
431 * binary search for closest extent of the given block
432 * the header must be checked before calling this
433 */
434static void
435ext4_ext_binsearch(struct inode *inode,
436		struct ext4_ext_path *path, ext4_lblk_t block)
437{
438	struct ext4_extent_header *eh = path->p_hdr;
439	struct ext4_extent *r, *l, *m;
440
441	if (eh->eh_entries == 0) {
442		/*
443		 * this leaf is empty:
444		 * we get such a leaf in split/add case
445		 */
446		return;
447	}
448
449	ext_debug("binsearch for %u:  ", block);
450
451	l = EXT_FIRST_EXTENT(eh) + 1;
452	r = EXT_LAST_EXTENT(eh);
453
454	while (l <= r) {
455		m = l + (r - l) / 2;
456		if (block < le32_to_cpu(m->ee_block))
457			r = m - 1;
458		else
459			l = m + 1;
460		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
461				m, le32_to_cpu(m->ee_block),
462				r, le32_to_cpu(r->ee_block));
463	}
464
465	path->p_ext = l - 1;
466	ext_debug("  -> %d:%llu:%d ",
467			le32_to_cpu(path->p_ext->ee_block),
468			ext_pblock(path->p_ext),
469			ext4_ext_get_actual_len(path->p_ext));
470
471#ifdef CHECK_BINSEARCH
472	{
473		struct ext4_extent *chex, *ex;
474		int k;
475
476		chex = ex = EXT_FIRST_EXTENT(eh);
477		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
478			BUG_ON(k && le32_to_cpu(ex->ee_block)
479					  <= le32_to_cpu(ex[-1].ee_block));
480			if (block < le32_to_cpu(ex->ee_block))
481				break;
482			chex = ex;
483		}
484		BUG_ON(chex != path->p_ext);
485	}
486#endif
487
488}
489
490int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
491{
492	struct ext4_extent_header *eh;
493
494	eh = ext_inode_hdr(inode);
495	eh->eh_depth = 0;
496	eh->eh_entries = 0;
497	eh->eh_magic = EXT4_EXT_MAGIC;
498	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
499	ext4_mark_inode_dirty(handle, inode);
500	ext4_ext_invalidate_cache(inode);
501	return 0;
502}
503
504struct ext4_ext_path *
505ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
506					struct ext4_ext_path *path)
507{
508	struct ext4_extent_header *eh;
509	struct buffer_head *bh;
510	short int depth, i, ppos = 0, alloc = 0;
511
512	eh = ext_inode_hdr(inode);
513	depth = ext_depth(inode);
514	if (ext4_ext_check_header(inode, eh, depth))
515		return ERR_PTR(-EIO);
516
517
518	/* account possible depth increase */
519	if (!path) {
520		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
521				GFP_NOFS);
522		if (!path)
523			return ERR_PTR(-ENOMEM);
524		alloc = 1;
525	}
526	path[0].p_hdr = eh;
527
528	i = depth;
529	/* walk through the tree */
530	while (i) {
531		ext_debug("depth %d: num %d, max %d\n",
532			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
533
534		ext4_ext_binsearch_idx(inode, path + ppos, block);
535		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
536		path[ppos].p_depth = i;
537		path[ppos].p_ext = NULL;
538
539		bh = sb_bread(inode->i_sb, path[ppos].p_block);
540		if (!bh)
541			goto err;
542
543		eh = ext_block_hdr(bh);
544		ppos++;
545		BUG_ON(ppos > depth);
546		path[ppos].p_bh = bh;
547		path[ppos].p_hdr = eh;
548		i--;
549
550		if (ext4_ext_check_header(inode, eh, i))
551			goto err;
552	}
553
554	path[ppos].p_depth = i;
555	path[ppos].p_hdr = eh;
556	path[ppos].p_ext = NULL;
557	path[ppos].p_idx = NULL;
558
559	/* find extent */
560	ext4_ext_binsearch(inode, path + ppos, block);
561
562	ext4_ext_show_path(inode, path);
563
564	return path;
565
566err:
567	ext4_ext_drop_refs(path);
568	if (alloc)
569		kfree(path);
570	return ERR_PTR(-EIO);
571}
572
573/*
574 * ext4_ext_insert_index:
575 * insert new index [@logical;@ptr] into the block at @curp;
576 * check where to insert: before @curp or after @curp
577 */
578static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
579				struct ext4_ext_path *curp,
580				int logical, ext4_fsblk_t ptr)
581{
582	struct ext4_extent_idx *ix;
583	int len, err;
584
585	err = ext4_ext_get_access(handle, inode, curp);
586	if (err)
587		return err;
588
589	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
590	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
591	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
592		/* insert after */
593		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
594			len = (len - 1) * sizeof(struct ext4_extent_idx);
595			len = len < 0 ? 0 : len;
596			ext_debug("insert new index %d after: %llu. "
597					"move %d from 0x%p to 0x%p\n",
598					logical, ptr, len,
599					(curp->p_idx + 1), (curp->p_idx + 2));
600			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
601		}
602		ix = curp->p_idx + 1;
603	} else {
604		/* insert before */
605		len = len * sizeof(struct ext4_extent_idx);
606		len = len < 0 ? 0 : len;
607		ext_debug("insert new index %d before: %llu. "
608				"move %d from 0x%p to 0x%p\n",
609				logical, ptr, len,
610				curp->p_idx, (curp->p_idx + 1));
611		memmove(curp->p_idx + 1, curp->p_idx, len);
612		ix = curp->p_idx;
613	}
614
615	ix->ei_block = cpu_to_le32(logical);
616	ext4_idx_store_pblock(ix, ptr);
617	curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
618
619	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
620			     > le16_to_cpu(curp->p_hdr->eh_max));
621	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
622
623	err = ext4_ext_dirty(handle, inode, curp);
624	ext4_std_error(inode->i_sb, err);
625
626	return err;
627}
628
629/*
630 * ext4_ext_split:
631 * inserts new subtree into the path, using free index entry
632 * at depth @at:
633 * - allocates all needed blocks (new leaf and all intermediate index blocks)
634 * - makes decision where to split
635 * - moves remaining extents and index entries (right to the split point)
636 *   into the newly allocated blocks
637 * - initializes subtree
638 */
639static int ext4_ext_split(handle_t *handle, struct inode *inode,
640				struct ext4_ext_path *path,
641				struct ext4_extent *newext, int at)
642{
643	struct buffer_head *bh = NULL;
644	int depth = ext_depth(inode);
645	struct ext4_extent_header *neh;
646	struct ext4_extent_idx *fidx;
647	struct ext4_extent *ex;
648	int i = at, k, m, a;
649	ext4_fsblk_t newblock, oldblock;
650	__le32 border;
651	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
652	int err = 0;
653
654	/* make decision: where to split? */
655	/* FIXME: now decision is simplest: at current extent */
656
657	/* if current leaf will be split, then we should use
658	 * border from split point */
659	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
660	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
661		border = path[depth].p_ext[1].ee_block;
662		ext_debug("leaf will be split."
663				" next leaf starts at %d\n",
664				  le32_to_cpu(border));
665	} else {
666		border = newext->ee_block;
667		ext_debug("leaf will be added."
668				" next leaf starts at %d\n",
669				le32_to_cpu(border));
670	}
671
672	/*
673	 * If error occurs, then we break processing
674	 * and mark filesystem read-only. index won't
675	 * be inserted and tree will be in consistent
676	 * state. Next mount will repair buffers too.
677	 */
678
679	/*
680	 * Get array to track all allocated blocks.
681	 * We need this to handle errors and free blocks
682	 * upon them.
683	 */
684	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
685	if (!ablocks)
686		return -ENOMEM;
687
688	/* allocate all needed blocks */
689	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
690	for (a = 0; a < depth - at; a++) {
691		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
692		if (newblock == 0)
693			goto cleanup;
694		ablocks[a] = newblock;
695	}
696
697	/* initialize new leaf */
698	newblock = ablocks[--a];
699	BUG_ON(newblock == 0);
700	bh = sb_getblk(inode->i_sb, newblock);
701	if (!bh) {
702		err = -EIO;
703		goto cleanup;
704	}
705	lock_buffer(bh);
706
707	err = ext4_journal_get_create_access(handle, bh);
708	if (err)
709		goto cleanup;
710
711	neh = ext_block_hdr(bh);
712	neh->eh_entries = 0;
713	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
714	neh->eh_magic = EXT4_EXT_MAGIC;
715	neh->eh_depth = 0;
716	ex = EXT_FIRST_EXTENT(neh);
717
718	/* move remainder of path[depth] to the new leaf */
719	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
720	/* start copy from next extent */
721	/* TODO: we could do it by single memmove */
722	m = 0;
723	path[depth].p_ext++;
724	while (path[depth].p_ext <=
725			EXT_MAX_EXTENT(path[depth].p_hdr)) {
726		ext_debug("move %d:%llu:%d in new leaf %llu\n",
727				le32_to_cpu(path[depth].p_ext->ee_block),
728				ext_pblock(path[depth].p_ext),
729				ext4_ext_get_actual_len(path[depth].p_ext),
730				newblock);
731		/*memmove(ex++, path[depth].p_ext++,
732				sizeof(struct ext4_extent));
733		neh->eh_entries++;*/
734		path[depth].p_ext++;
735		m++;
736	}
737	if (m) {
738		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
739		neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
740	}
741
742	set_buffer_uptodate(bh);
743	unlock_buffer(bh);
744
745	err = ext4_journal_dirty_metadata(handle, bh);
746	if (err)
747		goto cleanup;
748	brelse(bh);
749	bh = NULL;
750
751	/* correct old leaf */
752	if (m) {
753		err = ext4_ext_get_access(handle, inode, path + depth);
754		if (err)
755			goto cleanup;
756		path[depth].p_hdr->eh_entries =
757		     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
758		err = ext4_ext_dirty(handle, inode, path + depth);
759		if (err)
760			goto cleanup;
761
762	}
763
764	/* create intermediate indexes */
765	k = depth - at - 1;
766	BUG_ON(k < 0);
767	if (k)
768		ext_debug("create %d intermediate indices\n", k);
769	/* insert new index into current index block */
770	/* current depth stored in i var */
771	i = depth - 1;
772	while (k--) {
773		oldblock = newblock;
774		newblock = ablocks[--a];
775		bh = sb_getblk(inode->i_sb, newblock);
776		if (!bh) {
777			err = -EIO;
778			goto cleanup;
779		}
780		lock_buffer(bh);
781
782		err = ext4_journal_get_create_access(handle, bh);
783		if (err)
784			goto cleanup;
785
786		neh = ext_block_hdr(bh);
787		neh->eh_entries = cpu_to_le16(1);
788		neh->eh_magic = EXT4_EXT_MAGIC;
789		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
790		neh->eh_depth = cpu_to_le16(depth - i);
791		fidx = EXT_FIRST_INDEX(neh);
792		fidx->ei_block = border;
793		ext4_idx_store_pblock(fidx, oldblock);
794
795		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
796				i, newblock, le32_to_cpu(border), oldblock);
797		/* copy indexes */
798		m = 0;
799		path[i].p_idx++;
800
801		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
802				EXT_MAX_INDEX(path[i].p_hdr));
803		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
804				EXT_LAST_INDEX(path[i].p_hdr));
805		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
806			ext_debug("%d: move %d:%llu in new index %llu\n", i,
807					le32_to_cpu(path[i].p_idx->ei_block),
808					idx_pblock(path[i].p_idx),
809					newblock);
810			/*memmove(++fidx, path[i].p_idx++,
811					sizeof(struct ext4_extent_idx));
812			neh->eh_entries++;
813			BUG_ON(neh->eh_entries > neh->eh_max);*/
814			path[i].p_idx++;
815			m++;
816		}
817		if (m) {
818			memmove(++fidx, path[i].p_idx - m,
819				sizeof(struct ext4_extent_idx) * m);
820			neh->eh_entries =
821				cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
822		}
823		set_buffer_uptodate(bh);
824		unlock_buffer(bh);
825
826		err = ext4_journal_dirty_metadata(handle, bh);
827		if (err)
828			goto cleanup;
829		brelse(bh);
830		bh = NULL;
831
832		/* correct old index */
833		if (m) {
834			err = ext4_ext_get_access(handle, inode, path + i);
835			if (err)
836				goto cleanup;
837			path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
838			err = ext4_ext_dirty(handle, inode, path + i);
839			if (err)
840				goto cleanup;
841		}
842
843		i--;
844	}
845
846	/* insert new index */
847	err = ext4_ext_insert_index(handle, inode, path + at,
848				    le32_to_cpu(border), newblock);
849
850cleanup:
851	if (bh) {
852		if (buffer_locked(bh))
853			unlock_buffer(bh);
854		brelse(bh);
855	}
856
857	if (err) {
858		/* free all allocated blocks in error case */
859		for (i = 0; i < depth; i++) {
860			if (!ablocks[i])
861				continue;
862			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
863		}
864	}
865	kfree(ablocks);
866
867	return err;
868}
869
870/*
871 * ext4_ext_grow_indepth:
872 * implements tree growing procedure:
873 * - allocates new block
874 * - moves top-level data (index block or leaf) into the new block
875 * - initializes new top-level, creating index that points to the
876 *   just created block
877 */
878static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
879					struct ext4_ext_path *path,
880					struct ext4_extent *newext)
881{
882	struct ext4_ext_path *curp = path;
883	struct ext4_extent_header *neh;
884	struct ext4_extent_idx *fidx;
885	struct buffer_head *bh;
886	ext4_fsblk_t newblock;
887	int err = 0;
888
889	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
890	if (newblock == 0)
891		return err;
892
893	bh = sb_getblk(inode->i_sb, newblock);
894	if (!bh) {
895		err = -EIO;
896		ext4_std_error(inode->i_sb, err);
897		return err;
898	}
899	lock_buffer(bh);
900
901	err = ext4_journal_get_create_access(handle, bh);
902	if (err) {
903		unlock_buffer(bh);
904		goto out;
905	}
906
907	/* move top-level index/leaf into new block */
908	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
909
910	/* set size of new block */
911	neh = ext_block_hdr(bh);
912	/* old root could have indexes or leaves
913	 * so calculate e_max right way */
914	if (ext_depth(inode))
915	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
916	else
917	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
918	neh->eh_magic = EXT4_EXT_MAGIC;
919	set_buffer_uptodate(bh);
920	unlock_buffer(bh);
921
922	err = ext4_journal_dirty_metadata(handle, bh);
923	if (err)
924		goto out;
925
926	/* create index in new top-level index: num,max,pointer */
927	err = ext4_ext_get_access(handle, inode, curp);
928	if (err)
929		goto out;
930
931	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
932	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
933	curp->p_hdr->eh_entries = cpu_to_le16(1);
934	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
935
936	if (path[0].p_hdr->eh_depth)
937		curp->p_idx->ei_block =
938			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
939	else
940		curp->p_idx->ei_block =
941			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
942	ext4_idx_store_pblock(curp->p_idx, newblock);
943
944	neh = ext_inode_hdr(inode);
945	fidx = EXT_FIRST_INDEX(neh);
946	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
947		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
948		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
949
950	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
951	err = ext4_ext_dirty(handle, inode, curp);
952out:
953	brelse(bh);
954
955	return err;
956}
957
958/*
959 * ext4_ext_create_new_leaf:
960 * finds empty index and adds new leaf.
961 * if no free index is found, then it requests in-depth growing.
962 */
963static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
964					struct ext4_ext_path *path,
965					struct ext4_extent *newext)
966{
967	struct ext4_ext_path *curp;
968	int depth, i, err = 0;
969
970repeat:
971	i = depth = ext_depth(inode);
972
973	/* walk up to the tree and look for free index entry */
974	curp = path + depth;
975	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
976		i--;
977		curp--;
978	}
979
980	/* we use already allocated block for index block,
981	 * so subsequent data blocks should be contiguous */
982	if (EXT_HAS_FREE_INDEX(curp)) {
983		/* if we found index with free entry, then use that
984		 * entry: create all needed subtree and add new leaf */
985		err = ext4_ext_split(handle, inode, path, newext, i);
986
987		/* refill path */
988		ext4_ext_drop_refs(path);
989		path = ext4_ext_find_extent(inode,
990				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
991				    path);
992		if (IS_ERR(path))
993			err = PTR_ERR(path);
994	} else {
995		/* tree is full, time to grow in depth */
996		err = ext4_ext_grow_indepth(handle, inode, path, newext);
997		if (err)
998			goto out;
999
1000		/* refill path */
1001		ext4_ext_drop_refs(path);
1002		path = ext4_ext_find_extent(inode,
1003				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1004				    path);
1005		if (IS_ERR(path)) {
1006			err = PTR_ERR(path);
1007			goto out;
1008		}
1009
1010		/*
1011		 * only first (depth 0 -> 1) produces free space;
1012		 * in all other cases we have to split the grown tree
1013		 */
1014		depth = ext_depth(inode);
1015		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1016			/* now we need to split */
1017			goto repeat;
1018		}
1019	}
1020
1021out:
1022	return err;
1023}
1024
1025/*
1026 * search the closest allocated block to the left for *logical
1027 * and returns it at @logical + it's physical address at @phys
1028 * if *logical is the smallest allocated block, the function
1029 * returns 0 at @phys
1030 * return value contains 0 (success) or error code
1031 */
1032int
1033ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1034			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1035{
1036	struct ext4_extent_idx *ix;
1037	struct ext4_extent *ex;
1038	int depth, ee_len;
1039
1040	BUG_ON(path == NULL);
1041	depth = path->p_depth;
1042	*phys = 0;
1043
1044	if (depth == 0 && path->p_ext == NULL)
1045		return 0;
1046
1047	/* usually extent in the path covers blocks smaller
1048	 * then *logical, but it can be that extent is the
1049	 * first one in the file */
1050
1051	ex = path[depth].p_ext;
1052	ee_len = ext4_ext_get_actual_len(ex);
1053	if (*logical < le32_to_cpu(ex->ee_block)) {
1054		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1055		while (--depth >= 0) {
1056			ix = path[depth].p_idx;
1057			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1058		}
1059		return 0;
1060	}
1061
1062	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1063
1064	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1065	*phys = ext_pblock(ex) + ee_len - 1;
1066	return 0;
1067}
1068
1069/*
1070 * search the closest allocated block to the right for *logical
1071 * and returns it at @logical + it's physical address at @phys
1072 * if *logical is the smallest allocated block, the function
1073 * returns 0 at @phys
1074 * return value contains 0 (success) or error code
1075 */
1076int
1077ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1078			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1079{
1080	struct buffer_head *bh = NULL;
1081	struct ext4_extent_header *eh;
1082	struct ext4_extent_idx *ix;
1083	struct ext4_extent *ex;
1084	ext4_fsblk_t block;
1085	int depth, ee_len;
1086
1087	BUG_ON(path == NULL);
1088	depth = path->p_depth;
1089	*phys = 0;
1090
1091	if (depth == 0 && path->p_ext == NULL)
1092		return 0;
1093
1094	/* usually extent in the path covers blocks smaller
1095	 * then *logical, but it can be that extent is the
1096	 * first one in the file */
1097
1098	ex = path[depth].p_ext;
1099	ee_len = ext4_ext_get_actual_len(ex);
1100	if (*logical < le32_to_cpu(ex->ee_block)) {
1101		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1102		while (--depth >= 0) {
1103			ix = path[depth].p_idx;
1104			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1105		}
1106		*logical = le32_to_cpu(ex->ee_block);
1107		*phys = ext_pblock(ex);
1108		return 0;
1109	}
1110
1111	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1112
1113	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1114		/* next allocated block in this leaf */
1115		ex++;
1116		*logical = le32_to_cpu(ex->ee_block);
1117		*phys = ext_pblock(ex);
1118		return 0;
1119	}
1120
1121	/* go up and search for index to the right */
1122	while (--depth >= 0) {
1123		ix = path[depth].p_idx;
1124		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1125			break;
1126	}
1127
1128	if (depth < 0) {
1129		/* we've gone up to the root and
1130		 * found no index to the right */
1131		return 0;
1132	}
1133
1134	/* we've found index to the right, let's
1135	 * follow it and find the closest allocated
1136	 * block to the right */
1137	ix++;
1138	block = idx_pblock(ix);
1139	while (++depth < path->p_depth) {
1140		bh = sb_bread(inode->i_sb, block);
1141		if (bh == NULL)
1142			return -EIO;
1143		eh = ext_block_hdr(bh);
1144		if (ext4_ext_check_header(inode, eh, depth)) {
1145			put_bh(bh);
1146			return -EIO;
1147		}
1148		ix = EXT_FIRST_INDEX(eh);
1149		block = idx_pblock(ix);
1150		put_bh(bh);
1151	}
1152
1153	bh = sb_bread(inode->i_sb, block);
1154	if (bh == NULL)
1155		return -EIO;
1156	eh = ext_block_hdr(bh);
1157	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1158		put_bh(bh);
1159		return -EIO;
1160	}
1161	ex = EXT_FIRST_EXTENT(eh);
1162	*logical = le32_to_cpu(ex->ee_block);
1163	*phys = ext_pblock(ex);
1164	put_bh(bh);
1165	return 0;
1166
1167}
1168
1169/*
1170 * ext4_ext_next_allocated_block:
1171 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1172 * NOTE: it considers block number from index entry as
1173 * allocated block. Thus, index entries have to be consistent
1174 * with leaves.
1175 */
1176static ext4_lblk_t
1177ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1178{
1179	int depth;
1180
1181	BUG_ON(path == NULL);
1182	depth = path->p_depth;
1183
1184	if (depth == 0 && path->p_ext == NULL)
1185		return EXT_MAX_BLOCK;
1186
1187	while (depth >= 0) {
1188		if (depth == path->p_depth) {
1189			/* leaf */
1190			if (path[depth].p_ext !=
1191					EXT_LAST_EXTENT(path[depth].p_hdr))
1192			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1193		} else {
1194			/* index */
1195			if (path[depth].p_idx !=
1196					EXT_LAST_INDEX(path[depth].p_hdr))
1197			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1198		}
1199		depth--;
1200	}
1201
1202	return EXT_MAX_BLOCK;
1203}
1204
1205/*
1206 * ext4_ext_next_leaf_block:
1207 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1208 */
1209static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1210					struct ext4_ext_path *path)
1211{
1212	int depth;
1213
1214	BUG_ON(path == NULL);
1215	depth = path->p_depth;
1216
1217	/* zero-tree has no leaf blocks at all */
1218	if (depth == 0)
1219		return EXT_MAX_BLOCK;
1220
1221	/* go to index block */
1222	depth--;
1223
1224	while (depth >= 0) {
1225		if (path[depth].p_idx !=
1226				EXT_LAST_INDEX(path[depth].p_hdr))
1227			return (ext4_lblk_t)
1228				le32_to_cpu(path[depth].p_idx[1].ei_block);
1229		depth--;
1230	}
1231
1232	return EXT_MAX_BLOCK;
1233}
1234
1235/*
1236 * ext4_ext_correct_indexes:
1237 * if leaf gets modified and modified extent is first in the leaf,
1238 * then we have to correct all indexes above.
1239 * TODO: do we need to correct tree in all cases?
1240 */
1241static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1242				struct ext4_ext_path *path)
1243{
1244	struct ext4_extent_header *eh;
1245	int depth = ext_depth(inode);
1246	struct ext4_extent *ex;
1247	__le32 border;
1248	int k, err = 0;
1249
1250	eh = path[depth].p_hdr;
1251	ex = path[depth].p_ext;
1252	BUG_ON(ex == NULL);
1253	BUG_ON(eh == NULL);
1254
1255	if (depth == 0) {
1256		/* there is no tree at all */
1257		return 0;
1258	}
1259
1260	if (ex != EXT_FIRST_EXTENT(eh)) {
1261		/* we correct tree if first leaf got modified only */
1262		return 0;
1263	}
1264
1265	/*
1266	 * TODO: we need correction if border is smaller than current one
1267	 */
1268	k = depth - 1;
1269	border = path[depth].p_ext->ee_block;
1270	err = ext4_ext_get_access(handle, inode, path + k);
1271	if (err)
1272		return err;
1273	path[k].p_idx->ei_block = border;
1274	err = ext4_ext_dirty(handle, inode, path + k);
1275	if (err)
1276		return err;
1277
1278	while (k--) {
1279		/* change all left-side indexes */
1280		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1281			break;
1282		err = ext4_ext_get_access(handle, inode, path + k);
1283		if (err)
1284			break;
1285		path[k].p_idx->ei_block = border;
1286		err = ext4_ext_dirty(handle, inode, path + k);
1287		if (err)
1288			break;
1289	}
1290
1291	return err;
1292}
1293
1294static int
1295ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1296				struct ext4_extent *ex2)
1297{
1298	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1299
1300	/*
1301	 * Make sure that either both extents are uninitialized, or
1302	 * both are _not_.
1303	 */
1304	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1305		return 0;
1306
1307	if (ext4_ext_is_uninitialized(ex1))
1308		max_len = EXT_UNINIT_MAX_LEN;
1309	else
1310		max_len = EXT_INIT_MAX_LEN;
1311
1312	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1313	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1314
1315	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1316			le32_to_cpu(ex2->ee_block))
1317		return 0;
1318
1319	/*
1320	 * To allow future support for preallocated extents to be added
1321	 * as an RO_COMPAT feature, refuse to merge to extents if
1322	 * this can result in the top bit of ee_len being set.
1323	 */
1324	if (ext1_ee_len + ext2_ee_len > max_len)
1325		return 0;
1326#ifdef AGGRESSIVE_TEST
1327	if (ext1_ee_len >= 4)
1328		return 0;
1329#endif
1330
1331	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1332		return 1;
1333	return 0;
1334}
1335
1336/*
1337 * This function tries to merge the "ex" extent to the next extent in the tree.
1338 * It always tries to merge towards right. If you want to merge towards
1339 * left, pass "ex - 1" as argument instead of "ex".
1340 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1341 * 1 if they got merged.
1342 */
1343int ext4_ext_try_to_merge(struct inode *inode,
1344			  struct ext4_ext_path *path,
1345			  struct ext4_extent *ex)
1346{
1347	struct ext4_extent_header *eh;
1348	unsigned int depth, len;
1349	int merge_done = 0;
1350	int uninitialized = 0;
1351
1352	depth = ext_depth(inode);
1353	BUG_ON(path[depth].p_hdr == NULL);
1354	eh = path[depth].p_hdr;
1355
1356	while (ex < EXT_LAST_EXTENT(eh)) {
1357		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1358			break;
1359		/* merge with next extent! */
1360		if (ext4_ext_is_uninitialized(ex))
1361			uninitialized = 1;
1362		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1363				+ ext4_ext_get_actual_len(ex + 1));
1364		if (uninitialized)
1365			ext4_ext_mark_uninitialized(ex);
1366
1367		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1368			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1369				* sizeof(struct ext4_extent);
1370			memmove(ex + 1, ex + 2, len);
1371		}
1372		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
1373		merge_done = 1;
1374		WARN_ON(eh->eh_entries == 0);
1375		if (!eh->eh_entries)
1376			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1377			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1378	}
1379
1380	return merge_done;
1381}
1382
1383/*
1384 * check if a portion of the "newext" extent overlaps with an
1385 * existing extent.
1386 *
1387 * If there is an overlap discovered, it updates the length of the newext
1388 * such that there will be no overlap, and then returns 1.
1389 * If there is no overlap found, it returns 0.
1390 */
1391unsigned int ext4_ext_check_overlap(struct inode *inode,
1392				    struct ext4_extent *newext,
1393				    struct ext4_ext_path *path)
1394{
1395	ext4_lblk_t b1, b2;
1396	unsigned int depth, len1;
1397	unsigned int ret = 0;
1398
1399	b1 = le32_to_cpu(newext->ee_block);
1400	len1 = ext4_ext_get_actual_len(newext);
1401	depth = ext_depth(inode);
1402	if (!path[depth].p_ext)
1403		goto out;
1404	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1405
1406	/*
1407	 * get the next allocated block if the extent in the path
1408	 * is before the requested block(s)
1409	 */
1410	if (b2 < b1) {
1411		b2 = ext4_ext_next_allocated_block(path);
1412		if (b2 == EXT_MAX_BLOCK)
1413			goto out;
1414	}
1415
1416	/* check for wrap through zero on extent logical start block*/
1417	if (b1 + len1 < b1) {
1418		len1 = EXT_MAX_BLOCK - b1;
1419		newext->ee_len = cpu_to_le16(len1);
1420		ret = 1;
1421	}
1422
1423	/* check for overlap */
1424	if (b1 + len1 > b2) {
1425		newext->ee_len = cpu_to_le16(b2 - b1);
1426		ret = 1;
1427	}
1428out:
1429	return ret;
1430}
1431
1432/*
1433 * ext4_ext_insert_extent:
1434 * tries to merge requsted extent into the existing extent or
1435 * inserts requested extent as new one into the tree,
1436 * creating new leaf in the no-space case.
1437 */
1438int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1439				struct ext4_ext_path *path,
1440				struct ext4_extent *newext)
1441{
1442	struct ext4_extent_header * eh;
1443	struct ext4_extent *ex, *fex;
1444	struct ext4_extent *nearex; /* nearest extent */
1445	struct ext4_ext_path *npath = NULL;
1446	int depth, len, err;
1447	ext4_lblk_t next;
1448	unsigned uninitialized = 0;
1449
1450	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1451	depth = ext_depth(inode);
1452	ex = path[depth].p_ext;
1453	BUG_ON(path[depth].p_hdr == NULL);
1454
1455	/* try to insert block into found extent and return */
1456	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1457		ext_debug("append %d block to %d:%d (from %llu)\n",
1458				ext4_ext_get_actual_len(newext),
1459				le32_to_cpu(ex->ee_block),
1460				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1461		err = ext4_ext_get_access(handle, inode, path + depth);
1462		if (err)
1463			return err;
1464
1465		/*
1466		 * ext4_can_extents_be_merged should have checked that either
1467		 * both extents are uninitialized, or both aren't. Thus we
1468		 * need to check only one of them here.
1469		 */
1470		if (ext4_ext_is_uninitialized(ex))
1471			uninitialized = 1;
1472		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1473					+ ext4_ext_get_actual_len(newext));
1474		if (uninitialized)
1475			ext4_ext_mark_uninitialized(ex);
1476		eh = path[depth].p_hdr;
1477		nearex = ex;
1478		goto merge;
1479	}
1480
1481repeat:
1482	depth = ext_depth(inode);
1483	eh = path[depth].p_hdr;
1484	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1485		goto has_space;
1486
1487	/* probably next leaf has space for us? */
1488	fex = EXT_LAST_EXTENT(eh);
1489	next = ext4_ext_next_leaf_block(inode, path);
1490	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1491	    && next != EXT_MAX_BLOCK) {
1492		ext_debug("next leaf block - %d\n", next);
1493		BUG_ON(npath != NULL);
1494		npath = ext4_ext_find_extent(inode, next, NULL);
1495		if (IS_ERR(npath))
1496			return PTR_ERR(npath);
1497		BUG_ON(npath->p_depth != path->p_depth);
1498		eh = npath[depth].p_hdr;
1499		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1500			ext_debug("next leaf isnt full(%d)\n",
1501				  le16_to_cpu(eh->eh_entries));
1502			path = npath;
1503			goto repeat;
1504		}
1505		ext_debug("next leaf has no free space(%d,%d)\n",
1506			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1507	}
1508
1509	/*
1510	 * There is no free space in the found leaf.
1511	 * We're gonna add a new leaf in the tree.
1512	 */
1513	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1514	if (err)
1515		goto cleanup;
1516	depth = ext_depth(inode);
1517	eh = path[depth].p_hdr;
1518
1519has_space:
1520	nearex = path[depth].p_ext;
1521
1522	err = ext4_ext_get_access(handle, inode, path + depth);
1523	if (err)
1524		goto cleanup;
1525
1526	if (!nearex) {
1527		/* there is no extent in this leaf, create first one */
1528		ext_debug("first extent in the leaf: %d:%llu:%d\n",
1529				le32_to_cpu(newext->ee_block),
1530				ext_pblock(newext),
1531				ext4_ext_get_actual_len(newext));
1532		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1533	} else if (le32_to_cpu(newext->ee_block)
1534			   > le32_to_cpu(nearex->ee_block)) {
1535/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1536		if (nearex != EXT_LAST_EXTENT(eh)) {
1537			len = EXT_MAX_EXTENT(eh) - nearex;
1538			len = (len - 1) * sizeof(struct ext4_extent);
1539			len = len < 0 ? 0 : len;
1540			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1541					"move %d from 0x%p to 0x%p\n",
1542					le32_to_cpu(newext->ee_block),
1543					ext_pblock(newext),
1544					ext4_ext_get_actual_len(newext),
1545					nearex, len, nearex + 1, nearex + 2);
1546			memmove(nearex + 2, nearex + 1, len);
1547		}
1548		path[depth].p_ext = nearex + 1;
1549	} else {
1550		BUG_ON(newext->ee_block == nearex->ee_block);
1551		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1552		len = len < 0 ? 0 : len;
1553		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1554				"move %d from 0x%p to 0x%p\n",
1555				le32_to_cpu(newext->ee_block),
1556				ext_pblock(newext),
1557				ext4_ext_get_actual_len(newext),
1558				nearex, len, nearex + 1, nearex + 2);
1559		memmove(nearex + 1, nearex, len);
1560		path[depth].p_ext = nearex;
1561	}
1562
1563	eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
1564	nearex = path[depth].p_ext;
1565	nearex->ee_block = newext->ee_block;
1566	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1567	nearex->ee_len = newext->ee_len;
1568
1569merge:
1570	/* try to merge extents to the right */
1571	ext4_ext_try_to_merge(inode, path, nearex);
1572
1573	/* try to merge extents to the left */
1574
1575	/* time to correct all indexes above */
1576	err = ext4_ext_correct_indexes(handle, inode, path);
1577	if (err)
1578		goto cleanup;
1579
1580	err = ext4_ext_dirty(handle, inode, path + depth);
1581
1582cleanup:
1583	if (npath) {
1584		ext4_ext_drop_refs(npath);
1585		kfree(npath);
1586	}
1587	ext4_ext_tree_changed(inode);
1588	ext4_ext_invalidate_cache(inode);
1589	return err;
1590}
1591
1592static void
1593ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1594			__u32 len, ext4_fsblk_t start, int type)
1595{
1596	struct ext4_ext_cache *cex;
1597	BUG_ON(len == 0);
1598	cex = &EXT4_I(inode)->i_cached_extent;
1599	cex->ec_type = type;
1600	cex->ec_block = block;
1601	cex->ec_len = len;
1602	cex->ec_start = start;
1603}
1604
1605/*
1606 * ext4_ext_put_gap_in_cache:
1607 * calculate boundaries of the gap that the requested block fits into
1608 * and cache this gap
1609 */
1610static void
1611ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1612				ext4_lblk_t block)
1613{
1614	int depth = ext_depth(inode);
1615	unsigned long len;
1616	ext4_lblk_t lblock;
1617	struct ext4_extent *ex;
1618
1619	ex = path[depth].p_ext;
1620	if (ex == NULL) {
1621		/* there is no extent yet, so gap is [0;-] */
1622		lblock = 0;
1623		len = EXT_MAX_BLOCK;
1624		ext_debug("cache gap(whole file):");
1625	} else if (block < le32_to_cpu(ex->ee_block)) {
1626		lblock = block;
1627		len = le32_to_cpu(ex->ee_block) - block;
1628		ext_debug("cache gap(before): %u [%u:%u]",
1629				block,
1630				le32_to_cpu(ex->ee_block),
1631				 ext4_ext_get_actual_len(ex));
1632	} else if (block >= le32_to_cpu(ex->ee_block)
1633			+ ext4_ext_get_actual_len(ex)) {
1634		ext4_lblk_t next;
1635		lblock = le32_to_cpu(ex->ee_block)
1636			+ ext4_ext_get_actual_len(ex);
1637
1638		next = ext4_ext_next_allocated_block(path);
1639		ext_debug("cache gap(after): [%u:%u] %u",
1640				le32_to_cpu(ex->ee_block),
1641				ext4_ext_get_actual_len(ex),
1642				block);
1643		BUG_ON(next == lblock);
1644		len = next - lblock;
1645	} else {
1646		lblock = len = 0;
1647		BUG();
1648	}
1649
1650	ext_debug(" -> %u:%lu\n", lblock, len);
1651	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1652}
1653
1654static int
1655ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1656			struct ext4_extent *ex)
1657{
1658	struct ext4_ext_cache *cex;
1659
1660	cex = &EXT4_I(inode)->i_cached_extent;
1661
1662	/* has cache valid data? */
1663	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1664		return EXT4_EXT_CACHE_NO;
1665
1666	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1667			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1668	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1669		ex->ee_block = cpu_to_le32(cex->ec_block);
1670		ext4_ext_store_pblock(ex, cex->ec_start);
1671		ex->ee_len = cpu_to_le16(cex->ec_len);
1672		ext_debug("%u cached by %u:%u:%llu\n",
1673				block,
1674				cex->ec_block, cex->ec_len, cex->ec_start);
1675		return cex->ec_type;
1676	}
1677
1678	/* not in cache */
1679	return EXT4_EXT_CACHE_NO;
1680}
1681
1682/*
1683 * ext4_ext_rm_idx:
1684 * removes index from the index block.
1685 * It's used in truncate case only, thus all requests are for
1686 * last index in the block only.
1687 */
1688static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1689			struct ext4_ext_path *path)
1690{
1691	struct buffer_head *bh;
1692	int err;
1693	ext4_fsblk_t leaf;
1694
1695	/* free index block */
1696	path--;
1697	leaf = idx_pblock(path->p_idx);
1698	BUG_ON(path->p_hdr->eh_entries == 0);
1699	err = ext4_ext_get_access(handle, inode, path);
1700	if (err)
1701		return err;
1702	path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
1703	err = ext4_ext_dirty(handle, inode, path);
1704	if (err)
1705		return err;
1706	ext_debug("index is empty, remove it, free block %llu\n", leaf);
1707	bh = sb_find_get_block(inode->i_sb, leaf);
1708	ext4_forget(handle, 1, inode, bh, leaf);
1709	ext4_free_blocks(handle, inode, leaf, 1, 1);
1710	return err;
1711}
1712
1713/*
1714 * ext4_ext_calc_credits_for_insert:
1715 * This routine returns max. credits that the extent tree can consume.
1716 * It should be OK for low-performance paths like ->writepage()
1717 * To allow many writing processes to fit into a single transaction,
1718 * the caller should calculate credits under i_data_sem and
1719 * pass the actual path.
1720 */
1721int ext4_ext_calc_credits_for_insert(struct inode *inode,
1722						struct ext4_ext_path *path)
1723{
1724	int depth, needed;
1725
1726	if (path) {
1727		/* probably there is space in leaf? */
1728		depth = ext_depth(inode);
1729		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1730				< le16_to_cpu(path[depth].p_hdr->eh_max))
1731			return 1;
1732	}
1733
1734	/*
1735	 * given 32-bit logical block (4294967296 blocks), max. tree
1736	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1737	 * Let's also add one more level for imbalance.
1738	 */
1739	depth = 5;
1740
1741	/* allocation of new data block(s) */
1742	needed = 2;
1743
1744	/*
1745	 * tree can be full, so it would need to grow in depth:
1746	 * we need one credit to modify old root, credits for
1747	 * new root will be added in split accounting
1748	 */
1749	needed += 1;
1750
1751	/*
1752	 * Index split can happen, we would need:
1753	 *    allocate intermediate indexes (bitmap + group)
1754	 *  + change two blocks at each level, but root (already included)
1755	 */
1756	needed += (depth * 2) + (depth * 2);
1757
1758	/* any allocation modifies superblock */
1759	needed += 1;
1760
1761	return needed;
1762}
1763
1764static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1765				struct ext4_extent *ex,
1766				ext4_lblk_t from, ext4_lblk_t to)
1767{
1768	struct buffer_head *bh;
1769	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
1770	int i, metadata = 0;
1771
1772	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1773		metadata = 1;
1774#ifdef EXTENTS_STATS
1775	{
1776		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1777		spin_lock(&sbi->s_ext_stats_lock);
1778		sbi->s_ext_blocks += ee_len;
1779		sbi->s_ext_extents++;
1780		if (ee_len < sbi->s_ext_min)
1781			sbi->s_ext_min = ee_len;
1782		if (ee_len > sbi->s_ext_max)
1783			sbi->s_ext_max = ee_len;
1784		if (ext_depth(inode) > sbi->s_depth_max)
1785			sbi->s_depth_max = ext_depth(inode);
1786		spin_unlock(&sbi->s_ext_stats_lock);
1787	}
1788#endif
1789	if (from >= le32_to_cpu(ex->ee_block)
1790	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1791		/* tail removal */
1792		ext4_lblk_t num;
1793		ext4_fsblk_t start;
1794
1795		num = le32_to_cpu(ex->ee_block) + ee_len - from;
1796		start = ext_pblock(ex) + ee_len - num;
1797		ext_debug("free last %u blocks starting %llu\n", num, start);
1798		for (i = 0; i < num; i++) {
1799			bh = sb_find_get_block(inode->i_sb, start + i);
1800			ext4_forget(handle, 0, inode, bh, start + i);
1801		}
1802		ext4_free_blocks(handle, inode, start, num, metadata);
1803	} else if (from == le32_to_cpu(ex->ee_block)
1804		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1805		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1806			from, to, le32_to_cpu(ex->ee_block), ee_len);
1807	} else {
1808		printk(KERN_INFO "strange request: removal(2) "
1809				"%u-%u from %u:%u\n",
1810				from, to, le32_to_cpu(ex->ee_block), ee_len);
1811	}
1812	return 0;
1813}
1814
1815static int
1816ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1817		struct ext4_ext_path *path, ext4_lblk_t start)
1818{
1819	int err = 0, correct_index = 0;
1820	int depth = ext_depth(inode), credits;
1821	struct ext4_extent_header *eh;
1822	ext4_lblk_t a, b, block;
1823	unsigned num;
1824	ext4_lblk_t ex_ee_block;
1825	unsigned short ex_ee_len;
1826	unsigned uninitialized = 0;
1827	struct ext4_extent *ex;
1828
1829	/* the header must be checked already in ext4_ext_remove_space() */
1830	ext_debug("truncate since %u in leaf\n", start);
1831	if (!path[depth].p_hdr)
1832		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1833	eh = path[depth].p_hdr;
1834	BUG_ON(eh == NULL);
1835
1836	/* find where to start removing */
1837	ex = EXT_LAST_EXTENT(eh);
1838
1839	ex_ee_block = le32_to_cpu(ex->ee_block);
1840	if (ext4_ext_is_uninitialized(ex))
1841		uninitialized = 1;
1842	ex_ee_len = ext4_ext_get_actual_len(ex);
1843
1844	while (ex >= EXT_FIRST_EXTENT(eh) &&
1845			ex_ee_block + ex_ee_len > start) {
1846		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1847		path[depth].p_ext = ex;
1848
1849		a = ex_ee_block > start ? ex_ee_block : start;
1850		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1851			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1852
1853		ext_debug("  border %u:%u\n", a, b);
1854
1855		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1856			block = 0;
1857			num = 0;
1858			BUG();
1859		} else if (a != ex_ee_block) {
1860			/* remove tail of the extent */
1861			block = ex_ee_block;
1862			num = a - block;
1863		} else if (b != ex_ee_block + ex_ee_len - 1) {
1864			/* remove head of the extent */
1865			block = a;
1866			num = b - a;
1867			/* there is no "make a hole" API yet */
1868			BUG();
1869		} else {
1870			/* remove whole extent: excellent! */
1871			block = ex_ee_block;
1872			num = 0;
1873			BUG_ON(a != ex_ee_block);
1874			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1875		}
1876
1877		/* at present, extent can't cross block group: */
1878		/* leaf + bitmap + group desc + sb + inode */
1879		credits = 5;
1880		if (ex == EXT_FIRST_EXTENT(eh)) {
1881			correct_index = 1;
1882			credits += (ext_depth(inode)) + 1;
1883		}
1884#ifdef CONFIG_QUOTA
1885		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1886#endif
1887
1888		handle = ext4_ext_journal_restart(handle, credits);
1889		if (IS_ERR(handle)) {
1890			err = PTR_ERR(handle);
1891			goto out;
1892		}
1893
1894		err = ext4_ext_get_access(handle, inode, path + depth);
1895		if (err)
1896			goto out;
1897
1898		err = ext4_remove_blocks(handle, inode, ex, a, b);
1899		if (err)
1900			goto out;
1901
1902		if (num == 0) {
1903			/* this extent is removed; mark slot entirely unused */
1904			ext4_ext_store_pblock(ex, 0);
1905			eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1906		}
1907
1908		ex->ee_block = cpu_to_le32(block);
1909		ex->ee_len = cpu_to_le16(num);
1910		/*
1911		 * Do not mark uninitialized if all the blocks in the
1912		 * extent have been removed.
1913		 */
1914		if (uninitialized && num)
1915			ext4_ext_mark_uninitialized(ex);
1916
1917		err = ext4_ext_dirty(handle, inode, path + depth);
1918		if (err)
1919			goto out;
1920
1921		ext_debug("new extent: %u:%u:%llu\n", block, num,
1922				ext_pblock(ex));
1923		ex--;
1924		ex_ee_block = le32_to_cpu(ex->ee_block);
1925		ex_ee_len = ext4_ext_get_actual_len(ex);
1926	}
1927
1928	if (correct_index && eh->eh_entries)
1929		err = ext4_ext_correct_indexes(handle, inode, path);
1930
1931	/* if this leaf is free, then we should
1932	 * remove it from index block above */
1933	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1934		err = ext4_ext_rm_idx(handle, inode, path + depth);
1935
1936out:
1937	return err;
1938}
1939
1940/*
1941 * ext4_ext_more_to_rm:
1942 * returns 1 if current index has to be freed (even partial)
1943 */
1944static int
1945ext4_ext_more_to_rm(struct ext4_ext_path *path)
1946{
1947	BUG_ON(path->p_idx == NULL);
1948
1949	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1950		return 0;
1951
1952	/*
1953	 * if truncate on deeper level happened, it wasn't partial,
1954	 * so we have to consider current index for truncation
1955	 */
1956	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1957		return 0;
1958	return 1;
1959}
1960
1961static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
1962{
1963	struct super_block *sb = inode->i_sb;
1964	int depth = ext_depth(inode);
1965	struct ext4_ext_path *path;
1966	handle_t *handle;
1967	int i = 0, err = 0;
1968
1969	ext_debug("truncate since %u\n", start);
1970
1971	/* probably first extent we're gonna free will be last in block */
1972	handle = ext4_journal_start(inode, depth + 1);
1973	if (IS_ERR(handle))
1974		return PTR_ERR(handle);
1975
1976	ext4_ext_invalidate_cache(inode);
1977
1978	/*
1979	 * We start scanning from right side, freeing all the blocks
1980	 * after i_size and walking into the tree depth-wise.
1981	 */
1982	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
1983	if (path == NULL) {
1984		ext4_journal_stop(handle);
1985		return -ENOMEM;
1986	}
1987	path[0].p_hdr = ext_inode_hdr(inode);
1988	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
1989		err = -EIO;
1990		goto out;
1991	}
1992	path[0].p_depth = depth;
1993
1994	while (i >= 0 && err == 0) {
1995		if (i == depth) {
1996			/* this is leaf block */
1997			err = ext4_ext_rm_leaf(handle, inode, path, start);
1998			/* root level has p_bh == NULL, brelse() eats this */
1999			brelse(path[i].p_bh);
2000			path[i].p_bh = NULL;
2001			i--;
2002			continue;
2003		}
2004
2005		/* this is index block */
2006		if (!path[i].p_hdr) {
2007			ext_debug("initialize header\n");
2008			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2009		}
2010
2011		if (!path[i].p_idx) {
2012			/* this level hasn't been touched yet */
2013			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2014			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2015			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2016				  path[i].p_hdr,
2017				  le16_to_cpu(path[i].p_hdr->eh_entries));
2018		} else {
2019			/* we were already here, see at next index */
2020			path[i].p_idx--;
2021		}
2022
2023		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2024				i, EXT_FIRST_INDEX(path[i].p_hdr),
2025				path[i].p_idx);
2026		if (ext4_ext_more_to_rm(path + i)) {
2027			struct buffer_head *bh;
2028			/* go to the next level */
2029			ext_debug("move to level %d (block %llu)\n",
2030				  i + 1, idx_pblock(path[i].p_idx));
2031			memset(path + i + 1, 0, sizeof(*path));
2032			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2033			if (!bh) {
2034				/* should we reset i_size? */
2035				err = -EIO;
2036				break;
2037			}
2038			if (WARN_ON(i + 1 > depth)) {
2039				err = -EIO;
2040				break;
2041			}
2042			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
2043							depth - i - 1)) {
2044				err = -EIO;
2045				break;
2046			}
2047			path[i + 1].p_bh = bh;
2048
2049			/* save actual number of indexes since this
2050			 * number is changed at the next iteration */
2051			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2052			i++;
2053		} else {
2054			/* we finished processing this index, go up */
2055			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2056				/* index is empty, remove it;
2057				 * handle must be already prepared by the
2058				 * truncatei_leaf() */
2059				err = ext4_ext_rm_idx(handle, inode, path + i);
2060			}
2061			/* root level has p_bh == NULL, brelse() eats this */
2062			brelse(path[i].p_bh);
2063			path[i].p_bh = NULL;
2064			i--;
2065			ext_debug("return to level %d\n", i);
2066		}
2067	}
2068
2069	/* TODO: flexible tree reduction should be here */
2070	if (path->p_hdr->eh_entries == 0) {
2071		/*
2072		 * truncate to zero freed all the tree,
2073		 * so we need to correct eh_depth
2074		 */
2075		err = ext4_ext_get_access(handle, inode, path);
2076		if (err == 0) {
2077			ext_inode_hdr(inode)->eh_depth = 0;
2078			ext_inode_hdr(inode)->eh_max =
2079				cpu_to_le16(ext4_ext_space_root(inode));
2080			err = ext4_ext_dirty(handle, inode, path);
2081		}
2082	}
2083out:
2084	ext4_ext_tree_changed(inode);
2085	ext4_ext_drop_refs(path);
2086	kfree(path);
2087	ext4_journal_stop(handle);
2088
2089	return err;
2090}
2091
2092/*
2093 * called at mount time
2094 */
2095void ext4_ext_init(struct super_block *sb)
2096{
2097	/*
2098	 * possible initialization would be here
2099	 */
2100
2101	if (test_opt(sb, EXTENTS)) {
2102		printk("EXT4-fs: file extents enabled");
2103#ifdef AGGRESSIVE_TEST
2104		printk(", aggressive tests");
2105#endif
2106#ifdef CHECK_BINSEARCH
2107		printk(", check binsearch");
2108#endif
2109#ifdef EXTENTS_STATS
2110		printk(", stats");
2111#endif
2112		printk("\n");
2113#ifdef EXTENTS_STATS
2114		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2115		EXT4_SB(sb)->s_ext_min = 1 << 30;
2116		EXT4_SB(sb)->s_ext_max = 0;
2117#endif
2118	}
2119}
2120
2121/*
2122 * called at umount time
2123 */
2124void ext4_ext_release(struct super_block *sb)
2125{
2126	if (!test_opt(sb, EXTENTS))
2127		return;
2128
2129#ifdef EXTENTS_STATS
2130	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2131		struct ext4_sb_info *sbi = EXT4_SB(sb);
2132		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2133			sbi->s_ext_blocks, sbi->s_ext_extents,
2134			sbi->s_ext_blocks / sbi->s_ext_extents);
2135		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2136			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2137	}
2138#endif
2139}
2140
2141/*
2142 * This function is called by ext4_ext_get_blocks() if someone tries to write
2143 * to an uninitialized extent. It may result in splitting the uninitialized
2144 * extent into multiple extents (upto three - one initialized and two
2145 * uninitialized).
2146 * There are three possibilities:
2147 *   a> There is no split required: Entire extent should be initialized
2148 *   b> Splits in two extents: Write is happening at either end of the extent
2149 *   c> Splits in three extents: Somone is writing in middle of the extent
2150 */
2151static int ext4_ext_convert_to_initialized(handle_t *handle,
2152						struct inode *inode,
2153						struct ext4_ext_path *path,
2154						ext4_lblk_t iblock,
2155						unsigned long max_blocks)
2156{
2157	struct ext4_extent *ex, newex;
2158	struct ext4_extent *ex1 = NULL;
2159	struct ext4_extent *ex2 = NULL;
2160	struct ext4_extent *ex3 = NULL;
2161	struct ext4_extent_header *eh;
2162	ext4_lblk_t ee_block;
2163	unsigned int allocated, ee_len, depth;
2164	ext4_fsblk_t newblock;
2165	int err = 0;
2166	int ret = 0;
2167
2168	depth = ext_depth(inode);
2169	eh = path[depth].p_hdr;
2170	ex = path[depth].p_ext;
2171	ee_block = le32_to_cpu(ex->ee_block);
2172	ee_len = ext4_ext_get_actual_len(ex);
2173	allocated = ee_len - (iblock - ee_block);
2174	newblock = iblock - ee_block + ext_pblock(ex);
2175	ex2 = ex;
2176
2177	err = ext4_ext_get_access(handle, inode, path + depth);
2178	if (err)
2179		goto out;
2180
2181	/* ex1: ee_block to iblock - 1 : uninitialized */
2182	if (iblock > ee_block) {
2183		ex1 = ex;
2184		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2185		ext4_ext_mark_uninitialized(ex1);
2186		ex2 = &newex;
2187	}
2188	/*
2189	 * for sanity, update the length of the ex2 extent before
2190	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2191	 * overlap of blocks.
2192	 */
2193	if (!ex1 && allocated > max_blocks)
2194		ex2->ee_len = cpu_to_le16(max_blocks);
2195	/* ex3: to ee_block + ee_len : uninitialised */
2196	if (allocated > max_blocks) {
2197		unsigned int newdepth;
2198		ex3 = &newex;
2199		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2200		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2201		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2202		ext4_ext_mark_uninitialized(ex3);
2203		err = ext4_ext_insert_extent(handle, inode, path, ex3);
2204		if (err)
2205			goto out;
2206		/*
2207		 * The depth, and hence eh & ex might change
2208		 * as part of the insert above.
2209		 */
2210		newdepth = ext_depth(inode);
2211		if (newdepth != depth) {
2212			depth = newdepth;
2213			ext4_ext_drop_refs(path);
2214			path = ext4_ext_find_extent(inode, iblock, path);
2215			if (IS_ERR(path)) {
2216				err = PTR_ERR(path);
2217				goto out;
2218			}
2219			eh = path[depth].p_hdr;
2220			ex = path[depth].p_ext;
2221			if (ex2 != &newex)
2222				ex2 = ex;
2223
2224			err = ext4_ext_get_access(handle, inode, path + depth);
2225			if (err)
2226				goto out;
2227		}
2228		allocated = max_blocks;
2229	}
2230	/*
2231	 * If there was a change of depth as part of the
2232	 * insertion of ex3 above, we need to update the length
2233	 * of the ex1 extent again here
2234	 */
2235	if (ex1 && ex1 != ex) {
2236		ex1 = ex;
2237		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2238		ext4_ext_mark_uninitialized(ex1);
2239		ex2 = &newex;
2240	}
2241	/* ex2: iblock to iblock + maxblocks-1 : initialised */
2242	ex2->ee_block = cpu_to_le32(iblock);
2243	ext4_ext_store_pblock(ex2, newblock);
2244	ex2->ee_len = cpu_to_le16(allocated);
2245	if (ex2 != ex)
2246		goto insert;
2247	/*
2248	 * New (initialized) extent starts from the first block
2249	 * in the current extent. i.e., ex2 == ex
2250	 * We have to see if it can be merged with the extent
2251	 * on the left.
2252	 */
2253	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2254		/*
2255		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2256		 * since it merges towards right _only_.
2257		 */
2258		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2259		if (ret) {
2260			err = ext4_ext_correct_indexes(handle, inode, path);
2261			if (err)
2262				goto out;
2263			depth = ext_depth(inode);
2264			ex2--;
2265		}
2266	}
2267	/*
2268	 * Try to Merge towards right. This might be required
2269	 * only when the whole extent is being written to.
2270	 * i.e. ex2 == ex and ex3 == NULL.
2271	 */
2272	if (!ex3) {
2273		ret = ext4_ext_try_to_merge(inode, path, ex2);
2274		if (ret) {
2275			err = ext4_ext_correct_indexes(handle, inode, path);
2276			if (err)
2277				goto out;
2278		}
2279	}
2280	/* Mark modified extent as dirty */
2281	err = ext4_ext_dirty(handle, inode, path + depth);
2282	goto out;
2283insert:
2284	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2285out:
2286	return err ? err : allocated;
2287}
2288
2289/*
2290 * Need to be called with
2291 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2292 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2293 */
2294int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2295			ext4_lblk_t iblock,
2296			unsigned long max_blocks, struct buffer_head *bh_result,
2297			int create, int extend_disksize)
2298{
2299	struct ext4_ext_path *path = NULL;
2300	struct ext4_extent_header *eh;
2301	struct ext4_extent newex, *ex;
2302	ext4_fsblk_t goal, newblock;
2303	int err = 0, depth, ret;
2304	unsigned long allocated = 0;
2305	struct ext4_allocation_request ar;
2306
2307	__clear_bit(BH_New, &bh_result->b_state);
2308	ext_debug("blocks %u/%lu requested for inode %u\n",
2309			iblock, max_blocks, inode->i_ino);
2310
2311	/* check in cache */
2312	goal = ext4_ext_in_cache(inode, iblock, &newex);
2313	if (goal) {
2314		if (goal == EXT4_EXT_CACHE_GAP) {
2315			if (!create) {
2316				/*
2317				 * block isn't allocated yet and
2318				 * user doesn't want to allocate it
2319				 */
2320				goto out2;
2321			}
2322			/* we should allocate requested block */
2323		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
2324			/* block is already allocated */
2325			newblock = iblock
2326				   - le32_to_cpu(newex.ee_block)
2327				   + ext_pblock(&newex);
2328			/* number of remaining blocks in the extent */
2329			allocated = ext4_ext_get_actual_len(&newex) -
2330					(iblock - le32_to_cpu(newex.ee_block));
2331			goto out;
2332		} else {
2333			BUG();
2334		}
2335	}
2336
2337	/* find extent for this block */
2338	path = ext4_ext_find_extent(inode, iblock, NULL);
2339	if (IS_ERR(path)) {
2340		err = PTR_ERR(path);
2341		path = NULL;
2342		goto out2;
2343	}
2344
2345	depth = ext_depth(inode);
2346
2347	/*
2348	 * consistent leaf must not be empty;
2349	 * this situation is possible, though, _during_ tree modification;
2350	 * this is why assert can't be put in ext4_ext_find_extent()
2351	 */
2352	BUG_ON(path[depth].p_ext == NULL && depth != 0);
2353	eh = path[depth].p_hdr;
2354
2355	ex = path[depth].p_ext;
2356	if (ex) {
2357		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2358		ext4_fsblk_t ee_start = ext_pblock(ex);
2359		unsigned short ee_len;
2360
2361		/*
2362		 * Uninitialized extents are treated as holes, except that
2363		 * we split out initialized portions during a write.
2364		 */
2365		ee_len = ext4_ext_get_actual_len(ex);
2366		/* if found extent covers block, simply return it */
2367		if (iblock >= ee_block && iblock < ee_block + ee_len) {
2368			newblock = iblock - ee_block + ee_start;
2369			/* number of remaining blocks in the extent */
2370			allocated = ee_len - (iblock - ee_block);
2371			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2372					ee_block, ee_len, newblock);
2373
2374			/* Do not put uninitialized extent in the cache */
2375			if (!ext4_ext_is_uninitialized(ex)) {
2376				ext4_ext_put_in_cache(inode, ee_block,
2377							ee_len, ee_start,
2378							EXT4_EXT_CACHE_EXTENT);
2379				goto out;
2380			}
2381			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2382				goto out;
2383			if (!create)
2384				goto out2;
2385
2386			ret = ext4_ext_convert_to_initialized(handle, inode,
2387								path, iblock,
2388								max_blocks);
2389			if (ret <= 0) {
2390				err = ret;
2391				goto out2;
2392			} else
2393				allocated = ret;
2394			goto outnew;
2395		}
2396	}
2397
2398	/*
2399	 * requested block isn't allocated yet;
2400	 * we couldn't try to create block if create flag is zero
2401	 */
2402	if (!create) {
2403		/*
2404		 * put just found gap into cache to speed up
2405		 * subsequent requests
2406		 */
2407		ext4_ext_put_gap_in_cache(inode, path, iblock);
2408		goto out2;
2409	}
2410	/*
2411	 * Okay, we need to do block allocation.  Lazily initialize the block
2412	 * allocation info here if necessary.
2413	 */
2414	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2415		ext4_init_block_alloc_info(inode);
2416
2417	/* find neighbour allocated blocks */
2418	ar.lleft = iblock;
2419	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2420	if (err)
2421		goto out2;
2422	ar.lright = iblock;
2423	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2424	if (err)
2425		goto out2;
2426
2427	/*
2428	 * See if request is beyond maximum number of blocks we can have in
2429	 * a single extent. For an initialized extent this limit is
2430	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2431	 * EXT_UNINIT_MAX_LEN.
2432	 */
2433	if (max_blocks > EXT_INIT_MAX_LEN &&
2434	    create != EXT4_CREATE_UNINITIALIZED_EXT)
2435		max_blocks = EXT_INIT_MAX_LEN;
2436	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2437		 create == EXT4_CREATE_UNINITIALIZED_EXT)
2438		max_blocks = EXT_UNINIT_MAX_LEN;
2439
2440	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2441	newex.ee_block = cpu_to_le32(iblock);
2442	newex.ee_len = cpu_to_le16(max_blocks);
2443	err = ext4_ext_check_overlap(inode, &newex, path);
2444	if (err)
2445		allocated = ext4_ext_get_actual_len(&newex);
2446	else
2447		allocated = max_blocks;
2448
2449	/* allocate new block */
2450	ar.inode = inode;
2451	ar.goal = ext4_ext_find_goal(inode, path, iblock);
2452	ar.logical = iblock;
2453	ar.len = allocated;
2454	if (S_ISREG(inode->i_mode))
2455		ar.flags = EXT4_MB_HINT_DATA;
2456	else
2457		/* disable in-core preallocation for non-regular files */
2458		ar.flags = 0;
2459	newblock = ext4_mb_new_blocks(handle, &ar, &err);
2460	if (!newblock)
2461		goto out2;
2462	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2463			goal, newblock, allocated);
2464
2465	/* try to insert new extent into found leaf and return */
2466	ext4_ext_store_pblock(&newex, newblock);
2467	newex.ee_len = cpu_to_le16(ar.len);
2468	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
2469		ext4_ext_mark_uninitialized(&newex);
2470	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2471	if (err) {
2472		/* free data blocks we just allocated */
2473		/* not a good idea to call discard here directly,
2474		 * but otherwise we'd need to call it every free() */
2475		ext4_mb_discard_inode_preallocations(inode);
2476		ext4_free_blocks(handle, inode, ext_pblock(&newex),
2477					ext4_ext_get_actual_len(&newex), 0);
2478		goto out2;
2479	}
2480
2481	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2482		EXT4_I(inode)->i_disksize = inode->i_size;
2483
2484	/* previous routine could use block we allocated */
2485	newblock = ext_pblock(&newex);
2486	allocated = ext4_ext_get_actual_len(&newex);
2487outnew:
2488	__set_bit(BH_New, &bh_result->b_state);
2489
2490	/* Cache only when it is _not_ an uninitialized extent */
2491	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2492		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2493						EXT4_EXT_CACHE_EXTENT);
2494out:
2495	if (allocated > max_blocks)
2496		allocated = max_blocks;
2497	ext4_ext_show_leaf(inode, path);
2498	__set_bit(BH_Mapped, &bh_result->b_state);
2499	bh_result->b_bdev = inode->i_sb->s_bdev;
2500	bh_result->b_blocknr = newblock;
2501out2:
2502	if (path) {
2503		ext4_ext_drop_refs(path);
2504		kfree(path);
2505	}
2506	return err ? err : allocated;
2507}
2508
2509void ext4_ext_truncate(struct inode * inode, struct page *page)
2510{
2511	struct address_space *mapping = inode->i_mapping;
2512	struct super_block *sb = inode->i_sb;
2513	ext4_lblk_t last_block;
2514	handle_t *handle;
2515	int err = 0;
2516
2517	/*
2518	 * probably first extent we're gonna free will be last in block
2519	 */
2520	err = ext4_writepage_trans_blocks(inode) + 3;
2521	handle = ext4_journal_start(inode, err);
2522	if (IS_ERR(handle)) {
2523		if (page) {
2524			clear_highpage(page);
2525			flush_dcache_page(page);
2526			unlock_page(page);
2527			page_cache_release(page);
2528		}
2529		return;
2530	}
2531
2532	if (page)
2533		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2534
2535	down_write(&EXT4_I(inode)->i_data_sem);
2536	ext4_ext_invalidate_cache(inode);
2537
2538	ext4_mb_discard_inode_preallocations(inode);
2539
2540	/*
2541	 * TODO: optimization is possible here.
2542	 * Probably we need not scan at all,
2543	 * because page truncation is enough.
2544	 */
2545	if (ext4_orphan_add(handle, inode))
2546		goto out_stop;
2547
2548	/* we have to know where to truncate from in crash case */
2549	EXT4_I(inode)->i_disksize = inode->i_size;
2550	ext4_mark_inode_dirty(handle, inode);
2551
2552	last_block = (inode->i_size + sb->s_blocksize - 1)
2553			>> EXT4_BLOCK_SIZE_BITS(sb);
2554	err = ext4_ext_remove_space(inode, last_block);
2555
2556	/* In a multi-transaction truncate, we only make the final
2557	 * transaction synchronous.
2558	 */
2559	if (IS_SYNC(inode))
2560		handle->h_sync = 1;
2561
2562out_stop:
2563	/*
2564	 * If this was a simple ftruncate() and the file will remain alive,
2565	 * then we need to clear up the orphan record which we created above.
2566	 * However, if this was a real unlink then we were called by
2567	 * ext4_delete_inode(), and we allow that function to clean up the
2568	 * orphan info for us.
2569	 */
2570	if (inode->i_nlink)
2571		ext4_orphan_del(handle, inode);
2572
2573	up_write(&EXT4_I(inode)->i_data_sem);
2574	ext4_journal_stop(handle);
2575}
2576
2577/*
2578 * ext4_ext_writepage_trans_blocks:
2579 * calculate max number of blocks we could modify
2580 * in order to allocate new block for an inode
2581 */
2582int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2583{
2584	int needed;
2585
2586	needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2587
2588	/* caller wants to allocate num blocks, but note it includes sb */
2589	needed = needed * num - (num - 1);
2590
2591#ifdef CONFIG_QUOTA
2592	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2593#endif
2594
2595	return needed;
2596}
2597
2598/*
2599 * preallocate space for a file. This implements ext4's fallocate inode
2600 * operation, which gets called from sys_fallocate system call.
2601 * For block-mapped files, posix_fallocate should fall back to the method
2602 * of writing zeroes to the required new blocks (the same behavior which is
2603 * expected for file systems which do not support fallocate() system call).
2604 */
2605long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2606{
2607	handle_t *handle;
2608	ext4_lblk_t block;
2609	unsigned long max_blocks;
2610	ext4_fsblk_t nblocks = 0;
2611	int ret = 0;
2612	int ret2 = 0;
2613	int retries = 0;
2614	struct buffer_head map_bh;
2615	unsigned int credits, blkbits = inode->i_blkbits;
2616
2617	/*
2618	 * currently supporting (pre)allocate mode for extent-based
2619	 * files _only_
2620	 */
2621	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
2622		return -EOPNOTSUPP;
2623
2624	/* preallocation to directories is currently not supported */
2625	if (S_ISDIR(inode->i_mode))
2626		return -ENODEV;
2627
2628	block = offset >> blkbits;
2629	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2630			- block;
2631
2632	/*
2633	 * credits to insert 1 extent into extent tree + buffers to be able to
2634	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2635	 */
2636	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2637	mutex_lock(&inode->i_mutex);
2638retry:
2639	while (ret >= 0 && ret < max_blocks) {
2640		block = block + ret;
2641		max_blocks = max_blocks - ret;
2642		handle = ext4_journal_start(inode, credits);
2643		if (IS_ERR(handle)) {
2644			ret = PTR_ERR(handle);
2645			break;
2646		}
2647
2648		ret = ext4_get_blocks_wrap(handle, inode, block,
2649					  max_blocks, &map_bh,
2650					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
2651		WARN_ON(ret <= 0);
2652		if (ret <= 0) {
2653			ext4_error(inode->i_sb, "ext4_fallocate",
2654				    "ext4_ext_get_blocks returned error: "
2655				    "inode#%lu, block=%u, max_blocks=%lu",
2656				    inode->i_ino, block, max_blocks);
2657			ret = -EIO;
2658			ext4_mark_inode_dirty(handle, inode);
2659			ret2 = ext4_journal_stop(handle);
2660			break;
2661		}
2662		if (ret > 0) {
2663			/* check wrap through sign-bit/zero here */
2664			if ((block + ret) < 0 || (block + ret) < block) {
2665				ret = -EIO;
2666				ext4_mark_inode_dirty(handle, inode);
2667				ret2 = ext4_journal_stop(handle);
2668				break;
2669			}
2670			if (buffer_new(&map_bh) && ((block + ret) >
2671			    (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
2672			    >> blkbits)))
2673					nblocks = nblocks + ret;
2674		}
2675
2676		/* Update ctime if new blocks get allocated */
2677		if (nblocks) {
2678			struct timespec now;
2679
2680			now = current_fs_time(inode->i_sb);
2681			if (!timespec_equal(&inode->i_ctime, &now))
2682				inode->i_ctime = now;
2683		}
2684
2685		ext4_mark_inode_dirty(handle, inode);
2686		ret2 = ext4_journal_stop(handle);
2687		if (ret2)
2688			break;
2689	}
2690
2691	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2692		goto retry;
2693
2694	/*
2695	 * Time to update the file size.
2696	 * Update only when preallocation was requested beyond the file size.
2697	 */
2698	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
2699	    (offset + len) > i_size_read(inode)) {
2700		if (ret > 0) {
2701			/*
2702			 * if no error, we assume preallocation succeeded
2703			 * completely
2704			 */
2705			i_size_write(inode, offset + len);
2706			EXT4_I(inode)->i_disksize = i_size_read(inode);
2707		} else if (ret < 0 && nblocks) {
2708			/* Handle partial allocation scenario */
2709			loff_t newsize;
2710
2711			newsize  = (nblocks << blkbits) + i_size_read(inode);
2712			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
2713			EXT4_I(inode)->i_disksize = i_size_read(inode);
2714		}
2715	}
2716
2717	mutex_unlock(&inode->i_mutex);
2718	return ret > 0 ? ret2 : ret;
2719}
2720