extents.c revision c2ea3fde61f1df1dbf062345f23277dcd6f01dfe
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include "ext4_jbd2.h"
44#include "ext4_extents.h"
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53	ext4_fsblk_t block;
54
55	block = le32_to_cpu(ex->ee_start_lo);
56	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57	return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66	ext4_fsblk_t block;
67
68	block = le32_to_cpu(ix->ei_leaf_lo);
69	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70	return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static int ext4_ext_journal_restart(handle_t *handle, int needed)
96{
97	int err;
98
99	if (handle->h_buffer_credits > needed)
100		return 0;
101	err = ext4_journal_extend(handle, needed);
102	if (err <= 0)
103		return err;
104	return ext4_journal_restart(handle, needed);
105}
106
107/*
108 * could return:
109 *  - EROFS
110 *  - ENOMEM
111 */
112static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
113				struct ext4_ext_path *path)
114{
115	if (path->p_bh) {
116		/* path points to block */
117		return ext4_journal_get_write_access(handle, path->p_bh);
118	}
119	/* path points to leaf/index in inode body */
120	/* we use in-core data, no need to protect them */
121	return 0;
122}
123
124/*
125 * could return:
126 *  - EROFS
127 *  - ENOMEM
128 *  - EIO
129 */
130static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
131				struct ext4_ext_path *path)
132{
133	int err;
134	if (path->p_bh) {
135		/* path points to block */
136		err = ext4_journal_dirty_metadata(handle, path->p_bh);
137	} else {
138		/* path points to leaf/index in inode body */
139		err = ext4_mark_inode_dirty(handle, inode);
140	}
141	return err;
142}
143
144static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
145			      struct ext4_ext_path *path,
146			      ext4_lblk_t block)
147{
148	struct ext4_inode_info *ei = EXT4_I(inode);
149	ext4_fsblk_t bg_start;
150	ext4_fsblk_t last_block;
151	ext4_grpblk_t colour;
152	int depth;
153
154	if (path) {
155		struct ext4_extent *ex;
156		depth = path->p_depth;
157
158		/* try to predict block placement */
159		ex = path[depth].p_ext;
160		if (ex)
161			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
162
163		/* it looks like index is empty;
164		 * try to find starting block from index itself */
165		if (path[depth].p_bh)
166			return path[depth].p_bh->b_blocknr;
167	}
168
169	/* OK. use inode's group */
170	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
171		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
172	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
173
174	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
175		colour = (current->pid % 16) *
176			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
177	else
178		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
179	return bg_start + colour + block;
180}
181
182/*
183 * Allocation for a meta data block
184 */
185static ext4_fsblk_t
186ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
187			struct ext4_ext_path *path,
188			struct ext4_extent *ex, int *err)
189{
190	ext4_fsblk_t goal, newblock;
191
192	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
193	newblock = ext4_new_meta_block(handle, inode, goal, err);
194	return newblock;
195}
196
197static int ext4_ext_space_block(struct inode *inode)
198{
199	int size;
200
201	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
202			/ sizeof(struct ext4_extent);
203#ifdef AGGRESSIVE_TEST
204	if (size > 6)
205		size = 6;
206#endif
207	return size;
208}
209
210static int ext4_ext_space_block_idx(struct inode *inode)
211{
212	int size;
213
214	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
215			/ sizeof(struct ext4_extent_idx);
216#ifdef AGGRESSIVE_TEST
217	if (size > 5)
218		size = 5;
219#endif
220	return size;
221}
222
223static int ext4_ext_space_root(struct inode *inode)
224{
225	int size;
226
227	size = sizeof(EXT4_I(inode)->i_data);
228	size -= sizeof(struct ext4_extent_header);
229	size /= sizeof(struct ext4_extent);
230#ifdef AGGRESSIVE_TEST
231	if (size > 3)
232		size = 3;
233#endif
234	return size;
235}
236
237static int ext4_ext_space_root_idx(struct inode *inode)
238{
239	int size;
240
241	size = sizeof(EXT4_I(inode)->i_data);
242	size -= sizeof(struct ext4_extent_header);
243	size /= sizeof(struct ext4_extent_idx);
244#ifdef AGGRESSIVE_TEST
245	if (size > 4)
246		size = 4;
247#endif
248	return size;
249}
250
251/*
252 * Calculate the number of metadata blocks needed
253 * to allocate @blocks
254 * Worse case is one block per extent
255 */
256int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
257{
258	int lcap, icap, rcap, leafs, idxs, num;
259	int newextents = blocks;
260
261	rcap = ext4_ext_space_root_idx(inode);
262	lcap = ext4_ext_space_block(inode);
263	icap = ext4_ext_space_block_idx(inode);
264
265	/* number of new leaf blocks needed */
266	num = leafs = (newextents + lcap - 1) / lcap;
267
268	/*
269	 * Worse case, we need separate index block(s)
270	 * to link all new leaf blocks
271	 */
272	idxs = (leafs + icap - 1) / icap;
273	do {
274		num += idxs;
275		idxs = (idxs + icap - 1) / icap;
276	} while (idxs > rcap);
277
278	return num;
279}
280
281static int
282ext4_ext_max_entries(struct inode *inode, int depth)
283{
284	int max;
285
286	if (depth == ext_depth(inode)) {
287		if (depth == 0)
288			max = ext4_ext_space_root(inode);
289		else
290			max = ext4_ext_space_root_idx(inode);
291	} else {
292		if (depth == 0)
293			max = ext4_ext_space_block(inode);
294		else
295			max = ext4_ext_space_block_idx(inode);
296	}
297
298	return max;
299}
300
301static int __ext4_ext_check_header(const char *function, struct inode *inode,
302					struct ext4_extent_header *eh,
303					int depth)
304{
305	const char *error_msg;
306	int max = 0;
307
308	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
309		error_msg = "invalid magic";
310		goto corrupted;
311	}
312	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
313		error_msg = "unexpected eh_depth";
314		goto corrupted;
315	}
316	if (unlikely(eh->eh_max == 0)) {
317		error_msg = "invalid eh_max";
318		goto corrupted;
319	}
320	max = ext4_ext_max_entries(inode, depth);
321	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
322		error_msg = "too large eh_max";
323		goto corrupted;
324	}
325	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
326		error_msg = "invalid eh_entries";
327		goto corrupted;
328	}
329	return 0;
330
331corrupted:
332	ext4_error(inode->i_sb, function,
333			"bad header in inode #%lu: %s - magic %x, "
334			"entries %u, max %u(%u), depth %u(%u)",
335			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
336			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
337			max, le16_to_cpu(eh->eh_depth), depth);
338
339	return -EIO;
340}
341
342#define ext4_ext_check_header(inode, eh, depth)	\
343	__ext4_ext_check_header(__func__, inode, eh, depth)
344
345#ifdef EXT_DEBUG
346static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
347{
348	int k, l = path->p_depth;
349
350	ext_debug("path:");
351	for (k = 0; k <= l; k++, path++) {
352		if (path->p_idx) {
353		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
354			    idx_pblock(path->p_idx));
355		} else if (path->p_ext) {
356			ext_debug("  %d:%d:%llu ",
357				  le32_to_cpu(path->p_ext->ee_block),
358				  ext4_ext_get_actual_len(path->p_ext),
359				  ext_pblock(path->p_ext));
360		} else
361			ext_debug("  []");
362	}
363	ext_debug("\n");
364}
365
366static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
367{
368	int depth = ext_depth(inode);
369	struct ext4_extent_header *eh;
370	struct ext4_extent *ex;
371	int i;
372
373	if (!path)
374		return;
375
376	eh = path[depth].p_hdr;
377	ex = EXT_FIRST_EXTENT(eh);
378
379	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
380		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
381			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
382	}
383	ext_debug("\n");
384}
385#else
386#define ext4_ext_show_path(inode, path)
387#define ext4_ext_show_leaf(inode, path)
388#endif
389
390void ext4_ext_drop_refs(struct ext4_ext_path *path)
391{
392	int depth = path->p_depth;
393	int i;
394
395	for (i = 0; i <= depth; i++, path++)
396		if (path->p_bh) {
397			brelse(path->p_bh);
398			path->p_bh = NULL;
399		}
400}
401
402/*
403 * ext4_ext_binsearch_idx:
404 * binary search for the closest index of the given block
405 * the header must be checked before calling this
406 */
407static void
408ext4_ext_binsearch_idx(struct inode *inode,
409			struct ext4_ext_path *path, ext4_lblk_t block)
410{
411	struct ext4_extent_header *eh = path->p_hdr;
412	struct ext4_extent_idx *r, *l, *m;
413
414
415	ext_debug("binsearch for %u(idx):  ", block);
416
417	l = EXT_FIRST_INDEX(eh) + 1;
418	r = EXT_LAST_INDEX(eh);
419	while (l <= r) {
420		m = l + (r - l) / 2;
421		if (block < le32_to_cpu(m->ei_block))
422			r = m - 1;
423		else
424			l = m + 1;
425		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
426				m, le32_to_cpu(m->ei_block),
427				r, le32_to_cpu(r->ei_block));
428	}
429
430	path->p_idx = l - 1;
431	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
432		  idx_pblock(path->p_idx));
433
434#ifdef CHECK_BINSEARCH
435	{
436		struct ext4_extent_idx *chix, *ix;
437		int k;
438
439		chix = ix = EXT_FIRST_INDEX(eh);
440		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
441		  if (k != 0 &&
442		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
443				printk(KERN_DEBUG "k=%d, ix=0x%p, "
444				       "first=0x%p\n", k,
445				       ix, EXT_FIRST_INDEX(eh));
446				printk(KERN_DEBUG "%u <= %u\n",
447				       le32_to_cpu(ix->ei_block),
448				       le32_to_cpu(ix[-1].ei_block));
449			}
450			BUG_ON(k && le32_to_cpu(ix->ei_block)
451					   <= le32_to_cpu(ix[-1].ei_block));
452			if (block < le32_to_cpu(ix->ei_block))
453				break;
454			chix = ix;
455		}
456		BUG_ON(chix != path->p_idx);
457	}
458#endif
459
460}
461
462/*
463 * ext4_ext_binsearch:
464 * binary search for closest extent of the given block
465 * the header must be checked before calling this
466 */
467static void
468ext4_ext_binsearch(struct inode *inode,
469		struct ext4_ext_path *path, ext4_lblk_t block)
470{
471	struct ext4_extent_header *eh = path->p_hdr;
472	struct ext4_extent *r, *l, *m;
473
474	if (eh->eh_entries == 0) {
475		/*
476		 * this leaf is empty:
477		 * we get such a leaf in split/add case
478		 */
479		return;
480	}
481
482	ext_debug("binsearch for %u:  ", block);
483
484	l = EXT_FIRST_EXTENT(eh) + 1;
485	r = EXT_LAST_EXTENT(eh);
486
487	while (l <= r) {
488		m = l + (r - l) / 2;
489		if (block < le32_to_cpu(m->ee_block))
490			r = m - 1;
491		else
492			l = m + 1;
493		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
494				m, le32_to_cpu(m->ee_block),
495				r, le32_to_cpu(r->ee_block));
496	}
497
498	path->p_ext = l - 1;
499	ext_debug("  -> %d:%llu:%d ",
500			le32_to_cpu(path->p_ext->ee_block),
501			ext_pblock(path->p_ext),
502			ext4_ext_get_actual_len(path->p_ext));
503
504#ifdef CHECK_BINSEARCH
505	{
506		struct ext4_extent *chex, *ex;
507		int k;
508
509		chex = ex = EXT_FIRST_EXTENT(eh);
510		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
511			BUG_ON(k && le32_to_cpu(ex->ee_block)
512					  <= le32_to_cpu(ex[-1].ee_block));
513			if (block < le32_to_cpu(ex->ee_block))
514				break;
515			chex = ex;
516		}
517		BUG_ON(chex != path->p_ext);
518	}
519#endif
520
521}
522
523int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
524{
525	struct ext4_extent_header *eh;
526
527	eh = ext_inode_hdr(inode);
528	eh->eh_depth = 0;
529	eh->eh_entries = 0;
530	eh->eh_magic = EXT4_EXT_MAGIC;
531	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
532	ext4_mark_inode_dirty(handle, inode);
533	ext4_ext_invalidate_cache(inode);
534	return 0;
535}
536
537struct ext4_ext_path *
538ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
539					struct ext4_ext_path *path)
540{
541	struct ext4_extent_header *eh;
542	struct buffer_head *bh;
543	short int depth, i, ppos = 0, alloc = 0;
544
545	eh = ext_inode_hdr(inode);
546	depth = ext_depth(inode);
547	if (ext4_ext_check_header(inode, eh, depth))
548		return ERR_PTR(-EIO);
549
550
551	/* account possible depth increase */
552	if (!path) {
553		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
554				GFP_NOFS);
555		if (!path)
556			return ERR_PTR(-ENOMEM);
557		alloc = 1;
558	}
559	path[0].p_hdr = eh;
560	path[0].p_bh = NULL;
561
562	i = depth;
563	/* walk through the tree */
564	while (i) {
565		ext_debug("depth %d: num %d, max %d\n",
566			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
567
568		ext4_ext_binsearch_idx(inode, path + ppos, block);
569		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
570		path[ppos].p_depth = i;
571		path[ppos].p_ext = NULL;
572
573		bh = sb_bread(inode->i_sb, path[ppos].p_block);
574		if (!bh)
575			goto err;
576
577		eh = ext_block_hdr(bh);
578		ppos++;
579		BUG_ON(ppos > depth);
580		path[ppos].p_bh = bh;
581		path[ppos].p_hdr = eh;
582		i--;
583
584		if (ext4_ext_check_header(inode, eh, i))
585			goto err;
586	}
587
588	path[ppos].p_depth = i;
589	path[ppos].p_ext = NULL;
590	path[ppos].p_idx = NULL;
591
592	/* find extent */
593	ext4_ext_binsearch(inode, path + ppos, block);
594	/* if not an empty leaf */
595	if (path[ppos].p_ext)
596		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
597
598	ext4_ext_show_path(inode, path);
599
600	return path;
601
602err:
603	ext4_ext_drop_refs(path);
604	if (alloc)
605		kfree(path);
606	return ERR_PTR(-EIO);
607}
608
609/*
610 * ext4_ext_insert_index:
611 * insert new index [@logical;@ptr] into the block at @curp;
612 * check where to insert: before @curp or after @curp
613 */
614static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
615				struct ext4_ext_path *curp,
616				int logical, ext4_fsblk_t ptr)
617{
618	struct ext4_extent_idx *ix;
619	int len, err;
620
621	err = ext4_ext_get_access(handle, inode, curp);
622	if (err)
623		return err;
624
625	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
626	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
627	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
628		/* insert after */
629		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
630			len = (len - 1) * sizeof(struct ext4_extent_idx);
631			len = len < 0 ? 0 : len;
632			ext_debug("insert new index %d after: %llu. "
633					"move %d from 0x%p to 0x%p\n",
634					logical, ptr, len,
635					(curp->p_idx + 1), (curp->p_idx + 2));
636			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
637		}
638		ix = curp->p_idx + 1;
639	} else {
640		/* insert before */
641		len = len * sizeof(struct ext4_extent_idx);
642		len = len < 0 ? 0 : len;
643		ext_debug("insert new index %d before: %llu. "
644				"move %d from 0x%p to 0x%p\n",
645				logical, ptr, len,
646				curp->p_idx, (curp->p_idx + 1));
647		memmove(curp->p_idx + 1, curp->p_idx, len);
648		ix = curp->p_idx;
649	}
650
651	ix->ei_block = cpu_to_le32(logical);
652	ext4_idx_store_pblock(ix, ptr);
653	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
654
655	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
656			     > le16_to_cpu(curp->p_hdr->eh_max));
657	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
658
659	err = ext4_ext_dirty(handle, inode, curp);
660	ext4_std_error(inode->i_sb, err);
661
662	return err;
663}
664
665/*
666 * ext4_ext_split:
667 * inserts new subtree into the path, using free index entry
668 * at depth @at:
669 * - allocates all needed blocks (new leaf and all intermediate index blocks)
670 * - makes decision where to split
671 * - moves remaining extents and index entries (right to the split point)
672 *   into the newly allocated blocks
673 * - initializes subtree
674 */
675static int ext4_ext_split(handle_t *handle, struct inode *inode,
676				struct ext4_ext_path *path,
677				struct ext4_extent *newext, int at)
678{
679	struct buffer_head *bh = NULL;
680	int depth = ext_depth(inode);
681	struct ext4_extent_header *neh;
682	struct ext4_extent_idx *fidx;
683	struct ext4_extent *ex;
684	int i = at, k, m, a;
685	ext4_fsblk_t newblock, oldblock;
686	__le32 border;
687	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
688	int err = 0;
689
690	/* make decision: where to split? */
691	/* FIXME: now decision is simplest: at current extent */
692
693	/* if current leaf will be split, then we should use
694	 * border from split point */
695	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
696	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
697		border = path[depth].p_ext[1].ee_block;
698		ext_debug("leaf will be split."
699				" next leaf starts at %d\n",
700				  le32_to_cpu(border));
701	} else {
702		border = newext->ee_block;
703		ext_debug("leaf will be added."
704				" next leaf starts at %d\n",
705				le32_to_cpu(border));
706	}
707
708	/*
709	 * If error occurs, then we break processing
710	 * and mark filesystem read-only. index won't
711	 * be inserted and tree will be in consistent
712	 * state. Next mount will repair buffers too.
713	 */
714
715	/*
716	 * Get array to track all allocated blocks.
717	 * We need this to handle errors and free blocks
718	 * upon them.
719	 */
720	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
721	if (!ablocks)
722		return -ENOMEM;
723
724	/* allocate all needed blocks */
725	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
726	for (a = 0; a < depth - at; a++) {
727		newblock = ext4_ext_new_meta_block(handle, inode, path,
728						   newext, &err);
729		if (newblock == 0)
730			goto cleanup;
731		ablocks[a] = newblock;
732	}
733
734	/* initialize new leaf */
735	newblock = ablocks[--a];
736	BUG_ON(newblock == 0);
737	bh = sb_getblk(inode->i_sb, newblock);
738	if (!bh) {
739		err = -EIO;
740		goto cleanup;
741	}
742	lock_buffer(bh);
743
744	err = ext4_journal_get_create_access(handle, bh);
745	if (err)
746		goto cleanup;
747
748	neh = ext_block_hdr(bh);
749	neh->eh_entries = 0;
750	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
751	neh->eh_magic = EXT4_EXT_MAGIC;
752	neh->eh_depth = 0;
753	ex = EXT_FIRST_EXTENT(neh);
754
755	/* move remainder of path[depth] to the new leaf */
756	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
757	/* start copy from next extent */
758	/* TODO: we could do it by single memmove */
759	m = 0;
760	path[depth].p_ext++;
761	while (path[depth].p_ext <=
762			EXT_MAX_EXTENT(path[depth].p_hdr)) {
763		ext_debug("move %d:%llu:%d in new leaf %llu\n",
764				le32_to_cpu(path[depth].p_ext->ee_block),
765				ext_pblock(path[depth].p_ext),
766				ext4_ext_get_actual_len(path[depth].p_ext),
767				newblock);
768		/*memmove(ex++, path[depth].p_ext++,
769				sizeof(struct ext4_extent));
770		neh->eh_entries++;*/
771		path[depth].p_ext++;
772		m++;
773	}
774	if (m) {
775		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
776		le16_add_cpu(&neh->eh_entries, m);
777	}
778
779	set_buffer_uptodate(bh);
780	unlock_buffer(bh);
781
782	err = ext4_journal_dirty_metadata(handle, bh);
783	if (err)
784		goto cleanup;
785	brelse(bh);
786	bh = NULL;
787
788	/* correct old leaf */
789	if (m) {
790		err = ext4_ext_get_access(handle, inode, path + depth);
791		if (err)
792			goto cleanup;
793		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
794		err = ext4_ext_dirty(handle, inode, path + depth);
795		if (err)
796			goto cleanup;
797
798	}
799
800	/* create intermediate indexes */
801	k = depth - at - 1;
802	BUG_ON(k < 0);
803	if (k)
804		ext_debug("create %d intermediate indices\n", k);
805	/* insert new index into current index block */
806	/* current depth stored in i var */
807	i = depth - 1;
808	while (k--) {
809		oldblock = newblock;
810		newblock = ablocks[--a];
811		bh = sb_getblk(inode->i_sb, newblock);
812		if (!bh) {
813			err = -EIO;
814			goto cleanup;
815		}
816		lock_buffer(bh);
817
818		err = ext4_journal_get_create_access(handle, bh);
819		if (err)
820			goto cleanup;
821
822		neh = ext_block_hdr(bh);
823		neh->eh_entries = cpu_to_le16(1);
824		neh->eh_magic = EXT4_EXT_MAGIC;
825		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
826		neh->eh_depth = cpu_to_le16(depth - i);
827		fidx = EXT_FIRST_INDEX(neh);
828		fidx->ei_block = border;
829		ext4_idx_store_pblock(fidx, oldblock);
830
831		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
832				i, newblock, le32_to_cpu(border), oldblock);
833		/* copy indexes */
834		m = 0;
835		path[i].p_idx++;
836
837		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
838				EXT_MAX_INDEX(path[i].p_hdr));
839		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
840				EXT_LAST_INDEX(path[i].p_hdr));
841		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
842			ext_debug("%d: move %d:%llu in new index %llu\n", i,
843					le32_to_cpu(path[i].p_idx->ei_block),
844					idx_pblock(path[i].p_idx),
845					newblock);
846			/*memmove(++fidx, path[i].p_idx++,
847					sizeof(struct ext4_extent_idx));
848			neh->eh_entries++;
849			BUG_ON(neh->eh_entries > neh->eh_max);*/
850			path[i].p_idx++;
851			m++;
852		}
853		if (m) {
854			memmove(++fidx, path[i].p_idx - m,
855				sizeof(struct ext4_extent_idx) * m);
856			le16_add_cpu(&neh->eh_entries, m);
857		}
858		set_buffer_uptodate(bh);
859		unlock_buffer(bh);
860
861		err = ext4_journal_dirty_metadata(handle, bh);
862		if (err)
863			goto cleanup;
864		brelse(bh);
865		bh = NULL;
866
867		/* correct old index */
868		if (m) {
869			err = ext4_ext_get_access(handle, inode, path + i);
870			if (err)
871				goto cleanup;
872			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
873			err = ext4_ext_dirty(handle, inode, path + i);
874			if (err)
875				goto cleanup;
876		}
877
878		i--;
879	}
880
881	/* insert new index */
882	err = ext4_ext_insert_index(handle, inode, path + at,
883				    le32_to_cpu(border), newblock);
884
885cleanup:
886	if (bh) {
887		if (buffer_locked(bh))
888			unlock_buffer(bh);
889		brelse(bh);
890	}
891
892	if (err) {
893		/* free all allocated blocks in error case */
894		for (i = 0; i < depth; i++) {
895			if (!ablocks[i])
896				continue;
897			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
898		}
899	}
900	kfree(ablocks);
901
902	return err;
903}
904
905/*
906 * ext4_ext_grow_indepth:
907 * implements tree growing procedure:
908 * - allocates new block
909 * - moves top-level data (index block or leaf) into the new block
910 * - initializes new top-level, creating index that points to the
911 *   just created block
912 */
913static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
914					struct ext4_ext_path *path,
915					struct ext4_extent *newext)
916{
917	struct ext4_ext_path *curp = path;
918	struct ext4_extent_header *neh;
919	struct ext4_extent_idx *fidx;
920	struct buffer_head *bh;
921	ext4_fsblk_t newblock;
922	int err = 0;
923
924	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
925	if (newblock == 0)
926		return err;
927
928	bh = sb_getblk(inode->i_sb, newblock);
929	if (!bh) {
930		err = -EIO;
931		ext4_std_error(inode->i_sb, err);
932		return err;
933	}
934	lock_buffer(bh);
935
936	err = ext4_journal_get_create_access(handle, bh);
937	if (err) {
938		unlock_buffer(bh);
939		goto out;
940	}
941
942	/* move top-level index/leaf into new block */
943	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
944
945	/* set size of new block */
946	neh = ext_block_hdr(bh);
947	/* old root could have indexes or leaves
948	 * so calculate e_max right way */
949	if (ext_depth(inode))
950	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
951	else
952	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
953	neh->eh_magic = EXT4_EXT_MAGIC;
954	set_buffer_uptodate(bh);
955	unlock_buffer(bh);
956
957	err = ext4_journal_dirty_metadata(handle, bh);
958	if (err)
959		goto out;
960
961	/* create index in new top-level index: num,max,pointer */
962	err = ext4_ext_get_access(handle, inode, curp);
963	if (err)
964		goto out;
965
966	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
967	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
968	curp->p_hdr->eh_entries = cpu_to_le16(1);
969	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
970
971	if (path[0].p_hdr->eh_depth)
972		curp->p_idx->ei_block =
973			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
974	else
975		curp->p_idx->ei_block =
976			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
977	ext4_idx_store_pblock(curp->p_idx, newblock);
978
979	neh = ext_inode_hdr(inode);
980	fidx = EXT_FIRST_INDEX(neh);
981	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
982		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
983		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
984
985	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
986	err = ext4_ext_dirty(handle, inode, curp);
987out:
988	brelse(bh);
989
990	return err;
991}
992
993/*
994 * ext4_ext_create_new_leaf:
995 * finds empty index and adds new leaf.
996 * if no free index is found, then it requests in-depth growing.
997 */
998static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
999					struct ext4_ext_path *path,
1000					struct ext4_extent *newext)
1001{
1002	struct ext4_ext_path *curp;
1003	int depth, i, err = 0;
1004
1005repeat:
1006	i = depth = ext_depth(inode);
1007
1008	/* walk up to the tree and look for free index entry */
1009	curp = path + depth;
1010	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1011		i--;
1012		curp--;
1013	}
1014
1015	/* we use already allocated block for index block,
1016	 * so subsequent data blocks should be contiguous */
1017	if (EXT_HAS_FREE_INDEX(curp)) {
1018		/* if we found index with free entry, then use that
1019		 * entry: create all needed subtree and add new leaf */
1020		err = ext4_ext_split(handle, inode, path, newext, i);
1021		if (err)
1022			goto out;
1023
1024		/* refill path */
1025		ext4_ext_drop_refs(path);
1026		path = ext4_ext_find_extent(inode,
1027				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1028				    path);
1029		if (IS_ERR(path))
1030			err = PTR_ERR(path);
1031	} else {
1032		/* tree is full, time to grow in depth */
1033		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1034		if (err)
1035			goto out;
1036
1037		/* refill path */
1038		ext4_ext_drop_refs(path);
1039		path = ext4_ext_find_extent(inode,
1040				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1041				    path);
1042		if (IS_ERR(path)) {
1043			err = PTR_ERR(path);
1044			goto out;
1045		}
1046
1047		/*
1048		 * only first (depth 0 -> 1) produces free space;
1049		 * in all other cases we have to split the grown tree
1050		 */
1051		depth = ext_depth(inode);
1052		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1053			/* now we need to split */
1054			goto repeat;
1055		}
1056	}
1057
1058out:
1059	return err;
1060}
1061
1062/*
1063 * search the closest allocated block to the left for *logical
1064 * and returns it at @logical + it's physical address at @phys
1065 * if *logical is the smallest allocated block, the function
1066 * returns 0 at @phys
1067 * return value contains 0 (success) or error code
1068 */
1069int
1070ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1071			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1072{
1073	struct ext4_extent_idx *ix;
1074	struct ext4_extent *ex;
1075	int depth, ee_len;
1076
1077	BUG_ON(path == NULL);
1078	depth = path->p_depth;
1079	*phys = 0;
1080
1081	if (depth == 0 && path->p_ext == NULL)
1082		return 0;
1083
1084	/* usually extent in the path covers blocks smaller
1085	 * then *logical, but it can be that extent is the
1086	 * first one in the file */
1087
1088	ex = path[depth].p_ext;
1089	ee_len = ext4_ext_get_actual_len(ex);
1090	if (*logical < le32_to_cpu(ex->ee_block)) {
1091		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1092		while (--depth >= 0) {
1093			ix = path[depth].p_idx;
1094			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1095		}
1096		return 0;
1097	}
1098
1099	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1100
1101	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1102	*phys = ext_pblock(ex) + ee_len - 1;
1103	return 0;
1104}
1105
1106/*
1107 * search the closest allocated block to the right for *logical
1108 * and returns it at @logical + it's physical address at @phys
1109 * if *logical is the smallest allocated block, the function
1110 * returns 0 at @phys
1111 * return value contains 0 (success) or error code
1112 */
1113int
1114ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1115			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1116{
1117	struct buffer_head *bh = NULL;
1118	struct ext4_extent_header *eh;
1119	struct ext4_extent_idx *ix;
1120	struct ext4_extent *ex;
1121	ext4_fsblk_t block;
1122	int depth, ee_len;
1123
1124	BUG_ON(path == NULL);
1125	depth = path->p_depth;
1126	*phys = 0;
1127
1128	if (depth == 0 && path->p_ext == NULL)
1129		return 0;
1130
1131	/* usually extent in the path covers blocks smaller
1132	 * then *logical, but it can be that extent is the
1133	 * first one in the file */
1134
1135	ex = path[depth].p_ext;
1136	ee_len = ext4_ext_get_actual_len(ex);
1137	if (*logical < le32_to_cpu(ex->ee_block)) {
1138		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1139		while (--depth >= 0) {
1140			ix = path[depth].p_idx;
1141			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1142		}
1143		*logical = le32_to_cpu(ex->ee_block);
1144		*phys = ext_pblock(ex);
1145		return 0;
1146	}
1147
1148	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1149
1150	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1151		/* next allocated block in this leaf */
1152		ex++;
1153		*logical = le32_to_cpu(ex->ee_block);
1154		*phys = ext_pblock(ex);
1155		return 0;
1156	}
1157
1158	/* go up and search for index to the right */
1159	while (--depth >= 0) {
1160		ix = path[depth].p_idx;
1161		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1162			break;
1163	}
1164
1165	if (depth < 0) {
1166		/* we've gone up to the root and
1167		 * found no index to the right */
1168		return 0;
1169	}
1170
1171	/* we've found index to the right, let's
1172	 * follow it and find the closest allocated
1173	 * block to the right */
1174	ix++;
1175	block = idx_pblock(ix);
1176	while (++depth < path->p_depth) {
1177		bh = sb_bread(inode->i_sb, block);
1178		if (bh == NULL)
1179			return -EIO;
1180		eh = ext_block_hdr(bh);
1181		if (ext4_ext_check_header(inode, eh, depth)) {
1182			put_bh(bh);
1183			return -EIO;
1184		}
1185		ix = EXT_FIRST_INDEX(eh);
1186		block = idx_pblock(ix);
1187		put_bh(bh);
1188	}
1189
1190	bh = sb_bread(inode->i_sb, block);
1191	if (bh == NULL)
1192		return -EIO;
1193	eh = ext_block_hdr(bh);
1194	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1195		put_bh(bh);
1196		return -EIO;
1197	}
1198	ex = EXT_FIRST_EXTENT(eh);
1199	*logical = le32_to_cpu(ex->ee_block);
1200	*phys = ext_pblock(ex);
1201	put_bh(bh);
1202	return 0;
1203
1204}
1205
1206/*
1207 * ext4_ext_next_allocated_block:
1208 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1209 * NOTE: it considers block number from index entry as
1210 * allocated block. Thus, index entries have to be consistent
1211 * with leaves.
1212 */
1213static ext4_lblk_t
1214ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1215{
1216	int depth;
1217
1218	BUG_ON(path == NULL);
1219	depth = path->p_depth;
1220
1221	if (depth == 0 && path->p_ext == NULL)
1222		return EXT_MAX_BLOCK;
1223
1224	while (depth >= 0) {
1225		if (depth == path->p_depth) {
1226			/* leaf */
1227			if (path[depth].p_ext !=
1228					EXT_LAST_EXTENT(path[depth].p_hdr))
1229			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1230		} else {
1231			/* index */
1232			if (path[depth].p_idx !=
1233					EXT_LAST_INDEX(path[depth].p_hdr))
1234			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1235		}
1236		depth--;
1237	}
1238
1239	return EXT_MAX_BLOCK;
1240}
1241
1242/*
1243 * ext4_ext_next_leaf_block:
1244 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1245 */
1246static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1247					struct ext4_ext_path *path)
1248{
1249	int depth;
1250
1251	BUG_ON(path == NULL);
1252	depth = path->p_depth;
1253
1254	/* zero-tree has no leaf blocks at all */
1255	if (depth == 0)
1256		return EXT_MAX_BLOCK;
1257
1258	/* go to index block */
1259	depth--;
1260
1261	while (depth >= 0) {
1262		if (path[depth].p_idx !=
1263				EXT_LAST_INDEX(path[depth].p_hdr))
1264			return (ext4_lblk_t)
1265				le32_to_cpu(path[depth].p_idx[1].ei_block);
1266		depth--;
1267	}
1268
1269	return EXT_MAX_BLOCK;
1270}
1271
1272/*
1273 * ext4_ext_correct_indexes:
1274 * if leaf gets modified and modified extent is first in the leaf,
1275 * then we have to correct all indexes above.
1276 * TODO: do we need to correct tree in all cases?
1277 */
1278static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1279				struct ext4_ext_path *path)
1280{
1281	struct ext4_extent_header *eh;
1282	int depth = ext_depth(inode);
1283	struct ext4_extent *ex;
1284	__le32 border;
1285	int k, err = 0;
1286
1287	eh = path[depth].p_hdr;
1288	ex = path[depth].p_ext;
1289	BUG_ON(ex == NULL);
1290	BUG_ON(eh == NULL);
1291
1292	if (depth == 0) {
1293		/* there is no tree at all */
1294		return 0;
1295	}
1296
1297	if (ex != EXT_FIRST_EXTENT(eh)) {
1298		/* we correct tree if first leaf got modified only */
1299		return 0;
1300	}
1301
1302	/*
1303	 * TODO: we need correction if border is smaller than current one
1304	 */
1305	k = depth - 1;
1306	border = path[depth].p_ext->ee_block;
1307	err = ext4_ext_get_access(handle, inode, path + k);
1308	if (err)
1309		return err;
1310	path[k].p_idx->ei_block = border;
1311	err = ext4_ext_dirty(handle, inode, path + k);
1312	if (err)
1313		return err;
1314
1315	while (k--) {
1316		/* change all left-side indexes */
1317		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1318			break;
1319		err = ext4_ext_get_access(handle, inode, path + k);
1320		if (err)
1321			break;
1322		path[k].p_idx->ei_block = border;
1323		err = ext4_ext_dirty(handle, inode, path + k);
1324		if (err)
1325			break;
1326	}
1327
1328	return err;
1329}
1330
1331static int
1332ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1333				struct ext4_extent *ex2)
1334{
1335	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1336
1337	/*
1338	 * Make sure that either both extents are uninitialized, or
1339	 * both are _not_.
1340	 */
1341	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1342		return 0;
1343
1344	if (ext4_ext_is_uninitialized(ex1))
1345		max_len = EXT_UNINIT_MAX_LEN;
1346	else
1347		max_len = EXT_INIT_MAX_LEN;
1348
1349	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1350	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1351
1352	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1353			le32_to_cpu(ex2->ee_block))
1354		return 0;
1355
1356	/*
1357	 * To allow future support for preallocated extents to be added
1358	 * as an RO_COMPAT feature, refuse to merge to extents if
1359	 * this can result in the top bit of ee_len being set.
1360	 */
1361	if (ext1_ee_len + ext2_ee_len > max_len)
1362		return 0;
1363#ifdef AGGRESSIVE_TEST
1364	if (ext1_ee_len >= 4)
1365		return 0;
1366#endif
1367
1368	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1369		return 1;
1370	return 0;
1371}
1372
1373/*
1374 * This function tries to merge the "ex" extent to the next extent in the tree.
1375 * It always tries to merge towards right. If you want to merge towards
1376 * left, pass "ex - 1" as argument instead of "ex".
1377 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1378 * 1 if they got merged.
1379 */
1380int ext4_ext_try_to_merge(struct inode *inode,
1381			  struct ext4_ext_path *path,
1382			  struct ext4_extent *ex)
1383{
1384	struct ext4_extent_header *eh;
1385	unsigned int depth, len;
1386	int merge_done = 0;
1387	int uninitialized = 0;
1388
1389	depth = ext_depth(inode);
1390	BUG_ON(path[depth].p_hdr == NULL);
1391	eh = path[depth].p_hdr;
1392
1393	while (ex < EXT_LAST_EXTENT(eh)) {
1394		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1395			break;
1396		/* merge with next extent! */
1397		if (ext4_ext_is_uninitialized(ex))
1398			uninitialized = 1;
1399		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1400				+ ext4_ext_get_actual_len(ex + 1));
1401		if (uninitialized)
1402			ext4_ext_mark_uninitialized(ex);
1403
1404		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1405			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1406				* sizeof(struct ext4_extent);
1407			memmove(ex + 1, ex + 2, len);
1408		}
1409		le16_add_cpu(&eh->eh_entries, -1);
1410		merge_done = 1;
1411		WARN_ON(eh->eh_entries == 0);
1412		if (!eh->eh_entries)
1413			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1414			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1415	}
1416
1417	return merge_done;
1418}
1419
1420/*
1421 * check if a portion of the "newext" extent overlaps with an
1422 * existing extent.
1423 *
1424 * If there is an overlap discovered, it updates the length of the newext
1425 * such that there will be no overlap, and then returns 1.
1426 * If there is no overlap found, it returns 0.
1427 */
1428unsigned int ext4_ext_check_overlap(struct inode *inode,
1429				    struct ext4_extent *newext,
1430				    struct ext4_ext_path *path)
1431{
1432	ext4_lblk_t b1, b2;
1433	unsigned int depth, len1;
1434	unsigned int ret = 0;
1435
1436	b1 = le32_to_cpu(newext->ee_block);
1437	len1 = ext4_ext_get_actual_len(newext);
1438	depth = ext_depth(inode);
1439	if (!path[depth].p_ext)
1440		goto out;
1441	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1442
1443	/*
1444	 * get the next allocated block if the extent in the path
1445	 * is before the requested block(s)
1446	 */
1447	if (b2 < b1) {
1448		b2 = ext4_ext_next_allocated_block(path);
1449		if (b2 == EXT_MAX_BLOCK)
1450			goto out;
1451	}
1452
1453	/* check for wrap through zero on extent logical start block*/
1454	if (b1 + len1 < b1) {
1455		len1 = EXT_MAX_BLOCK - b1;
1456		newext->ee_len = cpu_to_le16(len1);
1457		ret = 1;
1458	}
1459
1460	/* check for overlap */
1461	if (b1 + len1 > b2) {
1462		newext->ee_len = cpu_to_le16(b2 - b1);
1463		ret = 1;
1464	}
1465out:
1466	return ret;
1467}
1468
1469/*
1470 * ext4_ext_insert_extent:
1471 * tries to merge requsted extent into the existing extent or
1472 * inserts requested extent as new one into the tree,
1473 * creating new leaf in the no-space case.
1474 */
1475int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1476				struct ext4_ext_path *path,
1477				struct ext4_extent *newext)
1478{
1479	struct ext4_extent_header *eh;
1480	struct ext4_extent *ex, *fex;
1481	struct ext4_extent *nearex; /* nearest extent */
1482	struct ext4_ext_path *npath = NULL;
1483	int depth, len, err;
1484	ext4_lblk_t next;
1485	unsigned uninitialized = 0;
1486
1487	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1488	depth = ext_depth(inode);
1489	ex = path[depth].p_ext;
1490	BUG_ON(path[depth].p_hdr == NULL);
1491
1492	/* try to insert block into found extent and return */
1493	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1494		ext_debug("append %d block to %d:%d (from %llu)\n",
1495				ext4_ext_get_actual_len(newext),
1496				le32_to_cpu(ex->ee_block),
1497				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1498		err = ext4_ext_get_access(handle, inode, path + depth);
1499		if (err)
1500			return err;
1501
1502		/*
1503		 * ext4_can_extents_be_merged should have checked that either
1504		 * both extents are uninitialized, or both aren't. Thus we
1505		 * need to check only one of them here.
1506		 */
1507		if (ext4_ext_is_uninitialized(ex))
1508			uninitialized = 1;
1509		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1510					+ ext4_ext_get_actual_len(newext));
1511		if (uninitialized)
1512			ext4_ext_mark_uninitialized(ex);
1513		eh = path[depth].p_hdr;
1514		nearex = ex;
1515		goto merge;
1516	}
1517
1518repeat:
1519	depth = ext_depth(inode);
1520	eh = path[depth].p_hdr;
1521	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1522		goto has_space;
1523
1524	/* probably next leaf has space for us? */
1525	fex = EXT_LAST_EXTENT(eh);
1526	next = ext4_ext_next_leaf_block(inode, path);
1527	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1528	    && next != EXT_MAX_BLOCK) {
1529		ext_debug("next leaf block - %d\n", next);
1530		BUG_ON(npath != NULL);
1531		npath = ext4_ext_find_extent(inode, next, NULL);
1532		if (IS_ERR(npath))
1533			return PTR_ERR(npath);
1534		BUG_ON(npath->p_depth != path->p_depth);
1535		eh = npath[depth].p_hdr;
1536		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1537			ext_debug("next leaf isnt full(%d)\n",
1538				  le16_to_cpu(eh->eh_entries));
1539			path = npath;
1540			goto repeat;
1541		}
1542		ext_debug("next leaf has no free space(%d,%d)\n",
1543			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1544	}
1545
1546	/*
1547	 * There is no free space in the found leaf.
1548	 * We're gonna add a new leaf in the tree.
1549	 */
1550	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1551	if (err)
1552		goto cleanup;
1553	depth = ext_depth(inode);
1554	eh = path[depth].p_hdr;
1555
1556has_space:
1557	nearex = path[depth].p_ext;
1558
1559	err = ext4_ext_get_access(handle, inode, path + depth);
1560	if (err)
1561		goto cleanup;
1562
1563	if (!nearex) {
1564		/* there is no extent in this leaf, create first one */
1565		ext_debug("first extent in the leaf: %d:%llu:%d\n",
1566				le32_to_cpu(newext->ee_block),
1567				ext_pblock(newext),
1568				ext4_ext_get_actual_len(newext));
1569		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1570	} else if (le32_to_cpu(newext->ee_block)
1571			   > le32_to_cpu(nearex->ee_block)) {
1572/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1573		if (nearex != EXT_LAST_EXTENT(eh)) {
1574			len = EXT_MAX_EXTENT(eh) - nearex;
1575			len = (len - 1) * sizeof(struct ext4_extent);
1576			len = len < 0 ? 0 : len;
1577			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1578					"move %d from 0x%p to 0x%p\n",
1579					le32_to_cpu(newext->ee_block),
1580					ext_pblock(newext),
1581					ext4_ext_get_actual_len(newext),
1582					nearex, len, nearex + 1, nearex + 2);
1583			memmove(nearex + 2, nearex + 1, len);
1584		}
1585		path[depth].p_ext = nearex + 1;
1586	} else {
1587		BUG_ON(newext->ee_block == nearex->ee_block);
1588		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1589		len = len < 0 ? 0 : len;
1590		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1591				"move %d from 0x%p to 0x%p\n",
1592				le32_to_cpu(newext->ee_block),
1593				ext_pblock(newext),
1594				ext4_ext_get_actual_len(newext),
1595				nearex, len, nearex + 1, nearex + 2);
1596		memmove(nearex + 1, nearex, len);
1597		path[depth].p_ext = nearex;
1598	}
1599
1600	le16_add_cpu(&eh->eh_entries, 1);
1601	nearex = path[depth].p_ext;
1602	nearex->ee_block = newext->ee_block;
1603	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1604	nearex->ee_len = newext->ee_len;
1605
1606merge:
1607	/* try to merge extents to the right */
1608	ext4_ext_try_to_merge(inode, path, nearex);
1609
1610	/* try to merge extents to the left */
1611
1612	/* time to correct all indexes above */
1613	err = ext4_ext_correct_indexes(handle, inode, path);
1614	if (err)
1615		goto cleanup;
1616
1617	err = ext4_ext_dirty(handle, inode, path + depth);
1618
1619cleanup:
1620	if (npath) {
1621		ext4_ext_drop_refs(npath);
1622		kfree(npath);
1623	}
1624	ext4_ext_tree_changed(inode);
1625	ext4_ext_invalidate_cache(inode);
1626	return err;
1627}
1628
1629static void
1630ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1631			__u32 len, ext4_fsblk_t start, int type)
1632{
1633	struct ext4_ext_cache *cex;
1634	BUG_ON(len == 0);
1635	cex = &EXT4_I(inode)->i_cached_extent;
1636	cex->ec_type = type;
1637	cex->ec_block = block;
1638	cex->ec_len = len;
1639	cex->ec_start = start;
1640}
1641
1642/*
1643 * ext4_ext_put_gap_in_cache:
1644 * calculate boundaries of the gap that the requested block fits into
1645 * and cache this gap
1646 */
1647static void
1648ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1649				ext4_lblk_t block)
1650{
1651	int depth = ext_depth(inode);
1652	unsigned long len;
1653	ext4_lblk_t lblock;
1654	struct ext4_extent *ex;
1655
1656	ex = path[depth].p_ext;
1657	if (ex == NULL) {
1658		/* there is no extent yet, so gap is [0;-] */
1659		lblock = 0;
1660		len = EXT_MAX_BLOCK;
1661		ext_debug("cache gap(whole file):");
1662	} else if (block < le32_to_cpu(ex->ee_block)) {
1663		lblock = block;
1664		len = le32_to_cpu(ex->ee_block) - block;
1665		ext_debug("cache gap(before): %u [%u:%u]",
1666				block,
1667				le32_to_cpu(ex->ee_block),
1668				 ext4_ext_get_actual_len(ex));
1669	} else if (block >= le32_to_cpu(ex->ee_block)
1670			+ ext4_ext_get_actual_len(ex)) {
1671		ext4_lblk_t next;
1672		lblock = le32_to_cpu(ex->ee_block)
1673			+ ext4_ext_get_actual_len(ex);
1674
1675		next = ext4_ext_next_allocated_block(path);
1676		ext_debug("cache gap(after): [%u:%u] %u",
1677				le32_to_cpu(ex->ee_block),
1678				ext4_ext_get_actual_len(ex),
1679				block);
1680		BUG_ON(next == lblock);
1681		len = next - lblock;
1682	} else {
1683		lblock = len = 0;
1684		BUG();
1685	}
1686
1687	ext_debug(" -> %u:%lu\n", lblock, len);
1688	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1689}
1690
1691static int
1692ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1693			struct ext4_extent *ex)
1694{
1695	struct ext4_ext_cache *cex;
1696
1697	cex = &EXT4_I(inode)->i_cached_extent;
1698
1699	/* has cache valid data? */
1700	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1701		return EXT4_EXT_CACHE_NO;
1702
1703	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1704			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1705	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1706		ex->ee_block = cpu_to_le32(cex->ec_block);
1707		ext4_ext_store_pblock(ex, cex->ec_start);
1708		ex->ee_len = cpu_to_le16(cex->ec_len);
1709		ext_debug("%u cached by %u:%u:%llu\n",
1710				block,
1711				cex->ec_block, cex->ec_len, cex->ec_start);
1712		return cex->ec_type;
1713	}
1714
1715	/* not in cache */
1716	return EXT4_EXT_CACHE_NO;
1717}
1718
1719/*
1720 * ext4_ext_rm_idx:
1721 * removes index from the index block.
1722 * It's used in truncate case only, thus all requests are for
1723 * last index in the block only.
1724 */
1725static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1726			struct ext4_ext_path *path)
1727{
1728	struct buffer_head *bh;
1729	int err;
1730	ext4_fsblk_t leaf;
1731
1732	/* free index block */
1733	path--;
1734	leaf = idx_pblock(path->p_idx);
1735	BUG_ON(path->p_hdr->eh_entries == 0);
1736	err = ext4_ext_get_access(handle, inode, path);
1737	if (err)
1738		return err;
1739	le16_add_cpu(&path->p_hdr->eh_entries, -1);
1740	err = ext4_ext_dirty(handle, inode, path);
1741	if (err)
1742		return err;
1743	ext_debug("index is empty, remove it, free block %llu\n", leaf);
1744	bh = sb_find_get_block(inode->i_sb, leaf);
1745	ext4_forget(handle, 1, inode, bh, leaf);
1746	ext4_free_blocks(handle, inode, leaf, 1, 1);
1747	return err;
1748}
1749
1750/*
1751 * ext4_ext_calc_credits_for_single_extent:
1752 * This routine returns max. credits that needed to insert an extent
1753 * to the extent tree.
1754 * When pass the actual path, the caller should calculate credits
1755 * under i_data_sem.
1756 */
1757int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1758						struct ext4_ext_path *path)
1759{
1760	if (path) {
1761		int depth = ext_depth(inode);
1762		int ret = 0;
1763
1764		/* probably there is space in leaf? */
1765		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1766				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
1767
1768			/*
1769			 *  There are some space in the leaf tree, no
1770			 *  need to account for leaf block credit
1771			 *
1772			 *  bitmaps and block group descriptor blocks
1773			 *  and other metadat blocks still need to be
1774			 *  accounted.
1775			 */
1776			/* 1 bitmap, 1 block group descriptor */
1777			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1778		}
1779	}
1780
1781	return ext4_chunk_trans_blocks(inode, nrblocks);
1782}
1783
1784/*
1785 * How many index/leaf blocks need to change/allocate to modify nrblocks?
1786 *
1787 * if nrblocks are fit in a single extent (chunk flag is 1), then
1788 * in the worse case, each tree level index/leaf need to be changed
1789 * if the tree split due to insert a new extent, then the old tree
1790 * index/leaf need to be updated too
1791 *
1792 * If the nrblocks are discontiguous, they could cause
1793 * the whole tree split more than once, but this is really rare.
1794 */
1795int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1796{
1797	int index;
1798	int depth = ext_depth(inode);
1799
1800	if (chunk)
1801		index = depth * 2;
1802	else
1803		index = depth * 3;
1804
1805	return index;
1806}
1807
1808static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1809				struct ext4_extent *ex,
1810				ext4_lblk_t from, ext4_lblk_t to)
1811{
1812	struct buffer_head *bh;
1813	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
1814	int i, metadata = 0;
1815
1816	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1817		metadata = 1;
1818#ifdef EXTENTS_STATS
1819	{
1820		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1821		spin_lock(&sbi->s_ext_stats_lock);
1822		sbi->s_ext_blocks += ee_len;
1823		sbi->s_ext_extents++;
1824		if (ee_len < sbi->s_ext_min)
1825			sbi->s_ext_min = ee_len;
1826		if (ee_len > sbi->s_ext_max)
1827			sbi->s_ext_max = ee_len;
1828		if (ext_depth(inode) > sbi->s_depth_max)
1829			sbi->s_depth_max = ext_depth(inode);
1830		spin_unlock(&sbi->s_ext_stats_lock);
1831	}
1832#endif
1833	if (from >= le32_to_cpu(ex->ee_block)
1834	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1835		/* tail removal */
1836		ext4_lblk_t num;
1837		ext4_fsblk_t start;
1838
1839		num = le32_to_cpu(ex->ee_block) + ee_len - from;
1840		start = ext_pblock(ex) + ee_len - num;
1841		ext_debug("free last %u blocks starting %llu\n", num, start);
1842		for (i = 0; i < num; i++) {
1843			bh = sb_find_get_block(inode->i_sb, start + i);
1844			ext4_forget(handle, 0, inode, bh, start + i);
1845		}
1846		ext4_free_blocks(handle, inode, start, num, metadata);
1847	} else if (from == le32_to_cpu(ex->ee_block)
1848		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1849		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1850			from, to, le32_to_cpu(ex->ee_block), ee_len);
1851	} else {
1852		printk(KERN_INFO "strange request: removal(2) "
1853				"%u-%u from %u:%u\n",
1854				from, to, le32_to_cpu(ex->ee_block), ee_len);
1855	}
1856	return 0;
1857}
1858
1859static int
1860ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1861		struct ext4_ext_path *path, ext4_lblk_t start)
1862{
1863	int err = 0, correct_index = 0;
1864	int depth = ext_depth(inode), credits;
1865	struct ext4_extent_header *eh;
1866	ext4_lblk_t a, b, block;
1867	unsigned num;
1868	ext4_lblk_t ex_ee_block;
1869	unsigned short ex_ee_len;
1870	unsigned uninitialized = 0;
1871	struct ext4_extent *ex;
1872
1873	/* the header must be checked already in ext4_ext_remove_space() */
1874	ext_debug("truncate since %u in leaf\n", start);
1875	if (!path[depth].p_hdr)
1876		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1877	eh = path[depth].p_hdr;
1878	BUG_ON(eh == NULL);
1879
1880	/* find where to start removing */
1881	ex = EXT_LAST_EXTENT(eh);
1882
1883	ex_ee_block = le32_to_cpu(ex->ee_block);
1884	if (ext4_ext_is_uninitialized(ex))
1885		uninitialized = 1;
1886	ex_ee_len = ext4_ext_get_actual_len(ex);
1887
1888	while (ex >= EXT_FIRST_EXTENT(eh) &&
1889			ex_ee_block + ex_ee_len > start) {
1890		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1891		path[depth].p_ext = ex;
1892
1893		a = ex_ee_block > start ? ex_ee_block : start;
1894		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1895			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1896
1897		ext_debug("  border %u:%u\n", a, b);
1898
1899		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1900			block = 0;
1901			num = 0;
1902			BUG();
1903		} else if (a != ex_ee_block) {
1904			/* remove tail of the extent */
1905			block = ex_ee_block;
1906			num = a - block;
1907		} else if (b != ex_ee_block + ex_ee_len - 1) {
1908			/* remove head of the extent */
1909			block = a;
1910			num = b - a;
1911			/* there is no "make a hole" API yet */
1912			BUG();
1913		} else {
1914			/* remove whole extent: excellent! */
1915			block = ex_ee_block;
1916			num = 0;
1917			BUG_ON(a != ex_ee_block);
1918			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1919		}
1920
1921		/*
1922		 * 3 for leaf, sb, and inode plus 2 (bmap and group
1923		 * descriptor) for each block group; assume two block
1924		 * groups plus ex_ee_len/blocks_per_block_group for
1925		 * the worst case
1926		 */
1927		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
1928		if (ex == EXT_FIRST_EXTENT(eh)) {
1929			correct_index = 1;
1930			credits += (ext_depth(inode)) + 1;
1931		}
1932		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1933
1934		err = ext4_ext_journal_restart(handle, credits);
1935		if (err)
1936			goto out;
1937
1938		err = ext4_ext_get_access(handle, inode, path + depth);
1939		if (err)
1940			goto out;
1941
1942		err = ext4_remove_blocks(handle, inode, ex, a, b);
1943		if (err)
1944			goto out;
1945
1946		if (num == 0) {
1947			/* this extent is removed; mark slot entirely unused */
1948			ext4_ext_store_pblock(ex, 0);
1949			le16_add_cpu(&eh->eh_entries, -1);
1950		}
1951
1952		ex->ee_block = cpu_to_le32(block);
1953		ex->ee_len = cpu_to_le16(num);
1954		/*
1955		 * Do not mark uninitialized if all the blocks in the
1956		 * extent have been removed.
1957		 */
1958		if (uninitialized && num)
1959			ext4_ext_mark_uninitialized(ex);
1960
1961		err = ext4_ext_dirty(handle, inode, path + depth);
1962		if (err)
1963			goto out;
1964
1965		ext_debug("new extent: %u:%u:%llu\n", block, num,
1966				ext_pblock(ex));
1967		ex--;
1968		ex_ee_block = le32_to_cpu(ex->ee_block);
1969		ex_ee_len = ext4_ext_get_actual_len(ex);
1970	}
1971
1972	if (correct_index && eh->eh_entries)
1973		err = ext4_ext_correct_indexes(handle, inode, path);
1974
1975	/* if this leaf is free, then we should
1976	 * remove it from index block above */
1977	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1978		err = ext4_ext_rm_idx(handle, inode, path + depth);
1979
1980out:
1981	return err;
1982}
1983
1984/*
1985 * ext4_ext_more_to_rm:
1986 * returns 1 if current index has to be freed (even partial)
1987 */
1988static int
1989ext4_ext_more_to_rm(struct ext4_ext_path *path)
1990{
1991	BUG_ON(path->p_idx == NULL);
1992
1993	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1994		return 0;
1995
1996	/*
1997	 * if truncate on deeper level happened, it wasn't partial,
1998	 * so we have to consider current index for truncation
1999	 */
2000	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2001		return 0;
2002	return 1;
2003}
2004
2005static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2006{
2007	struct super_block *sb = inode->i_sb;
2008	int depth = ext_depth(inode);
2009	struct ext4_ext_path *path;
2010	handle_t *handle;
2011	int i = 0, err = 0;
2012
2013	ext_debug("truncate since %u\n", start);
2014
2015	/* probably first extent we're gonna free will be last in block */
2016	handle = ext4_journal_start(inode, depth + 1);
2017	if (IS_ERR(handle))
2018		return PTR_ERR(handle);
2019
2020	ext4_ext_invalidate_cache(inode);
2021
2022	/*
2023	 * We start scanning from right side, freeing all the blocks
2024	 * after i_size and walking into the tree depth-wise.
2025	 */
2026	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2027	if (path == NULL) {
2028		ext4_journal_stop(handle);
2029		return -ENOMEM;
2030	}
2031	path[0].p_hdr = ext_inode_hdr(inode);
2032	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
2033		err = -EIO;
2034		goto out;
2035	}
2036	path[0].p_depth = depth;
2037
2038	while (i >= 0 && err == 0) {
2039		if (i == depth) {
2040			/* this is leaf block */
2041			err = ext4_ext_rm_leaf(handle, inode, path, start);
2042			/* root level has p_bh == NULL, brelse() eats this */
2043			brelse(path[i].p_bh);
2044			path[i].p_bh = NULL;
2045			i--;
2046			continue;
2047		}
2048
2049		/* this is index block */
2050		if (!path[i].p_hdr) {
2051			ext_debug("initialize header\n");
2052			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2053		}
2054
2055		if (!path[i].p_idx) {
2056			/* this level hasn't been touched yet */
2057			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2058			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2059			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2060				  path[i].p_hdr,
2061				  le16_to_cpu(path[i].p_hdr->eh_entries));
2062		} else {
2063			/* we were already here, see at next index */
2064			path[i].p_idx--;
2065		}
2066
2067		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2068				i, EXT_FIRST_INDEX(path[i].p_hdr),
2069				path[i].p_idx);
2070		if (ext4_ext_more_to_rm(path + i)) {
2071			struct buffer_head *bh;
2072			/* go to the next level */
2073			ext_debug("move to level %d (block %llu)\n",
2074				  i + 1, idx_pblock(path[i].p_idx));
2075			memset(path + i + 1, 0, sizeof(*path));
2076			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2077			if (!bh) {
2078				/* should we reset i_size? */
2079				err = -EIO;
2080				break;
2081			}
2082			if (WARN_ON(i + 1 > depth)) {
2083				err = -EIO;
2084				break;
2085			}
2086			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
2087							depth - i - 1)) {
2088				err = -EIO;
2089				break;
2090			}
2091			path[i + 1].p_bh = bh;
2092
2093			/* save actual number of indexes since this
2094			 * number is changed at the next iteration */
2095			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2096			i++;
2097		} else {
2098			/* we finished processing this index, go up */
2099			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2100				/* index is empty, remove it;
2101				 * handle must be already prepared by the
2102				 * truncatei_leaf() */
2103				err = ext4_ext_rm_idx(handle, inode, path + i);
2104			}
2105			/* root level has p_bh == NULL, brelse() eats this */
2106			brelse(path[i].p_bh);
2107			path[i].p_bh = NULL;
2108			i--;
2109			ext_debug("return to level %d\n", i);
2110		}
2111	}
2112
2113	/* TODO: flexible tree reduction should be here */
2114	if (path->p_hdr->eh_entries == 0) {
2115		/*
2116		 * truncate to zero freed all the tree,
2117		 * so we need to correct eh_depth
2118		 */
2119		err = ext4_ext_get_access(handle, inode, path);
2120		if (err == 0) {
2121			ext_inode_hdr(inode)->eh_depth = 0;
2122			ext_inode_hdr(inode)->eh_max =
2123				cpu_to_le16(ext4_ext_space_root(inode));
2124			err = ext4_ext_dirty(handle, inode, path);
2125		}
2126	}
2127out:
2128	ext4_ext_tree_changed(inode);
2129	ext4_ext_drop_refs(path);
2130	kfree(path);
2131	ext4_journal_stop(handle);
2132
2133	return err;
2134}
2135
2136/*
2137 * called at mount time
2138 */
2139void ext4_ext_init(struct super_block *sb)
2140{
2141	/*
2142	 * possible initialization would be here
2143	 */
2144
2145	if (test_opt(sb, EXTENTS)) {
2146		printk(KERN_INFO "EXT4-fs: file extents enabled");
2147#ifdef AGGRESSIVE_TEST
2148		printk(", aggressive tests");
2149#endif
2150#ifdef CHECK_BINSEARCH
2151		printk(", check binsearch");
2152#endif
2153#ifdef EXTENTS_STATS
2154		printk(", stats");
2155#endif
2156		printk("\n");
2157#ifdef EXTENTS_STATS
2158		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2159		EXT4_SB(sb)->s_ext_min = 1 << 30;
2160		EXT4_SB(sb)->s_ext_max = 0;
2161#endif
2162	}
2163}
2164
2165/*
2166 * called at umount time
2167 */
2168void ext4_ext_release(struct super_block *sb)
2169{
2170	if (!test_opt(sb, EXTENTS))
2171		return;
2172
2173#ifdef EXTENTS_STATS
2174	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2175		struct ext4_sb_info *sbi = EXT4_SB(sb);
2176		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2177			sbi->s_ext_blocks, sbi->s_ext_extents,
2178			sbi->s_ext_blocks / sbi->s_ext_extents);
2179		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2180			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2181	}
2182#endif
2183}
2184
2185static void bi_complete(struct bio *bio, int error)
2186{
2187	complete((struct completion *)bio->bi_private);
2188}
2189
2190/* FIXME!! we need to try to merge to left or right after zero-out  */
2191static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2192{
2193	int ret = -EIO;
2194	struct bio *bio;
2195	int blkbits, blocksize;
2196	sector_t ee_pblock;
2197	struct completion event;
2198	unsigned int ee_len, len, done, offset;
2199
2200
2201	blkbits   = inode->i_blkbits;
2202	blocksize = inode->i_sb->s_blocksize;
2203	ee_len    = ext4_ext_get_actual_len(ex);
2204	ee_pblock = ext_pblock(ex);
2205
2206	/* convert ee_pblock to 512 byte sectors */
2207	ee_pblock = ee_pblock << (blkbits - 9);
2208
2209	while (ee_len > 0) {
2210
2211		if (ee_len > BIO_MAX_PAGES)
2212			len = BIO_MAX_PAGES;
2213		else
2214			len = ee_len;
2215
2216		bio = bio_alloc(GFP_NOIO, len);
2217		if (!bio)
2218			return -ENOMEM;
2219		bio->bi_sector = ee_pblock;
2220		bio->bi_bdev   = inode->i_sb->s_bdev;
2221
2222		done = 0;
2223		offset = 0;
2224		while (done < len) {
2225			ret = bio_add_page(bio, ZERO_PAGE(0),
2226							blocksize, offset);
2227			if (ret != blocksize) {
2228				/*
2229				 * We can't add any more pages because of
2230				 * hardware limitations.  Start a new bio.
2231				 */
2232				break;
2233			}
2234			done++;
2235			offset += blocksize;
2236			if (offset >= PAGE_CACHE_SIZE)
2237				offset = 0;
2238		}
2239
2240		init_completion(&event);
2241		bio->bi_private = &event;
2242		bio->bi_end_io = bi_complete;
2243		submit_bio(WRITE, bio);
2244		wait_for_completion(&event);
2245
2246		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2247			ret = 0;
2248		else {
2249			ret = -EIO;
2250			break;
2251		}
2252		bio_put(bio);
2253		ee_len    -= done;
2254		ee_pblock += done  << (blkbits - 9);
2255	}
2256	return ret;
2257}
2258
2259#define EXT4_EXT_ZERO_LEN 7
2260
2261/*
2262 * This function is called by ext4_ext_get_blocks() if someone tries to write
2263 * to an uninitialized extent. It may result in splitting the uninitialized
2264 * extent into multiple extents (upto three - one initialized and two
2265 * uninitialized).
2266 * There are three possibilities:
2267 *   a> There is no split required: Entire extent should be initialized
2268 *   b> Splits in two extents: Write is happening at either end of the extent
2269 *   c> Splits in three extents: Somone is writing in middle of the extent
2270 */
2271static int ext4_ext_convert_to_initialized(handle_t *handle,
2272						struct inode *inode,
2273						struct ext4_ext_path *path,
2274						ext4_lblk_t iblock,
2275						unsigned long max_blocks)
2276{
2277	struct ext4_extent *ex, newex, orig_ex;
2278	struct ext4_extent *ex1 = NULL;
2279	struct ext4_extent *ex2 = NULL;
2280	struct ext4_extent *ex3 = NULL;
2281	struct ext4_extent_header *eh;
2282	ext4_lblk_t ee_block;
2283	unsigned int allocated, ee_len, depth;
2284	ext4_fsblk_t newblock;
2285	int err = 0;
2286	int ret = 0;
2287
2288	depth = ext_depth(inode);
2289	eh = path[depth].p_hdr;
2290	ex = path[depth].p_ext;
2291	ee_block = le32_to_cpu(ex->ee_block);
2292	ee_len = ext4_ext_get_actual_len(ex);
2293	allocated = ee_len - (iblock - ee_block);
2294	newblock = iblock - ee_block + ext_pblock(ex);
2295	ex2 = ex;
2296	orig_ex.ee_block = ex->ee_block;
2297	orig_ex.ee_len   = cpu_to_le16(ee_len);
2298	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2299
2300	err = ext4_ext_get_access(handle, inode, path + depth);
2301	if (err)
2302		goto out;
2303	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2304	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2305		err =  ext4_ext_zeroout(inode, &orig_ex);
2306		if (err)
2307			goto fix_extent_len;
2308		/* update the extent length and mark as initialized */
2309		ex->ee_block = orig_ex.ee_block;
2310		ex->ee_len   = orig_ex.ee_len;
2311		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2312		ext4_ext_dirty(handle, inode, path + depth);
2313		/* zeroed the full extent */
2314		return allocated;
2315	}
2316
2317	/* ex1: ee_block to iblock - 1 : uninitialized */
2318	if (iblock > ee_block) {
2319		ex1 = ex;
2320		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2321		ext4_ext_mark_uninitialized(ex1);
2322		ex2 = &newex;
2323	}
2324	/*
2325	 * for sanity, update the length of the ex2 extent before
2326	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2327	 * overlap of blocks.
2328	 */
2329	if (!ex1 && allocated > max_blocks)
2330		ex2->ee_len = cpu_to_le16(max_blocks);
2331	/* ex3: to ee_block + ee_len : uninitialised */
2332	if (allocated > max_blocks) {
2333		unsigned int newdepth;
2334		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2335		if (allocated <= EXT4_EXT_ZERO_LEN) {
2336			/*
2337			 * iblock == ee_block is handled by the zerouout
2338			 * at the beginning.
2339			 * Mark first half uninitialized.
2340			 * Mark second half initialized and zero out the
2341			 * initialized extent
2342			 */
2343			ex->ee_block = orig_ex.ee_block;
2344			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2345			ext4_ext_mark_uninitialized(ex);
2346			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2347			ext4_ext_dirty(handle, inode, path + depth);
2348
2349			ex3 = &newex;
2350			ex3->ee_block = cpu_to_le32(iblock);
2351			ext4_ext_store_pblock(ex3, newblock);
2352			ex3->ee_len = cpu_to_le16(allocated);
2353			err = ext4_ext_insert_extent(handle, inode, path, ex3);
2354			if (err == -ENOSPC) {
2355				err =  ext4_ext_zeroout(inode, &orig_ex);
2356				if (err)
2357					goto fix_extent_len;
2358				ex->ee_block = orig_ex.ee_block;
2359				ex->ee_len   = orig_ex.ee_len;
2360				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2361				ext4_ext_dirty(handle, inode, path + depth);
2362				/* blocks available from iblock */
2363				return allocated;
2364
2365			} else if (err)
2366				goto fix_extent_len;
2367
2368			/*
2369			 * We need to zero out the second half because
2370			 * an fallocate request can update file size and
2371			 * converting the second half to initialized extent
2372			 * implies that we can leak some junk data to user
2373			 * space.
2374			 */
2375			err =  ext4_ext_zeroout(inode, ex3);
2376			if (err) {
2377				/*
2378				 * We should actually mark the
2379				 * second half as uninit and return error
2380				 * Insert would have changed the extent
2381				 */
2382				depth = ext_depth(inode);
2383				ext4_ext_drop_refs(path);
2384				path = ext4_ext_find_extent(inode,
2385								iblock, path);
2386				if (IS_ERR(path)) {
2387					err = PTR_ERR(path);
2388					return err;
2389				}
2390				/* get the second half extent details */
2391				ex = path[depth].p_ext;
2392				err = ext4_ext_get_access(handle, inode,
2393								path + depth);
2394				if (err)
2395					return err;
2396				ext4_ext_mark_uninitialized(ex);
2397				ext4_ext_dirty(handle, inode, path + depth);
2398				return err;
2399			}
2400
2401			/* zeroed the second half */
2402			return allocated;
2403		}
2404		ex3 = &newex;
2405		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2406		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2407		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2408		ext4_ext_mark_uninitialized(ex3);
2409		err = ext4_ext_insert_extent(handle, inode, path, ex3);
2410		if (err == -ENOSPC) {
2411			err =  ext4_ext_zeroout(inode, &orig_ex);
2412			if (err)
2413				goto fix_extent_len;
2414			/* update the extent length and mark as initialized */
2415			ex->ee_block = orig_ex.ee_block;
2416			ex->ee_len   = orig_ex.ee_len;
2417			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2418			ext4_ext_dirty(handle, inode, path + depth);
2419			/* zeroed the full extent */
2420			/* blocks available from iblock */
2421			return allocated;
2422
2423		} else if (err)
2424			goto fix_extent_len;
2425		/*
2426		 * The depth, and hence eh & ex might change
2427		 * as part of the insert above.
2428		 */
2429		newdepth = ext_depth(inode);
2430		/*
2431		 * update the extent length after successfull insert of the
2432		 * split extent
2433		 */
2434		orig_ex.ee_len = cpu_to_le16(ee_len -
2435						ext4_ext_get_actual_len(ex3));
2436		depth = newdepth;
2437		ext4_ext_drop_refs(path);
2438		path = ext4_ext_find_extent(inode, iblock, path);
2439		if (IS_ERR(path)) {
2440			err = PTR_ERR(path);
2441			goto out;
2442		}
2443		eh = path[depth].p_hdr;
2444		ex = path[depth].p_ext;
2445		if (ex2 != &newex)
2446			ex2 = ex;
2447
2448		err = ext4_ext_get_access(handle, inode, path + depth);
2449		if (err)
2450			goto out;
2451
2452		allocated = max_blocks;
2453
2454		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2455		 * to insert a extent in the middle zerout directly
2456		 * otherwise give the extent a chance to merge to left
2457		 */
2458		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2459							iblock != ee_block) {
2460			err =  ext4_ext_zeroout(inode, &orig_ex);
2461			if (err)
2462				goto fix_extent_len;
2463			/* update the extent length and mark as initialized */
2464			ex->ee_block = orig_ex.ee_block;
2465			ex->ee_len   = orig_ex.ee_len;
2466			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2467			ext4_ext_dirty(handle, inode, path + depth);
2468			/* zero out the first half */
2469			/* blocks available from iblock */
2470			return allocated;
2471		}
2472	}
2473	/*
2474	 * If there was a change of depth as part of the
2475	 * insertion of ex3 above, we need to update the length
2476	 * of the ex1 extent again here
2477	 */
2478	if (ex1 && ex1 != ex) {
2479		ex1 = ex;
2480		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2481		ext4_ext_mark_uninitialized(ex1);
2482		ex2 = &newex;
2483	}
2484	/* ex2: iblock to iblock + maxblocks-1 : initialised */
2485	ex2->ee_block = cpu_to_le32(iblock);
2486	ext4_ext_store_pblock(ex2, newblock);
2487	ex2->ee_len = cpu_to_le16(allocated);
2488	if (ex2 != ex)
2489		goto insert;
2490	/*
2491	 * New (initialized) extent starts from the first block
2492	 * in the current extent. i.e., ex2 == ex
2493	 * We have to see if it can be merged with the extent
2494	 * on the left.
2495	 */
2496	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2497		/*
2498		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2499		 * since it merges towards right _only_.
2500		 */
2501		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2502		if (ret) {
2503			err = ext4_ext_correct_indexes(handle, inode, path);
2504			if (err)
2505				goto out;
2506			depth = ext_depth(inode);
2507			ex2--;
2508		}
2509	}
2510	/*
2511	 * Try to Merge towards right. This might be required
2512	 * only when the whole extent is being written to.
2513	 * i.e. ex2 == ex and ex3 == NULL.
2514	 */
2515	if (!ex3) {
2516		ret = ext4_ext_try_to_merge(inode, path, ex2);
2517		if (ret) {
2518			err = ext4_ext_correct_indexes(handle, inode, path);
2519			if (err)
2520				goto out;
2521		}
2522	}
2523	/* Mark modified extent as dirty */
2524	err = ext4_ext_dirty(handle, inode, path + depth);
2525	goto out;
2526insert:
2527	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2528	if (err == -ENOSPC) {
2529		err =  ext4_ext_zeroout(inode, &orig_ex);
2530		if (err)
2531			goto fix_extent_len;
2532		/* update the extent length and mark as initialized */
2533		ex->ee_block = orig_ex.ee_block;
2534		ex->ee_len   = orig_ex.ee_len;
2535		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2536		ext4_ext_dirty(handle, inode, path + depth);
2537		/* zero out the first half */
2538		return allocated;
2539	} else if (err)
2540		goto fix_extent_len;
2541out:
2542	return err ? err : allocated;
2543
2544fix_extent_len:
2545	ex->ee_block = orig_ex.ee_block;
2546	ex->ee_len   = orig_ex.ee_len;
2547	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2548	ext4_ext_mark_uninitialized(ex);
2549	ext4_ext_dirty(handle, inode, path + depth);
2550	return err;
2551}
2552
2553/*
2554 * Block allocation/map/preallocation routine for extents based files
2555 *
2556 *
2557 * Need to be called with
2558 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2559 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2560 *
2561 * return > 0, number of of blocks already mapped/allocated
2562 *          if create == 0 and these are pre-allocated blocks
2563 *          	buffer head is unmapped
2564 *          otherwise blocks are mapped
2565 *
2566 * return = 0, if plain look up failed (blocks have not been allocated)
2567 *          buffer head is unmapped
2568 *
2569 * return < 0, error case.
2570 */
2571int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2572			ext4_lblk_t iblock,
2573			unsigned long max_blocks, struct buffer_head *bh_result,
2574			int create, int extend_disksize)
2575{
2576	struct ext4_ext_path *path = NULL;
2577	struct ext4_extent_header *eh;
2578	struct ext4_extent newex, *ex;
2579	ext4_fsblk_t goal, newblock;
2580	int err = 0, depth, ret;
2581	unsigned long allocated = 0;
2582	struct ext4_allocation_request ar;
2583	loff_t disksize;
2584
2585	__clear_bit(BH_New, &bh_result->b_state);
2586	ext_debug("blocks %u/%lu requested for inode %u\n",
2587			iblock, max_blocks, inode->i_ino);
2588
2589	/* check in cache */
2590	goal = ext4_ext_in_cache(inode, iblock, &newex);
2591	if (goal) {
2592		if (goal == EXT4_EXT_CACHE_GAP) {
2593			if (!create) {
2594				/*
2595				 * block isn't allocated yet and
2596				 * user doesn't want to allocate it
2597				 */
2598				goto out2;
2599			}
2600			/* we should allocate requested block */
2601		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
2602			/* block is already allocated */
2603			newblock = iblock
2604				   - le32_to_cpu(newex.ee_block)
2605				   + ext_pblock(&newex);
2606			/* number of remaining blocks in the extent */
2607			allocated = ext4_ext_get_actual_len(&newex) -
2608					(iblock - le32_to_cpu(newex.ee_block));
2609			goto out;
2610		} else {
2611			BUG();
2612		}
2613	}
2614
2615	/* find extent for this block */
2616	path = ext4_ext_find_extent(inode, iblock, NULL);
2617	if (IS_ERR(path)) {
2618		err = PTR_ERR(path);
2619		path = NULL;
2620		goto out2;
2621	}
2622
2623	depth = ext_depth(inode);
2624
2625	/*
2626	 * consistent leaf must not be empty;
2627	 * this situation is possible, though, _during_ tree modification;
2628	 * this is why assert can't be put in ext4_ext_find_extent()
2629	 */
2630	BUG_ON(path[depth].p_ext == NULL && depth != 0);
2631	eh = path[depth].p_hdr;
2632
2633	ex = path[depth].p_ext;
2634	if (ex) {
2635		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2636		ext4_fsblk_t ee_start = ext_pblock(ex);
2637		unsigned short ee_len;
2638
2639		/*
2640		 * Uninitialized extents are treated as holes, except that
2641		 * we split out initialized portions during a write.
2642		 */
2643		ee_len = ext4_ext_get_actual_len(ex);
2644		/* if found extent covers block, simply return it */
2645		if (iblock >= ee_block && iblock < ee_block + ee_len) {
2646			newblock = iblock - ee_block + ee_start;
2647			/* number of remaining blocks in the extent */
2648			allocated = ee_len - (iblock - ee_block);
2649			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2650					ee_block, ee_len, newblock);
2651
2652			/* Do not put uninitialized extent in the cache */
2653			if (!ext4_ext_is_uninitialized(ex)) {
2654				ext4_ext_put_in_cache(inode, ee_block,
2655							ee_len, ee_start,
2656							EXT4_EXT_CACHE_EXTENT);
2657				goto out;
2658			}
2659			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2660				goto out;
2661			if (!create) {
2662				/*
2663				 * We have blocks reserved already.  We
2664				 * return allocated blocks so that delalloc
2665				 * won't do block reservation for us.  But
2666				 * the buffer head will be unmapped so that
2667				 * a read from the block returns 0s.
2668				 */
2669				if (allocated > max_blocks)
2670					allocated = max_blocks;
2671				set_buffer_unwritten(bh_result);
2672				goto out2;
2673			}
2674
2675			ret = ext4_ext_convert_to_initialized(handle, inode,
2676								path, iblock,
2677								max_blocks);
2678			if (ret <= 0) {
2679				err = ret;
2680				goto out2;
2681			} else
2682				allocated = ret;
2683			goto outnew;
2684		}
2685	}
2686
2687	/*
2688	 * requested block isn't allocated yet;
2689	 * we couldn't try to create block if create flag is zero
2690	 */
2691	if (!create) {
2692		/*
2693		 * put just found gap into cache to speed up
2694		 * subsequent requests
2695		 */
2696		ext4_ext_put_gap_in_cache(inode, path, iblock);
2697		goto out2;
2698	}
2699	/*
2700	 * Okay, we need to do block allocation.
2701	 */
2702
2703	/* find neighbour allocated blocks */
2704	ar.lleft = iblock;
2705	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2706	if (err)
2707		goto out2;
2708	ar.lright = iblock;
2709	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2710	if (err)
2711		goto out2;
2712
2713	/*
2714	 * See if request is beyond maximum number of blocks we can have in
2715	 * a single extent. For an initialized extent this limit is
2716	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2717	 * EXT_UNINIT_MAX_LEN.
2718	 */
2719	if (max_blocks > EXT_INIT_MAX_LEN &&
2720	    create != EXT4_CREATE_UNINITIALIZED_EXT)
2721		max_blocks = EXT_INIT_MAX_LEN;
2722	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2723		 create == EXT4_CREATE_UNINITIALIZED_EXT)
2724		max_blocks = EXT_UNINIT_MAX_LEN;
2725
2726	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2727	newex.ee_block = cpu_to_le32(iblock);
2728	newex.ee_len = cpu_to_le16(max_blocks);
2729	err = ext4_ext_check_overlap(inode, &newex, path);
2730	if (err)
2731		allocated = ext4_ext_get_actual_len(&newex);
2732	else
2733		allocated = max_blocks;
2734
2735	/* allocate new block */
2736	ar.inode = inode;
2737	ar.goal = ext4_ext_find_goal(inode, path, iblock);
2738	ar.logical = iblock;
2739	ar.len = allocated;
2740	if (S_ISREG(inode->i_mode))
2741		ar.flags = EXT4_MB_HINT_DATA;
2742	else
2743		/* disable in-core preallocation for non-regular files */
2744		ar.flags = 0;
2745	newblock = ext4_mb_new_blocks(handle, &ar, &err);
2746	if (!newblock)
2747		goto out2;
2748	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2749			goal, newblock, allocated);
2750
2751	/* try to insert new extent into found leaf and return */
2752	ext4_ext_store_pblock(&newex, newblock);
2753	newex.ee_len = cpu_to_le16(ar.len);
2754	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
2755		ext4_ext_mark_uninitialized(&newex);
2756	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2757	if (err) {
2758		/* free data blocks we just allocated */
2759		/* not a good idea to call discard here directly,
2760		 * but otherwise we'd need to call it every free() */
2761		ext4_discard_preallocations(inode);
2762		ext4_free_blocks(handle, inode, ext_pblock(&newex),
2763					ext4_ext_get_actual_len(&newex), 0);
2764		goto out2;
2765	}
2766
2767	/* previous routine could use block we allocated */
2768	newblock = ext_pblock(&newex);
2769	allocated = ext4_ext_get_actual_len(&newex);
2770outnew:
2771	if (extend_disksize) {
2772		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2773		if (disksize > i_size_read(inode))
2774			disksize = i_size_read(inode);
2775		if (disksize > EXT4_I(inode)->i_disksize)
2776			EXT4_I(inode)->i_disksize = disksize;
2777	}
2778
2779	set_buffer_new(bh_result);
2780
2781	/* Cache only when it is _not_ an uninitialized extent */
2782	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2783		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2784						EXT4_EXT_CACHE_EXTENT);
2785out:
2786	if (allocated > max_blocks)
2787		allocated = max_blocks;
2788	ext4_ext_show_leaf(inode, path);
2789	set_buffer_mapped(bh_result);
2790	bh_result->b_bdev = inode->i_sb->s_bdev;
2791	bh_result->b_blocknr = newblock;
2792out2:
2793	if (path) {
2794		ext4_ext_drop_refs(path);
2795		kfree(path);
2796	}
2797	return err ? err : allocated;
2798}
2799
2800void ext4_ext_truncate(struct inode *inode)
2801{
2802	struct address_space *mapping = inode->i_mapping;
2803	struct super_block *sb = inode->i_sb;
2804	ext4_lblk_t last_block;
2805	handle_t *handle;
2806	int err = 0;
2807
2808	/*
2809	 * probably first extent we're gonna free will be last in block
2810	 */
2811	err = ext4_writepage_trans_blocks(inode);
2812	handle = ext4_journal_start(inode, err);
2813	if (IS_ERR(handle))
2814		return;
2815
2816	if (inode->i_size & (sb->s_blocksize - 1))
2817		ext4_block_truncate_page(handle, mapping, inode->i_size);
2818
2819	if (ext4_orphan_add(handle, inode))
2820		goto out_stop;
2821
2822	down_write(&EXT4_I(inode)->i_data_sem);
2823	ext4_ext_invalidate_cache(inode);
2824
2825	ext4_discard_preallocations(inode);
2826
2827	/*
2828	 * TODO: optimization is possible here.
2829	 * Probably we need not scan at all,
2830	 * because page truncation is enough.
2831	 */
2832
2833	/* we have to know where to truncate from in crash case */
2834	EXT4_I(inode)->i_disksize = inode->i_size;
2835	ext4_mark_inode_dirty(handle, inode);
2836
2837	last_block = (inode->i_size + sb->s_blocksize - 1)
2838			>> EXT4_BLOCK_SIZE_BITS(sb);
2839	err = ext4_ext_remove_space(inode, last_block);
2840
2841	/* In a multi-transaction truncate, we only make the final
2842	 * transaction synchronous.
2843	 */
2844	if (IS_SYNC(inode))
2845		handle->h_sync = 1;
2846
2847out_stop:
2848	up_write(&EXT4_I(inode)->i_data_sem);
2849	/*
2850	 * If this was a simple ftruncate() and the file will remain alive,
2851	 * then we need to clear up the orphan record which we created above.
2852	 * However, if this was a real unlink then we were called by
2853	 * ext4_delete_inode(), and we allow that function to clean up the
2854	 * orphan info for us.
2855	 */
2856	if (inode->i_nlink)
2857		ext4_orphan_del(handle, inode);
2858
2859	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2860	ext4_mark_inode_dirty(handle, inode);
2861	ext4_journal_stop(handle);
2862}
2863
2864static void ext4_falloc_update_inode(struct inode *inode,
2865				int mode, loff_t new_size, int update_ctime)
2866{
2867	struct timespec now;
2868
2869	if (update_ctime) {
2870		now = current_fs_time(inode->i_sb);
2871		if (!timespec_equal(&inode->i_ctime, &now))
2872			inode->i_ctime = now;
2873	}
2874	/*
2875	 * Update only when preallocation was requested beyond
2876	 * the file size.
2877	 */
2878	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2879		if (new_size > i_size_read(inode))
2880			i_size_write(inode, new_size);
2881		if (new_size > EXT4_I(inode)->i_disksize)
2882			ext4_update_i_disksize(inode, new_size);
2883	}
2884
2885}
2886
2887/*
2888 * preallocate space for a file. This implements ext4's fallocate inode
2889 * operation, which gets called from sys_fallocate system call.
2890 * For block-mapped files, posix_fallocate should fall back to the method
2891 * of writing zeroes to the required new blocks (the same behavior which is
2892 * expected for file systems which do not support fallocate() system call).
2893 */
2894long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2895{
2896	handle_t *handle;
2897	ext4_lblk_t block;
2898	loff_t new_size;
2899	unsigned long max_blocks;
2900	int ret = 0;
2901	int ret2 = 0;
2902	int retries = 0;
2903	struct buffer_head map_bh;
2904	unsigned int credits, blkbits = inode->i_blkbits;
2905
2906	/*
2907	 * currently supporting (pre)allocate mode for extent-based
2908	 * files _only_
2909	 */
2910	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
2911		return -EOPNOTSUPP;
2912
2913	/* preallocation to directories is currently not supported */
2914	if (S_ISDIR(inode->i_mode))
2915		return -ENODEV;
2916
2917	block = offset >> blkbits;
2918	/*
2919	 * We can't just convert len to max_blocks because
2920	 * If blocksize = 4096 offset = 3072 and len = 2048
2921	 */
2922	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2923							- block;
2924	/*
2925	 * credits to insert 1 extent into extent tree
2926	 */
2927	credits = ext4_chunk_trans_blocks(inode, max_blocks);
2928	mutex_lock(&inode->i_mutex);
2929retry:
2930	while (ret >= 0 && ret < max_blocks) {
2931		block = block + ret;
2932		max_blocks = max_blocks - ret;
2933		handle = ext4_journal_start(inode, credits);
2934		if (IS_ERR(handle)) {
2935			ret = PTR_ERR(handle);
2936			break;
2937		}
2938		ret = ext4_get_blocks_wrap(handle, inode, block,
2939					  max_blocks, &map_bh,
2940					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
2941		if (ret <= 0) {
2942#ifdef EXT4FS_DEBUG
2943			WARN_ON(ret <= 0);
2944			printk(KERN_ERR "%s: ext4_ext_get_blocks "
2945				    "returned error inode#%lu, block=%u, "
2946				    "max_blocks=%lu", __func__,
2947				    inode->i_ino, block, max_blocks);
2948#endif
2949			ext4_mark_inode_dirty(handle, inode);
2950			ret2 = ext4_journal_stop(handle);
2951			break;
2952		}
2953		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
2954						blkbits) >> blkbits))
2955			new_size = offset + len;
2956		else
2957			new_size = (block + ret) << blkbits;
2958
2959		ext4_falloc_update_inode(inode, mode, new_size,
2960						buffer_new(&map_bh));
2961		ext4_mark_inode_dirty(handle, inode);
2962		ret2 = ext4_journal_stop(handle);
2963		if (ret2)
2964			break;
2965	}
2966	if (ret == -ENOSPC &&
2967			ext4_should_retry_alloc(inode->i_sb, &retries)) {
2968		ret = 0;
2969		goto retry;
2970	}
2971	mutex_unlock(&inode->i_mutex);
2972	return ret > 0 ? ret2 : ret;
2973}
2974