extents.c revision 515f41c33a9d44a964264c9511ad2c869af1fac3
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54	ext4_fsblk_t block;
55
56	block = le32_to_cpu(ex->ee_start_lo);
57	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58	return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67	ext4_fsblk_t block;
68
69	block = le32_to_cpu(ix->ei_leaf_lo);
70	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71	return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97					    struct inode *inode,
98					    int needed)
99{
100	int err;
101
102	if (!ext4_handle_valid(handle))
103		return 0;
104	if (handle->h_buffer_credits > needed)
105		return 0;
106	err = ext4_journal_extend(handle, needed);
107	if (err <= 0)
108		return err;
109	err = ext4_truncate_restart_trans(handle, inode, needed);
110	/*
111	 * We have dropped i_data_sem so someone might have cached again
112	 * an extent we are going to truncate.
113	 */
114	ext4_ext_invalidate_cache(inode);
115
116	return err;
117}
118
119/*
120 * could return:
121 *  - EROFS
122 *  - ENOMEM
123 */
124static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
125				struct ext4_ext_path *path)
126{
127	if (path->p_bh) {
128		/* path points to block */
129		return ext4_journal_get_write_access(handle, path->p_bh);
130	}
131	/* path points to leaf/index in inode body */
132	/* we use in-core data, no need to protect them */
133	return 0;
134}
135
136/*
137 * could return:
138 *  - EROFS
139 *  - ENOMEM
140 *  - EIO
141 */
142static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
143				struct ext4_ext_path *path)
144{
145	int err;
146	if (path->p_bh) {
147		/* path points to block */
148		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
149	} else {
150		/* path points to leaf/index in inode body */
151		err = ext4_mark_inode_dirty(handle, inode);
152	}
153	return err;
154}
155
156static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
157			      struct ext4_ext_path *path,
158			      ext4_lblk_t block)
159{
160	struct ext4_inode_info *ei = EXT4_I(inode);
161	ext4_fsblk_t bg_start;
162	ext4_fsblk_t last_block;
163	ext4_grpblk_t colour;
164	ext4_group_t block_group;
165	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
166	int depth;
167
168	if (path) {
169		struct ext4_extent *ex;
170		depth = path->p_depth;
171
172		/* try to predict block placement */
173		ex = path[depth].p_ext;
174		if (ex)
175			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
176
177		/* it looks like index is empty;
178		 * try to find starting block from index itself */
179		if (path[depth].p_bh)
180			return path[depth].p_bh->b_blocknr;
181	}
182
183	/* OK. use inode's group */
184	block_group = ei->i_block_group;
185	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186		/*
187		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188		 * block groups per flexgroup, reserve the first block
189		 * group for directories and special files.  Regular
190		 * files will start at the second block group.  This
191		 * tends to speed up directory access and improves
192		 * fsck times.
193		 */
194		block_group &= ~(flex_size-1);
195		if (S_ISREG(inode->i_mode))
196			block_group++;
197	}
198	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
199		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201
202	/*
203	 * If we are doing delayed allocation, we don't need take
204	 * colour into account.
205	 */
206	if (test_opt(inode->i_sb, DELALLOC))
207		return bg_start;
208
209	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
210		colour = (current->pid % 16) *
211			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
212	else
213		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
214	return bg_start + colour + block;
215}
216
217/*
218 * Allocation for a meta data block
219 */
220static ext4_fsblk_t
221ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
222			struct ext4_ext_path *path,
223			struct ext4_extent *ex, int *err)
224{
225	ext4_fsblk_t goal, newblock;
226
227	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
228	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
229	return newblock;
230}
231
232static inline int ext4_ext_space_block(struct inode *inode, int check)
233{
234	int size;
235
236	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
237			/ sizeof(struct ext4_extent);
238	if (!check) {
239#ifdef AGGRESSIVE_TEST
240		if (size > 6)
241			size = 6;
242#endif
243	}
244	return size;
245}
246
247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
248{
249	int size;
250
251	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
252			/ sizeof(struct ext4_extent_idx);
253	if (!check) {
254#ifdef AGGRESSIVE_TEST
255		if (size > 5)
256			size = 5;
257#endif
258	}
259	return size;
260}
261
262static inline int ext4_ext_space_root(struct inode *inode, int check)
263{
264	int size;
265
266	size = sizeof(EXT4_I(inode)->i_data);
267	size -= sizeof(struct ext4_extent_header);
268	size /= sizeof(struct ext4_extent);
269	if (!check) {
270#ifdef AGGRESSIVE_TEST
271		if (size > 3)
272			size = 3;
273#endif
274	}
275	return size;
276}
277
278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
279{
280	int size;
281
282	size = sizeof(EXT4_I(inode)->i_data);
283	size -= sizeof(struct ext4_extent_header);
284	size /= sizeof(struct ext4_extent_idx);
285	if (!check) {
286#ifdef AGGRESSIVE_TEST
287		if (size > 4)
288			size = 4;
289#endif
290	}
291	return size;
292}
293
294/*
295 * Calculate the number of metadata blocks needed
296 * to allocate @blocks
297 * Worse case is one block per extent
298 */
299int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
300{
301	int lcap, icap, rcap, leafs, idxs, num;
302	int newextents = blocks;
303
304	rcap = ext4_ext_space_root_idx(inode, 0);
305	lcap = ext4_ext_space_block(inode, 0);
306	icap = ext4_ext_space_block_idx(inode, 0);
307
308	/* number of new leaf blocks needed */
309	num = leafs = (newextents + lcap - 1) / lcap;
310
311	/*
312	 * Worse case, we need separate index block(s)
313	 * to link all new leaf blocks
314	 */
315	idxs = (leafs + icap - 1) / icap;
316	do {
317		num += idxs;
318		idxs = (idxs + icap - 1) / icap;
319	} while (idxs > rcap);
320
321	return num;
322}
323
324static int
325ext4_ext_max_entries(struct inode *inode, int depth)
326{
327	int max;
328
329	if (depth == ext_depth(inode)) {
330		if (depth == 0)
331			max = ext4_ext_space_root(inode, 1);
332		else
333			max = ext4_ext_space_root_idx(inode, 1);
334	} else {
335		if (depth == 0)
336			max = ext4_ext_space_block(inode, 1);
337		else
338			max = ext4_ext_space_block_idx(inode, 1);
339	}
340
341	return max;
342}
343
344static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
345{
346	ext4_fsblk_t block = ext_pblock(ext);
347	int len = ext4_ext_get_actual_len(ext);
348
349	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
350}
351
352static int ext4_valid_extent_idx(struct inode *inode,
353				struct ext4_extent_idx *ext_idx)
354{
355	ext4_fsblk_t block = idx_pblock(ext_idx);
356
357	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
358}
359
360static int ext4_valid_extent_entries(struct inode *inode,
361				struct ext4_extent_header *eh,
362				int depth)
363{
364	struct ext4_extent *ext;
365	struct ext4_extent_idx *ext_idx;
366	unsigned short entries;
367	if (eh->eh_entries == 0)
368		return 1;
369
370	entries = le16_to_cpu(eh->eh_entries);
371
372	if (depth == 0) {
373		/* leaf entries */
374		ext = EXT_FIRST_EXTENT(eh);
375		while (entries) {
376			if (!ext4_valid_extent(inode, ext))
377				return 0;
378			ext++;
379			entries--;
380		}
381	} else {
382		ext_idx = EXT_FIRST_INDEX(eh);
383		while (entries) {
384			if (!ext4_valid_extent_idx(inode, ext_idx))
385				return 0;
386			ext_idx++;
387			entries--;
388		}
389	}
390	return 1;
391}
392
393static int __ext4_ext_check(const char *function, struct inode *inode,
394					struct ext4_extent_header *eh,
395					int depth)
396{
397	const char *error_msg;
398	int max = 0;
399
400	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
401		error_msg = "invalid magic";
402		goto corrupted;
403	}
404	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
405		error_msg = "unexpected eh_depth";
406		goto corrupted;
407	}
408	if (unlikely(eh->eh_max == 0)) {
409		error_msg = "invalid eh_max";
410		goto corrupted;
411	}
412	max = ext4_ext_max_entries(inode, depth);
413	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
414		error_msg = "too large eh_max";
415		goto corrupted;
416	}
417	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
418		error_msg = "invalid eh_entries";
419		goto corrupted;
420	}
421	if (!ext4_valid_extent_entries(inode, eh, depth)) {
422		error_msg = "invalid extent entries";
423		goto corrupted;
424	}
425	return 0;
426
427corrupted:
428	ext4_error(inode->i_sb, function,
429			"bad header/extent in inode #%lu: %s - magic %x, "
430			"entries %u, max %u(%u), depth %u(%u)",
431			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
432			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
433			max, le16_to_cpu(eh->eh_depth), depth);
434
435	return -EIO;
436}
437
438#define ext4_ext_check(inode, eh, depth)	\
439	__ext4_ext_check(__func__, inode, eh, depth)
440
441int ext4_ext_check_inode(struct inode *inode)
442{
443	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
444}
445
446#ifdef EXT_DEBUG
447static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
448{
449	int k, l = path->p_depth;
450
451	ext_debug("path:");
452	for (k = 0; k <= l; k++, path++) {
453		if (path->p_idx) {
454		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
455			    idx_pblock(path->p_idx));
456		} else if (path->p_ext) {
457			ext_debug("  %d:[%d]%d:%llu ",
458				  le32_to_cpu(path->p_ext->ee_block),
459				  ext4_ext_is_uninitialized(path->p_ext),
460				  ext4_ext_get_actual_len(path->p_ext),
461				  ext_pblock(path->p_ext));
462		} else
463			ext_debug("  []");
464	}
465	ext_debug("\n");
466}
467
468static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
469{
470	int depth = ext_depth(inode);
471	struct ext4_extent_header *eh;
472	struct ext4_extent *ex;
473	int i;
474
475	if (!path)
476		return;
477
478	eh = path[depth].p_hdr;
479	ex = EXT_FIRST_EXTENT(eh);
480
481	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
483	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
484		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485			  ext4_ext_is_uninitialized(ex),
486			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
487	}
488	ext_debug("\n");
489}
490#else
491#define ext4_ext_show_path(inode, path)
492#define ext4_ext_show_leaf(inode, path)
493#endif
494
495void ext4_ext_drop_refs(struct ext4_ext_path *path)
496{
497	int depth = path->p_depth;
498	int i;
499
500	for (i = 0; i <= depth; i++, path++)
501		if (path->p_bh) {
502			brelse(path->p_bh);
503			path->p_bh = NULL;
504		}
505}
506
507/*
508 * ext4_ext_binsearch_idx:
509 * binary search for the closest index of the given block
510 * the header must be checked before calling this
511 */
512static void
513ext4_ext_binsearch_idx(struct inode *inode,
514			struct ext4_ext_path *path, ext4_lblk_t block)
515{
516	struct ext4_extent_header *eh = path->p_hdr;
517	struct ext4_extent_idx *r, *l, *m;
518
519
520	ext_debug("binsearch for %u(idx):  ", block);
521
522	l = EXT_FIRST_INDEX(eh) + 1;
523	r = EXT_LAST_INDEX(eh);
524	while (l <= r) {
525		m = l + (r - l) / 2;
526		if (block < le32_to_cpu(m->ei_block))
527			r = m - 1;
528		else
529			l = m + 1;
530		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
531				m, le32_to_cpu(m->ei_block),
532				r, le32_to_cpu(r->ei_block));
533	}
534
535	path->p_idx = l - 1;
536	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
537		  idx_pblock(path->p_idx));
538
539#ifdef CHECK_BINSEARCH
540	{
541		struct ext4_extent_idx *chix, *ix;
542		int k;
543
544		chix = ix = EXT_FIRST_INDEX(eh);
545		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
546		  if (k != 0 &&
547		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
548				printk(KERN_DEBUG "k=%d, ix=0x%p, "
549				       "first=0x%p\n", k,
550				       ix, EXT_FIRST_INDEX(eh));
551				printk(KERN_DEBUG "%u <= %u\n",
552				       le32_to_cpu(ix->ei_block),
553				       le32_to_cpu(ix[-1].ei_block));
554			}
555			BUG_ON(k && le32_to_cpu(ix->ei_block)
556					   <= le32_to_cpu(ix[-1].ei_block));
557			if (block < le32_to_cpu(ix->ei_block))
558				break;
559			chix = ix;
560		}
561		BUG_ON(chix != path->p_idx);
562	}
563#endif
564
565}
566
567/*
568 * ext4_ext_binsearch:
569 * binary search for closest extent of the given block
570 * the header must be checked before calling this
571 */
572static void
573ext4_ext_binsearch(struct inode *inode,
574		struct ext4_ext_path *path, ext4_lblk_t block)
575{
576	struct ext4_extent_header *eh = path->p_hdr;
577	struct ext4_extent *r, *l, *m;
578
579	if (eh->eh_entries == 0) {
580		/*
581		 * this leaf is empty:
582		 * we get such a leaf in split/add case
583		 */
584		return;
585	}
586
587	ext_debug("binsearch for %u:  ", block);
588
589	l = EXT_FIRST_EXTENT(eh) + 1;
590	r = EXT_LAST_EXTENT(eh);
591
592	while (l <= r) {
593		m = l + (r - l) / 2;
594		if (block < le32_to_cpu(m->ee_block))
595			r = m - 1;
596		else
597			l = m + 1;
598		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
599				m, le32_to_cpu(m->ee_block),
600				r, le32_to_cpu(r->ee_block));
601	}
602
603	path->p_ext = l - 1;
604	ext_debug("  -> %d:%llu:[%d]%d ",
605			le32_to_cpu(path->p_ext->ee_block),
606			ext_pblock(path->p_ext),
607			ext4_ext_is_uninitialized(path->p_ext),
608			ext4_ext_get_actual_len(path->p_ext));
609
610#ifdef CHECK_BINSEARCH
611	{
612		struct ext4_extent *chex, *ex;
613		int k;
614
615		chex = ex = EXT_FIRST_EXTENT(eh);
616		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
617			BUG_ON(k && le32_to_cpu(ex->ee_block)
618					  <= le32_to_cpu(ex[-1].ee_block));
619			if (block < le32_to_cpu(ex->ee_block))
620				break;
621			chex = ex;
622		}
623		BUG_ON(chex != path->p_ext);
624	}
625#endif
626
627}
628
629int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
630{
631	struct ext4_extent_header *eh;
632
633	eh = ext_inode_hdr(inode);
634	eh->eh_depth = 0;
635	eh->eh_entries = 0;
636	eh->eh_magic = EXT4_EXT_MAGIC;
637	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
638	ext4_mark_inode_dirty(handle, inode);
639	ext4_ext_invalidate_cache(inode);
640	return 0;
641}
642
643struct ext4_ext_path *
644ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
645					struct ext4_ext_path *path)
646{
647	struct ext4_extent_header *eh;
648	struct buffer_head *bh;
649	short int depth, i, ppos = 0, alloc = 0;
650
651	eh = ext_inode_hdr(inode);
652	depth = ext_depth(inode);
653
654	/* account possible depth increase */
655	if (!path) {
656		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
657				GFP_NOFS);
658		if (!path)
659			return ERR_PTR(-ENOMEM);
660		alloc = 1;
661	}
662	path[0].p_hdr = eh;
663	path[0].p_bh = NULL;
664
665	i = depth;
666	/* walk through the tree */
667	while (i) {
668		int need_to_validate = 0;
669
670		ext_debug("depth %d: num %d, max %d\n",
671			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
672
673		ext4_ext_binsearch_idx(inode, path + ppos, block);
674		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
675		path[ppos].p_depth = i;
676		path[ppos].p_ext = NULL;
677
678		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
679		if (unlikely(!bh))
680			goto err;
681		if (!bh_uptodate_or_lock(bh)) {
682			if (bh_submit_read(bh) < 0) {
683				put_bh(bh);
684				goto err;
685			}
686			/* validate the extent entries */
687			need_to_validate = 1;
688		}
689		eh = ext_block_hdr(bh);
690		ppos++;
691		BUG_ON(ppos > depth);
692		path[ppos].p_bh = bh;
693		path[ppos].p_hdr = eh;
694		i--;
695
696		if (need_to_validate && ext4_ext_check(inode, eh, i))
697			goto err;
698	}
699
700	path[ppos].p_depth = i;
701	path[ppos].p_ext = NULL;
702	path[ppos].p_idx = NULL;
703
704	/* find extent */
705	ext4_ext_binsearch(inode, path + ppos, block);
706	/* if not an empty leaf */
707	if (path[ppos].p_ext)
708		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
709
710	ext4_ext_show_path(inode, path);
711
712	return path;
713
714err:
715	ext4_ext_drop_refs(path);
716	if (alloc)
717		kfree(path);
718	return ERR_PTR(-EIO);
719}
720
721/*
722 * ext4_ext_insert_index:
723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp
725 */
726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727				struct ext4_ext_path *curp,
728				int logical, ext4_fsblk_t ptr)
729{
730	struct ext4_extent_idx *ix;
731	int len, err;
732
733	err = ext4_ext_get_access(handle, inode, curp);
734	if (err)
735		return err;
736
737	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
738	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
739	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
740		/* insert after */
741		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
742			len = (len - 1) * sizeof(struct ext4_extent_idx);
743			len = len < 0 ? 0 : len;
744			ext_debug("insert new index %d after: %llu. "
745					"move %d from 0x%p to 0x%p\n",
746					logical, ptr, len,
747					(curp->p_idx + 1), (curp->p_idx + 2));
748			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
749		}
750		ix = curp->p_idx + 1;
751	} else {
752		/* insert before */
753		len = len * sizeof(struct ext4_extent_idx);
754		len = len < 0 ? 0 : len;
755		ext_debug("insert new index %d before: %llu. "
756				"move %d from 0x%p to 0x%p\n",
757				logical, ptr, len,
758				curp->p_idx, (curp->p_idx + 1));
759		memmove(curp->p_idx + 1, curp->p_idx, len);
760		ix = curp->p_idx;
761	}
762
763	ix->ei_block = cpu_to_le32(logical);
764	ext4_idx_store_pblock(ix, ptr);
765	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
766
767	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
768			     > le16_to_cpu(curp->p_hdr->eh_max));
769	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
770
771	err = ext4_ext_dirty(handle, inode, curp);
772	ext4_std_error(inode->i_sb, err);
773
774	return err;
775}
776
777/*
778 * ext4_ext_split:
779 * inserts new subtree into the path, using free index entry
780 * at depth @at:
781 * - allocates all needed blocks (new leaf and all intermediate index blocks)
782 * - makes decision where to split
783 * - moves remaining extents and index entries (right to the split point)
784 *   into the newly allocated blocks
785 * - initializes subtree
786 */
787static int ext4_ext_split(handle_t *handle, struct inode *inode,
788				struct ext4_ext_path *path,
789				struct ext4_extent *newext, int at)
790{
791	struct buffer_head *bh = NULL;
792	int depth = ext_depth(inode);
793	struct ext4_extent_header *neh;
794	struct ext4_extent_idx *fidx;
795	struct ext4_extent *ex;
796	int i = at, k, m, a;
797	ext4_fsblk_t newblock, oldblock;
798	__le32 border;
799	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
800	int err = 0;
801
802	/* make decision: where to split? */
803	/* FIXME: now decision is simplest: at current extent */
804
805	/* if current leaf will be split, then we should use
806	 * border from split point */
807	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
808	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
809		border = path[depth].p_ext[1].ee_block;
810		ext_debug("leaf will be split."
811				" next leaf starts at %d\n",
812				  le32_to_cpu(border));
813	} else {
814		border = newext->ee_block;
815		ext_debug("leaf will be added."
816				" next leaf starts at %d\n",
817				le32_to_cpu(border));
818	}
819
820	/*
821	 * If error occurs, then we break processing
822	 * and mark filesystem read-only. index won't
823	 * be inserted and tree will be in consistent
824	 * state. Next mount will repair buffers too.
825	 */
826
827	/*
828	 * Get array to track all allocated blocks.
829	 * We need this to handle errors and free blocks
830	 * upon them.
831	 */
832	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
833	if (!ablocks)
834		return -ENOMEM;
835
836	/* allocate all needed blocks */
837	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
838	for (a = 0; a < depth - at; a++) {
839		newblock = ext4_ext_new_meta_block(handle, inode, path,
840						   newext, &err);
841		if (newblock == 0)
842			goto cleanup;
843		ablocks[a] = newblock;
844	}
845
846	/* initialize new leaf */
847	newblock = ablocks[--a];
848	BUG_ON(newblock == 0);
849	bh = sb_getblk(inode->i_sb, newblock);
850	if (!bh) {
851		err = -EIO;
852		goto cleanup;
853	}
854	lock_buffer(bh);
855
856	err = ext4_journal_get_create_access(handle, bh);
857	if (err)
858		goto cleanup;
859
860	neh = ext_block_hdr(bh);
861	neh->eh_entries = 0;
862	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
863	neh->eh_magic = EXT4_EXT_MAGIC;
864	neh->eh_depth = 0;
865	ex = EXT_FIRST_EXTENT(neh);
866
867	/* move remainder of path[depth] to the new leaf */
868	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
869	/* start copy from next extent */
870	/* TODO: we could do it by single memmove */
871	m = 0;
872	path[depth].p_ext++;
873	while (path[depth].p_ext <=
874			EXT_MAX_EXTENT(path[depth].p_hdr)) {
875		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
876				le32_to_cpu(path[depth].p_ext->ee_block),
877				ext_pblock(path[depth].p_ext),
878				ext4_ext_is_uninitialized(path[depth].p_ext),
879				ext4_ext_get_actual_len(path[depth].p_ext),
880				newblock);
881		/*memmove(ex++, path[depth].p_ext++,
882				sizeof(struct ext4_extent));
883		neh->eh_entries++;*/
884		path[depth].p_ext++;
885		m++;
886	}
887	if (m) {
888		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
889		le16_add_cpu(&neh->eh_entries, m);
890	}
891
892	set_buffer_uptodate(bh);
893	unlock_buffer(bh);
894
895	err = ext4_handle_dirty_metadata(handle, inode, bh);
896	if (err)
897		goto cleanup;
898	brelse(bh);
899	bh = NULL;
900
901	/* correct old leaf */
902	if (m) {
903		err = ext4_ext_get_access(handle, inode, path + depth);
904		if (err)
905			goto cleanup;
906		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
907		err = ext4_ext_dirty(handle, inode, path + depth);
908		if (err)
909			goto cleanup;
910
911	}
912
913	/* create intermediate indexes */
914	k = depth - at - 1;
915	BUG_ON(k < 0);
916	if (k)
917		ext_debug("create %d intermediate indices\n", k);
918	/* insert new index into current index block */
919	/* current depth stored in i var */
920	i = depth - 1;
921	while (k--) {
922		oldblock = newblock;
923		newblock = ablocks[--a];
924		bh = sb_getblk(inode->i_sb, newblock);
925		if (!bh) {
926			err = -EIO;
927			goto cleanup;
928		}
929		lock_buffer(bh);
930
931		err = ext4_journal_get_create_access(handle, bh);
932		if (err)
933			goto cleanup;
934
935		neh = ext_block_hdr(bh);
936		neh->eh_entries = cpu_to_le16(1);
937		neh->eh_magic = EXT4_EXT_MAGIC;
938		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
939		neh->eh_depth = cpu_to_le16(depth - i);
940		fidx = EXT_FIRST_INDEX(neh);
941		fidx->ei_block = border;
942		ext4_idx_store_pblock(fidx, oldblock);
943
944		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
945				i, newblock, le32_to_cpu(border), oldblock);
946		/* copy indexes */
947		m = 0;
948		path[i].p_idx++;
949
950		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
951				EXT_MAX_INDEX(path[i].p_hdr));
952		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
953				EXT_LAST_INDEX(path[i].p_hdr));
954		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
955			ext_debug("%d: move %d:%llu in new index %llu\n", i,
956					le32_to_cpu(path[i].p_idx->ei_block),
957					idx_pblock(path[i].p_idx),
958					newblock);
959			/*memmove(++fidx, path[i].p_idx++,
960					sizeof(struct ext4_extent_idx));
961			neh->eh_entries++;
962			BUG_ON(neh->eh_entries > neh->eh_max);*/
963			path[i].p_idx++;
964			m++;
965		}
966		if (m) {
967			memmove(++fidx, path[i].p_idx - m,
968				sizeof(struct ext4_extent_idx) * m);
969			le16_add_cpu(&neh->eh_entries, m);
970		}
971		set_buffer_uptodate(bh);
972		unlock_buffer(bh);
973
974		err = ext4_handle_dirty_metadata(handle, inode, bh);
975		if (err)
976			goto cleanup;
977		brelse(bh);
978		bh = NULL;
979
980		/* correct old index */
981		if (m) {
982			err = ext4_ext_get_access(handle, inode, path + i);
983			if (err)
984				goto cleanup;
985			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
986			err = ext4_ext_dirty(handle, inode, path + i);
987			if (err)
988				goto cleanup;
989		}
990
991		i--;
992	}
993
994	/* insert new index */
995	err = ext4_ext_insert_index(handle, inode, path + at,
996				    le32_to_cpu(border), newblock);
997
998cleanup:
999	if (bh) {
1000		if (buffer_locked(bh))
1001			unlock_buffer(bh);
1002		brelse(bh);
1003	}
1004
1005	if (err) {
1006		/* free all allocated blocks in error case */
1007		for (i = 0; i < depth; i++) {
1008			if (!ablocks[i])
1009				continue;
1010			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1011					 EXT4_FREE_BLOCKS_METADATA);
1012		}
1013	}
1014	kfree(ablocks);
1015
1016	return err;
1017}
1018
1019/*
1020 * ext4_ext_grow_indepth:
1021 * implements tree growing procedure:
1022 * - allocates new block
1023 * - moves top-level data (index block or leaf) into the new block
1024 * - initializes new top-level, creating index that points to the
1025 *   just created block
1026 */
1027static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1028					struct ext4_ext_path *path,
1029					struct ext4_extent *newext)
1030{
1031	struct ext4_ext_path *curp = path;
1032	struct ext4_extent_header *neh;
1033	struct ext4_extent_idx *fidx;
1034	struct buffer_head *bh;
1035	ext4_fsblk_t newblock;
1036	int err = 0;
1037
1038	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1039	if (newblock == 0)
1040		return err;
1041
1042	bh = sb_getblk(inode->i_sb, newblock);
1043	if (!bh) {
1044		err = -EIO;
1045		ext4_std_error(inode->i_sb, err);
1046		return err;
1047	}
1048	lock_buffer(bh);
1049
1050	err = ext4_journal_get_create_access(handle, bh);
1051	if (err) {
1052		unlock_buffer(bh);
1053		goto out;
1054	}
1055
1056	/* move top-level index/leaf into new block */
1057	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1058
1059	/* set size of new block */
1060	neh = ext_block_hdr(bh);
1061	/* old root could have indexes or leaves
1062	 * so calculate e_max right way */
1063	if (ext_depth(inode))
1064		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1065	else
1066		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1067	neh->eh_magic = EXT4_EXT_MAGIC;
1068	set_buffer_uptodate(bh);
1069	unlock_buffer(bh);
1070
1071	err = ext4_handle_dirty_metadata(handle, inode, bh);
1072	if (err)
1073		goto out;
1074
1075	/* create index in new top-level index: num,max,pointer */
1076	err = ext4_ext_get_access(handle, inode, curp);
1077	if (err)
1078		goto out;
1079
1080	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1081	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1082	curp->p_hdr->eh_entries = cpu_to_le16(1);
1083	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1084
1085	if (path[0].p_hdr->eh_depth)
1086		curp->p_idx->ei_block =
1087			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1088	else
1089		curp->p_idx->ei_block =
1090			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1091	ext4_idx_store_pblock(curp->p_idx, newblock);
1092
1093	neh = ext_inode_hdr(inode);
1094	fidx = EXT_FIRST_INDEX(neh);
1095	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1096		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1097		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
1098
1099	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1100	err = ext4_ext_dirty(handle, inode, curp);
1101out:
1102	brelse(bh);
1103
1104	return err;
1105}
1106
1107/*
1108 * ext4_ext_create_new_leaf:
1109 * finds empty index and adds new leaf.
1110 * if no free index is found, then it requests in-depth growing.
1111 */
1112static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1113					struct ext4_ext_path *path,
1114					struct ext4_extent *newext)
1115{
1116	struct ext4_ext_path *curp;
1117	int depth, i, err = 0;
1118
1119repeat:
1120	i = depth = ext_depth(inode);
1121
1122	/* walk up to the tree and look for free index entry */
1123	curp = path + depth;
1124	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1125		i--;
1126		curp--;
1127	}
1128
1129	/* we use already allocated block for index block,
1130	 * so subsequent data blocks should be contiguous */
1131	if (EXT_HAS_FREE_INDEX(curp)) {
1132		/* if we found index with free entry, then use that
1133		 * entry: create all needed subtree and add new leaf */
1134		err = ext4_ext_split(handle, inode, path, newext, i);
1135		if (err)
1136			goto out;
1137
1138		/* refill path */
1139		ext4_ext_drop_refs(path);
1140		path = ext4_ext_find_extent(inode,
1141				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1142				    path);
1143		if (IS_ERR(path))
1144			err = PTR_ERR(path);
1145	} else {
1146		/* tree is full, time to grow in depth */
1147		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1148		if (err)
1149			goto out;
1150
1151		/* refill path */
1152		ext4_ext_drop_refs(path);
1153		path = ext4_ext_find_extent(inode,
1154				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1155				    path);
1156		if (IS_ERR(path)) {
1157			err = PTR_ERR(path);
1158			goto out;
1159		}
1160
1161		/*
1162		 * only first (depth 0 -> 1) produces free space;
1163		 * in all other cases we have to split the grown tree
1164		 */
1165		depth = ext_depth(inode);
1166		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1167			/* now we need to split */
1168			goto repeat;
1169		}
1170	}
1171
1172out:
1173	return err;
1174}
1175
1176/*
1177 * search the closest allocated block to the left for *logical
1178 * and returns it at @logical + it's physical address at @phys
1179 * if *logical is the smallest allocated block, the function
1180 * returns 0 at @phys
1181 * return value contains 0 (success) or error code
1182 */
1183int
1184ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1185			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1186{
1187	struct ext4_extent_idx *ix;
1188	struct ext4_extent *ex;
1189	int depth, ee_len;
1190
1191	BUG_ON(path == NULL);
1192	depth = path->p_depth;
1193	*phys = 0;
1194
1195	if (depth == 0 && path->p_ext == NULL)
1196		return 0;
1197
1198	/* usually extent in the path covers blocks smaller
1199	 * then *logical, but it can be that extent is the
1200	 * first one in the file */
1201
1202	ex = path[depth].p_ext;
1203	ee_len = ext4_ext_get_actual_len(ex);
1204	if (*logical < le32_to_cpu(ex->ee_block)) {
1205		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1206		while (--depth >= 0) {
1207			ix = path[depth].p_idx;
1208			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1209		}
1210		return 0;
1211	}
1212
1213	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1214
1215	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1216	*phys = ext_pblock(ex) + ee_len - 1;
1217	return 0;
1218}
1219
1220/*
1221 * search the closest allocated block to the right for *logical
1222 * and returns it at @logical + it's physical address at @phys
1223 * if *logical is the smallest allocated block, the function
1224 * returns 0 at @phys
1225 * return value contains 0 (success) or error code
1226 */
1227int
1228ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1229			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1230{
1231	struct buffer_head *bh = NULL;
1232	struct ext4_extent_header *eh;
1233	struct ext4_extent_idx *ix;
1234	struct ext4_extent *ex;
1235	ext4_fsblk_t block;
1236	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1237	int ee_len;
1238
1239	BUG_ON(path == NULL);
1240	depth = path->p_depth;
1241	*phys = 0;
1242
1243	if (depth == 0 && path->p_ext == NULL)
1244		return 0;
1245
1246	/* usually extent in the path covers blocks smaller
1247	 * then *logical, but it can be that extent is the
1248	 * first one in the file */
1249
1250	ex = path[depth].p_ext;
1251	ee_len = ext4_ext_get_actual_len(ex);
1252	if (*logical < le32_to_cpu(ex->ee_block)) {
1253		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1254		while (--depth >= 0) {
1255			ix = path[depth].p_idx;
1256			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1257		}
1258		*logical = le32_to_cpu(ex->ee_block);
1259		*phys = ext_pblock(ex);
1260		return 0;
1261	}
1262
1263	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1264
1265	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1266		/* next allocated block in this leaf */
1267		ex++;
1268		*logical = le32_to_cpu(ex->ee_block);
1269		*phys = ext_pblock(ex);
1270		return 0;
1271	}
1272
1273	/* go up and search for index to the right */
1274	while (--depth >= 0) {
1275		ix = path[depth].p_idx;
1276		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1277			goto got_index;
1278	}
1279
1280	/* we've gone up to the root and found no index to the right */
1281	return 0;
1282
1283got_index:
1284	/* we've found index to the right, let's
1285	 * follow it and find the closest allocated
1286	 * block to the right */
1287	ix++;
1288	block = idx_pblock(ix);
1289	while (++depth < path->p_depth) {
1290		bh = sb_bread(inode->i_sb, block);
1291		if (bh == NULL)
1292			return -EIO;
1293		eh = ext_block_hdr(bh);
1294		/* subtract from p_depth to get proper eh_depth */
1295		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1296			put_bh(bh);
1297			return -EIO;
1298		}
1299		ix = EXT_FIRST_INDEX(eh);
1300		block = idx_pblock(ix);
1301		put_bh(bh);
1302	}
1303
1304	bh = sb_bread(inode->i_sb, block);
1305	if (bh == NULL)
1306		return -EIO;
1307	eh = ext_block_hdr(bh);
1308	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1309		put_bh(bh);
1310		return -EIO;
1311	}
1312	ex = EXT_FIRST_EXTENT(eh);
1313	*logical = le32_to_cpu(ex->ee_block);
1314	*phys = ext_pblock(ex);
1315	put_bh(bh);
1316	return 0;
1317}
1318
1319/*
1320 * ext4_ext_next_allocated_block:
1321 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1322 * NOTE: it considers block number from index entry as
1323 * allocated block. Thus, index entries have to be consistent
1324 * with leaves.
1325 */
1326static ext4_lblk_t
1327ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1328{
1329	int depth;
1330
1331	BUG_ON(path == NULL);
1332	depth = path->p_depth;
1333
1334	if (depth == 0 && path->p_ext == NULL)
1335		return EXT_MAX_BLOCK;
1336
1337	while (depth >= 0) {
1338		if (depth == path->p_depth) {
1339			/* leaf */
1340			if (path[depth].p_ext !=
1341					EXT_LAST_EXTENT(path[depth].p_hdr))
1342			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1343		} else {
1344			/* index */
1345			if (path[depth].p_idx !=
1346					EXT_LAST_INDEX(path[depth].p_hdr))
1347			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1348		}
1349		depth--;
1350	}
1351
1352	return EXT_MAX_BLOCK;
1353}
1354
1355/*
1356 * ext4_ext_next_leaf_block:
1357 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1358 */
1359static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1360					struct ext4_ext_path *path)
1361{
1362	int depth;
1363
1364	BUG_ON(path == NULL);
1365	depth = path->p_depth;
1366
1367	/* zero-tree has no leaf blocks at all */
1368	if (depth == 0)
1369		return EXT_MAX_BLOCK;
1370
1371	/* go to index block */
1372	depth--;
1373
1374	while (depth >= 0) {
1375		if (path[depth].p_idx !=
1376				EXT_LAST_INDEX(path[depth].p_hdr))
1377			return (ext4_lblk_t)
1378				le32_to_cpu(path[depth].p_idx[1].ei_block);
1379		depth--;
1380	}
1381
1382	return EXT_MAX_BLOCK;
1383}
1384
1385/*
1386 * ext4_ext_correct_indexes:
1387 * if leaf gets modified and modified extent is first in the leaf,
1388 * then we have to correct all indexes above.
1389 * TODO: do we need to correct tree in all cases?
1390 */
1391static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1392				struct ext4_ext_path *path)
1393{
1394	struct ext4_extent_header *eh;
1395	int depth = ext_depth(inode);
1396	struct ext4_extent *ex;
1397	__le32 border;
1398	int k, err = 0;
1399
1400	eh = path[depth].p_hdr;
1401	ex = path[depth].p_ext;
1402	BUG_ON(ex == NULL);
1403	BUG_ON(eh == NULL);
1404
1405	if (depth == 0) {
1406		/* there is no tree at all */
1407		return 0;
1408	}
1409
1410	if (ex != EXT_FIRST_EXTENT(eh)) {
1411		/* we correct tree if first leaf got modified only */
1412		return 0;
1413	}
1414
1415	/*
1416	 * TODO: we need correction if border is smaller than current one
1417	 */
1418	k = depth - 1;
1419	border = path[depth].p_ext->ee_block;
1420	err = ext4_ext_get_access(handle, inode, path + k);
1421	if (err)
1422		return err;
1423	path[k].p_idx->ei_block = border;
1424	err = ext4_ext_dirty(handle, inode, path + k);
1425	if (err)
1426		return err;
1427
1428	while (k--) {
1429		/* change all left-side indexes */
1430		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1431			break;
1432		err = ext4_ext_get_access(handle, inode, path + k);
1433		if (err)
1434			break;
1435		path[k].p_idx->ei_block = border;
1436		err = ext4_ext_dirty(handle, inode, path + k);
1437		if (err)
1438			break;
1439	}
1440
1441	return err;
1442}
1443
1444int
1445ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1446				struct ext4_extent *ex2)
1447{
1448	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1449
1450	/*
1451	 * Make sure that either both extents are uninitialized, or
1452	 * both are _not_.
1453	 */
1454	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1455		return 0;
1456
1457	if (ext4_ext_is_uninitialized(ex1))
1458		max_len = EXT_UNINIT_MAX_LEN;
1459	else
1460		max_len = EXT_INIT_MAX_LEN;
1461
1462	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1463	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1464
1465	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1466			le32_to_cpu(ex2->ee_block))
1467		return 0;
1468
1469	/*
1470	 * To allow future support for preallocated extents to be added
1471	 * as an RO_COMPAT feature, refuse to merge to extents if
1472	 * this can result in the top bit of ee_len being set.
1473	 */
1474	if (ext1_ee_len + ext2_ee_len > max_len)
1475		return 0;
1476#ifdef AGGRESSIVE_TEST
1477	if (ext1_ee_len >= 4)
1478		return 0;
1479#endif
1480
1481	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1482		return 1;
1483	return 0;
1484}
1485
1486/*
1487 * This function tries to merge the "ex" extent to the next extent in the tree.
1488 * It always tries to merge towards right. If you want to merge towards
1489 * left, pass "ex - 1" as argument instead of "ex".
1490 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1491 * 1 if they got merged.
1492 */
1493int ext4_ext_try_to_merge(struct inode *inode,
1494			  struct ext4_ext_path *path,
1495			  struct ext4_extent *ex)
1496{
1497	struct ext4_extent_header *eh;
1498	unsigned int depth, len;
1499	int merge_done = 0;
1500	int uninitialized = 0;
1501
1502	depth = ext_depth(inode);
1503	BUG_ON(path[depth].p_hdr == NULL);
1504	eh = path[depth].p_hdr;
1505
1506	while (ex < EXT_LAST_EXTENT(eh)) {
1507		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1508			break;
1509		/* merge with next extent! */
1510		if (ext4_ext_is_uninitialized(ex))
1511			uninitialized = 1;
1512		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1513				+ ext4_ext_get_actual_len(ex + 1));
1514		if (uninitialized)
1515			ext4_ext_mark_uninitialized(ex);
1516
1517		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1518			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1519				* sizeof(struct ext4_extent);
1520			memmove(ex + 1, ex + 2, len);
1521		}
1522		le16_add_cpu(&eh->eh_entries, -1);
1523		merge_done = 1;
1524		WARN_ON(eh->eh_entries == 0);
1525		if (!eh->eh_entries)
1526			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1527			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1528	}
1529
1530	return merge_done;
1531}
1532
1533/*
1534 * check if a portion of the "newext" extent overlaps with an
1535 * existing extent.
1536 *
1537 * If there is an overlap discovered, it updates the length of the newext
1538 * such that there will be no overlap, and then returns 1.
1539 * If there is no overlap found, it returns 0.
1540 */
1541unsigned int ext4_ext_check_overlap(struct inode *inode,
1542				    struct ext4_extent *newext,
1543				    struct ext4_ext_path *path)
1544{
1545	ext4_lblk_t b1, b2;
1546	unsigned int depth, len1;
1547	unsigned int ret = 0;
1548
1549	b1 = le32_to_cpu(newext->ee_block);
1550	len1 = ext4_ext_get_actual_len(newext);
1551	depth = ext_depth(inode);
1552	if (!path[depth].p_ext)
1553		goto out;
1554	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1555
1556	/*
1557	 * get the next allocated block if the extent in the path
1558	 * is before the requested block(s)
1559	 */
1560	if (b2 < b1) {
1561		b2 = ext4_ext_next_allocated_block(path);
1562		if (b2 == EXT_MAX_BLOCK)
1563			goto out;
1564	}
1565
1566	/* check for wrap through zero on extent logical start block*/
1567	if (b1 + len1 < b1) {
1568		len1 = EXT_MAX_BLOCK - b1;
1569		newext->ee_len = cpu_to_le16(len1);
1570		ret = 1;
1571	}
1572
1573	/* check for overlap */
1574	if (b1 + len1 > b2) {
1575		newext->ee_len = cpu_to_le16(b2 - b1);
1576		ret = 1;
1577	}
1578out:
1579	return ret;
1580}
1581
1582/*
1583 * ext4_ext_insert_extent:
1584 * tries to merge requsted extent into the existing extent or
1585 * inserts requested extent as new one into the tree,
1586 * creating new leaf in the no-space case.
1587 */
1588int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1589				struct ext4_ext_path *path,
1590				struct ext4_extent *newext, int flag)
1591{
1592	struct ext4_extent_header *eh;
1593	struct ext4_extent *ex, *fex;
1594	struct ext4_extent *nearex; /* nearest extent */
1595	struct ext4_ext_path *npath = NULL;
1596	int depth, len, err;
1597	ext4_lblk_t next;
1598	unsigned uninitialized = 0;
1599
1600	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1601	depth = ext_depth(inode);
1602	ex = path[depth].p_ext;
1603	BUG_ON(path[depth].p_hdr == NULL);
1604
1605	/* try to insert block into found extent and return */
1606	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1607		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1608		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1609				ext4_ext_is_uninitialized(newext),
1610				ext4_ext_get_actual_len(newext),
1611				le32_to_cpu(ex->ee_block),
1612				ext4_ext_is_uninitialized(ex),
1613				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1614		err = ext4_ext_get_access(handle, inode, path + depth);
1615		if (err)
1616			return err;
1617
1618		/*
1619		 * ext4_can_extents_be_merged should have checked that either
1620		 * both extents are uninitialized, or both aren't. Thus we
1621		 * need to check only one of them here.
1622		 */
1623		if (ext4_ext_is_uninitialized(ex))
1624			uninitialized = 1;
1625		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1626					+ ext4_ext_get_actual_len(newext));
1627		if (uninitialized)
1628			ext4_ext_mark_uninitialized(ex);
1629		eh = path[depth].p_hdr;
1630		nearex = ex;
1631		goto merge;
1632	}
1633
1634repeat:
1635	depth = ext_depth(inode);
1636	eh = path[depth].p_hdr;
1637	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1638		goto has_space;
1639
1640	/* probably next leaf has space for us? */
1641	fex = EXT_LAST_EXTENT(eh);
1642	next = ext4_ext_next_leaf_block(inode, path);
1643	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1644	    && next != EXT_MAX_BLOCK) {
1645		ext_debug("next leaf block - %d\n", next);
1646		BUG_ON(npath != NULL);
1647		npath = ext4_ext_find_extent(inode, next, NULL);
1648		if (IS_ERR(npath))
1649			return PTR_ERR(npath);
1650		BUG_ON(npath->p_depth != path->p_depth);
1651		eh = npath[depth].p_hdr;
1652		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1653			ext_debug("next leaf isnt full(%d)\n",
1654				  le16_to_cpu(eh->eh_entries));
1655			path = npath;
1656			goto repeat;
1657		}
1658		ext_debug("next leaf has no free space(%d,%d)\n",
1659			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1660	}
1661
1662	/*
1663	 * There is no free space in the found leaf.
1664	 * We're gonna add a new leaf in the tree.
1665	 */
1666	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1667	if (err)
1668		goto cleanup;
1669	depth = ext_depth(inode);
1670	eh = path[depth].p_hdr;
1671
1672has_space:
1673	nearex = path[depth].p_ext;
1674
1675	err = ext4_ext_get_access(handle, inode, path + depth);
1676	if (err)
1677		goto cleanup;
1678
1679	if (!nearex) {
1680		/* there is no extent in this leaf, create first one */
1681		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1682				le32_to_cpu(newext->ee_block),
1683				ext_pblock(newext),
1684				ext4_ext_is_uninitialized(newext),
1685				ext4_ext_get_actual_len(newext));
1686		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1687	} else if (le32_to_cpu(newext->ee_block)
1688			   > le32_to_cpu(nearex->ee_block)) {
1689/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1690		if (nearex != EXT_LAST_EXTENT(eh)) {
1691			len = EXT_MAX_EXTENT(eh) - nearex;
1692			len = (len - 1) * sizeof(struct ext4_extent);
1693			len = len < 0 ? 0 : len;
1694			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1695					"move %d from 0x%p to 0x%p\n",
1696					le32_to_cpu(newext->ee_block),
1697					ext_pblock(newext),
1698					ext4_ext_is_uninitialized(newext),
1699					ext4_ext_get_actual_len(newext),
1700					nearex, len, nearex + 1, nearex + 2);
1701			memmove(nearex + 2, nearex + 1, len);
1702		}
1703		path[depth].p_ext = nearex + 1;
1704	} else {
1705		BUG_ON(newext->ee_block == nearex->ee_block);
1706		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1707		len = len < 0 ? 0 : len;
1708		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1709				"move %d from 0x%p to 0x%p\n",
1710				le32_to_cpu(newext->ee_block),
1711				ext_pblock(newext),
1712				ext4_ext_is_uninitialized(newext),
1713				ext4_ext_get_actual_len(newext),
1714				nearex, len, nearex + 1, nearex + 2);
1715		memmove(nearex + 1, nearex, len);
1716		path[depth].p_ext = nearex;
1717	}
1718
1719	le16_add_cpu(&eh->eh_entries, 1);
1720	nearex = path[depth].p_ext;
1721	nearex->ee_block = newext->ee_block;
1722	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1723	nearex->ee_len = newext->ee_len;
1724
1725merge:
1726	/* try to merge extents to the right */
1727	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1728		ext4_ext_try_to_merge(inode, path, nearex);
1729
1730	/* try to merge extents to the left */
1731
1732	/* time to correct all indexes above */
1733	err = ext4_ext_correct_indexes(handle, inode, path);
1734	if (err)
1735		goto cleanup;
1736
1737	err = ext4_ext_dirty(handle, inode, path + depth);
1738
1739cleanup:
1740	if (npath) {
1741		ext4_ext_drop_refs(npath);
1742		kfree(npath);
1743	}
1744	ext4_ext_invalidate_cache(inode);
1745	return err;
1746}
1747
1748int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1749			ext4_lblk_t num, ext_prepare_callback func,
1750			void *cbdata)
1751{
1752	struct ext4_ext_path *path = NULL;
1753	struct ext4_ext_cache cbex;
1754	struct ext4_extent *ex;
1755	ext4_lblk_t next, start = 0, end = 0;
1756	ext4_lblk_t last = block + num;
1757	int depth, exists, err = 0;
1758
1759	BUG_ON(func == NULL);
1760	BUG_ON(inode == NULL);
1761
1762	while (block < last && block != EXT_MAX_BLOCK) {
1763		num = last - block;
1764		/* find extent for this block */
1765		down_read(&EXT4_I(inode)->i_data_sem);
1766		path = ext4_ext_find_extent(inode, block, path);
1767		up_read(&EXT4_I(inode)->i_data_sem);
1768		if (IS_ERR(path)) {
1769			err = PTR_ERR(path);
1770			path = NULL;
1771			break;
1772		}
1773
1774		depth = ext_depth(inode);
1775		BUG_ON(path[depth].p_hdr == NULL);
1776		ex = path[depth].p_ext;
1777		next = ext4_ext_next_allocated_block(path);
1778
1779		exists = 0;
1780		if (!ex) {
1781			/* there is no extent yet, so try to allocate
1782			 * all requested space */
1783			start = block;
1784			end = block + num;
1785		} else if (le32_to_cpu(ex->ee_block) > block) {
1786			/* need to allocate space before found extent */
1787			start = block;
1788			end = le32_to_cpu(ex->ee_block);
1789			if (block + num < end)
1790				end = block + num;
1791		} else if (block >= le32_to_cpu(ex->ee_block)
1792					+ ext4_ext_get_actual_len(ex)) {
1793			/* need to allocate space after found extent */
1794			start = block;
1795			end = block + num;
1796			if (end >= next)
1797				end = next;
1798		} else if (block >= le32_to_cpu(ex->ee_block)) {
1799			/*
1800			 * some part of requested space is covered
1801			 * by found extent
1802			 */
1803			start = block;
1804			end = le32_to_cpu(ex->ee_block)
1805				+ ext4_ext_get_actual_len(ex);
1806			if (block + num < end)
1807				end = block + num;
1808			exists = 1;
1809		} else {
1810			BUG();
1811		}
1812		BUG_ON(end <= start);
1813
1814		if (!exists) {
1815			cbex.ec_block = start;
1816			cbex.ec_len = end - start;
1817			cbex.ec_start = 0;
1818			cbex.ec_type = EXT4_EXT_CACHE_GAP;
1819		} else {
1820			cbex.ec_block = le32_to_cpu(ex->ee_block);
1821			cbex.ec_len = ext4_ext_get_actual_len(ex);
1822			cbex.ec_start = ext_pblock(ex);
1823			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1824		}
1825
1826		BUG_ON(cbex.ec_len == 0);
1827		err = func(inode, path, &cbex, ex, cbdata);
1828		ext4_ext_drop_refs(path);
1829
1830		if (err < 0)
1831			break;
1832
1833		if (err == EXT_REPEAT)
1834			continue;
1835		else if (err == EXT_BREAK) {
1836			err = 0;
1837			break;
1838		}
1839
1840		if (ext_depth(inode) != depth) {
1841			/* depth was changed. we have to realloc path */
1842			kfree(path);
1843			path = NULL;
1844		}
1845
1846		block = cbex.ec_block + cbex.ec_len;
1847	}
1848
1849	if (path) {
1850		ext4_ext_drop_refs(path);
1851		kfree(path);
1852	}
1853
1854	return err;
1855}
1856
1857static void
1858ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1859			__u32 len, ext4_fsblk_t start, int type)
1860{
1861	struct ext4_ext_cache *cex;
1862	BUG_ON(len == 0);
1863	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1864	cex = &EXT4_I(inode)->i_cached_extent;
1865	cex->ec_type = type;
1866	cex->ec_block = block;
1867	cex->ec_len = len;
1868	cex->ec_start = start;
1869	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1870}
1871
1872/*
1873 * ext4_ext_put_gap_in_cache:
1874 * calculate boundaries of the gap that the requested block fits into
1875 * and cache this gap
1876 */
1877static void
1878ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1879				ext4_lblk_t block)
1880{
1881	int depth = ext_depth(inode);
1882	unsigned long len;
1883	ext4_lblk_t lblock;
1884	struct ext4_extent *ex;
1885
1886	ex = path[depth].p_ext;
1887	if (ex == NULL) {
1888		/* there is no extent yet, so gap is [0;-] */
1889		lblock = 0;
1890		len = EXT_MAX_BLOCK;
1891		ext_debug("cache gap(whole file):");
1892	} else if (block < le32_to_cpu(ex->ee_block)) {
1893		lblock = block;
1894		len = le32_to_cpu(ex->ee_block) - block;
1895		ext_debug("cache gap(before): %u [%u:%u]",
1896				block,
1897				le32_to_cpu(ex->ee_block),
1898				 ext4_ext_get_actual_len(ex));
1899	} else if (block >= le32_to_cpu(ex->ee_block)
1900			+ ext4_ext_get_actual_len(ex)) {
1901		ext4_lblk_t next;
1902		lblock = le32_to_cpu(ex->ee_block)
1903			+ ext4_ext_get_actual_len(ex);
1904
1905		next = ext4_ext_next_allocated_block(path);
1906		ext_debug("cache gap(after): [%u:%u] %u",
1907				le32_to_cpu(ex->ee_block),
1908				ext4_ext_get_actual_len(ex),
1909				block);
1910		BUG_ON(next == lblock);
1911		len = next - lblock;
1912	} else {
1913		lblock = len = 0;
1914		BUG();
1915	}
1916
1917	ext_debug(" -> %u:%lu\n", lblock, len);
1918	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1919}
1920
1921static int
1922ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1923			struct ext4_extent *ex)
1924{
1925	struct ext4_ext_cache *cex;
1926	int ret = EXT4_EXT_CACHE_NO;
1927
1928	/*
1929	 * We borrow i_block_reservation_lock to protect i_cached_extent
1930	 */
1931	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1932	cex = &EXT4_I(inode)->i_cached_extent;
1933
1934	/* has cache valid data? */
1935	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1936		goto errout;
1937
1938	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1939			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1940	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1941		ex->ee_block = cpu_to_le32(cex->ec_block);
1942		ext4_ext_store_pblock(ex, cex->ec_start);
1943		ex->ee_len = cpu_to_le16(cex->ec_len);
1944		ext_debug("%u cached by %u:%u:%llu\n",
1945				block,
1946				cex->ec_block, cex->ec_len, cex->ec_start);
1947		ret = cex->ec_type;
1948	}
1949errout:
1950	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1951	return ret;
1952}
1953
1954/*
1955 * ext4_ext_rm_idx:
1956 * removes index from the index block.
1957 * It's used in truncate case only, thus all requests are for
1958 * last index in the block only.
1959 */
1960static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1961			struct ext4_ext_path *path)
1962{
1963	int err;
1964	ext4_fsblk_t leaf;
1965
1966	/* free index block */
1967	path--;
1968	leaf = idx_pblock(path->p_idx);
1969	BUG_ON(path->p_hdr->eh_entries == 0);
1970	err = ext4_ext_get_access(handle, inode, path);
1971	if (err)
1972		return err;
1973	le16_add_cpu(&path->p_hdr->eh_entries, -1);
1974	err = ext4_ext_dirty(handle, inode, path);
1975	if (err)
1976		return err;
1977	ext_debug("index is empty, remove it, free block %llu\n", leaf);
1978	ext4_free_blocks(handle, inode, 0, leaf, 1,
1979			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1980	return err;
1981}
1982
1983/*
1984 * ext4_ext_calc_credits_for_single_extent:
1985 * This routine returns max. credits that needed to insert an extent
1986 * to the extent tree.
1987 * When pass the actual path, the caller should calculate credits
1988 * under i_data_sem.
1989 */
1990int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1991						struct ext4_ext_path *path)
1992{
1993	if (path) {
1994		int depth = ext_depth(inode);
1995		int ret = 0;
1996
1997		/* probably there is space in leaf? */
1998		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1999				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2000
2001			/*
2002			 *  There are some space in the leaf tree, no
2003			 *  need to account for leaf block credit
2004			 *
2005			 *  bitmaps and block group descriptor blocks
2006			 *  and other metadat blocks still need to be
2007			 *  accounted.
2008			 */
2009			/* 1 bitmap, 1 block group descriptor */
2010			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2011			return ret;
2012		}
2013	}
2014
2015	return ext4_chunk_trans_blocks(inode, nrblocks);
2016}
2017
2018/*
2019 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2020 *
2021 * if nrblocks are fit in a single extent (chunk flag is 1), then
2022 * in the worse case, each tree level index/leaf need to be changed
2023 * if the tree split due to insert a new extent, then the old tree
2024 * index/leaf need to be updated too
2025 *
2026 * If the nrblocks are discontiguous, they could cause
2027 * the whole tree split more than once, but this is really rare.
2028 */
2029int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2030{
2031	int index;
2032	int depth = ext_depth(inode);
2033
2034	if (chunk)
2035		index = depth * 2;
2036	else
2037		index = depth * 3;
2038
2039	return index;
2040}
2041
2042static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2043				struct ext4_extent *ex,
2044				ext4_lblk_t from, ext4_lblk_t to)
2045{
2046	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2047	int flags = EXT4_FREE_BLOCKS_FORGET;
2048
2049	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2050		flags |= EXT4_FREE_BLOCKS_METADATA;
2051#ifdef EXTENTS_STATS
2052	{
2053		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2054		spin_lock(&sbi->s_ext_stats_lock);
2055		sbi->s_ext_blocks += ee_len;
2056		sbi->s_ext_extents++;
2057		if (ee_len < sbi->s_ext_min)
2058			sbi->s_ext_min = ee_len;
2059		if (ee_len > sbi->s_ext_max)
2060			sbi->s_ext_max = ee_len;
2061		if (ext_depth(inode) > sbi->s_depth_max)
2062			sbi->s_depth_max = ext_depth(inode);
2063		spin_unlock(&sbi->s_ext_stats_lock);
2064	}
2065#endif
2066	if (from >= le32_to_cpu(ex->ee_block)
2067	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2068		/* tail removal */
2069		ext4_lblk_t num;
2070		ext4_fsblk_t start;
2071
2072		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2073		start = ext_pblock(ex) + ee_len - num;
2074		ext_debug("free last %u blocks starting %llu\n", num, start);
2075		ext4_free_blocks(handle, inode, 0, start, num, flags);
2076	} else if (from == le32_to_cpu(ex->ee_block)
2077		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2078		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2079			from, to, le32_to_cpu(ex->ee_block), ee_len);
2080	} else {
2081		printk(KERN_INFO "strange request: removal(2) "
2082				"%u-%u from %u:%u\n",
2083				from, to, le32_to_cpu(ex->ee_block), ee_len);
2084	}
2085	return 0;
2086}
2087
2088static int
2089ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2090		struct ext4_ext_path *path, ext4_lblk_t start)
2091{
2092	int err = 0, correct_index = 0;
2093	int depth = ext_depth(inode), credits;
2094	struct ext4_extent_header *eh;
2095	ext4_lblk_t a, b, block;
2096	unsigned num;
2097	ext4_lblk_t ex_ee_block;
2098	unsigned short ex_ee_len;
2099	unsigned uninitialized = 0;
2100	struct ext4_extent *ex;
2101
2102	/* the header must be checked already in ext4_ext_remove_space() */
2103	ext_debug("truncate since %u in leaf\n", start);
2104	if (!path[depth].p_hdr)
2105		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2106	eh = path[depth].p_hdr;
2107	BUG_ON(eh == NULL);
2108
2109	/* find where to start removing */
2110	ex = EXT_LAST_EXTENT(eh);
2111
2112	ex_ee_block = le32_to_cpu(ex->ee_block);
2113	ex_ee_len = ext4_ext_get_actual_len(ex);
2114
2115	while (ex >= EXT_FIRST_EXTENT(eh) &&
2116			ex_ee_block + ex_ee_len > start) {
2117
2118		if (ext4_ext_is_uninitialized(ex))
2119			uninitialized = 1;
2120		else
2121			uninitialized = 0;
2122
2123		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2124			 uninitialized, ex_ee_len);
2125		path[depth].p_ext = ex;
2126
2127		a = ex_ee_block > start ? ex_ee_block : start;
2128		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2129			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2130
2131		ext_debug("  border %u:%u\n", a, b);
2132
2133		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2134			block = 0;
2135			num = 0;
2136			BUG();
2137		} else if (a != ex_ee_block) {
2138			/* remove tail of the extent */
2139			block = ex_ee_block;
2140			num = a - block;
2141		} else if (b != ex_ee_block + ex_ee_len - 1) {
2142			/* remove head of the extent */
2143			block = a;
2144			num = b - a;
2145			/* there is no "make a hole" API yet */
2146			BUG();
2147		} else {
2148			/* remove whole extent: excellent! */
2149			block = ex_ee_block;
2150			num = 0;
2151			BUG_ON(a != ex_ee_block);
2152			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2153		}
2154
2155		/*
2156		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2157		 * descriptor) for each block group; assume two block
2158		 * groups plus ex_ee_len/blocks_per_block_group for
2159		 * the worst case
2160		 */
2161		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2162		if (ex == EXT_FIRST_EXTENT(eh)) {
2163			correct_index = 1;
2164			credits += (ext_depth(inode)) + 1;
2165		}
2166		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2167
2168		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2169		if (err)
2170			goto out;
2171
2172		err = ext4_ext_get_access(handle, inode, path + depth);
2173		if (err)
2174			goto out;
2175
2176		err = ext4_remove_blocks(handle, inode, ex, a, b);
2177		if (err)
2178			goto out;
2179
2180		if (num == 0) {
2181			/* this extent is removed; mark slot entirely unused */
2182			ext4_ext_store_pblock(ex, 0);
2183			le16_add_cpu(&eh->eh_entries, -1);
2184		}
2185
2186		ex->ee_block = cpu_to_le32(block);
2187		ex->ee_len = cpu_to_le16(num);
2188		/*
2189		 * Do not mark uninitialized if all the blocks in the
2190		 * extent have been removed.
2191		 */
2192		if (uninitialized && num)
2193			ext4_ext_mark_uninitialized(ex);
2194
2195		err = ext4_ext_dirty(handle, inode, path + depth);
2196		if (err)
2197			goto out;
2198
2199		ext_debug("new extent: %u:%u:%llu\n", block, num,
2200				ext_pblock(ex));
2201		ex--;
2202		ex_ee_block = le32_to_cpu(ex->ee_block);
2203		ex_ee_len = ext4_ext_get_actual_len(ex);
2204	}
2205
2206	if (correct_index && eh->eh_entries)
2207		err = ext4_ext_correct_indexes(handle, inode, path);
2208
2209	/* if this leaf is free, then we should
2210	 * remove it from index block above */
2211	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2212		err = ext4_ext_rm_idx(handle, inode, path + depth);
2213
2214out:
2215	return err;
2216}
2217
2218/*
2219 * ext4_ext_more_to_rm:
2220 * returns 1 if current index has to be freed (even partial)
2221 */
2222static int
2223ext4_ext_more_to_rm(struct ext4_ext_path *path)
2224{
2225	BUG_ON(path->p_idx == NULL);
2226
2227	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2228		return 0;
2229
2230	/*
2231	 * if truncate on deeper level happened, it wasn't partial,
2232	 * so we have to consider current index for truncation
2233	 */
2234	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2235		return 0;
2236	return 1;
2237}
2238
2239static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2240{
2241	struct super_block *sb = inode->i_sb;
2242	int depth = ext_depth(inode);
2243	struct ext4_ext_path *path;
2244	handle_t *handle;
2245	int i = 0, err = 0;
2246
2247	ext_debug("truncate since %u\n", start);
2248
2249	/* probably first extent we're gonna free will be last in block */
2250	handle = ext4_journal_start(inode, depth + 1);
2251	if (IS_ERR(handle))
2252		return PTR_ERR(handle);
2253
2254	ext4_ext_invalidate_cache(inode);
2255
2256	/*
2257	 * We start scanning from right side, freeing all the blocks
2258	 * after i_size and walking into the tree depth-wise.
2259	 */
2260	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2261	if (path == NULL) {
2262		ext4_journal_stop(handle);
2263		return -ENOMEM;
2264	}
2265	path[0].p_hdr = ext_inode_hdr(inode);
2266	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2267		err = -EIO;
2268		goto out;
2269	}
2270	path[0].p_depth = depth;
2271
2272	while (i >= 0 && err == 0) {
2273		if (i == depth) {
2274			/* this is leaf block */
2275			err = ext4_ext_rm_leaf(handle, inode, path, start);
2276			/* root level has p_bh == NULL, brelse() eats this */
2277			brelse(path[i].p_bh);
2278			path[i].p_bh = NULL;
2279			i--;
2280			continue;
2281		}
2282
2283		/* this is index block */
2284		if (!path[i].p_hdr) {
2285			ext_debug("initialize header\n");
2286			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2287		}
2288
2289		if (!path[i].p_idx) {
2290			/* this level hasn't been touched yet */
2291			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2292			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2293			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2294				  path[i].p_hdr,
2295				  le16_to_cpu(path[i].p_hdr->eh_entries));
2296		} else {
2297			/* we were already here, see at next index */
2298			path[i].p_idx--;
2299		}
2300
2301		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2302				i, EXT_FIRST_INDEX(path[i].p_hdr),
2303				path[i].p_idx);
2304		if (ext4_ext_more_to_rm(path + i)) {
2305			struct buffer_head *bh;
2306			/* go to the next level */
2307			ext_debug("move to level %d (block %llu)\n",
2308				  i + 1, idx_pblock(path[i].p_idx));
2309			memset(path + i + 1, 0, sizeof(*path));
2310			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2311			if (!bh) {
2312				/* should we reset i_size? */
2313				err = -EIO;
2314				break;
2315			}
2316			if (WARN_ON(i + 1 > depth)) {
2317				err = -EIO;
2318				break;
2319			}
2320			if (ext4_ext_check(inode, ext_block_hdr(bh),
2321							depth - i - 1)) {
2322				err = -EIO;
2323				break;
2324			}
2325			path[i + 1].p_bh = bh;
2326
2327			/* save actual number of indexes since this
2328			 * number is changed at the next iteration */
2329			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2330			i++;
2331		} else {
2332			/* we finished processing this index, go up */
2333			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2334				/* index is empty, remove it;
2335				 * handle must be already prepared by the
2336				 * truncatei_leaf() */
2337				err = ext4_ext_rm_idx(handle, inode, path + i);
2338			}
2339			/* root level has p_bh == NULL, brelse() eats this */
2340			brelse(path[i].p_bh);
2341			path[i].p_bh = NULL;
2342			i--;
2343			ext_debug("return to level %d\n", i);
2344		}
2345	}
2346
2347	/* TODO: flexible tree reduction should be here */
2348	if (path->p_hdr->eh_entries == 0) {
2349		/*
2350		 * truncate to zero freed all the tree,
2351		 * so we need to correct eh_depth
2352		 */
2353		err = ext4_ext_get_access(handle, inode, path);
2354		if (err == 0) {
2355			ext_inode_hdr(inode)->eh_depth = 0;
2356			ext_inode_hdr(inode)->eh_max =
2357				cpu_to_le16(ext4_ext_space_root(inode, 0));
2358			err = ext4_ext_dirty(handle, inode, path);
2359		}
2360	}
2361out:
2362	ext4_ext_drop_refs(path);
2363	kfree(path);
2364	ext4_journal_stop(handle);
2365
2366	return err;
2367}
2368
2369/*
2370 * called at mount time
2371 */
2372void ext4_ext_init(struct super_block *sb)
2373{
2374	/*
2375	 * possible initialization would be here
2376	 */
2377
2378	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2379#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2380		printk(KERN_INFO "EXT4-fs: file extents enabled");
2381#ifdef AGGRESSIVE_TEST
2382		printk(", aggressive tests");
2383#endif
2384#ifdef CHECK_BINSEARCH
2385		printk(", check binsearch");
2386#endif
2387#ifdef EXTENTS_STATS
2388		printk(", stats");
2389#endif
2390		printk("\n");
2391#endif
2392#ifdef EXTENTS_STATS
2393		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394		EXT4_SB(sb)->s_ext_min = 1 << 30;
2395		EXT4_SB(sb)->s_ext_max = 0;
2396#endif
2397	}
2398}
2399
2400/*
2401 * called at umount time
2402 */
2403void ext4_ext_release(struct super_block *sb)
2404{
2405	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2406		return;
2407
2408#ifdef EXTENTS_STATS
2409	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2410		struct ext4_sb_info *sbi = EXT4_SB(sb);
2411		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2412			sbi->s_ext_blocks, sbi->s_ext_extents,
2413			sbi->s_ext_blocks / sbi->s_ext_extents);
2414		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2415			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2416	}
2417#endif
2418}
2419
2420static void bi_complete(struct bio *bio, int error)
2421{
2422	complete((struct completion *)bio->bi_private);
2423}
2424
2425/* FIXME!! we need to try to merge to left or right after zero-out  */
2426static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2427{
2428	int ret = -EIO;
2429	struct bio *bio;
2430	int blkbits, blocksize;
2431	sector_t ee_pblock;
2432	struct completion event;
2433	unsigned int ee_len, len, done, offset;
2434
2435
2436	blkbits   = inode->i_blkbits;
2437	blocksize = inode->i_sb->s_blocksize;
2438	ee_len    = ext4_ext_get_actual_len(ex);
2439	ee_pblock = ext_pblock(ex);
2440
2441	/* convert ee_pblock to 512 byte sectors */
2442	ee_pblock = ee_pblock << (blkbits - 9);
2443
2444	while (ee_len > 0) {
2445
2446		if (ee_len > BIO_MAX_PAGES)
2447			len = BIO_MAX_PAGES;
2448		else
2449			len = ee_len;
2450
2451		bio = bio_alloc(GFP_NOIO, len);
2452		bio->bi_sector = ee_pblock;
2453		bio->bi_bdev   = inode->i_sb->s_bdev;
2454
2455		done = 0;
2456		offset = 0;
2457		while (done < len) {
2458			ret = bio_add_page(bio, ZERO_PAGE(0),
2459							blocksize, offset);
2460			if (ret != blocksize) {
2461				/*
2462				 * We can't add any more pages because of
2463				 * hardware limitations.  Start a new bio.
2464				 */
2465				break;
2466			}
2467			done++;
2468			offset += blocksize;
2469			if (offset >= PAGE_CACHE_SIZE)
2470				offset = 0;
2471		}
2472
2473		init_completion(&event);
2474		bio->bi_private = &event;
2475		bio->bi_end_io = bi_complete;
2476		submit_bio(WRITE, bio);
2477		wait_for_completion(&event);
2478
2479		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2480			ret = 0;
2481		else {
2482			ret = -EIO;
2483			break;
2484		}
2485		bio_put(bio);
2486		ee_len    -= done;
2487		ee_pblock += done  << (blkbits - 9);
2488	}
2489	return ret;
2490}
2491
2492#define EXT4_EXT_ZERO_LEN 7
2493/*
2494 * This function is called by ext4_ext_get_blocks() if someone tries to write
2495 * to an uninitialized extent. It may result in splitting the uninitialized
2496 * extent into multiple extents (upto three - one initialized and two
2497 * uninitialized).
2498 * There are three possibilities:
2499 *   a> There is no split required: Entire extent should be initialized
2500 *   b> Splits in two extents: Write is happening at either end of the extent
2501 *   c> Splits in three extents: Somone is writing in middle of the extent
2502 */
2503static int ext4_ext_convert_to_initialized(handle_t *handle,
2504						struct inode *inode,
2505						struct ext4_ext_path *path,
2506						ext4_lblk_t iblock,
2507						unsigned int max_blocks)
2508{
2509	struct ext4_extent *ex, newex, orig_ex;
2510	struct ext4_extent *ex1 = NULL;
2511	struct ext4_extent *ex2 = NULL;
2512	struct ext4_extent *ex3 = NULL;
2513	struct ext4_extent_header *eh;
2514	ext4_lblk_t ee_block;
2515	unsigned int allocated, ee_len, depth;
2516	ext4_fsblk_t newblock;
2517	int err = 0;
2518	int ret = 0;
2519
2520	depth = ext_depth(inode);
2521	eh = path[depth].p_hdr;
2522	ex = path[depth].p_ext;
2523	ee_block = le32_to_cpu(ex->ee_block);
2524	ee_len = ext4_ext_get_actual_len(ex);
2525	allocated = ee_len - (iblock - ee_block);
2526	newblock = iblock - ee_block + ext_pblock(ex);
2527	ex2 = ex;
2528	orig_ex.ee_block = ex->ee_block;
2529	orig_ex.ee_len   = cpu_to_le16(ee_len);
2530	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2531
2532	err = ext4_ext_get_access(handle, inode, path + depth);
2533	if (err)
2534		goto out;
2535	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2536	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2537		err =  ext4_ext_zeroout(inode, &orig_ex);
2538		if (err)
2539			goto fix_extent_len;
2540		/* update the extent length and mark as initialized */
2541		ex->ee_block = orig_ex.ee_block;
2542		ex->ee_len   = orig_ex.ee_len;
2543		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2544		ext4_ext_dirty(handle, inode, path + depth);
2545		/* zeroed the full extent */
2546		return allocated;
2547	}
2548
2549	/* ex1: ee_block to iblock - 1 : uninitialized */
2550	if (iblock > ee_block) {
2551		ex1 = ex;
2552		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2553		ext4_ext_mark_uninitialized(ex1);
2554		ex2 = &newex;
2555	}
2556	/*
2557	 * for sanity, update the length of the ex2 extent before
2558	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2559	 * overlap of blocks.
2560	 */
2561	if (!ex1 && allocated > max_blocks)
2562		ex2->ee_len = cpu_to_le16(max_blocks);
2563	/* ex3: to ee_block + ee_len : uninitialised */
2564	if (allocated > max_blocks) {
2565		unsigned int newdepth;
2566		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2567		if (allocated <= EXT4_EXT_ZERO_LEN) {
2568			/*
2569			 * iblock == ee_block is handled by the zerouout
2570			 * at the beginning.
2571			 * Mark first half uninitialized.
2572			 * Mark second half initialized and zero out the
2573			 * initialized extent
2574			 */
2575			ex->ee_block = orig_ex.ee_block;
2576			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2577			ext4_ext_mark_uninitialized(ex);
2578			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2579			ext4_ext_dirty(handle, inode, path + depth);
2580
2581			ex3 = &newex;
2582			ex3->ee_block = cpu_to_le32(iblock);
2583			ext4_ext_store_pblock(ex3, newblock);
2584			ex3->ee_len = cpu_to_le16(allocated);
2585			err = ext4_ext_insert_extent(handle, inode, path,
2586							ex3, 0);
2587			if (err == -ENOSPC) {
2588				err =  ext4_ext_zeroout(inode, &orig_ex);
2589				if (err)
2590					goto fix_extent_len;
2591				ex->ee_block = orig_ex.ee_block;
2592				ex->ee_len   = orig_ex.ee_len;
2593				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2594				ext4_ext_dirty(handle, inode, path + depth);
2595				/* blocks available from iblock */
2596				return allocated;
2597
2598			} else if (err)
2599				goto fix_extent_len;
2600
2601			/*
2602			 * We need to zero out the second half because
2603			 * an fallocate request can update file size and
2604			 * converting the second half to initialized extent
2605			 * implies that we can leak some junk data to user
2606			 * space.
2607			 */
2608			err =  ext4_ext_zeroout(inode, ex3);
2609			if (err) {
2610				/*
2611				 * We should actually mark the
2612				 * second half as uninit and return error
2613				 * Insert would have changed the extent
2614				 */
2615				depth = ext_depth(inode);
2616				ext4_ext_drop_refs(path);
2617				path = ext4_ext_find_extent(inode,
2618								iblock, path);
2619				if (IS_ERR(path)) {
2620					err = PTR_ERR(path);
2621					return err;
2622				}
2623				/* get the second half extent details */
2624				ex = path[depth].p_ext;
2625				err = ext4_ext_get_access(handle, inode,
2626								path + depth);
2627				if (err)
2628					return err;
2629				ext4_ext_mark_uninitialized(ex);
2630				ext4_ext_dirty(handle, inode, path + depth);
2631				return err;
2632			}
2633
2634			/* zeroed the second half */
2635			return allocated;
2636		}
2637		ex3 = &newex;
2638		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2639		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641		ext4_ext_mark_uninitialized(ex3);
2642		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643		if (err == -ENOSPC) {
2644			err =  ext4_ext_zeroout(inode, &orig_ex);
2645			if (err)
2646				goto fix_extent_len;
2647			/* update the extent length and mark as initialized */
2648			ex->ee_block = orig_ex.ee_block;
2649			ex->ee_len   = orig_ex.ee_len;
2650			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2651			ext4_ext_dirty(handle, inode, path + depth);
2652			/* zeroed the full extent */
2653			/* blocks available from iblock */
2654			return allocated;
2655
2656		} else if (err)
2657			goto fix_extent_len;
2658		/*
2659		 * The depth, and hence eh & ex might change
2660		 * as part of the insert above.
2661		 */
2662		newdepth = ext_depth(inode);
2663		/*
2664		 * update the extent length after successful insert of the
2665		 * split extent
2666		 */
2667		orig_ex.ee_len = cpu_to_le16(ee_len -
2668						ext4_ext_get_actual_len(ex3));
2669		depth = newdepth;
2670		ext4_ext_drop_refs(path);
2671		path = ext4_ext_find_extent(inode, iblock, path);
2672		if (IS_ERR(path)) {
2673			err = PTR_ERR(path);
2674			goto out;
2675		}
2676		eh = path[depth].p_hdr;
2677		ex = path[depth].p_ext;
2678		if (ex2 != &newex)
2679			ex2 = ex;
2680
2681		err = ext4_ext_get_access(handle, inode, path + depth);
2682		if (err)
2683			goto out;
2684
2685		allocated = max_blocks;
2686
2687		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2688		 * to insert a extent in the middle zerout directly
2689		 * otherwise give the extent a chance to merge to left
2690		 */
2691		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2692							iblock != ee_block) {
2693			err =  ext4_ext_zeroout(inode, &orig_ex);
2694			if (err)
2695				goto fix_extent_len;
2696			/* update the extent length and mark as initialized */
2697			ex->ee_block = orig_ex.ee_block;
2698			ex->ee_len   = orig_ex.ee_len;
2699			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2700			ext4_ext_dirty(handle, inode, path + depth);
2701			/* zero out the first half */
2702			/* blocks available from iblock */
2703			return allocated;
2704		}
2705	}
2706	/*
2707	 * If there was a change of depth as part of the
2708	 * insertion of ex3 above, we need to update the length
2709	 * of the ex1 extent again here
2710	 */
2711	if (ex1 && ex1 != ex) {
2712		ex1 = ex;
2713		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2714		ext4_ext_mark_uninitialized(ex1);
2715		ex2 = &newex;
2716	}
2717	/* ex2: iblock to iblock + maxblocks-1 : initialised */
2718	ex2->ee_block = cpu_to_le32(iblock);
2719	ext4_ext_store_pblock(ex2, newblock);
2720	ex2->ee_len = cpu_to_le16(allocated);
2721	if (ex2 != ex)
2722		goto insert;
2723	/*
2724	 * New (initialized) extent starts from the first block
2725	 * in the current extent. i.e., ex2 == ex
2726	 * We have to see if it can be merged with the extent
2727	 * on the left.
2728	 */
2729	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2730		/*
2731		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2732		 * since it merges towards right _only_.
2733		 */
2734		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2735		if (ret) {
2736			err = ext4_ext_correct_indexes(handle, inode, path);
2737			if (err)
2738				goto out;
2739			depth = ext_depth(inode);
2740			ex2--;
2741		}
2742	}
2743	/*
2744	 * Try to Merge towards right. This might be required
2745	 * only when the whole extent is being written to.
2746	 * i.e. ex2 == ex and ex3 == NULL.
2747	 */
2748	if (!ex3) {
2749		ret = ext4_ext_try_to_merge(inode, path, ex2);
2750		if (ret) {
2751			err = ext4_ext_correct_indexes(handle, inode, path);
2752			if (err)
2753				goto out;
2754		}
2755	}
2756	/* Mark modified extent as dirty */
2757	err = ext4_ext_dirty(handle, inode, path + depth);
2758	goto out;
2759insert:
2760	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2761	if (err == -ENOSPC) {
2762		err =  ext4_ext_zeroout(inode, &orig_ex);
2763		if (err)
2764			goto fix_extent_len;
2765		/* update the extent length and mark as initialized */
2766		ex->ee_block = orig_ex.ee_block;
2767		ex->ee_len   = orig_ex.ee_len;
2768		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2769		ext4_ext_dirty(handle, inode, path + depth);
2770		/* zero out the first half */
2771		return allocated;
2772	} else if (err)
2773		goto fix_extent_len;
2774out:
2775	ext4_ext_show_leaf(inode, path);
2776	return err ? err : allocated;
2777
2778fix_extent_len:
2779	ex->ee_block = orig_ex.ee_block;
2780	ex->ee_len   = orig_ex.ee_len;
2781	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2782	ext4_ext_mark_uninitialized(ex);
2783	ext4_ext_dirty(handle, inode, path + depth);
2784	return err;
2785}
2786
2787/*
2788 * This function is called by ext4_ext_get_blocks() from
2789 * ext4_get_blocks_dio_write() when DIO to write
2790 * to an uninitialized extent.
2791 *
2792 * Writing to an uninitized extent may result in splitting the uninitialized
2793 * extent into multiple /intialized unintialized extents (up to three)
2794 * There are three possibilities:
2795 *   a> There is no split required: Entire extent should be uninitialized
2796 *   b> Splits in two extents: Write is happening at either end of the extent
2797 *   c> Splits in three extents: Somone is writing in middle of the extent
2798 *
2799 * One of more index blocks maybe needed if the extent tree grow after
2800 * the unintialized extent split. To prevent ENOSPC occur at the IO
2801 * complete, we need to split the uninitialized extent before DIO submit
2802 * the IO. The uninitilized extent called at this time will be split
2803 * into three uninitialized extent(at most). After IO complete, the part
2804 * being filled will be convert to initialized by the end_io callback function
2805 * via ext4_convert_unwritten_extents().
2806 *
2807 * Returns the size of uninitialized extent to be written on success.
2808 */
2809static int ext4_split_unwritten_extents(handle_t *handle,
2810					struct inode *inode,
2811					struct ext4_ext_path *path,
2812					ext4_lblk_t iblock,
2813					unsigned int max_blocks,
2814					int flags)
2815{
2816	struct ext4_extent *ex, newex, orig_ex;
2817	struct ext4_extent *ex1 = NULL;
2818	struct ext4_extent *ex2 = NULL;
2819	struct ext4_extent *ex3 = NULL;
2820	struct ext4_extent_header *eh;
2821	ext4_lblk_t ee_block;
2822	unsigned int allocated, ee_len, depth;
2823	ext4_fsblk_t newblock;
2824	int err = 0;
2825
2826	ext_debug("ext4_split_unwritten_extents: inode %lu,"
2827		  "iblock %llu, max_blocks %u\n", inode->i_ino,
2828		  (unsigned long long)iblock, max_blocks);
2829	depth = ext_depth(inode);
2830	eh = path[depth].p_hdr;
2831	ex = path[depth].p_ext;
2832	ee_block = le32_to_cpu(ex->ee_block);
2833	ee_len = ext4_ext_get_actual_len(ex);
2834	allocated = ee_len - (iblock - ee_block);
2835	newblock = iblock - ee_block + ext_pblock(ex);
2836	ex2 = ex;
2837	orig_ex.ee_block = ex->ee_block;
2838	orig_ex.ee_len   = cpu_to_le16(ee_len);
2839	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2840
2841	/*
2842 	 * If the uninitialized extent begins at the same logical
2843 	 * block where the write begins, and the write completely
2844 	 * covers the extent, then we don't need to split it.
2845 	 */
2846	if ((iblock == ee_block) && (allocated <= max_blocks))
2847		return allocated;
2848
2849	err = ext4_ext_get_access(handle, inode, path + depth);
2850	if (err)
2851		goto out;
2852	/* ex1: ee_block to iblock - 1 : uninitialized */
2853	if (iblock > ee_block) {
2854		ex1 = ex;
2855		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2856		ext4_ext_mark_uninitialized(ex1);
2857		ex2 = &newex;
2858	}
2859	/*
2860	 * for sanity, update the length of the ex2 extent before
2861	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2862	 * overlap of blocks.
2863	 */
2864	if (!ex1 && allocated > max_blocks)
2865		ex2->ee_len = cpu_to_le16(max_blocks);
2866	/* ex3: to ee_block + ee_len : uninitialised */
2867	if (allocated > max_blocks) {
2868		unsigned int newdepth;
2869		ex3 = &newex;
2870		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2871		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2872		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2873		ext4_ext_mark_uninitialized(ex3);
2874		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2875		if (err == -ENOSPC) {
2876			err =  ext4_ext_zeroout(inode, &orig_ex);
2877			if (err)
2878				goto fix_extent_len;
2879			/* update the extent length and mark as initialized */
2880			ex->ee_block = orig_ex.ee_block;
2881			ex->ee_len   = orig_ex.ee_len;
2882			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2883			ext4_ext_dirty(handle, inode, path + depth);
2884			/* zeroed the full extent */
2885			/* blocks available from iblock */
2886			return allocated;
2887
2888		} else if (err)
2889			goto fix_extent_len;
2890		/*
2891		 * The depth, and hence eh & ex might change
2892		 * as part of the insert above.
2893		 */
2894		newdepth = ext_depth(inode);
2895		/*
2896		 * update the extent length after successful insert of the
2897		 * split extent
2898		 */
2899		orig_ex.ee_len = cpu_to_le16(ee_len -
2900						ext4_ext_get_actual_len(ex3));
2901		depth = newdepth;
2902		ext4_ext_drop_refs(path);
2903		path = ext4_ext_find_extent(inode, iblock, path);
2904		if (IS_ERR(path)) {
2905			err = PTR_ERR(path);
2906			goto out;
2907		}
2908		eh = path[depth].p_hdr;
2909		ex = path[depth].p_ext;
2910		if (ex2 != &newex)
2911			ex2 = ex;
2912
2913		err = ext4_ext_get_access(handle, inode, path + depth);
2914		if (err)
2915			goto out;
2916
2917		allocated = max_blocks;
2918	}
2919	/*
2920	 * If there was a change of depth as part of the
2921	 * insertion of ex3 above, we need to update the length
2922	 * of the ex1 extent again here
2923	 */
2924	if (ex1 && ex1 != ex) {
2925		ex1 = ex;
2926		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2927		ext4_ext_mark_uninitialized(ex1);
2928		ex2 = &newex;
2929	}
2930	/*
2931	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2932	 * uninitialised still.
2933	 */
2934	ex2->ee_block = cpu_to_le32(iblock);
2935	ext4_ext_store_pblock(ex2, newblock);
2936	ex2->ee_len = cpu_to_le16(allocated);
2937	ext4_ext_mark_uninitialized(ex2);
2938	if (ex2 != ex)
2939		goto insert;
2940	/* Mark modified extent as dirty */
2941	err = ext4_ext_dirty(handle, inode, path + depth);
2942	ext_debug("out here\n");
2943	goto out;
2944insert:
2945	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2946	if (err == -ENOSPC) {
2947		err =  ext4_ext_zeroout(inode, &orig_ex);
2948		if (err)
2949			goto fix_extent_len;
2950		/* update the extent length and mark as initialized */
2951		ex->ee_block = orig_ex.ee_block;
2952		ex->ee_len   = orig_ex.ee_len;
2953		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2954		ext4_ext_dirty(handle, inode, path + depth);
2955		/* zero out the first half */
2956		return allocated;
2957	} else if (err)
2958		goto fix_extent_len;
2959out:
2960	ext4_ext_show_leaf(inode, path);
2961	return err ? err : allocated;
2962
2963fix_extent_len:
2964	ex->ee_block = orig_ex.ee_block;
2965	ex->ee_len   = orig_ex.ee_len;
2966	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2967	ext4_ext_mark_uninitialized(ex);
2968	ext4_ext_dirty(handle, inode, path + depth);
2969	return err;
2970}
2971static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2972					      struct inode *inode,
2973					      struct ext4_ext_path *path)
2974{
2975	struct ext4_extent *ex;
2976	struct ext4_extent_header *eh;
2977	int depth;
2978	int err = 0;
2979	int ret = 0;
2980
2981	depth = ext_depth(inode);
2982	eh = path[depth].p_hdr;
2983	ex = path[depth].p_ext;
2984
2985	err = ext4_ext_get_access(handle, inode, path + depth);
2986	if (err)
2987		goto out;
2988	/* first mark the extent as initialized */
2989	ext4_ext_mark_initialized(ex);
2990
2991	/*
2992	 * We have to see if it can be merged with the extent
2993	 * on the left.
2994	 */
2995	if (ex > EXT_FIRST_EXTENT(eh)) {
2996		/*
2997		 * To merge left, pass "ex - 1" to try_to_merge(),
2998		 * since it merges towards right _only_.
2999		 */
3000		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3001		if (ret) {
3002			err = ext4_ext_correct_indexes(handle, inode, path);
3003			if (err)
3004				goto out;
3005			depth = ext_depth(inode);
3006			ex--;
3007		}
3008	}
3009	/*
3010	 * Try to Merge towards right.
3011	 */
3012	ret = ext4_ext_try_to_merge(inode, path, ex);
3013	if (ret) {
3014		err = ext4_ext_correct_indexes(handle, inode, path);
3015		if (err)
3016			goto out;
3017		depth = ext_depth(inode);
3018	}
3019	/* Mark modified extent as dirty */
3020	err = ext4_ext_dirty(handle, inode, path + depth);
3021out:
3022	ext4_ext_show_leaf(inode, path);
3023	return err;
3024}
3025
3026static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3027			sector_t block, int count)
3028{
3029	int i;
3030	for (i = 0; i < count; i++)
3031                unmap_underlying_metadata(bdev, block + i);
3032}
3033
3034static int
3035ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3036			ext4_lblk_t iblock, unsigned int max_blocks,
3037			struct ext4_ext_path *path, int flags,
3038			unsigned int allocated, struct buffer_head *bh_result,
3039			ext4_fsblk_t newblock)
3040{
3041	int ret = 0;
3042	int err = 0;
3043	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3044
3045	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3046		  "block %llu, max_blocks %u, flags %d, allocated %u",
3047		  inode->i_ino, (unsigned long long)iblock, max_blocks,
3048		  flags, allocated);
3049	ext4_ext_show_leaf(inode, path);
3050
3051	/* DIO get_block() before submit the IO, split the extent */
3052	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3053		ret = ext4_split_unwritten_extents(handle,
3054						inode, path, iblock,
3055						max_blocks, flags);
3056		/*
3057		 * Flag the inode(non aio case) or end_io struct (aio case)
3058		 * that this IO needs to convertion to written when IO is
3059		 * completed
3060		 */
3061		if (io)
3062			io->flag = DIO_AIO_UNWRITTEN;
3063		else
3064			EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
3065		goto out;
3066	}
3067	/* async DIO end_io complete, convert the filled extent to written */
3068	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3069		ret = ext4_convert_unwritten_extents_dio(handle, inode,
3070							path);
3071		if (ret >= 0)
3072			ext4_update_inode_fsync_trans(handle, inode, 1);
3073		goto out2;
3074	}
3075	/* buffered IO case */
3076	/*
3077	 * repeat fallocate creation request
3078	 * we already have an unwritten extent
3079	 */
3080	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3081		goto map_out;
3082
3083	/* buffered READ or buffered write_begin() lookup */
3084	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3085		/*
3086		 * We have blocks reserved already.  We
3087		 * return allocated blocks so that delalloc
3088		 * won't do block reservation for us.  But
3089		 * the buffer head will be unmapped so that
3090		 * a read from the block returns 0s.
3091		 */
3092		set_buffer_unwritten(bh_result);
3093		goto out1;
3094	}
3095
3096	/* buffered write, writepage time, convert*/
3097	ret = ext4_ext_convert_to_initialized(handle, inode,
3098						path, iblock,
3099						max_blocks);
3100	if (ret >= 0)
3101		ext4_update_inode_fsync_trans(handle, inode, 1);
3102out:
3103	if (ret <= 0) {
3104		err = ret;
3105		goto out2;
3106	} else
3107		allocated = ret;
3108	set_buffer_new(bh_result);
3109	/*
3110	 * if we allocated more blocks than requested
3111	 * we need to make sure we unmap the extra block
3112	 * allocated. The actual needed block will get
3113	 * unmapped later when we find the buffer_head marked
3114	 * new.
3115	 */
3116	if (allocated > max_blocks) {
3117		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3118					newblock + max_blocks,
3119					allocated - max_blocks);
3120	}
3121map_out:
3122	set_buffer_mapped(bh_result);
3123out1:
3124	if (allocated > max_blocks)
3125		allocated = max_blocks;
3126	ext4_ext_show_leaf(inode, path);
3127	bh_result->b_bdev = inode->i_sb->s_bdev;
3128	bh_result->b_blocknr = newblock;
3129out2:
3130	if (path) {
3131		ext4_ext_drop_refs(path);
3132		kfree(path);
3133	}
3134	return err ? err : allocated;
3135}
3136/*
3137 * Block allocation/map/preallocation routine for extents based files
3138 *
3139 *
3140 * Need to be called with
3141 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3142 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3143 *
3144 * return > 0, number of of blocks already mapped/allocated
3145 *          if create == 0 and these are pre-allocated blocks
3146 *          	buffer head is unmapped
3147 *          otherwise blocks are mapped
3148 *
3149 * return = 0, if plain look up failed (blocks have not been allocated)
3150 *          buffer head is unmapped
3151 *
3152 * return < 0, error case.
3153 */
3154int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3155			ext4_lblk_t iblock,
3156			unsigned int max_blocks, struct buffer_head *bh_result,
3157			int flags)
3158{
3159	struct ext4_ext_path *path = NULL;
3160	struct ext4_extent_header *eh;
3161	struct ext4_extent newex, *ex;
3162	ext4_fsblk_t newblock;
3163	int err = 0, depth, ret, cache_type;
3164	unsigned int allocated = 0;
3165	struct ext4_allocation_request ar;
3166	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3167
3168	__clear_bit(BH_New, &bh_result->b_state);
3169	ext_debug("blocks %u/%u requested for inode %lu\n",
3170			iblock, max_blocks, inode->i_ino);
3171
3172	/* check in cache */
3173	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
3174	if (cache_type) {
3175		if (cache_type == EXT4_EXT_CACHE_GAP) {
3176			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3177				/*
3178				 * block isn't allocated yet and
3179				 * user doesn't want to allocate it
3180				 */
3181				goto out2;
3182			}
3183			/* we should allocate requested block */
3184		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3185			/* block is already allocated */
3186			newblock = iblock
3187				   - le32_to_cpu(newex.ee_block)
3188				   + ext_pblock(&newex);
3189			/* number of remaining blocks in the extent */
3190			allocated = ext4_ext_get_actual_len(&newex) -
3191					(iblock - le32_to_cpu(newex.ee_block));
3192			goto out;
3193		} else {
3194			BUG();
3195		}
3196	}
3197
3198	/* find extent for this block */
3199	path = ext4_ext_find_extent(inode, iblock, NULL);
3200	if (IS_ERR(path)) {
3201		err = PTR_ERR(path);
3202		path = NULL;
3203		goto out2;
3204	}
3205
3206	depth = ext_depth(inode);
3207
3208	/*
3209	 * consistent leaf must not be empty;
3210	 * this situation is possible, though, _during_ tree modification;
3211	 * this is why assert can't be put in ext4_ext_find_extent()
3212	 */
3213	if (path[depth].p_ext == NULL && depth != 0) {
3214		ext4_error(inode->i_sb, __func__, "bad extent address "
3215			   "inode: %lu, iblock: %d, depth: %d",
3216			   inode->i_ino, iblock, depth);
3217		err = -EIO;
3218		goto out2;
3219	}
3220	eh = path[depth].p_hdr;
3221
3222	ex = path[depth].p_ext;
3223	if (ex) {
3224		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3225		ext4_fsblk_t ee_start = ext_pblock(ex);
3226		unsigned short ee_len;
3227
3228		/*
3229		 * Uninitialized extents are treated as holes, except that
3230		 * we split out initialized portions during a write.
3231		 */
3232		ee_len = ext4_ext_get_actual_len(ex);
3233		/* if found extent covers block, simply return it */
3234		if (iblock >= ee_block && iblock < ee_block + ee_len) {
3235			newblock = iblock - ee_block + ee_start;
3236			/* number of remaining blocks in the extent */
3237			allocated = ee_len - (iblock - ee_block);
3238			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
3239					ee_block, ee_len, newblock);
3240
3241			/* Do not put uninitialized extent in the cache */
3242			if (!ext4_ext_is_uninitialized(ex)) {
3243				ext4_ext_put_in_cache(inode, ee_block,
3244							ee_len, ee_start,
3245							EXT4_EXT_CACHE_EXTENT);
3246				goto out;
3247			}
3248			ret = ext4_ext_handle_uninitialized_extents(handle,
3249					inode, iblock, max_blocks, path,
3250					flags, allocated, bh_result, newblock);
3251			return ret;
3252		}
3253	}
3254
3255	/*
3256	 * requested block isn't allocated yet;
3257	 * we couldn't try to create block if create flag is zero
3258	 */
3259	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3260		/*
3261		 * put just found gap into cache to speed up
3262		 * subsequent requests
3263		 */
3264		ext4_ext_put_gap_in_cache(inode, path, iblock);
3265		goto out2;
3266	}
3267	/*
3268	 * Okay, we need to do block allocation.
3269	 */
3270
3271	/* find neighbour allocated blocks */
3272	ar.lleft = iblock;
3273	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3274	if (err)
3275		goto out2;
3276	ar.lright = iblock;
3277	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3278	if (err)
3279		goto out2;
3280
3281	/*
3282	 * See if request is beyond maximum number of blocks we can have in
3283	 * a single extent. For an initialized extent this limit is
3284	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3285	 * EXT_UNINIT_MAX_LEN.
3286	 */
3287	if (max_blocks > EXT_INIT_MAX_LEN &&
3288	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3289		max_blocks = EXT_INIT_MAX_LEN;
3290	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
3291		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3292		max_blocks = EXT_UNINIT_MAX_LEN;
3293
3294	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
3295	newex.ee_block = cpu_to_le32(iblock);
3296	newex.ee_len = cpu_to_le16(max_blocks);
3297	err = ext4_ext_check_overlap(inode, &newex, path);
3298	if (err)
3299		allocated = ext4_ext_get_actual_len(&newex);
3300	else
3301		allocated = max_blocks;
3302
3303	/* allocate new block */
3304	ar.inode = inode;
3305	ar.goal = ext4_ext_find_goal(inode, path, iblock);
3306	ar.logical = iblock;
3307	ar.len = allocated;
3308	if (S_ISREG(inode->i_mode))
3309		ar.flags = EXT4_MB_HINT_DATA;
3310	else
3311		/* disable in-core preallocation for non-regular files */
3312		ar.flags = 0;
3313	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3314	if (!newblock)
3315		goto out2;
3316	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3317		  ar.goal, newblock, allocated);
3318
3319	/* try to insert new extent into found leaf and return */
3320	ext4_ext_store_pblock(&newex, newblock);
3321	newex.ee_len = cpu_to_le16(ar.len);
3322	/* Mark uninitialized */
3323	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3324		ext4_ext_mark_uninitialized(&newex);
3325		/*
3326		 * io_end structure was created for every async
3327		 * direct IO write to the middle of the file.
3328		 * To avoid unecessary convertion for every aio dio rewrite
3329		 * to the mid of file, here we flag the IO that is really
3330		 * need the convertion.
3331		 * For non asycn direct IO case, flag the inode state
3332		 * that we need to perform convertion when IO is done.
3333		 */
3334		if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3335			if (io)
3336				io->flag = DIO_AIO_UNWRITTEN;
3337			else
3338				EXT4_I(inode)->i_state |=
3339					EXT4_STATE_DIO_UNWRITTEN;;
3340		}
3341	}
3342	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3343	if (err) {
3344		/* free data blocks we just allocated */
3345		/* not a good idea to call discard here directly,
3346		 * but otherwise we'd need to call it every free() */
3347		ext4_discard_preallocations(inode);
3348		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3349				 ext4_ext_get_actual_len(&newex), 0);
3350		goto out2;
3351	}
3352
3353	/* previous routine could use block we allocated */
3354	newblock = ext_pblock(&newex);
3355	allocated = ext4_ext_get_actual_len(&newex);
3356	set_buffer_new(bh_result);
3357
3358	/*
3359	 * Cache the extent and update transaction to commit on fdatasync only
3360	 * when it is _not_ an uninitialized extent.
3361	 */
3362	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3363		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
3364						EXT4_EXT_CACHE_EXTENT);
3365		ext4_update_inode_fsync_trans(handle, inode, 1);
3366	} else
3367		ext4_update_inode_fsync_trans(handle, inode, 0);
3368out:
3369	if (allocated > max_blocks)
3370		allocated = max_blocks;
3371	ext4_ext_show_leaf(inode, path);
3372	set_buffer_mapped(bh_result);
3373	bh_result->b_bdev = inode->i_sb->s_bdev;
3374	bh_result->b_blocknr = newblock;
3375out2:
3376	if (path) {
3377		ext4_ext_drop_refs(path);
3378		kfree(path);
3379	}
3380	return err ? err : allocated;
3381}
3382
3383void ext4_ext_truncate(struct inode *inode)
3384{
3385	struct address_space *mapping = inode->i_mapping;
3386	struct super_block *sb = inode->i_sb;
3387	ext4_lblk_t last_block;
3388	handle_t *handle;
3389	int err = 0;
3390
3391	/*
3392	 * probably first extent we're gonna free will be last in block
3393	 */
3394	err = ext4_writepage_trans_blocks(inode);
3395	handle = ext4_journal_start(inode, err);
3396	if (IS_ERR(handle))
3397		return;
3398
3399	if (inode->i_size & (sb->s_blocksize - 1))
3400		ext4_block_truncate_page(handle, mapping, inode->i_size);
3401
3402	if (ext4_orphan_add(handle, inode))
3403		goto out_stop;
3404
3405	down_write(&EXT4_I(inode)->i_data_sem);
3406	ext4_ext_invalidate_cache(inode);
3407
3408	ext4_discard_preallocations(inode);
3409
3410	/*
3411	 * TODO: optimization is possible here.
3412	 * Probably we need not scan at all,
3413	 * because page truncation is enough.
3414	 */
3415
3416	/* we have to know where to truncate from in crash case */
3417	EXT4_I(inode)->i_disksize = inode->i_size;
3418	ext4_mark_inode_dirty(handle, inode);
3419
3420	last_block = (inode->i_size + sb->s_blocksize - 1)
3421			>> EXT4_BLOCK_SIZE_BITS(sb);
3422	err = ext4_ext_remove_space(inode, last_block);
3423
3424	/* In a multi-transaction truncate, we only make the final
3425	 * transaction synchronous.
3426	 */
3427	if (IS_SYNC(inode))
3428		ext4_handle_sync(handle);
3429
3430out_stop:
3431	up_write(&EXT4_I(inode)->i_data_sem);
3432	/*
3433	 * If this was a simple ftruncate() and the file will remain alive,
3434	 * then we need to clear up the orphan record which we created above.
3435	 * However, if this was a real unlink then we were called by
3436	 * ext4_delete_inode(), and we allow that function to clean up the
3437	 * orphan info for us.
3438	 */
3439	if (inode->i_nlink)
3440		ext4_orphan_del(handle, inode);
3441
3442	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3443	ext4_mark_inode_dirty(handle, inode);
3444	ext4_journal_stop(handle);
3445}
3446
3447static void ext4_falloc_update_inode(struct inode *inode,
3448				int mode, loff_t new_size, int update_ctime)
3449{
3450	struct timespec now;
3451
3452	if (update_ctime) {
3453		now = current_fs_time(inode->i_sb);
3454		if (!timespec_equal(&inode->i_ctime, &now))
3455			inode->i_ctime = now;
3456	}
3457	/*
3458	 * Update only when preallocation was requested beyond
3459	 * the file size.
3460	 */
3461	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3462		if (new_size > i_size_read(inode))
3463			i_size_write(inode, new_size);
3464		if (new_size > EXT4_I(inode)->i_disksize)
3465			ext4_update_i_disksize(inode, new_size);
3466	}
3467
3468}
3469
3470/*
3471 * preallocate space for a file. This implements ext4's fallocate inode
3472 * operation, which gets called from sys_fallocate system call.
3473 * For block-mapped files, posix_fallocate should fall back to the method
3474 * of writing zeroes to the required new blocks (the same behavior which is
3475 * expected for file systems which do not support fallocate() system call).
3476 */
3477long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3478{
3479	handle_t *handle;
3480	ext4_lblk_t block;
3481	loff_t new_size;
3482	unsigned int max_blocks;
3483	int ret = 0;
3484	int ret2 = 0;
3485	int retries = 0;
3486	struct buffer_head map_bh;
3487	unsigned int credits, blkbits = inode->i_blkbits;
3488
3489	/*
3490	 * currently supporting (pre)allocate mode for extent-based
3491	 * files _only_
3492	 */
3493	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3494		return -EOPNOTSUPP;
3495
3496	/* preallocation to directories is currently not supported */
3497	if (S_ISDIR(inode->i_mode))
3498		return -ENODEV;
3499
3500	block = offset >> blkbits;
3501	/*
3502	 * We can't just convert len to max_blocks because
3503	 * If blocksize = 4096 offset = 3072 and len = 2048
3504	 */
3505	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3506							- block;
3507	/*
3508	 * credits to insert 1 extent into extent tree
3509	 */
3510	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3511	mutex_lock(&inode->i_mutex);
3512retry:
3513	while (ret >= 0 && ret < max_blocks) {
3514		block = block + ret;
3515		max_blocks = max_blocks - ret;
3516		handle = ext4_journal_start(inode, credits);
3517		if (IS_ERR(handle)) {
3518			ret = PTR_ERR(handle);
3519			break;
3520		}
3521		map_bh.b_state = 0;
3522		ret = ext4_get_blocks(handle, inode, block,
3523				      max_blocks, &map_bh,
3524				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3525		if (ret <= 0) {
3526#ifdef EXT4FS_DEBUG
3527			WARN_ON(ret <= 0);
3528			printk(KERN_ERR "%s: ext4_ext_get_blocks "
3529				    "returned error inode#%lu, block=%u, "
3530				    "max_blocks=%u", __func__,
3531				    inode->i_ino, block, max_blocks);
3532#endif
3533			ext4_mark_inode_dirty(handle, inode);
3534			ret2 = ext4_journal_stop(handle);
3535			break;
3536		}
3537		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3538						blkbits) >> blkbits))
3539			new_size = offset + len;
3540		else
3541			new_size = (block + ret) << blkbits;
3542
3543		ext4_falloc_update_inode(inode, mode, new_size,
3544						buffer_new(&map_bh));
3545		ext4_mark_inode_dirty(handle, inode);
3546		ret2 = ext4_journal_stop(handle);
3547		if (ret2)
3548			break;
3549	}
3550	if (ret == -ENOSPC &&
3551			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3552		ret = 0;
3553		goto retry;
3554	}
3555	mutex_unlock(&inode->i_mutex);
3556	return ret > 0 ? ret2 : ret;
3557}
3558
3559/*
3560 * This function convert a range of blocks to written extents
3561 * The caller of this function will pass the start offset and the size.
3562 * all unwritten extents within this range will be converted to
3563 * written extents.
3564 *
3565 * This function is called from the direct IO end io call back
3566 * function, to convert the fallocated extents after IO is completed.
3567 * Returns 0 on success.
3568 */
3569int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3570				    loff_t len)
3571{
3572	handle_t *handle;
3573	ext4_lblk_t block;
3574	unsigned int max_blocks;
3575	int ret = 0;
3576	int ret2 = 0;
3577	struct buffer_head map_bh;
3578	unsigned int credits, blkbits = inode->i_blkbits;
3579
3580	block = offset >> blkbits;
3581	/*
3582	 * We can't just convert len to max_blocks because
3583	 * If blocksize = 4096 offset = 3072 and len = 2048
3584	 */
3585	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3586							- block;
3587	/*
3588	 * credits to insert 1 extent into extent tree
3589	 */
3590	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3591	while (ret >= 0 && ret < max_blocks) {
3592		block = block + ret;
3593		max_blocks = max_blocks - ret;
3594		handle = ext4_journal_start(inode, credits);
3595		if (IS_ERR(handle)) {
3596			ret = PTR_ERR(handle);
3597			break;
3598		}
3599		map_bh.b_state = 0;
3600		ret = ext4_get_blocks(handle, inode, block,
3601				      max_blocks, &map_bh,
3602				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3603		if (ret <= 0) {
3604			WARN_ON(ret <= 0);
3605			printk(KERN_ERR "%s: ext4_ext_get_blocks "
3606				    "returned error inode#%lu, block=%u, "
3607				    "max_blocks=%u", __func__,
3608				    inode->i_ino, block, max_blocks);
3609		}
3610		ext4_mark_inode_dirty(handle, inode);
3611		ret2 = ext4_journal_stop(handle);
3612		if (ret <= 0 || ret2 )
3613			break;
3614	}
3615	return ret > 0 ? ret2 : ret;
3616}
3617/*
3618 * Callback function called for each extent to gather FIEMAP information.
3619 */
3620static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3621		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3622		       void *data)
3623{
3624	struct fiemap_extent_info *fieinfo = data;
3625	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3626	__u64	logical;
3627	__u64	physical;
3628	__u64	length;
3629	__u32	flags = 0;
3630	int	error;
3631
3632	logical =  (__u64)newex->ec_block << blksize_bits;
3633
3634	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3635		pgoff_t offset;
3636		struct page *page;
3637		struct buffer_head *bh = NULL;
3638
3639		offset = logical >> PAGE_SHIFT;
3640		page = find_get_page(inode->i_mapping, offset);
3641		if (!page || !page_has_buffers(page))
3642			return EXT_CONTINUE;
3643
3644		bh = page_buffers(page);
3645
3646		if (!bh)
3647			return EXT_CONTINUE;
3648
3649		if (buffer_delay(bh)) {
3650			flags |= FIEMAP_EXTENT_DELALLOC;
3651			page_cache_release(page);
3652		} else {
3653			page_cache_release(page);
3654			return EXT_CONTINUE;
3655		}
3656	}
3657
3658	physical = (__u64)newex->ec_start << blksize_bits;
3659	length =   (__u64)newex->ec_len << blksize_bits;
3660
3661	if (ex && ext4_ext_is_uninitialized(ex))
3662		flags |= FIEMAP_EXTENT_UNWRITTEN;
3663
3664	/*
3665	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3666	 *
3667	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3668	 * this also indicates no more allocated blocks.
3669	 *
3670	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3671	 */
3672	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3673	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3674		loff_t size = i_size_read(inode);
3675		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3676
3677		flags |= FIEMAP_EXTENT_LAST;
3678		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3679		    logical+length > size)
3680			length = (size - logical + bs - 1) & ~(bs-1);
3681	}
3682
3683	error = fiemap_fill_next_extent(fieinfo, logical, physical,
3684					length, flags);
3685	if (error < 0)
3686		return error;
3687	if (error == 1)
3688		return EXT_BREAK;
3689
3690	return EXT_CONTINUE;
3691}
3692
3693/* fiemap flags we can handle specified here */
3694#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3695
3696static int ext4_xattr_fiemap(struct inode *inode,
3697				struct fiemap_extent_info *fieinfo)
3698{
3699	__u64 physical = 0;
3700	__u64 length;
3701	__u32 flags = FIEMAP_EXTENT_LAST;
3702	int blockbits = inode->i_sb->s_blocksize_bits;
3703	int error = 0;
3704
3705	/* in-inode? */
3706	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3707		struct ext4_iloc iloc;
3708		int offset;	/* offset of xattr in inode */
3709
3710		error = ext4_get_inode_loc(inode, &iloc);
3711		if (error)
3712			return error;
3713		physical = iloc.bh->b_blocknr << blockbits;
3714		offset = EXT4_GOOD_OLD_INODE_SIZE +
3715				EXT4_I(inode)->i_extra_isize;
3716		physical += offset;
3717		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3718		flags |= FIEMAP_EXTENT_DATA_INLINE;
3719	} else { /* external block */
3720		physical = EXT4_I(inode)->i_file_acl << blockbits;
3721		length = inode->i_sb->s_blocksize;
3722	}
3723
3724	if (physical)
3725		error = fiemap_fill_next_extent(fieinfo, 0, physical,
3726						length, flags);
3727	return (error < 0 ? error : 0);
3728}
3729
3730int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3731		__u64 start, __u64 len)
3732{
3733	ext4_lblk_t start_blk;
3734	ext4_lblk_t len_blks;
3735	int error = 0;
3736
3737	/* fallback to generic here if not in extents fmt */
3738	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3739		return generic_block_fiemap(inode, fieinfo, start, len,
3740			ext4_get_block);
3741
3742	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3743		return -EBADR;
3744
3745	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3746		error = ext4_xattr_fiemap(inode, fieinfo);
3747	} else {
3748		start_blk = start >> inode->i_sb->s_blocksize_bits;
3749		len_blks = len >> inode->i_sb->s_blocksize_bits;
3750
3751		/*
3752		 * Walk the extent tree gathering extent information.
3753		 * ext4_ext_fiemap_cb will push extents back to user.
3754		 */
3755		error = ext4_ext_walk_space(inode, start_blk, len_blks,
3756					  ext4_ext_fiemap_cb, fieinfo);
3757	}
3758
3759	return error;
3760}
3761
3762