extents.c revision 63fedaf1c2dd546da22ff5b4ae06e9b4efba5146
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/fs.h>
33#include <linux/time.h>
34#include <linux/jbd2.h>
35#include <linux/highuid.h>
36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/string.h>
39#include <linux/slab.h>
40#include <linux/falloc.h>
41#include <asm/uaccess.h>
42#include <linux/fiemap.h>
43#include "ext4_jbd2.h"
44
45#include <trace/events/ext4.h>
46
47/*
48 * used by extent splitting.
49 */
50#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
51					due to ENOSPC */
52#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
54
55static __le32 ext4_extent_block_csum(struct inode *inode,
56				     struct ext4_extent_header *eh)
57{
58	struct ext4_inode_info *ei = EXT4_I(inode);
59	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
60	__u32 csum;
61
62	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
63			   EXT4_EXTENT_TAIL_OFFSET(eh));
64	return cpu_to_le32(csum);
65}
66
67static int ext4_extent_block_csum_verify(struct inode *inode,
68					 struct ext4_extent_header *eh)
69{
70	struct ext4_extent_tail *et;
71
72	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
73		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
74		return 1;
75
76	et = find_ext4_extent_tail(eh);
77	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
78		return 0;
79	return 1;
80}
81
82static void ext4_extent_block_csum_set(struct inode *inode,
83				       struct ext4_extent_header *eh)
84{
85	struct ext4_extent_tail *et;
86
87	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
88		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
89		return;
90
91	et = find_ext4_extent_tail(eh);
92	et->et_checksum = ext4_extent_block_csum(inode, eh);
93}
94
95static int ext4_split_extent(handle_t *handle,
96				struct inode *inode,
97				struct ext4_ext_path *path,
98				struct ext4_map_blocks *map,
99				int split_flag,
100				int flags);
101
102static int ext4_split_extent_at(handle_t *handle,
103			     struct inode *inode,
104			     struct ext4_ext_path *path,
105			     ext4_lblk_t split,
106			     int split_flag,
107			     int flags);
108
109static int ext4_ext_truncate_extend_restart(handle_t *handle,
110					    struct inode *inode,
111					    int needed)
112{
113	int err;
114
115	if (!ext4_handle_valid(handle))
116		return 0;
117	if (handle->h_buffer_credits > needed)
118		return 0;
119	err = ext4_journal_extend(handle, needed);
120	if (err <= 0)
121		return err;
122	err = ext4_truncate_restart_trans(handle, inode, needed);
123	if (err == 0)
124		err = -EAGAIN;
125
126	return err;
127}
128
129/*
130 * could return:
131 *  - EROFS
132 *  - ENOMEM
133 */
134static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
135				struct ext4_ext_path *path)
136{
137	if (path->p_bh) {
138		/* path points to block */
139		return ext4_journal_get_write_access(handle, path->p_bh);
140	}
141	/* path points to leaf/index in inode body */
142	/* we use in-core data, no need to protect them */
143	return 0;
144}
145
146/*
147 * could return:
148 *  - EROFS
149 *  - ENOMEM
150 *  - EIO
151 */
152#define ext4_ext_dirty(handle, inode, path) \
153		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
154static int __ext4_ext_dirty(const char *where, unsigned int line,
155			    handle_t *handle, struct inode *inode,
156			    struct ext4_ext_path *path)
157{
158	int err;
159	if (path->p_bh) {
160		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
161		/* path points to block */
162		err = __ext4_handle_dirty_metadata(where, line, handle,
163						   inode, path->p_bh);
164	} else {
165		/* path points to leaf/index in inode body */
166		err = ext4_mark_inode_dirty(handle, inode);
167	}
168	return err;
169}
170
171static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
172			      struct ext4_ext_path *path,
173			      ext4_lblk_t block)
174{
175	if (path) {
176		int depth = path->p_depth;
177		struct ext4_extent *ex;
178
179		/*
180		 * Try to predict block placement assuming that we are
181		 * filling in a file which will eventually be
182		 * non-sparse --- i.e., in the case of libbfd writing
183		 * an ELF object sections out-of-order but in a way
184		 * the eventually results in a contiguous object or
185		 * executable file, or some database extending a table
186		 * space file.  However, this is actually somewhat
187		 * non-ideal if we are writing a sparse file such as
188		 * qemu or KVM writing a raw image file that is going
189		 * to stay fairly sparse, since it will end up
190		 * fragmenting the file system's free space.  Maybe we
191		 * should have some hueristics or some way to allow
192		 * userspace to pass a hint to file system,
193		 * especially if the latter case turns out to be
194		 * common.
195		 */
196		ex = path[depth].p_ext;
197		if (ex) {
198			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
199			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
200
201			if (block > ext_block)
202				return ext_pblk + (block - ext_block);
203			else
204				return ext_pblk - (ext_block - block);
205		}
206
207		/* it looks like index is empty;
208		 * try to find starting block from index itself */
209		if (path[depth].p_bh)
210			return path[depth].p_bh->b_blocknr;
211	}
212
213	/* OK. use inode's group */
214	return ext4_inode_to_goal_block(inode);
215}
216
217/*
218 * Allocation for a meta data block
219 */
220static ext4_fsblk_t
221ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
222			struct ext4_ext_path *path,
223			struct ext4_extent *ex, int *err, unsigned int flags)
224{
225	ext4_fsblk_t goal, newblock;
226
227	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
228	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
229					NULL, err);
230	return newblock;
231}
232
233static inline int ext4_ext_space_block(struct inode *inode, int check)
234{
235	int size;
236
237	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
238			/ sizeof(struct ext4_extent);
239#ifdef AGGRESSIVE_TEST
240	if (!check && size > 6)
241		size = 6;
242#endif
243	return size;
244}
245
246static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
247{
248	int size;
249
250	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
251			/ sizeof(struct ext4_extent_idx);
252#ifdef AGGRESSIVE_TEST
253	if (!check && size > 5)
254		size = 5;
255#endif
256	return size;
257}
258
259static inline int ext4_ext_space_root(struct inode *inode, int check)
260{
261	int size;
262
263	size = sizeof(EXT4_I(inode)->i_data);
264	size -= sizeof(struct ext4_extent_header);
265	size /= sizeof(struct ext4_extent);
266#ifdef AGGRESSIVE_TEST
267	if (!check && size > 3)
268		size = 3;
269#endif
270	return size;
271}
272
273static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
274{
275	int size;
276
277	size = sizeof(EXT4_I(inode)->i_data);
278	size -= sizeof(struct ext4_extent_header);
279	size /= sizeof(struct ext4_extent_idx);
280#ifdef AGGRESSIVE_TEST
281	if (!check && size > 4)
282		size = 4;
283#endif
284	return size;
285}
286
287/*
288 * Calculate the number of metadata blocks needed
289 * to allocate @blocks
290 * Worse case is one block per extent
291 */
292int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
293{
294	struct ext4_inode_info *ei = EXT4_I(inode);
295	int idxs;
296
297	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
298		/ sizeof(struct ext4_extent_idx));
299
300	/*
301	 * If the new delayed allocation block is contiguous with the
302	 * previous da block, it can share index blocks with the
303	 * previous block, so we only need to allocate a new index
304	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
305	 * an additional index block, and at ldxs**3 blocks, yet
306	 * another index blocks.
307	 */
308	if (ei->i_da_metadata_calc_len &&
309	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
310		int num = 0;
311
312		if ((ei->i_da_metadata_calc_len % idxs) == 0)
313			num++;
314		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
315			num++;
316		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
317			num++;
318			ei->i_da_metadata_calc_len = 0;
319		} else
320			ei->i_da_metadata_calc_len++;
321		ei->i_da_metadata_calc_last_lblock++;
322		return num;
323	}
324
325	/*
326	 * In the worst case we need a new set of index blocks at
327	 * every level of the inode's extent tree.
328	 */
329	ei->i_da_metadata_calc_len = 1;
330	ei->i_da_metadata_calc_last_lblock = lblock;
331	return ext_depth(inode) + 1;
332}
333
334static int
335ext4_ext_max_entries(struct inode *inode, int depth)
336{
337	int max;
338
339	if (depth == ext_depth(inode)) {
340		if (depth == 0)
341			max = ext4_ext_space_root(inode, 1);
342		else
343			max = ext4_ext_space_root_idx(inode, 1);
344	} else {
345		if (depth == 0)
346			max = ext4_ext_space_block(inode, 1);
347		else
348			max = ext4_ext_space_block_idx(inode, 1);
349	}
350
351	return max;
352}
353
354static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
355{
356	ext4_fsblk_t block = ext4_ext_pblock(ext);
357	int len = ext4_ext_get_actual_len(ext);
358
359	if (len == 0)
360		return 0;
361	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
362}
363
364static int ext4_valid_extent_idx(struct inode *inode,
365				struct ext4_extent_idx *ext_idx)
366{
367	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
368
369	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
370}
371
372static int ext4_valid_extent_entries(struct inode *inode,
373				struct ext4_extent_header *eh,
374				int depth)
375{
376	unsigned short entries;
377	if (eh->eh_entries == 0)
378		return 1;
379
380	entries = le16_to_cpu(eh->eh_entries);
381
382	if (depth == 0) {
383		/* leaf entries */
384		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
385		while (entries) {
386			if (!ext4_valid_extent(inode, ext))
387				return 0;
388			ext++;
389			entries--;
390		}
391	} else {
392		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
393		while (entries) {
394			if (!ext4_valid_extent_idx(inode, ext_idx))
395				return 0;
396			ext_idx++;
397			entries--;
398		}
399	}
400	return 1;
401}
402
403static int __ext4_ext_check(const char *function, unsigned int line,
404			    struct inode *inode, struct ext4_extent_header *eh,
405			    int depth)
406{
407	const char *error_msg;
408	int max = 0;
409
410	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
411		error_msg = "invalid magic";
412		goto corrupted;
413	}
414	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
415		error_msg = "unexpected eh_depth";
416		goto corrupted;
417	}
418	if (unlikely(eh->eh_max == 0)) {
419		error_msg = "invalid eh_max";
420		goto corrupted;
421	}
422	max = ext4_ext_max_entries(inode, depth);
423	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
424		error_msg = "too large eh_max";
425		goto corrupted;
426	}
427	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
428		error_msg = "invalid eh_entries";
429		goto corrupted;
430	}
431	if (!ext4_valid_extent_entries(inode, eh, depth)) {
432		error_msg = "invalid extent entries";
433		goto corrupted;
434	}
435	/* Verify checksum on non-root extent tree nodes */
436	if (ext_depth(inode) != depth &&
437	    !ext4_extent_block_csum_verify(inode, eh)) {
438		error_msg = "extent tree corrupted";
439		goto corrupted;
440	}
441	return 0;
442
443corrupted:
444	ext4_error_inode(inode, function, line, 0,
445			"bad header/extent: %s - magic %x, "
446			"entries %u, max %u(%u), depth %u(%u)",
447			error_msg, le16_to_cpu(eh->eh_magic),
448			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
449			max, le16_to_cpu(eh->eh_depth), depth);
450
451	return -EIO;
452}
453
454#define ext4_ext_check(inode, eh, depth)	\
455	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
456
457int ext4_ext_check_inode(struct inode *inode)
458{
459	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
460}
461
462static int __ext4_ext_check_block(const char *function, unsigned int line,
463				  struct inode *inode,
464				  struct ext4_extent_header *eh,
465				  int depth,
466				  struct buffer_head *bh)
467{
468	int ret;
469
470	if (buffer_verified(bh))
471		return 0;
472	ret = ext4_ext_check(inode, eh, depth);
473	if (ret)
474		return ret;
475	set_buffer_verified(bh);
476	return ret;
477}
478
479#define ext4_ext_check_block(inode, eh, depth, bh)	\
480	__ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
481
482#ifdef EXT_DEBUG
483static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
484{
485	int k, l = path->p_depth;
486
487	ext_debug("path:");
488	for (k = 0; k <= l; k++, path++) {
489		if (path->p_idx) {
490		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
491			    ext4_idx_pblock(path->p_idx));
492		} else if (path->p_ext) {
493			ext_debug("  %d:[%d]%d:%llu ",
494				  le32_to_cpu(path->p_ext->ee_block),
495				  ext4_ext_is_uninitialized(path->p_ext),
496				  ext4_ext_get_actual_len(path->p_ext),
497				  ext4_ext_pblock(path->p_ext));
498		} else
499			ext_debug("  []");
500	}
501	ext_debug("\n");
502}
503
504static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
505{
506	int depth = ext_depth(inode);
507	struct ext4_extent_header *eh;
508	struct ext4_extent *ex;
509	int i;
510
511	if (!path)
512		return;
513
514	eh = path[depth].p_hdr;
515	ex = EXT_FIRST_EXTENT(eh);
516
517	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
518
519	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
520		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
521			  ext4_ext_is_uninitialized(ex),
522			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
523	}
524	ext_debug("\n");
525}
526
527static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
528			ext4_fsblk_t newblock, int level)
529{
530	int depth = ext_depth(inode);
531	struct ext4_extent *ex;
532
533	if (depth != level) {
534		struct ext4_extent_idx *idx;
535		idx = path[level].p_idx;
536		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
537			ext_debug("%d: move %d:%llu in new index %llu\n", level,
538					le32_to_cpu(idx->ei_block),
539					ext4_idx_pblock(idx),
540					newblock);
541			idx++;
542		}
543
544		return;
545	}
546
547	ex = path[depth].p_ext;
548	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
549		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
550				le32_to_cpu(ex->ee_block),
551				ext4_ext_pblock(ex),
552				ext4_ext_is_uninitialized(ex),
553				ext4_ext_get_actual_len(ex),
554				newblock);
555		ex++;
556	}
557}
558
559#else
560#define ext4_ext_show_path(inode, path)
561#define ext4_ext_show_leaf(inode, path)
562#define ext4_ext_show_move(inode, path, newblock, level)
563#endif
564
565void ext4_ext_drop_refs(struct ext4_ext_path *path)
566{
567	int depth = path->p_depth;
568	int i;
569
570	for (i = 0; i <= depth; i++, path++)
571		if (path->p_bh) {
572			brelse(path->p_bh);
573			path->p_bh = NULL;
574		}
575}
576
577/*
578 * ext4_ext_binsearch_idx:
579 * binary search for the closest index of the given block
580 * the header must be checked before calling this
581 */
582static void
583ext4_ext_binsearch_idx(struct inode *inode,
584			struct ext4_ext_path *path, ext4_lblk_t block)
585{
586	struct ext4_extent_header *eh = path->p_hdr;
587	struct ext4_extent_idx *r, *l, *m;
588
589
590	ext_debug("binsearch for %u(idx):  ", block);
591
592	l = EXT_FIRST_INDEX(eh) + 1;
593	r = EXT_LAST_INDEX(eh);
594	while (l <= r) {
595		m = l + (r - l) / 2;
596		if (block < le32_to_cpu(m->ei_block))
597			r = m - 1;
598		else
599			l = m + 1;
600		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
601				m, le32_to_cpu(m->ei_block),
602				r, le32_to_cpu(r->ei_block));
603	}
604
605	path->p_idx = l - 1;
606	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
607		  ext4_idx_pblock(path->p_idx));
608
609#ifdef CHECK_BINSEARCH
610	{
611		struct ext4_extent_idx *chix, *ix;
612		int k;
613
614		chix = ix = EXT_FIRST_INDEX(eh);
615		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
616		  if (k != 0 &&
617		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
618				printk(KERN_DEBUG "k=%d, ix=0x%p, "
619				       "first=0x%p\n", k,
620				       ix, EXT_FIRST_INDEX(eh));
621				printk(KERN_DEBUG "%u <= %u\n",
622				       le32_to_cpu(ix->ei_block),
623				       le32_to_cpu(ix[-1].ei_block));
624			}
625			BUG_ON(k && le32_to_cpu(ix->ei_block)
626					   <= le32_to_cpu(ix[-1].ei_block));
627			if (block < le32_to_cpu(ix->ei_block))
628				break;
629			chix = ix;
630		}
631		BUG_ON(chix != path->p_idx);
632	}
633#endif
634
635}
636
637/*
638 * ext4_ext_binsearch:
639 * binary search for closest extent of the given block
640 * the header must be checked before calling this
641 */
642static void
643ext4_ext_binsearch(struct inode *inode,
644		struct ext4_ext_path *path, ext4_lblk_t block)
645{
646	struct ext4_extent_header *eh = path->p_hdr;
647	struct ext4_extent *r, *l, *m;
648
649	if (eh->eh_entries == 0) {
650		/*
651		 * this leaf is empty:
652		 * we get such a leaf in split/add case
653		 */
654		return;
655	}
656
657	ext_debug("binsearch for %u:  ", block);
658
659	l = EXT_FIRST_EXTENT(eh) + 1;
660	r = EXT_LAST_EXTENT(eh);
661
662	while (l <= r) {
663		m = l + (r - l) / 2;
664		if (block < le32_to_cpu(m->ee_block))
665			r = m - 1;
666		else
667			l = m + 1;
668		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
669				m, le32_to_cpu(m->ee_block),
670				r, le32_to_cpu(r->ee_block));
671	}
672
673	path->p_ext = l - 1;
674	ext_debug("  -> %d:%llu:[%d]%d ",
675			le32_to_cpu(path->p_ext->ee_block),
676			ext4_ext_pblock(path->p_ext),
677			ext4_ext_is_uninitialized(path->p_ext),
678			ext4_ext_get_actual_len(path->p_ext));
679
680#ifdef CHECK_BINSEARCH
681	{
682		struct ext4_extent *chex, *ex;
683		int k;
684
685		chex = ex = EXT_FIRST_EXTENT(eh);
686		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
687			BUG_ON(k && le32_to_cpu(ex->ee_block)
688					  <= le32_to_cpu(ex[-1].ee_block));
689			if (block < le32_to_cpu(ex->ee_block))
690				break;
691			chex = ex;
692		}
693		BUG_ON(chex != path->p_ext);
694	}
695#endif
696
697}
698
699int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
700{
701	struct ext4_extent_header *eh;
702
703	eh = ext_inode_hdr(inode);
704	eh->eh_depth = 0;
705	eh->eh_entries = 0;
706	eh->eh_magic = EXT4_EXT_MAGIC;
707	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
708	ext4_mark_inode_dirty(handle, inode);
709	ext4_ext_invalidate_cache(inode);
710	return 0;
711}
712
713struct ext4_ext_path *
714ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
715					struct ext4_ext_path *path)
716{
717	struct ext4_extent_header *eh;
718	struct buffer_head *bh;
719	short int depth, i, ppos = 0, alloc = 0;
720
721	eh = ext_inode_hdr(inode);
722	depth = ext_depth(inode);
723
724	/* account possible depth increase */
725	if (!path) {
726		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
727				GFP_NOFS);
728		if (!path)
729			return ERR_PTR(-ENOMEM);
730		alloc = 1;
731	}
732	path[0].p_hdr = eh;
733	path[0].p_bh = NULL;
734
735	i = depth;
736	/* walk through the tree */
737	while (i) {
738		ext_debug("depth %d: num %d, max %d\n",
739			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
740
741		ext4_ext_binsearch_idx(inode, path + ppos, block);
742		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
743		path[ppos].p_depth = i;
744		path[ppos].p_ext = NULL;
745
746		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
747		if (unlikely(!bh))
748			goto err;
749		if (!bh_uptodate_or_lock(bh)) {
750			trace_ext4_ext_load_extent(inode, block,
751						path[ppos].p_block);
752			if (bh_submit_read(bh) < 0) {
753				put_bh(bh);
754				goto err;
755			}
756		}
757		eh = ext_block_hdr(bh);
758		ppos++;
759		if (unlikely(ppos > depth)) {
760			put_bh(bh);
761			EXT4_ERROR_INODE(inode,
762					 "ppos %d > depth %d", ppos, depth);
763			goto err;
764		}
765		path[ppos].p_bh = bh;
766		path[ppos].p_hdr = eh;
767		i--;
768
769		if (ext4_ext_check_block(inode, eh, i, bh))
770			goto err;
771	}
772
773	path[ppos].p_depth = i;
774	path[ppos].p_ext = NULL;
775	path[ppos].p_idx = NULL;
776
777	/* find extent */
778	ext4_ext_binsearch(inode, path + ppos, block);
779	/* if not an empty leaf */
780	if (path[ppos].p_ext)
781		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
782
783	ext4_ext_show_path(inode, path);
784
785	return path;
786
787err:
788	ext4_ext_drop_refs(path);
789	if (alloc)
790		kfree(path);
791	return ERR_PTR(-EIO);
792}
793
794/*
795 * ext4_ext_insert_index:
796 * insert new index [@logical;@ptr] into the block at @curp;
797 * check where to insert: before @curp or after @curp
798 */
799static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
800				 struct ext4_ext_path *curp,
801				 int logical, ext4_fsblk_t ptr)
802{
803	struct ext4_extent_idx *ix;
804	int len, err;
805
806	err = ext4_ext_get_access(handle, inode, curp);
807	if (err)
808		return err;
809
810	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
811		EXT4_ERROR_INODE(inode,
812				 "logical %d == ei_block %d!",
813				 logical, le32_to_cpu(curp->p_idx->ei_block));
814		return -EIO;
815	}
816
817	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
818			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
819		EXT4_ERROR_INODE(inode,
820				 "eh_entries %d >= eh_max %d!",
821				 le16_to_cpu(curp->p_hdr->eh_entries),
822				 le16_to_cpu(curp->p_hdr->eh_max));
823		return -EIO;
824	}
825
826	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
827		/* insert after */
828		ext_debug("insert new index %d after: %llu\n", logical, ptr);
829		ix = curp->p_idx + 1;
830	} else {
831		/* insert before */
832		ext_debug("insert new index %d before: %llu\n", logical, ptr);
833		ix = curp->p_idx;
834	}
835
836	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
837	BUG_ON(len < 0);
838	if (len > 0) {
839		ext_debug("insert new index %d: "
840				"move %d indices from 0x%p to 0x%p\n",
841				logical, len, ix, ix + 1);
842		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
843	}
844
845	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
846		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
847		return -EIO;
848	}
849
850	ix->ei_block = cpu_to_le32(logical);
851	ext4_idx_store_pblock(ix, ptr);
852	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
853
854	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
855		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
856		return -EIO;
857	}
858
859	err = ext4_ext_dirty(handle, inode, curp);
860	ext4_std_error(inode->i_sb, err);
861
862	return err;
863}
864
865/*
866 * ext4_ext_split:
867 * inserts new subtree into the path, using free index entry
868 * at depth @at:
869 * - allocates all needed blocks (new leaf and all intermediate index blocks)
870 * - makes decision where to split
871 * - moves remaining extents and index entries (right to the split point)
872 *   into the newly allocated blocks
873 * - initializes subtree
874 */
875static int ext4_ext_split(handle_t *handle, struct inode *inode,
876			  unsigned int flags,
877			  struct ext4_ext_path *path,
878			  struct ext4_extent *newext, int at)
879{
880	struct buffer_head *bh = NULL;
881	int depth = ext_depth(inode);
882	struct ext4_extent_header *neh;
883	struct ext4_extent_idx *fidx;
884	int i = at, k, m, a;
885	ext4_fsblk_t newblock, oldblock;
886	__le32 border;
887	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
888	int err = 0;
889
890	/* make decision: where to split? */
891	/* FIXME: now decision is simplest: at current extent */
892
893	/* if current leaf will be split, then we should use
894	 * border from split point */
895	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
896		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
897		return -EIO;
898	}
899	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
900		border = path[depth].p_ext[1].ee_block;
901		ext_debug("leaf will be split."
902				" next leaf starts at %d\n",
903				  le32_to_cpu(border));
904	} else {
905		border = newext->ee_block;
906		ext_debug("leaf will be added."
907				" next leaf starts at %d\n",
908				le32_to_cpu(border));
909	}
910
911	/*
912	 * If error occurs, then we break processing
913	 * and mark filesystem read-only. index won't
914	 * be inserted and tree will be in consistent
915	 * state. Next mount will repair buffers too.
916	 */
917
918	/*
919	 * Get array to track all allocated blocks.
920	 * We need this to handle errors and free blocks
921	 * upon them.
922	 */
923	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
924	if (!ablocks)
925		return -ENOMEM;
926
927	/* allocate all needed blocks */
928	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
929	for (a = 0; a < depth - at; a++) {
930		newblock = ext4_ext_new_meta_block(handle, inode, path,
931						   newext, &err, flags);
932		if (newblock == 0)
933			goto cleanup;
934		ablocks[a] = newblock;
935	}
936
937	/* initialize new leaf */
938	newblock = ablocks[--a];
939	if (unlikely(newblock == 0)) {
940		EXT4_ERROR_INODE(inode, "newblock == 0!");
941		err = -EIO;
942		goto cleanup;
943	}
944	bh = sb_getblk(inode->i_sb, newblock);
945	if (!bh) {
946		err = -EIO;
947		goto cleanup;
948	}
949	lock_buffer(bh);
950
951	err = ext4_journal_get_create_access(handle, bh);
952	if (err)
953		goto cleanup;
954
955	neh = ext_block_hdr(bh);
956	neh->eh_entries = 0;
957	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
958	neh->eh_magic = EXT4_EXT_MAGIC;
959	neh->eh_depth = 0;
960
961	/* move remainder of path[depth] to the new leaf */
962	if (unlikely(path[depth].p_hdr->eh_entries !=
963		     path[depth].p_hdr->eh_max)) {
964		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
965				 path[depth].p_hdr->eh_entries,
966				 path[depth].p_hdr->eh_max);
967		err = -EIO;
968		goto cleanup;
969	}
970	/* start copy from next extent */
971	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
972	ext4_ext_show_move(inode, path, newblock, depth);
973	if (m) {
974		struct ext4_extent *ex;
975		ex = EXT_FIRST_EXTENT(neh);
976		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
977		le16_add_cpu(&neh->eh_entries, m);
978	}
979
980	ext4_extent_block_csum_set(inode, neh);
981	set_buffer_uptodate(bh);
982	unlock_buffer(bh);
983
984	err = ext4_handle_dirty_metadata(handle, inode, bh);
985	if (err)
986		goto cleanup;
987	brelse(bh);
988	bh = NULL;
989
990	/* correct old leaf */
991	if (m) {
992		err = ext4_ext_get_access(handle, inode, path + depth);
993		if (err)
994			goto cleanup;
995		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
996		err = ext4_ext_dirty(handle, inode, path + depth);
997		if (err)
998			goto cleanup;
999
1000	}
1001
1002	/* create intermediate indexes */
1003	k = depth - at - 1;
1004	if (unlikely(k < 0)) {
1005		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1006		err = -EIO;
1007		goto cleanup;
1008	}
1009	if (k)
1010		ext_debug("create %d intermediate indices\n", k);
1011	/* insert new index into current index block */
1012	/* current depth stored in i var */
1013	i = depth - 1;
1014	while (k--) {
1015		oldblock = newblock;
1016		newblock = ablocks[--a];
1017		bh = sb_getblk(inode->i_sb, newblock);
1018		if (!bh) {
1019			err = -EIO;
1020			goto cleanup;
1021		}
1022		lock_buffer(bh);
1023
1024		err = ext4_journal_get_create_access(handle, bh);
1025		if (err)
1026			goto cleanup;
1027
1028		neh = ext_block_hdr(bh);
1029		neh->eh_entries = cpu_to_le16(1);
1030		neh->eh_magic = EXT4_EXT_MAGIC;
1031		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1032		neh->eh_depth = cpu_to_le16(depth - i);
1033		fidx = EXT_FIRST_INDEX(neh);
1034		fidx->ei_block = border;
1035		ext4_idx_store_pblock(fidx, oldblock);
1036
1037		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1038				i, newblock, le32_to_cpu(border), oldblock);
1039
1040		/* move remainder of path[i] to the new index block */
1041		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1042					EXT_LAST_INDEX(path[i].p_hdr))) {
1043			EXT4_ERROR_INODE(inode,
1044					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1045					 le32_to_cpu(path[i].p_ext->ee_block));
1046			err = -EIO;
1047			goto cleanup;
1048		}
1049		/* start copy indexes */
1050		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1051		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1052				EXT_MAX_INDEX(path[i].p_hdr));
1053		ext4_ext_show_move(inode, path, newblock, i);
1054		if (m) {
1055			memmove(++fidx, path[i].p_idx,
1056				sizeof(struct ext4_extent_idx) * m);
1057			le16_add_cpu(&neh->eh_entries, m);
1058		}
1059		ext4_extent_block_csum_set(inode, neh);
1060		set_buffer_uptodate(bh);
1061		unlock_buffer(bh);
1062
1063		err = ext4_handle_dirty_metadata(handle, inode, bh);
1064		if (err)
1065			goto cleanup;
1066		brelse(bh);
1067		bh = NULL;
1068
1069		/* correct old index */
1070		if (m) {
1071			err = ext4_ext_get_access(handle, inode, path + i);
1072			if (err)
1073				goto cleanup;
1074			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1075			err = ext4_ext_dirty(handle, inode, path + i);
1076			if (err)
1077				goto cleanup;
1078		}
1079
1080		i--;
1081	}
1082
1083	/* insert new index */
1084	err = ext4_ext_insert_index(handle, inode, path + at,
1085				    le32_to_cpu(border), newblock);
1086
1087cleanup:
1088	if (bh) {
1089		if (buffer_locked(bh))
1090			unlock_buffer(bh);
1091		brelse(bh);
1092	}
1093
1094	if (err) {
1095		/* free all allocated blocks in error case */
1096		for (i = 0; i < depth; i++) {
1097			if (!ablocks[i])
1098				continue;
1099			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1100					 EXT4_FREE_BLOCKS_METADATA);
1101		}
1102	}
1103	kfree(ablocks);
1104
1105	return err;
1106}
1107
1108/*
1109 * ext4_ext_grow_indepth:
1110 * implements tree growing procedure:
1111 * - allocates new block
1112 * - moves top-level data (index block or leaf) into the new block
1113 * - initializes new top-level, creating index that points to the
1114 *   just created block
1115 */
1116static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1117				 unsigned int flags,
1118				 struct ext4_extent *newext)
1119{
1120	struct ext4_extent_header *neh;
1121	struct buffer_head *bh;
1122	ext4_fsblk_t newblock;
1123	int err = 0;
1124
1125	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1126		newext, &err, flags);
1127	if (newblock == 0)
1128		return err;
1129
1130	bh = sb_getblk(inode->i_sb, newblock);
1131	if (!bh) {
1132		err = -EIO;
1133		ext4_std_error(inode->i_sb, err);
1134		return err;
1135	}
1136	lock_buffer(bh);
1137
1138	err = ext4_journal_get_create_access(handle, bh);
1139	if (err) {
1140		unlock_buffer(bh);
1141		goto out;
1142	}
1143
1144	/* move top-level index/leaf into new block */
1145	memmove(bh->b_data, EXT4_I(inode)->i_data,
1146		sizeof(EXT4_I(inode)->i_data));
1147
1148	/* set size of new block */
1149	neh = ext_block_hdr(bh);
1150	/* old root could have indexes or leaves
1151	 * so calculate e_max right way */
1152	if (ext_depth(inode))
1153		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1154	else
1155		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1156	neh->eh_magic = EXT4_EXT_MAGIC;
1157	ext4_extent_block_csum_set(inode, neh);
1158	set_buffer_uptodate(bh);
1159	unlock_buffer(bh);
1160
1161	err = ext4_handle_dirty_metadata(handle, inode, bh);
1162	if (err)
1163		goto out;
1164
1165	/* Update top-level index: num,max,pointer */
1166	neh = ext_inode_hdr(inode);
1167	neh->eh_entries = cpu_to_le16(1);
1168	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1169	if (neh->eh_depth == 0) {
1170		/* Root extent block becomes index block */
1171		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1172		EXT_FIRST_INDEX(neh)->ei_block =
1173			EXT_FIRST_EXTENT(neh)->ee_block;
1174	}
1175	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1176		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1177		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1178		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1179
1180	neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
1181	ext4_mark_inode_dirty(handle, inode);
1182out:
1183	brelse(bh);
1184
1185	return err;
1186}
1187
1188/*
1189 * ext4_ext_create_new_leaf:
1190 * finds empty index and adds new leaf.
1191 * if no free index is found, then it requests in-depth growing.
1192 */
1193static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1194				    unsigned int flags,
1195				    struct ext4_ext_path *path,
1196				    struct ext4_extent *newext)
1197{
1198	struct ext4_ext_path *curp;
1199	int depth, i, err = 0;
1200
1201repeat:
1202	i = depth = ext_depth(inode);
1203
1204	/* walk up to the tree and look for free index entry */
1205	curp = path + depth;
1206	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1207		i--;
1208		curp--;
1209	}
1210
1211	/* we use already allocated block for index block,
1212	 * so subsequent data blocks should be contiguous */
1213	if (EXT_HAS_FREE_INDEX(curp)) {
1214		/* if we found index with free entry, then use that
1215		 * entry: create all needed subtree and add new leaf */
1216		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1217		if (err)
1218			goto out;
1219
1220		/* refill path */
1221		ext4_ext_drop_refs(path);
1222		path = ext4_ext_find_extent(inode,
1223				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1224				    path);
1225		if (IS_ERR(path))
1226			err = PTR_ERR(path);
1227	} else {
1228		/* tree is full, time to grow in depth */
1229		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1230		if (err)
1231			goto out;
1232
1233		/* refill path */
1234		ext4_ext_drop_refs(path);
1235		path = ext4_ext_find_extent(inode,
1236				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1237				    path);
1238		if (IS_ERR(path)) {
1239			err = PTR_ERR(path);
1240			goto out;
1241		}
1242
1243		/*
1244		 * only first (depth 0 -> 1) produces free space;
1245		 * in all other cases we have to split the grown tree
1246		 */
1247		depth = ext_depth(inode);
1248		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1249			/* now we need to split */
1250			goto repeat;
1251		}
1252	}
1253
1254out:
1255	return err;
1256}
1257
1258/*
1259 * search the closest allocated block to the left for *logical
1260 * and returns it at @logical + it's physical address at @phys
1261 * if *logical is the smallest allocated block, the function
1262 * returns 0 at @phys
1263 * return value contains 0 (success) or error code
1264 */
1265static int ext4_ext_search_left(struct inode *inode,
1266				struct ext4_ext_path *path,
1267				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1268{
1269	struct ext4_extent_idx *ix;
1270	struct ext4_extent *ex;
1271	int depth, ee_len;
1272
1273	if (unlikely(path == NULL)) {
1274		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1275		return -EIO;
1276	}
1277	depth = path->p_depth;
1278	*phys = 0;
1279
1280	if (depth == 0 && path->p_ext == NULL)
1281		return 0;
1282
1283	/* usually extent in the path covers blocks smaller
1284	 * then *logical, but it can be that extent is the
1285	 * first one in the file */
1286
1287	ex = path[depth].p_ext;
1288	ee_len = ext4_ext_get_actual_len(ex);
1289	if (*logical < le32_to_cpu(ex->ee_block)) {
1290		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1291			EXT4_ERROR_INODE(inode,
1292					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1293					 *logical, le32_to_cpu(ex->ee_block));
1294			return -EIO;
1295		}
1296		while (--depth >= 0) {
1297			ix = path[depth].p_idx;
1298			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1299				EXT4_ERROR_INODE(inode,
1300				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1301				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1302				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1303		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1304				  depth);
1305				return -EIO;
1306			}
1307		}
1308		return 0;
1309	}
1310
1311	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1312		EXT4_ERROR_INODE(inode,
1313				 "logical %d < ee_block %d + ee_len %d!",
1314				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1315		return -EIO;
1316	}
1317
1318	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1319	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1320	return 0;
1321}
1322
1323/*
1324 * search the closest allocated block to the right for *logical
1325 * and returns it at @logical + it's physical address at @phys
1326 * if *logical is the largest allocated block, the function
1327 * returns 0 at @phys
1328 * return value contains 0 (success) or error code
1329 */
1330static int ext4_ext_search_right(struct inode *inode,
1331				 struct ext4_ext_path *path,
1332				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1333				 struct ext4_extent **ret_ex)
1334{
1335	struct buffer_head *bh = NULL;
1336	struct ext4_extent_header *eh;
1337	struct ext4_extent_idx *ix;
1338	struct ext4_extent *ex;
1339	ext4_fsblk_t block;
1340	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1341	int ee_len;
1342
1343	if (unlikely(path == NULL)) {
1344		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1345		return -EIO;
1346	}
1347	depth = path->p_depth;
1348	*phys = 0;
1349
1350	if (depth == 0 && path->p_ext == NULL)
1351		return 0;
1352
1353	/* usually extent in the path covers blocks smaller
1354	 * then *logical, but it can be that extent is the
1355	 * first one in the file */
1356
1357	ex = path[depth].p_ext;
1358	ee_len = ext4_ext_get_actual_len(ex);
1359	if (*logical < le32_to_cpu(ex->ee_block)) {
1360		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1361			EXT4_ERROR_INODE(inode,
1362					 "first_extent(path[%d].p_hdr) != ex",
1363					 depth);
1364			return -EIO;
1365		}
1366		while (--depth >= 0) {
1367			ix = path[depth].p_idx;
1368			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1369				EXT4_ERROR_INODE(inode,
1370						 "ix != EXT_FIRST_INDEX *logical %d!",
1371						 *logical);
1372				return -EIO;
1373			}
1374		}
1375		goto found_extent;
1376	}
1377
1378	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1379		EXT4_ERROR_INODE(inode,
1380				 "logical %d < ee_block %d + ee_len %d!",
1381				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1382		return -EIO;
1383	}
1384
1385	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1386		/* next allocated block in this leaf */
1387		ex++;
1388		goto found_extent;
1389	}
1390
1391	/* go up and search for index to the right */
1392	while (--depth >= 0) {
1393		ix = path[depth].p_idx;
1394		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1395			goto got_index;
1396	}
1397
1398	/* we've gone up to the root and found no index to the right */
1399	return 0;
1400
1401got_index:
1402	/* we've found index to the right, let's
1403	 * follow it and find the closest allocated
1404	 * block to the right */
1405	ix++;
1406	block = ext4_idx_pblock(ix);
1407	while (++depth < path->p_depth) {
1408		bh = sb_bread(inode->i_sb, block);
1409		if (bh == NULL)
1410			return -EIO;
1411		eh = ext_block_hdr(bh);
1412		/* subtract from p_depth to get proper eh_depth */
1413		if (ext4_ext_check_block(inode, eh,
1414					 path->p_depth - depth, bh)) {
1415			put_bh(bh);
1416			return -EIO;
1417		}
1418		ix = EXT_FIRST_INDEX(eh);
1419		block = ext4_idx_pblock(ix);
1420		put_bh(bh);
1421	}
1422
1423	bh = sb_bread(inode->i_sb, block);
1424	if (bh == NULL)
1425		return -EIO;
1426	eh = ext_block_hdr(bh);
1427	if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
1428		put_bh(bh);
1429		return -EIO;
1430	}
1431	ex = EXT_FIRST_EXTENT(eh);
1432found_extent:
1433	*logical = le32_to_cpu(ex->ee_block);
1434	*phys = ext4_ext_pblock(ex);
1435	*ret_ex = ex;
1436	if (bh)
1437		put_bh(bh);
1438	return 0;
1439}
1440
1441/*
1442 * ext4_ext_next_allocated_block:
1443 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1444 * NOTE: it considers block number from index entry as
1445 * allocated block. Thus, index entries have to be consistent
1446 * with leaves.
1447 */
1448static ext4_lblk_t
1449ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1450{
1451	int depth;
1452
1453	BUG_ON(path == NULL);
1454	depth = path->p_depth;
1455
1456	if (depth == 0 && path->p_ext == NULL)
1457		return EXT_MAX_BLOCKS;
1458
1459	while (depth >= 0) {
1460		if (depth == path->p_depth) {
1461			/* leaf */
1462			if (path[depth].p_ext &&
1463				path[depth].p_ext !=
1464					EXT_LAST_EXTENT(path[depth].p_hdr))
1465			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1466		} else {
1467			/* index */
1468			if (path[depth].p_idx !=
1469					EXT_LAST_INDEX(path[depth].p_hdr))
1470			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1471		}
1472		depth--;
1473	}
1474
1475	return EXT_MAX_BLOCKS;
1476}
1477
1478/*
1479 * ext4_ext_next_leaf_block:
1480 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1481 */
1482static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1483{
1484	int depth;
1485
1486	BUG_ON(path == NULL);
1487	depth = path->p_depth;
1488
1489	/* zero-tree has no leaf blocks at all */
1490	if (depth == 0)
1491		return EXT_MAX_BLOCKS;
1492
1493	/* go to index block */
1494	depth--;
1495
1496	while (depth >= 0) {
1497		if (path[depth].p_idx !=
1498				EXT_LAST_INDEX(path[depth].p_hdr))
1499			return (ext4_lblk_t)
1500				le32_to_cpu(path[depth].p_idx[1].ei_block);
1501		depth--;
1502	}
1503
1504	return EXT_MAX_BLOCKS;
1505}
1506
1507/*
1508 * ext4_ext_correct_indexes:
1509 * if leaf gets modified and modified extent is first in the leaf,
1510 * then we have to correct all indexes above.
1511 * TODO: do we need to correct tree in all cases?
1512 */
1513static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1514				struct ext4_ext_path *path)
1515{
1516	struct ext4_extent_header *eh;
1517	int depth = ext_depth(inode);
1518	struct ext4_extent *ex;
1519	__le32 border;
1520	int k, err = 0;
1521
1522	eh = path[depth].p_hdr;
1523	ex = path[depth].p_ext;
1524
1525	if (unlikely(ex == NULL || eh == NULL)) {
1526		EXT4_ERROR_INODE(inode,
1527				 "ex %p == NULL or eh %p == NULL", ex, eh);
1528		return -EIO;
1529	}
1530
1531	if (depth == 0) {
1532		/* there is no tree at all */
1533		return 0;
1534	}
1535
1536	if (ex != EXT_FIRST_EXTENT(eh)) {
1537		/* we correct tree if first leaf got modified only */
1538		return 0;
1539	}
1540
1541	/*
1542	 * TODO: we need correction if border is smaller than current one
1543	 */
1544	k = depth - 1;
1545	border = path[depth].p_ext->ee_block;
1546	err = ext4_ext_get_access(handle, inode, path + k);
1547	if (err)
1548		return err;
1549	path[k].p_idx->ei_block = border;
1550	err = ext4_ext_dirty(handle, inode, path + k);
1551	if (err)
1552		return err;
1553
1554	while (k--) {
1555		/* change all left-side indexes */
1556		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1557			break;
1558		err = ext4_ext_get_access(handle, inode, path + k);
1559		if (err)
1560			break;
1561		path[k].p_idx->ei_block = border;
1562		err = ext4_ext_dirty(handle, inode, path + k);
1563		if (err)
1564			break;
1565	}
1566
1567	return err;
1568}
1569
1570int
1571ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1572				struct ext4_extent *ex2)
1573{
1574	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1575
1576	/*
1577	 * Make sure that either both extents are uninitialized, or
1578	 * both are _not_.
1579	 */
1580	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1581		return 0;
1582
1583	if (ext4_ext_is_uninitialized(ex1))
1584		max_len = EXT_UNINIT_MAX_LEN;
1585	else
1586		max_len = EXT_INIT_MAX_LEN;
1587
1588	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1589	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1590
1591	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1592			le32_to_cpu(ex2->ee_block))
1593		return 0;
1594
1595	/*
1596	 * To allow future support for preallocated extents to be added
1597	 * as an RO_COMPAT feature, refuse to merge to extents if
1598	 * this can result in the top bit of ee_len being set.
1599	 */
1600	if (ext1_ee_len + ext2_ee_len > max_len)
1601		return 0;
1602#ifdef AGGRESSIVE_TEST
1603	if (ext1_ee_len >= 4)
1604		return 0;
1605#endif
1606
1607	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1608		return 1;
1609	return 0;
1610}
1611
1612/*
1613 * This function tries to merge the "ex" extent to the next extent in the tree.
1614 * It always tries to merge towards right. If you want to merge towards
1615 * left, pass "ex - 1" as argument instead of "ex".
1616 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1617 * 1 if they got merged.
1618 */
1619static int ext4_ext_try_to_merge_right(struct inode *inode,
1620				 struct ext4_ext_path *path,
1621				 struct ext4_extent *ex)
1622{
1623	struct ext4_extent_header *eh;
1624	unsigned int depth, len;
1625	int merge_done = 0;
1626	int uninitialized = 0;
1627
1628	depth = ext_depth(inode);
1629	BUG_ON(path[depth].p_hdr == NULL);
1630	eh = path[depth].p_hdr;
1631
1632	while (ex < EXT_LAST_EXTENT(eh)) {
1633		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1634			break;
1635		/* merge with next extent! */
1636		if (ext4_ext_is_uninitialized(ex))
1637			uninitialized = 1;
1638		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1639				+ ext4_ext_get_actual_len(ex + 1));
1640		if (uninitialized)
1641			ext4_ext_mark_uninitialized(ex);
1642
1643		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1644			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1645				* sizeof(struct ext4_extent);
1646			memmove(ex + 1, ex + 2, len);
1647		}
1648		le16_add_cpu(&eh->eh_entries, -1);
1649		merge_done = 1;
1650		WARN_ON(eh->eh_entries == 0);
1651		if (!eh->eh_entries)
1652			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1653	}
1654
1655	return merge_done;
1656}
1657
1658/*
1659 * This function does a very simple check to see if we can collapse
1660 * an extent tree with a single extent tree leaf block into the inode.
1661 */
1662static void ext4_ext_try_to_merge_up(handle_t *handle,
1663				     struct inode *inode,
1664				     struct ext4_ext_path *path)
1665{
1666	size_t s;
1667	unsigned max_root = ext4_ext_space_root(inode, 0);
1668	ext4_fsblk_t blk;
1669
1670	if ((path[0].p_depth != 1) ||
1671	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1672	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1673		return;
1674
1675	/*
1676	 * We need to modify the block allocation bitmap and the block
1677	 * group descriptor to release the extent tree block.  If we
1678	 * can't get the journal credits, give up.
1679	 */
1680	if (ext4_journal_extend(handle, 2))
1681		return;
1682
1683	/*
1684	 * Copy the extent data up to the inode
1685	 */
1686	blk = ext4_idx_pblock(path[0].p_idx);
1687	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1688		sizeof(struct ext4_extent_idx);
1689	s += sizeof(struct ext4_extent_header);
1690
1691	memcpy(path[0].p_hdr, path[1].p_hdr, s);
1692	path[0].p_depth = 0;
1693	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1694		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1695	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1696
1697	brelse(path[1].p_bh);
1698	ext4_free_blocks(handle, inode, NULL, blk, 1,
1699			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1700}
1701
1702/*
1703 * This function tries to merge the @ex extent to neighbours in the tree.
1704 * return 1 if merge left else 0.
1705 */
1706static void ext4_ext_try_to_merge(handle_t *handle,
1707				  struct inode *inode,
1708				  struct ext4_ext_path *path,
1709				  struct ext4_extent *ex) {
1710	struct ext4_extent_header *eh;
1711	unsigned int depth;
1712	int merge_done = 0;
1713
1714	depth = ext_depth(inode);
1715	BUG_ON(path[depth].p_hdr == NULL);
1716	eh = path[depth].p_hdr;
1717
1718	if (ex > EXT_FIRST_EXTENT(eh))
1719		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1720
1721	if (!merge_done)
1722		(void) ext4_ext_try_to_merge_right(inode, path, ex);
1723
1724	ext4_ext_try_to_merge_up(handle, inode, path);
1725}
1726
1727/*
1728 * check if a portion of the "newext" extent overlaps with an
1729 * existing extent.
1730 *
1731 * If there is an overlap discovered, it updates the length of the newext
1732 * such that there will be no overlap, and then returns 1.
1733 * If there is no overlap found, it returns 0.
1734 */
1735static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1736					   struct inode *inode,
1737					   struct ext4_extent *newext,
1738					   struct ext4_ext_path *path)
1739{
1740	ext4_lblk_t b1, b2;
1741	unsigned int depth, len1;
1742	unsigned int ret = 0;
1743
1744	b1 = le32_to_cpu(newext->ee_block);
1745	len1 = ext4_ext_get_actual_len(newext);
1746	depth = ext_depth(inode);
1747	if (!path[depth].p_ext)
1748		goto out;
1749	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1750	b2 &= ~(sbi->s_cluster_ratio - 1);
1751
1752	/*
1753	 * get the next allocated block if the extent in the path
1754	 * is before the requested block(s)
1755	 */
1756	if (b2 < b1) {
1757		b2 = ext4_ext_next_allocated_block(path);
1758		if (b2 == EXT_MAX_BLOCKS)
1759			goto out;
1760		b2 &= ~(sbi->s_cluster_ratio - 1);
1761	}
1762
1763	/* check for wrap through zero on extent logical start block*/
1764	if (b1 + len1 < b1) {
1765		len1 = EXT_MAX_BLOCKS - b1;
1766		newext->ee_len = cpu_to_le16(len1);
1767		ret = 1;
1768	}
1769
1770	/* check for overlap */
1771	if (b1 + len1 > b2) {
1772		newext->ee_len = cpu_to_le16(b2 - b1);
1773		ret = 1;
1774	}
1775out:
1776	return ret;
1777}
1778
1779/*
1780 * ext4_ext_insert_extent:
1781 * tries to merge requsted extent into the existing extent or
1782 * inserts requested extent as new one into the tree,
1783 * creating new leaf in the no-space case.
1784 */
1785int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1786				struct ext4_ext_path *path,
1787				struct ext4_extent *newext, int flag)
1788{
1789	struct ext4_extent_header *eh;
1790	struct ext4_extent *ex, *fex;
1791	struct ext4_extent *nearex; /* nearest extent */
1792	struct ext4_ext_path *npath = NULL;
1793	int depth, len, err;
1794	ext4_lblk_t next;
1795	unsigned uninitialized = 0;
1796	int flags = 0;
1797
1798	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1799		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1800		return -EIO;
1801	}
1802	depth = ext_depth(inode);
1803	ex = path[depth].p_ext;
1804	if (unlikely(path[depth].p_hdr == NULL)) {
1805		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1806		return -EIO;
1807	}
1808
1809	/* try to insert block into found extent and return */
1810	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1811		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1812		ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1813			  ext4_ext_is_uninitialized(newext),
1814			  ext4_ext_get_actual_len(newext),
1815			  le32_to_cpu(ex->ee_block),
1816			  ext4_ext_is_uninitialized(ex),
1817			  ext4_ext_get_actual_len(ex),
1818			  ext4_ext_pblock(ex));
1819		err = ext4_ext_get_access(handle, inode, path + depth);
1820		if (err)
1821			return err;
1822
1823		/*
1824		 * ext4_can_extents_be_merged should have checked that either
1825		 * both extents are uninitialized, or both aren't. Thus we
1826		 * need to check only one of them here.
1827		 */
1828		if (ext4_ext_is_uninitialized(ex))
1829			uninitialized = 1;
1830		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1831					+ ext4_ext_get_actual_len(newext));
1832		if (uninitialized)
1833			ext4_ext_mark_uninitialized(ex);
1834		eh = path[depth].p_hdr;
1835		nearex = ex;
1836		goto merge;
1837	}
1838
1839	depth = ext_depth(inode);
1840	eh = path[depth].p_hdr;
1841	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1842		goto has_space;
1843
1844	/* probably next leaf has space for us? */
1845	fex = EXT_LAST_EXTENT(eh);
1846	next = EXT_MAX_BLOCKS;
1847	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1848		next = ext4_ext_next_leaf_block(path);
1849	if (next != EXT_MAX_BLOCKS) {
1850		ext_debug("next leaf block - %u\n", next);
1851		BUG_ON(npath != NULL);
1852		npath = ext4_ext_find_extent(inode, next, NULL);
1853		if (IS_ERR(npath))
1854			return PTR_ERR(npath);
1855		BUG_ON(npath->p_depth != path->p_depth);
1856		eh = npath[depth].p_hdr;
1857		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1858			ext_debug("next leaf isn't full(%d)\n",
1859				  le16_to_cpu(eh->eh_entries));
1860			path = npath;
1861			goto has_space;
1862		}
1863		ext_debug("next leaf has no free space(%d,%d)\n",
1864			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1865	}
1866
1867	/*
1868	 * There is no free space in the found leaf.
1869	 * We're gonna add a new leaf in the tree.
1870	 */
1871	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1872		flags = EXT4_MB_USE_ROOT_BLOCKS;
1873	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1874	if (err)
1875		goto cleanup;
1876	depth = ext_depth(inode);
1877	eh = path[depth].p_hdr;
1878
1879has_space:
1880	nearex = path[depth].p_ext;
1881
1882	err = ext4_ext_get_access(handle, inode, path + depth);
1883	if (err)
1884		goto cleanup;
1885
1886	if (!nearex) {
1887		/* there is no extent in this leaf, create first one */
1888		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
1889				le32_to_cpu(newext->ee_block),
1890				ext4_ext_pblock(newext),
1891				ext4_ext_is_uninitialized(newext),
1892				ext4_ext_get_actual_len(newext));
1893		nearex = EXT_FIRST_EXTENT(eh);
1894	} else {
1895		if (le32_to_cpu(newext->ee_block)
1896			   > le32_to_cpu(nearex->ee_block)) {
1897			/* Insert after */
1898			ext_debug("insert %u:%llu:[%d]%d before: "
1899					"nearest %p\n",
1900					le32_to_cpu(newext->ee_block),
1901					ext4_ext_pblock(newext),
1902					ext4_ext_is_uninitialized(newext),
1903					ext4_ext_get_actual_len(newext),
1904					nearex);
1905			nearex++;
1906		} else {
1907			/* Insert before */
1908			BUG_ON(newext->ee_block == nearex->ee_block);
1909			ext_debug("insert %u:%llu:[%d]%d after: "
1910					"nearest %p\n",
1911					le32_to_cpu(newext->ee_block),
1912					ext4_ext_pblock(newext),
1913					ext4_ext_is_uninitialized(newext),
1914					ext4_ext_get_actual_len(newext),
1915					nearex);
1916		}
1917		len = EXT_LAST_EXTENT(eh) - nearex + 1;
1918		if (len > 0) {
1919			ext_debug("insert %u:%llu:[%d]%d: "
1920					"move %d extents from 0x%p to 0x%p\n",
1921					le32_to_cpu(newext->ee_block),
1922					ext4_ext_pblock(newext),
1923					ext4_ext_is_uninitialized(newext),
1924					ext4_ext_get_actual_len(newext),
1925					len, nearex, nearex + 1);
1926			memmove(nearex + 1, nearex,
1927				len * sizeof(struct ext4_extent));
1928		}
1929	}
1930
1931	le16_add_cpu(&eh->eh_entries, 1);
1932	path[depth].p_ext = nearex;
1933	nearex->ee_block = newext->ee_block;
1934	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1935	nearex->ee_len = newext->ee_len;
1936
1937merge:
1938	/* try to merge extents */
1939	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1940		ext4_ext_try_to_merge(handle, inode, path, nearex);
1941
1942
1943	/* time to correct all indexes above */
1944	err = ext4_ext_correct_indexes(handle, inode, path);
1945	if (err)
1946		goto cleanup;
1947
1948	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
1949
1950cleanup:
1951	if (npath) {
1952		ext4_ext_drop_refs(npath);
1953		kfree(npath);
1954	}
1955	ext4_ext_invalidate_cache(inode);
1956	return err;
1957}
1958
1959static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1960			       ext4_lblk_t num, ext_prepare_callback func,
1961			       void *cbdata)
1962{
1963	struct ext4_ext_path *path = NULL;
1964	struct ext4_ext_cache cbex;
1965	struct ext4_extent *ex;
1966	ext4_lblk_t next, start = 0, end = 0;
1967	ext4_lblk_t last = block + num;
1968	int depth, exists, err = 0;
1969
1970	BUG_ON(func == NULL);
1971	BUG_ON(inode == NULL);
1972
1973	while (block < last && block != EXT_MAX_BLOCKS) {
1974		num = last - block;
1975		/* find extent for this block */
1976		down_read(&EXT4_I(inode)->i_data_sem);
1977		path = ext4_ext_find_extent(inode, block, path);
1978		up_read(&EXT4_I(inode)->i_data_sem);
1979		if (IS_ERR(path)) {
1980			err = PTR_ERR(path);
1981			path = NULL;
1982			break;
1983		}
1984
1985		depth = ext_depth(inode);
1986		if (unlikely(path[depth].p_hdr == NULL)) {
1987			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1988			err = -EIO;
1989			break;
1990		}
1991		ex = path[depth].p_ext;
1992		next = ext4_ext_next_allocated_block(path);
1993
1994		exists = 0;
1995		if (!ex) {
1996			/* there is no extent yet, so try to allocate
1997			 * all requested space */
1998			start = block;
1999			end = block + num;
2000		} else if (le32_to_cpu(ex->ee_block) > block) {
2001			/* need to allocate space before found extent */
2002			start = block;
2003			end = le32_to_cpu(ex->ee_block);
2004			if (block + num < end)
2005				end = block + num;
2006		} else if (block >= le32_to_cpu(ex->ee_block)
2007					+ ext4_ext_get_actual_len(ex)) {
2008			/* need to allocate space after found extent */
2009			start = block;
2010			end = block + num;
2011			if (end >= next)
2012				end = next;
2013		} else if (block >= le32_to_cpu(ex->ee_block)) {
2014			/*
2015			 * some part of requested space is covered
2016			 * by found extent
2017			 */
2018			start = block;
2019			end = le32_to_cpu(ex->ee_block)
2020				+ ext4_ext_get_actual_len(ex);
2021			if (block + num < end)
2022				end = block + num;
2023			exists = 1;
2024		} else {
2025			BUG();
2026		}
2027		BUG_ON(end <= start);
2028
2029		if (!exists) {
2030			cbex.ec_block = start;
2031			cbex.ec_len = end - start;
2032			cbex.ec_start = 0;
2033		} else {
2034			cbex.ec_block = le32_to_cpu(ex->ee_block);
2035			cbex.ec_len = ext4_ext_get_actual_len(ex);
2036			cbex.ec_start = ext4_ext_pblock(ex);
2037		}
2038
2039		if (unlikely(cbex.ec_len == 0)) {
2040			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
2041			err = -EIO;
2042			break;
2043		}
2044		err = func(inode, next, &cbex, ex, cbdata);
2045		ext4_ext_drop_refs(path);
2046
2047		if (err < 0)
2048			break;
2049
2050		if (err == EXT_REPEAT)
2051			continue;
2052		else if (err == EXT_BREAK) {
2053			err = 0;
2054			break;
2055		}
2056
2057		if (ext_depth(inode) != depth) {
2058			/* depth was changed. we have to realloc path */
2059			kfree(path);
2060			path = NULL;
2061		}
2062
2063		block = cbex.ec_block + cbex.ec_len;
2064	}
2065
2066	if (path) {
2067		ext4_ext_drop_refs(path);
2068		kfree(path);
2069	}
2070
2071	return err;
2072}
2073
2074static void
2075ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
2076			__u32 len, ext4_fsblk_t start)
2077{
2078	struct ext4_ext_cache *cex;
2079	BUG_ON(len == 0);
2080	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2081	trace_ext4_ext_put_in_cache(inode, block, len, start);
2082	cex = &EXT4_I(inode)->i_cached_extent;
2083	cex->ec_block = block;
2084	cex->ec_len = len;
2085	cex->ec_start = start;
2086	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2087}
2088
2089/*
2090 * ext4_ext_put_gap_in_cache:
2091 * calculate boundaries of the gap that the requested block fits into
2092 * and cache this gap
2093 */
2094static void
2095ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2096				ext4_lblk_t block)
2097{
2098	int depth = ext_depth(inode);
2099	unsigned long len;
2100	ext4_lblk_t lblock;
2101	struct ext4_extent *ex;
2102
2103	ex = path[depth].p_ext;
2104	if (ex == NULL) {
2105		/* there is no extent yet, so gap is [0;-] */
2106		lblock = 0;
2107		len = EXT_MAX_BLOCKS;
2108		ext_debug("cache gap(whole file):");
2109	} else if (block < le32_to_cpu(ex->ee_block)) {
2110		lblock = block;
2111		len = le32_to_cpu(ex->ee_block) - block;
2112		ext_debug("cache gap(before): %u [%u:%u]",
2113				block,
2114				le32_to_cpu(ex->ee_block),
2115				 ext4_ext_get_actual_len(ex));
2116	} else if (block >= le32_to_cpu(ex->ee_block)
2117			+ ext4_ext_get_actual_len(ex)) {
2118		ext4_lblk_t next;
2119		lblock = le32_to_cpu(ex->ee_block)
2120			+ ext4_ext_get_actual_len(ex);
2121
2122		next = ext4_ext_next_allocated_block(path);
2123		ext_debug("cache gap(after): [%u:%u] %u",
2124				le32_to_cpu(ex->ee_block),
2125				ext4_ext_get_actual_len(ex),
2126				block);
2127		BUG_ON(next == lblock);
2128		len = next - lblock;
2129	} else {
2130		lblock = len = 0;
2131		BUG();
2132	}
2133
2134	ext_debug(" -> %u:%lu\n", lblock, len);
2135	ext4_ext_put_in_cache(inode, lblock, len, 0);
2136}
2137
2138/*
2139 * ext4_ext_in_cache()
2140 * Checks to see if the given block is in the cache.
2141 * If it is, the cached extent is stored in the given
2142 * cache extent pointer.
2143 *
2144 * @inode: The files inode
2145 * @block: The block to look for in the cache
2146 * @ex:    Pointer where the cached extent will be stored
2147 *         if it contains block
2148 *
2149 * Return 0 if cache is invalid; 1 if the cache is valid
2150 */
2151static int
2152ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2153		  struct ext4_extent *ex)
2154{
2155	struct ext4_ext_cache *cex;
2156	struct ext4_sb_info *sbi;
2157	int ret = 0;
2158
2159	/*
2160	 * We borrow i_block_reservation_lock to protect i_cached_extent
2161	 */
2162	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2163	cex = &EXT4_I(inode)->i_cached_extent;
2164	sbi = EXT4_SB(inode->i_sb);
2165
2166	/* has cache valid data? */
2167	if (cex->ec_len == 0)
2168		goto errout;
2169
2170	if (in_range(block, cex->ec_block, cex->ec_len)) {
2171		ex->ee_block = cpu_to_le32(cex->ec_block);
2172		ext4_ext_store_pblock(ex, cex->ec_start);
2173		ex->ee_len = cpu_to_le16(cex->ec_len);
2174		ext_debug("%u cached by %u:%u:%llu\n",
2175				block,
2176				cex->ec_block, cex->ec_len, cex->ec_start);
2177		ret = 1;
2178	}
2179errout:
2180	trace_ext4_ext_in_cache(inode, block, ret);
2181	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2182	return ret;
2183}
2184
2185/*
2186 * ext4_ext_rm_idx:
2187 * removes index from the index block.
2188 */
2189static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2190			struct ext4_ext_path *path)
2191{
2192	int err;
2193	ext4_fsblk_t leaf;
2194
2195	/* free index block */
2196	path--;
2197	leaf = ext4_idx_pblock(path->p_idx);
2198	if (unlikely(path->p_hdr->eh_entries == 0)) {
2199		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2200		return -EIO;
2201	}
2202	err = ext4_ext_get_access(handle, inode, path);
2203	if (err)
2204		return err;
2205
2206	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2207		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2208		len *= sizeof(struct ext4_extent_idx);
2209		memmove(path->p_idx, path->p_idx + 1, len);
2210	}
2211
2212	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2213	err = ext4_ext_dirty(handle, inode, path);
2214	if (err)
2215		return err;
2216	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2217	trace_ext4_ext_rm_idx(inode, leaf);
2218
2219	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2220			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2221	return err;
2222}
2223
2224/*
2225 * ext4_ext_calc_credits_for_single_extent:
2226 * This routine returns max. credits that needed to insert an extent
2227 * to the extent tree.
2228 * When pass the actual path, the caller should calculate credits
2229 * under i_data_sem.
2230 */
2231int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2232						struct ext4_ext_path *path)
2233{
2234	if (path) {
2235		int depth = ext_depth(inode);
2236		int ret = 0;
2237
2238		/* probably there is space in leaf? */
2239		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2240				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2241
2242			/*
2243			 *  There are some space in the leaf tree, no
2244			 *  need to account for leaf block credit
2245			 *
2246			 *  bitmaps and block group descriptor blocks
2247			 *  and other metadata blocks still need to be
2248			 *  accounted.
2249			 */
2250			/* 1 bitmap, 1 block group descriptor */
2251			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2252			return ret;
2253		}
2254	}
2255
2256	return ext4_chunk_trans_blocks(inode, nrblocks);
2257}
2258
2259/*
2260 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2261 *
2262 * if nrblocks are fit in a single extent (chunk flag is 1), then
2263 * in the worse case, each tree level index/leaf need to be changed
2264 * if the tree split due to insert a new extent, then the old tree
2265 * index/leaf need to be updated too
2266 *
2267 * If the nrblocks are discontiguous, they could cause
2268 * the whole tree split more than once, but this is really rare.
2269 */
2270int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2271{
2272	int index;
2273	int depth = ext_depth(inode);
2274
2275	if (chunk)
2276		index = depth * 2;
2277	else
2278		index = depth * 3;
2279
2280	return index;
2281}
2282
2283static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2284			      struct ext4_extent *ex,
2285			      ext4_fsblk_t *partial_cluster,
2286			      ext4_lblk_t from, ext4_lblk_t to)
2287{
2288	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2289	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2290	ext4_fsblk_t pblk;
2291	int flags = 0;
2292
2293	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2294		flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2295	else if (ext4_should_journal_data(inode))
2296		flags |= EXT4_FREE_BLOCKS_FORGET;
2297
2298	/*
2299	 * For bigalloc file systems, we never free a partial cluster
2300	 * at the beginning of the extent.  Instead, we make a note
2301	 * that we tried freeing the cluster, and check to see if we
2302	 * need to free it on a subsequent call to ext4_remove_blocks,
2303	 * or at the end of the ext4_truncate() operation.
2304	 */
2305	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2306
2307	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2308	/*
2309	 * If we have a partial cluster, and it's different from the
2310	 * cluster of the last block, we need to explicitly free the
2311	 * partial cluster here.
2312	 */
2313	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2314	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2315		ext4_free_blocks(handle, inode, NULL,
2316				 EXT4_C2B(sbi, *partial_cluster),
2317				 sbi->s_cluster_ratio, flags);
2318		*partial_cluster = 0;
2319	}
2320
2321#ifdef EXTENTS_STATS
2322	{
2323		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2324		spin_lock(&sbi->s_ext_stats_lock);
2325		sbi->s_ext_blocks += ee_len;
2326		sbi->s_ext_extents++;
2327		if (ee_len < sbi->s_ext_min)
2328			sbi->s_ext_min = ee_len;
2329		if (ee_len > sbi->s_ext_max)
2330			sbi->s_ext_max = ee_len;
2331		if (ext_depth(inode) > sbi->s_depth_max)
2332			sbi->s_depth_max = ext_depth(inode);
2333		spin_unlock(&sbi->s_ext_stats_lock);
2334	}
2335#endif
2336	if (from >= le32_to_cpu(ex->ee_block)
2337	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2338		/* tail removal */
2339		ext4_lblk_t num;
2340
2341		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2342		pblk = ext4_ext_pblock(ex) + ee_len - num;
2343		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2344		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2345		/*
2346		 * If the block range to be freed didn't start at the
2347		 * beginning of a cluster, and we removed the entire
2348		 * extent, save the partial cluster here, since we
2349		 * might need to delete if we determine that the
2350		 * truncate operation has removed all of the blocks in
2351		 * the cluster.
2352		 */
2353		if (pblk & (sbi->s_cluster_ratio - 1) &&
2354		    (ee_len == num))
2355			*partial_cluster = EXT4_B2C(sbi, pblk);
2356		else
2357			*partial_cluster = 0;
2358	} else if (from == le32_to_cpu(ex->ee_block)
2359		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2360		/* head removal */
2361		ext4_lblk_t num;
2362		ext4_fsblk_t start;
2363
2364		num = to - from;
2365		start = ext4_ext_pblock(ex);
2366
2367		ext_debug("free first %u blocks starting %llu\n", num, start);
2368		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2369
2370	} else {
2371		printk(KERN_INFO "strange request: removal(2) "
2372				"%u-%u from %u:%u\n",
2373				from, to, le32_to_cpu(ex->ee_block), ee_len);
2374	}
2375	return 0;
2376}
2377
2378
2379/*
2380 * ext4_ext_rm_leaf() Removes the extents associated with the
2381 * blocks appearing between "start" and "end", and splits the extents
2382 * if "start" and "end" appear in the same extent
2383 *
2384 * @handle: The journal handle
2385 * @inode:  The files inode
2386 * @path:   The path to the leaf
2387 * @start:  The first block to remove
2388 * @end:   The last block to remove
2389 */
2390static int
2391ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2392		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2393		 ext4_lblk_t start, ext4_lblk_t end)
2394{
2395	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2396	int err = 0, correct_index = 0;
2397	int depth = ext_depth(inode), credits;
2398	struct ext4_extent_header *eh;
2399	ext4_lblk_t a, b;
2400	unsigned num;
2401	ext4_lblk_t ex_ee_block;
2402	unsigned short ex_ee_len;
2403	unsigned uninitialized = 0;
2404	struct ext4_extent *ex;
2405
2406	/* the header must be checked already in ext4_ext_remove_space() */
2407	ext_debug("truncate since %u in leaf to %u\n", start, end);
2408	if (!path[depth].p_hdr)
2409		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2410	eh = path[depth].p_hdr;
2411	if (unlikely(path[depth].p_hdr == NULL)) {
2412		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2413		return -EIO;
2414	}
2415	/* find where to start removing */
2416	ex = EXT_LAST_EXTENT(eh);
2417
2418	ex_ee_block = le32_to_cpu(ex->ee_block);
2419	ex_ee_len = ext4_ext_get_actual_len(ex);
2420
2421	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2422
2423	while (ex >= EXT_FIRST_EXTENT(eh) &&
2424			ex_ee_block + ex_ee_len > start) {
2425
2426		if (ext4_ext_is_uninitialized(ex))
2427			uninitialized = 1;
2428		else
2429			uninitialized = 0;
2430
2431		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2432			 uninitialized, ex_ee_len);
2433		path[depth].p_ext = ex;
2434
2435		a = ex_ee_block > start ? ex_ee_block : start;
2436		b = ex_ee_block+ex_ee_len - 1 < end ?
2437			ex_ee_block+ex_ee_len - 1 : end;
2438
2439		ext_debug("  border %u:%u\n", a, b);
2440
2441		/* If this extent is beyond the end of the hole, skip it */
2442		if (end < ex_ee_block) {
2443			ex--;
2444			ex_ee_block = le32_to_cpu(ex->ee_block);
2445			ex_ee_len = ext4_ext_get_actual_len(ex);
2446			continue;
2447		} else if (b != ex_ee_block + ex_ee_len - 1) {
2448			EXT4_ERROR_INODE(inode,
2449					 "can not handle truncate %u:%u "
2450					 "on extent %u:%u",
2451					 start, end, ex_ee_block,
2452					 ex_ee_block + ex_ee_len - 1);
2453			err = -EIO;
2454			goto out;
2455		} else if (a != ex_ee_block) {
2456			/* remove tail of the extent */
2457			num = a - ex_ee_block;
2458		} else {
2459			/* remove whole extent: excellent! */
2460			num = 0;
2461		}
2462		/*
2463		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2464		 * descriptor) for each block group; assume two block
2465		 * groups plus ex_ee_len/blocks_per_block_group for
2466		 * the worst case
2467		 */
2468		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2469		if (ex == EXT_FIRST_EXTENT(eh)) {
2470			correct_index = 1;
2471			credits += (ext_depth(inode)) + 1;
2472		}
2473		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2474
2475		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2476		if (err)
2477			goto out;
2478
2479		err = ext4_ext_get_access(handle, inode, path + depth);
2480		if (err)
2481			goto out;
2482
2483		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2484					 a, b);
2485		if (err)
2486			goto out;
2487
2488		if (num == 0)
2489			/* this extent is removed; mark slot entirely unused */
2490			ext4_ext_store_pblock(ex, 0);
2491
2492		ex->ee_len = cpu_to_le16(num);
2493		/*
2494		 * Do not mark uninitialized if all the blocks in the
2495		 * extent have been removed.
2496		 */
2497		if (uninitialized && num)
2498			ext4_ext_mark_uninitialized(ex);
2499		/*
2500		 * If the extent was completely released,
2501		 * we need to remove it from the leaf
2502		 */
2503		if (num == 0) {
2504			if (end != EXT_MAX_BLOCKS - 1) {
2505				/*
2506				 * For hole punching, we need to scoot all the
2507				 * extents up when an extent is removed so that
2508				 * we dont have blank extents in the middle
2509				 */
2510				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2511					sizeof(struct ext4_extent));
2512
2513				/* Now get rid of the one at the end */
2514				memset(EXT_LAST_EXTENT(eh), 0,
2515					sizeof(struct ext4_extent));
2516			}
2517			le16_add_cpu(&eh->eh_entries, -1);
2518		} else
2519			*partial_cluster = 0;
2520
2521		err = ext4_ext_dirty(handle, inode, path + depth);
2522		if (err)
2523			goto out;
2524
2525		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2526				ext4_ext_pblock(ex));
2527		ex--;
2528		ex_ee_block = le32_to_cpu(ex->ee_block);
2529		ex_ee_len = ext4_ext_get_actual_len(ex);
2530	}
2531
2532	if (correct_index && eh->eh_entries)
2533		err = ext4_ext_correct_indexes(handle, inode, path);
2534
2535	/*
2536	 * If there is still a entry in the leaf node, check to see if
2537	 * it references the partial cluster.  This is the only place
2538	 * where it could; if it doesn't, we can free the cluster.
2539	 */
2540	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2541	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2542	     *partial_cluster)) {
2543		int flags = EXT4_FREE_BLOCKS_FORGET;
2544
2545		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2546			flags |= EXT4_FREE_BLOCKS_METADATA;
2547
2548		ext4_free_blocks(handle, inode, NULL,
2549				 EXT4_C2B(sbi, *partial_cluster),
2550				 sbi->s_cluster_ratio, flags);
2551		*partial_cluster = 0;
2552	}
2553
2554	/* if this leaf is free, then we should
2555	 * remove it from index block above */
2556	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2557		err = ext4_ext_rm_idx(handle, inode, path + depth);
2558
2559out:
2560	return err;
2561}
2562
2563/*
2564 * ext4_ext_more_to_rm:
2565 * returns 1 if current index has to be freed (even partial)
2566 */
2567static int
2568ext4_ext_more_to_rm(struct ext4_ext_path *path)
2569{
2570	BUG_ON(path->p_idx == NULL);
2571
2572	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2573		return 0;
2574
2575	/*
2576	 * if truncate on deeper level happened, it wasn't partial,
2577	 * so we have to consider current index for truncation
2578	 */
2579	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2580		return 0;
2581	return 1;
2582}
2583
2584static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2585				 ext4_lblk_t end)
2586{
2587	struct super_block *sb = inode->i_sb;
2588	int depth = ext_depth(inode);
2589	struct ext4_ext_path *path = NULL;
2590	ext4_fsblk_t partial_cluster = 0;
2591	handle_t *handle;
2592	int i = 0, err;
2593
2594	ext_debug("truncate since %u to %u\n", start, end);
2595
2596	/* probably first extent we're gonna free will be last in block */
2597	handle = ext4_journal_start(inode, depth + 1);
2598	if (IS_ERR(handle))
2599		return PTR_ERR(handle);
2600
2601again:
2602	ext4_ext_invalidate_cache(inode);
2603
2604	trace_ext4_ext_remove_space(inode, start, depth);
2605
2606	/*
2607	 * Check if we are removing extents inside the extent tree. If that
2608	 * is the case, we are going to punch a hole inside the extent tree
2609	 * so we have to check whether we need to split the extent covering
2610	 * the last block to remove so we can easily remove the part of it
2611	 * in ext4_ext_rm_leaf().
2612	 */
2613	if (end < EXT_MAX_BLOCKS - 1) {
2614		struct ext4_extent *ex;
2615		ext4_lblk_t ee_block;
2616
2617		/* find extent for this block */
2618		path = ext4_ext_find_extent(inode, end, NULL);
2619		if (IS_ERR(path)) {
2620			ext4_journal_stop(handle);
2621			return PTR_ERR(path);
2622		}
2623		depth = ext_depth(inode);
2624		ex = path[depth].p_ext;
2625		if (!ex) {
2626			ext4_ext_drop_refs(path);
2627			kfree(path);
2628			path = NULL;
2629			goto cont;
2630		}
2631
2632		ee_block = le32_to_cpu(ex->ee_block);
2633
2634		/*
2635		 * See if the last block is inside the extent, if so split
2636		 * the extent at 'end' block so we can easily remove the
2637		 * tail of the first part of the split extent in
2638		 * ext4_ext_rm_leaf().
2639		 */
2640		if (end >= ee_block &&
2641		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2642			int split_flag = 0;
2643
2644			if (ext4_ext_is_uninitialized(ex))
2645				split_flag = EXT4_EXT_MARK_UNINIT1 |
2646					     EXT4_EXT_MARK_UNINIT2;
2647
2648			/*
2649			 * Split the extent in two so that 'end' is the last
2650			 * block in the first new extent
2651			 */
2652			err = ext4_split_extent_at(handle, inode, path,
2653						end + 1, split_flag,
2654						EXT4_GET_BLOCKS_PRE_IO |
2655						EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2656
2657			if (err < 0)
2658				goto out;
2659		}
2660	}
2661cont:
2662
2663	/*
2664	 * We start scanning from right side, freeing all the blocks
2665	 * after i_size and walking into the tree depth-wise.
2666	 */
2667	depth = ext_depth(inode);
2668	if (path) {
2669		int k = i = depth;
2670		while (--k > 0)
2671			path[k].p_block =
2672				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2673	} else {
2674		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2675			       GFP_NOFS);
2676		if (path == NULL) {
2677			ext4_journal_stop(handle);
2678			return -ENOMEM;
2679		}
2680		path[0].p_depth = depth;
2681		path[0].p_hdr = ext_inode_hdr(inode);
2682		i = 0;
2683
2684		if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2685			err = -EIO;
2686			goto out;
2687		}
2688	}
2689	err = 0;
2690
2691	while (i >= 0 && err == 0) {
2692		if (i == depth) {
2693			/* this is leaf block */
2694			err = ext4_ext_rm_leaf(handle, inode, path,
2695					       &partial_cluster, start,
2696					       end);
2697			/* root level has p_bh == NULL, brelse() eats this */
2698			brelse(path[i].p_bh);
2699			path[i].p_bh = NULL;
2700			i--;
2701			continue;
2702		}
2703
2704		/* this is index block */
2705		if (!path[i].p_hdr) {
2706			ext_debug("initialize header\n");
2707			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2708		}
2709
2710		if (!path[i].p_idx) {
2711			/* this level hasn't been touched yet */
2712			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2713			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2714			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2715				  path[i].p_hdr,
2716				  le16_to_cpu(path[i].p_hdr->eh_entries));
2717		} else {
2718			/* we were already here, see at next index */
2719			path[i].p_idx--;
2720		}
2721
2722		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2723				i, EXT_FIRST_INDEX(path[i].p_hdr),
2724				path[i].p_idx);
2725		if (ext4_ext_more_to_rm(path + i)) {
2726			struct buffer_head *bh;
2727			/* go to the next level */
2728			ext_debug("move to level %d (block %llu)\n",
2729				  i + 1, ext4_idx_pblock(path[i].p_idx));
2730			memset(path + i + 1, 0, sizeof(*path));
2731			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2732			if (!bh) {
2733				/* should we reset i_size? */
2734				err = -EIO;
2735				break;
2736			}
2737			if (WARN_ON(i + 1 > depth)) {
2738				err = -EIO;
2739				break;
2740			}
2741			if (ext4_ext_check_block(inode, ext_block_hdr(bh),
2742							depth - i - 1, bh)) {
2743				err = -EIO;
2744				break;
2745			}
2746			path[i + 1].p_bh = bh;
2747
2748			/* save actual number of indexes since this
2749			 * number is changed at the next iteration */
2750			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2751			i++;
2752		} else {
2753			/* we finished processing this index, go up */
2754			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2755				/* index is empty, remove it;
2756				 * handle must be already prepared by the
2757				 * truncatei_leaf() */
2758				err = ext4_ext_rm_idx(handle, inode, path + i);
2759			}
2760			/* root level has p_bh == NULL, brelse() eats this */
2761			brelse(path[i].p_bh);
2762			path[i].p_bh = NULL;
2763			i--;
2764			ext_debug("return to level %d\n", i);
2765		}
2766	}
2767
2768	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2769			path->p_hdr->eh_entries);
2770
2771	/* If we still have something in the partial cluster and we have removed
2772	 * even the first extent, then we should free the blocks in the partial
2773	 * cluster as well. */
2774	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2775		int flags = EXT4_FREE_BLOCKS_FORGET;
2776
2777		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2778			flags |= EXT4_FREE_BLOCKS_METADATA;
2779
2780		ext4_free_blocks(handle, inode, NULL,
2781				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2782				 EXT4_SB(sb)->s_cluster_ratio, flags);
2783		partial_cluster = 0;
2784	}
2785
2786	/* TODO: flexible tree reduction should be here */
2787	if (path->p_hdr->eh_entries == 0) {
2788		/*
2789		 * truncate to zero freed all the tree,
2790		 * so we need to correct eh_depth
2791		 */
2792		err = ext4_ext_get_access(handle, inode, path);
2793		if (err == 0) {
2794			ext_inode_hdr(inode)->eh_depth = 0;
2795			ext_inode_hdr(inode)->eh_max =
2796				cpu_to_le16(ext4_ext_space_root(inode, 0));
2797			err = ext4_ext_dirty(handle, inode, path);
2798		}
2799	}
2800out:
2801	ext4_ext_drop_refs(path);
2802	kfree(path);
2803	if (err == -EAGAIN) {
2804		path = NULL;
2805		goto again;
2806	}
2807	ext4_journal_stop(handle);
2808
2809	return err;
2810}
2811
2812/*
2813 * called at mount time
2814 */
2815void ext4_ext_init(struct super_block *sb)
2816{
2817	/*
2818	 * possible initialization would be here
2819	 */
2820
2821	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2822#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2823		printk(KERN_INFO "EXT4-fs: file extents enabled"
2824#ifdef AGGRESSIVE_TEST
2825		       ", aggressive tests"
2826#endif
2827#ifdef CHECK_BINSEARCH
2828		       ", check binsearch"
2829#endif
2830#ifdef EXTENTS_STATS
2831		       ", stats"
2832#endif
2833		       "\n");
2834#endif
2835#ifdef EXTENTS_STATS
2836		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2837		EXT4_SB(sb)->s_ext_min = 1 << 30;
2838		EXT4_SB(sb)->s_ext_max = 0;
2839#endif
2840	}
2841}
2842
2843/*
2844 * called at umount time
2845 */
2846void ext4_ext_release(struct super_block *sb)
2847{
2848	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2849		return;
2850
2851#ifdef EXTENTS_STATS
2852	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2853		struct ext4_sb_info *sbi = EXT4_SB(sb);
2854		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2855			sbi->s_ext_blocks, sbi->s_ext_extents,
2856			sbi->s_ext_blocks / sbi->s_ext_extents);
2857		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2858			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2859	}
2860#endif
2861}
2862
2863/* FIXME!! we need to try to merge to left or right after zero-out  */
2864static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2865{
2866	ext4_fsblk_t ee_pblock;
2867	unsigned int ee_len;
2868	int ret;
2869
2870	ee_len    = ext4_ext_get_actual_len(ex);
2871	ee_pblock = ext4_ext_pblock(ex);
2872
2873	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2874	if (ret > 0)
2875		ret = 0;
2876
2877	return ret;
2878}
2879
2880/*
2881 * ext4_split_extent_at() splits an extent at given block.
2882 *
2883 * @handle: the journal handle
2884 * @inode: the file inode
2885 * @path: the path to the extent
2886 * @split: the logical block where the extent is splitted.
2887 * @split_flags: indicates if the extent could be zeroout if split fails, and
2888 *		 the states(init or uninit) of new extents.
2889 * @flags: flags used to insert new extent to extent tree.
2890 *
2891 *
2892 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2893 * of which are deterimined by split_flag.
2894 *
2895 * There are two cases:
2896 *  a> the extent are splitted into two extent.
2897 *  b> split is not needed, and just mark the extent.
2898 *
2899 * return 0 on success.
2900 */
2901static int ext4_split_extent_at(handle_t *handle,
2902			     struct inode *inode,
2903			     struct ext4_ext_path *path,
2904			     ext4_lblk_t split,
2905			     int split_flag,
2906			     int flags)
2907{
2908	ext4_fsblk_t newblock;
2909	ext4_lblk_t ee_block;
2910	struct ext4_extent *ex, newex, orig_ex;
2911	struct ext4_extent *ex2 = NULL;
2912	unsigned int ee_len, depth;
2913	int err = 0;
2914
2915	ext_debug("ext4_split_extents_at: inode %lu, logical"
2916		"block %llu\n", inode->i_ino, (unsigned long long)split);
2917
2918	ext4_ext_show_leaf(inode, path);
2919
2920	depth = ext_depth(inode);
2921	ex = path[depth].p_ext;
2922	ee_block = le32_to_cpu(ex->ee_block);
2923	ee_len = ext4_ext_get_actual_len(ex);
2924	newblock = split - ee_block + ext4_ext_pblock(ex);
2925
2926	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2927
2928	err = ext4_ext_get_access(handle, inode, path + depth);
2929	if (err)
2930		goto out;
2931
2932	if (split == ee_block) {
2933		/*
2934		 * case b: block @split is the block that the extent begins with
2935		 * then we just change the state of the extent, and splitting
2936		 * is not needed.
2937		 */
2938		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2939			ext4_ext_mark_uninitialized(ex);
2940		else
2941			ext4_ext_mark_initialized(ex);
2942
2943		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2944			ext4_ext_try_to_merge(handle, inode, path, ex);
2945
2946		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2947		goto out;
2948	}
2949
2950	/* case a */
2951	memcpy(&orig_ex, ex, sizeof(orig_ex));
2952	ex->ee_len = cpu_to_le16(split - ee_block);
2953	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2954		ext4_ext_mark_uninitialized(ex);
2955
2956	/*
2957	 * path may lead to new leaf, not to original leaf any more
2958	 * after ext4_ext_insert_extent() returns,
2959	 */
2960	err = ext4_ext_dirty(handle, inode, path + depth);
2961	if (err)
2962		goto fix_extent_len;
2963
2964	ex2 = &newex;
2965	ex2->ee_block = cpu_to_le32(split);
2966	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2967	ext4_ext_store_pblock(ex2, newblock);
2968	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2969		ext4_ext_mark_uninitialized(ex2);
2970
2971	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2972	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2973		err = ext4_ext_zeroout(inode, &orig_ex);
2974		if (err)
2975			goto fix_extent_len;
2976		/* update the extent length and mark as initialized */
2977		ex->ee_len = cpu_to_le16(ee_len);
2978		ext4_ext_try_to_merge(handle, inode, path, ex);
2979		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2980		goto out;
2981	} else if (err)
2982		goto fix_extent_len;
2983
2984out:
2985	ext4_ext_show_leaf(inode, path);
2986	return err;
2987
2988fix_extent_len:
2989	ex->ee_len = orig_ex.ee_len;
2990	ext4_ext_dirty(handle, inode, path + depth);
2991	return err;
2992}
2993
2994/*
2995 * ext4_split_extents() splits an extent and mark extent which is covered
2996 * by @map as split_flags indicates
2997 *
2998 * It may result in splitting the extent into multiple extents (upto three)
2999 * There are three possibilities:
3000 *   a> There is no split required
3001 *   b> Splits in two extents: Split is happening at either end of the extent
3002 *   c> Splits in three extents: Somone is splitting in middle of the extent
3003 *
3004 */
3005static int ext4_split_extent(handle_t *handle,
3006			      struct inode *inode,
3007			      struct ext4_ext_path *path,
3008			      struct ext4_map_blocks *map,
3009			      int split_flag,
3010			      int flags)
3011{
3012	ext4_lblk_t ee_block;
3013	struct ext4_extent *ex;
3014	unsigned int ee_len, depth;
3015	int err = 0;
3016	int uninitialized;
3017	int split_flag1, flags1;
3018
3019	depth = ext_depth(inode);
3020	ex = path[depth].p_ext;
3021	ee_block = le32_to_cpu(ex->ee_block);
3022	ee_len = ext4_ext_get_actual_len(ex);
3023	uninitialized = ext4_ext_is_uninitialized(ex);
3024
3025	if (map->m_lblk + map->m_len < ee_block + ee_len) {
3026		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
3027			      EXT4_EXT_MAY_ZEROOUT : 0;
3028		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3029		if (uninitialized)
3030			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
3031				       EXT4_EXT_MARK_UNINIT2;
3032		err = ext4_split_extent_at(handle, inode, path,
3033				map->m_lblk + map->m_len, split_flag1, flags1);
3034		if (err)
3035			goto out;
3036	}
3037
3038	ext4_ext_drop_refs(path);
3039	path = ext4_ext_find_extent(inode, map->m_lblk, path);
3040	if (IS_ERR(path))
3041		return PTR_ERR(path);
3042
3043	if (map->m_lblk >= ee_block) {
3044		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
3045			      EXT4_EXT_MAY_ZEROOUT : 0;
3046		if (uninitialized)
3047			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
3048		if (split_flag & EXT4_EXT_MARK_UNINIT2)
3049			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
3050		err = ext4_split_extent_at(handle, inode, path,
3051				map->m_lblk, split_flag1, flags);
3052		if (err)
3053			goto out;
3054	}
3055
3056	ext4_ext_show_leaf(inode, path);
3057out:
3058	return err ? err : map->m_len;
3059}
3060
3061/*
3062 * This function is called by ext4_ext_map_blocks() if someone tries to write
3063 * to an uninitialized extent. It may result in splitting the uninitialized
3064 * extent into multiple extents (up to three - one initialized and two
3065 * uninitialized).
3066 * There are three possibilities:
3067 *   a> There is no split required: Entire extent should be initialized
3068 *   b> Splits in two extents: Write is happening at either end of the extent
3069 *   c> Splits in three extents: Somone is writing in middle of the extent
3070 *
3071 * Pre-conditions:
3072 *  - The extent pointed to by 'path' is uninitialized.
3073 *  - The extent pointed to by 'path' contains a superset
3074 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3075 *
3076 * Post-conditions on success:
3077 *  - the returned value is the number of blocks beyond map->l_lblk
3078 *    that are allocated and initialized.
3079 *    It is guaranteed to be >= map->m_len.
3080 */
3081static int ext4_ext_convert_to_initialized(handle_t *handle,
3082					   struct inode *inode,
3083					   struct ext4_map_blocks *map,
3084					   struct ext4_ext_path *path)
3085{
3086	struct ext4_sb_info *sbi;
3087	struct ext4_extent_header *eh;
3088	struct ext4_map_blocks split_map;
3089	struct ext4_extent zero_ex;
3090	struct ext4_extent *ex;
3091	ext4_lblk_t ee_block, eof_block;
3092	unsigned int ee_len, depth;
3093	int allocated, max_zeroout = 0;
3094	int err = 0;
3095	int split_flag = 0;
3096
3097	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3098		"block %llu, max_blocks %u\n", inode->i_ino,
3099		(unsigned long long)map->m_lblk, map->m_len);
3100
3101	sbi = EXT4_SB(inode->i_sb);
3102	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3103		inode->i_sb->s_blocksize_bits;
3104	if (eof_block < map->m_lblk + map->m_len)
3105		eof_block = map->m_lblk + map->m_len;
3106
3107	depth = ext_depth(inode);
3108	eh = path[depth].p_hdr;
3109	ex = path[depth].p_ext;
3110	ee_block = le32_to_cpu(ex->ee_block);
3111	ee_len = ext4_ext_get_actual_len(ex);
3112	allocated = ee_len - (map->m_lblk - ee_block);
3113
3114	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3115
3116	/* Pre-conditions */
3117	BUG_ON(!ext4_ext_is_uninitialized(ex));
3118	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3119
3120	/*
3121	 * Attempt to transfer newly initialized blocks from the currently
3122	 * uninitialized extent to its left neighbor. This is much cheaper
3123	 * than an insertion followed by a merge as those involve costly
3124	 * memmove() calls. This is the common case in steady state for
3125	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
3126	 * writes.
3127	 *
3128	 * Limitations of the current logic:
3129	 *  - L1: we only deal with writes at the start of the extent.
3130	 *    The approach could be extended to writes at the end
3131	 *    of the extent but this scenario was deemed less common.
3132	 *  - L2: we do not deal with writes covering the whole extent.
3133	 *    This would require removing the extent if the transfer
3134	 *    is possible.
3135	 *  - L3: we only attempt to merge with an extent stored in the
3136	 *    same extent tree node.
3137	 */
3138	if ((map->m_lblk == ee_block) &&	/*L1*/
3139		(map->m_len < ee_len) &&	/*L2*/
3140		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
3141		struct ext4_extent *prev_ex;
3142		ext4_lblk_t prev_lblk;
3143		ext4_fsblk_t prev_pblk, ee_pblk;
3144		unsigned int prev_len, write_len;
3145
3146		prev_ex = ex - 1;
3147		prev_lblk = le32_to_cpu(prev_ex->ee_block);
3148		prev_len = ext4_ext_get_actual_len(prev_ex);
3149		prev_pblk = ext4_ext_pblock(prev_ex);
3150		ee_pblk = ext4_ext_pblock(ex);
3151		write_len = map->m_len;
3152
3153		/*
3154		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
3155		 * upon those conditions:
3156		 * - C1: prev_ex is initialized,
3157		 * - C2: prev_ex is logically abutting ex,
3158		 * - C3: prev_ex is physically abutting ex,
3159		 * - C4: prev_ex can receive the additional blocks without
3160		 *   overflowing the (initialized) length limit.
3161		 */
3162		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
3163			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3164			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3165			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
3166			err = ext4_ext_get_access(handle, inode, path + depth);
3167			if (err)
3168				goto out;
3169
3170			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3171				map, ex, prev_ex);
3172
3173			/* Shift the start of ex by 'write_len' blocks */
3174			ex->ee_block = cpu_to_le32(ee_block + write_len);
3175			ext4_ext_store_pblock(ex, ee_pblk + write_len);
3176			ex->ee_len = cpu_to_le16(ee_len - write_len);
3177			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3178
3179			/* Extend prev_ex by 'write_len' blocks */
3180			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3181
3182			/* Mark the block containing both extents as dirty */
3183			ext4_ext_dirty(handle, inode, path + depth);
3184
3185			/* Update path to point to the right extent */
3186			path[depth].p_ext = prev_ex;
3187
3188			/* Result: number of initialized blocks past m_lblk */
3189			allocated = write_len;
3190			goto out;
3191		}
3192	}
3193
3194	WARN_ON(map->m_lblk < ee_block);
3195	/*
3196	 * It is safe to convert extent to initialized via explicit
3197	 * zeroout only if extent is fully insde i_size or new_size.
3198	 */
3199	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3200
3201	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3202		max_zeroout = sbi->s_extent_max_zeroout_kb >>
3203			inode->i_sb->s_blocksize_bits;
3204
3205	/* If extent is less than s_max_zeroout_kb, zeroout directly */
3206	if (max_zeroout && (ee_len <= max_zeroout)) {
3207		err = ext4_ext_zeroout(inode, ex);
3208		if (err)
3209			goto out;
3210
3211		err = ext4_ext_get_access(handle, inode, path + depth);
3212		if (err)
3213			goto out;
3214		ext4_ext_mark_initialized(ex);
3215		ext4_ext_try_to_merge(handle, inode, path, ex);
3216		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3217		goto out;
3218	}
3219
3220	/*
3221	 * four cases:
3222	 * 1. split the extent into three extents.
3223	 * 2. split the extent into two extents, zeroout the first half.
3224	 * 3. split the extent into two extents, zeroout the second half.
3225	 * 4. split the extent into two extents with out zeroout.
3226	 */
3227	split_map.m_lblk = map->m_lblk;
3228	split_map.m_len = map->m_len;
3229
3230	if (max_zeroout && (allocated > map->m_len)) {
3231		if (allocated <= max_zeroout) {
3232			/* case 3 */
3233			zero_ex.ee_block =
3234					 cpu_to_le32(map->m_lblk);
3235			zero_ex.ee_len = cpu_to_le16(allocated);
3236			ext4_ext_store_pblock(&zero_ex,
3237				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3238			err = ext4_ext_zeroout(inode, &zero_ex);
3239			if (err)
3240				goto out;
3241			split_map.m_lblk = map->m_lblk;
3242			split_map.m_len = allocated;
3243		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3244			/* case 2 */
3245			if (map->m_lblk != ee_block) {
3246				zero_ex.ee_block = ex->ee_block;
3247				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3248							ee_block);
3249				ext4_ext_store_pblock(&zero_ex,
3250						      ext4_ext_pblock(ex));
3251				err = ext4_ext_zeroout(inode, &zero_ex);
3252				if (err)
3253					goto out;
3254			}
3255
3256			split_map.m_lblk = ee_block;
3257			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3258			allocated = map->m_len;
3259		}
3260	}
3261
3262	allocated = ext4_split_extent(handle, inode, path,
3263				      &split_map, split_flag, 0);
3264	if (allocated < 0)
3265		err = allocated;
3266
3267out:
3268	return err ? err : allocated;
3269}
3270
3271/*
3272 * This function is called by ext4_ext_map_blocks() from
3273 * ext4_get_blocks_dio_write() when DIO to write
3274 * to an uninitialized extent.
3275 *
3276 * Writing to an uninitialized extent may result in splitting the uninitialized
3277 * extent into multiple initialized/uninitialized extents (up to three)
3278 * There are three possibilities:
3279 *   a> There is no split required: Entire extent should be uninitialized
3280 *   b> Splits in two extents: Write is happening at either end of the extent
3281 *   c> Splits in three extents: Somone is writing in middle of the extent
3282 *
3283 * One of more index blocks maybe needed if the extent tree grow after
3284 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3285 * complete, we need to split the uninitialized extent before DIO submit
3286 * the IO. The uninitialized extent called at this time will be split
3287 * into three uninitialized extent(at most). After IO complete, the part
3288 * being filled will be convert to initialized by the end_io callback function
3289 * via ext4_convert_unwritten_extents().
3290 *
3291 * Returns the size of uninitialized extent to be written on success.
3292 */
3293static int ext4_split_unwritten_extents(handle_t *handle,
3294					struct inode *inode,
3295					struct ext4_map_blocks *map,
3296					struct ext4_ext_path *path,
3297					int flags)
3298{
3299	ext4_lblk_t eof_block;
3300	ext4_lblk_t ee_block;
3301	struct ext4_extent *ex;
3302	unsigned int ee_len;
3303	int split_flag = 0, depth;
3304
3305	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3306		"block %llu, max_blocks %u\n", inode->i_ino,
3307		(unsigned long long)map->m_lblk, map->m_len);
3308
3309	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3310		inode->i_sb->s_blocksize_bits;
3311	if (eof_block < map->m_lblk + map->m_len)
3312		eof_block = map->m_lblk + map->m_len;
3313	/*
3314	 * It is safe to convert extent to initialized via explicit
3315	 * zeroout only if extent is fully insde i_size or new_size.
3316	 */
3317	depth = ext_depth(inode);
3318	ex = path[depth].p_ext;
3319	ee_block = le32_to_cpu(ex->ee_block);
3320	ee_len = ext4_ext_get_actual_len(ex);
3321
3322	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3323	split_flag |= EXT4_EXT_MARK_UNINIT2;
3324
3325	flags |= EXT4_GET_BLOCKS_PRE_IO;
3326	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3327}
3328
3329static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3330					      struct inode *inode,
3331					      struct ext4_ext_path *path)
3332{
3333	struct ext4_extent *ex;
3334	int depth;
3335	int err = 0;
3336
3337	depth = ext_depth(inode);
3338	ex = path[depth].p_ext;
3339
3340	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3341		"block %llu, max_blocks %u\n", inode->i_ino,
3342		(unsigned long long)le32_to_cpu(ex->ee_block),
3343		ext4_ext_get_actual_len(ex));
3344
3345	err = ext4_ext_get_access(handle, inode, path + depth);
3346	if (err)
3347		goto out;
3348	/* first mark the extent as initialized */
3349	ext4_ext_mark_initialized(ex);
3350
3351	/* note: ext4_ext_correct_indexes() isn't needed here because
3352	 * borders are not changed
3353	 */
3354	ext4_ext_try_to_merge(handle, inode, path, ex);
3355
3356	/* Mark modified extent as dirty */
3357	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3358out:
3359	ext4_ext_show_leaf(inode, path);
3360	return err;
3361}
3362
3363static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3364			sector_t block, int count)
3365{
3366	int i;
3367	for (i = 0; i < count; i++)
3368                unmap_underlying_metadata(bdev, block + i);
3369}
3370
3371/*
3372 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3373 */
3374static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3375			      ext4_lblk_t lblk,
3376			      struct ext4_ext_path *path,
3377			      unsigned int len)
3378{
3379	int i, depth;
3380	struct ext4_extent_header *eh;
3381	struct ext4_extent *last_ex;
3382
3383	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3384		return 0;
3385
3386	depth = ext_depth(inode);
3387	eh = path[depth].p_hdr;
3388
3389	/*
3390	 * We're going to remove EOFBLOCKS_FL entirely in future so we
3391	 * do not care for this case anymore. Simply remove the flag
3392	 * if there are no extents.
3393	 */
3394	if (unlikely(!eh->eh_entries))
3395		goto out;
3396	last_ex = EXT_LAST_EXTENT(eh);
3397	/*
3398	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3399	 * last block in the last extent in the file.  We test this by
3400	 * first checking to see if the caller to
3401	 * ext4_ext_get_blocks() was interested in the last block (or
3402	 * a block beyond the last block) in the current extent.  If
3403	 * this turns out to be false, we can bail out from this
3404	 * function immediately.
3405	 */
3406	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3407	    ext4_ext_get_actual_len(last_ex))
3408		return 0;
3409	/*
3410	 * If the caller does appear to be planning to write at or
3411	 * beyond the end of the current extent, we then test to see
3412	 * if the current extent is the last extent in the file, by
3413	 * checking to make sure it was reached via the rightmost node
3414	 * at each level of the tree.
3415	 */
3416	for (i = depth-1; i >= 0; i--)
3417		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3418			return 0;
3419out:
3420	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3421	return ext4_mark_inode_dirty(handle, inode);
3422}
3423
3424/**
3425 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3426 *
3427 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3428 * whether there are any buffers marked for delayed allocation. It returns '1'
3429 * on the first delalloc'ed buffer head found. If no buffer head in the given
3430 * range is marked for delalloc, it returns 0.
3431 * lblk_start should always be <= lblk_end.
3432 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3433 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3434 * block sooner). This is useful when blocks are truncated sequentially from
3435 * lblk_start towards lblk_end.
3436 */
3437static int ext4_find_delalloc_range(struct inode *inode,
3438				    ext4_lblk_t lblk_start,
3439				    ext4_lblk_t lblk_end,
3440				    int search_hint_reverse)
3441{
3442	struct address_space *mapping = inode->i_mapping;
3443	struct buffer_head *head, *bh = NULL;
3444	struct page *page;
3445	ext4_lblk_t i, pg_lblk;
3446	pgoff_t index;
3447
3448	if (!test_opt(inode->i_sb, DELALLOC))
3449		return 0;
3450
3451	/* reverse search wont work if fs block size is less than page size */
3452	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3453		search_hint_reverse = 0;
3454
3455	if (search_hint_reverse)
3456		i = lblk_end;
3457	else
3458		i = lblk_start;
3459
3460	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3461
3462	while ((i >= lblk_start) && (i <= lblk_end)) {
3463		page = find_get_page(mapping, index);
3464		if (!page)
3465			goto nextpage;
3466
3467		if (!page_has_buffers(page))
3468			goto nextpage;
3469
3470		head = page_buffers(page);
3471		if (!head)
3472			goto nextpage;
3473
3474		bh = head;
3475		pg_lblk = index << (PAGE_CACHE_SHIFT -
3476						inode->i_blkbits);
3477		do {
3478			if (unlikely(pg_lblk < lblk_start)) {
3479				/*
3480				 * This is possible when fs block size is less
3481				 * than page size and our cluster starts/ends in
3482				 * middle of the page. So we need to skip the
3483				 * initial few blocks till we reach the 'lblk'
3484				 */
3485				pg_lblk++;
3486				continue;
3487			}
3488
3489			/* Check if the buffer is delayed allocated and that it
3490			 * is not yet mapped. (when da-buffers are mapped during
3491			 * their writeout, their da_mapped bit is set.)
3492			 */
3493			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3494				page_cache_release(page);
3495				trace_ext4_find_delalloc_range(inode,
3496						lblk_start, lblk_end,
3497						search_hint_reverse,
3498						1, i);
3499				return 1;
3500			}
3501			if (search_hint_reverse)
3502				i--;
3503			else
3504				i++;
3505		} while ((i >= lblk_start) && (i <= lblk_end) &&
3506				((bh = bh->b_this_page) != head));
3507nextpage:
3508		if (page)
3509			page_cache_release(page);
3510		/*
3511		 * Move to next page. 'i' will be the first lblk in the next
3512		 * page.
3513		 */
3514		if (search_hint_reverse)
3515			index--;
3516		else
3517			index++;
3518		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3519	}
3520
3521	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3522					search_hint_reverse, 0, 0);
3523	return 0;
3524}
3525
3526int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3527			       int search_hint_reverse)
3528{
3529	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3530	ext4_lblk_t lblk_start, lblk_end;
3531	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3532	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3533
3534	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3535					search_hint_reverse);
3536}
3537
3538/**
3539 * Determines how many complete clusters (out of those specified by the 'map')
3540 * are under delalloc and were reserved quota for.
3541 * This function is called when we are writing out the blocks that were
3542 * originally written with their allocation delayed, but then the space was
3543 * allocated using fallocate() before the delayed allocation could be resolved.
3544 * The cases to look for are:
3545 * ('=' indicated delayed allocated blocks
3546 *  '-' indicates non-delayed allocated blocks)
3547 * (a) partial clusters towards beginning and/or end outside of allocated range
3548 *     are not delalloc'ed.
3549 *	Ex:
3550 *	|----c---=|====c====|====c====|===-c----|
3551 *	         |++++++ allocated ++++++|
3552 *	==> 4 complete clusters in above example
3553 *
3554 * (b) partial cluster (outside of allocated range) towards either end is
3555 *     marked for delayed allocation. In this case, we will exclude that
3556 *     cluster.
3557 *	Ex:
3558 *	|----====c========|========c========|
3559 *	     |++++++ allocated ++++++|
3560 *	==> 1 complete clusters in above example
3561 *
3562 *	Ex:
3563 *	|================c================|
3564 *            |++++++ allocated ++++++|
3565 *	==> 0 complete clusters in above example
3566 *
3567 * The ext4_da_update_reserve_space will be called only if we
3568 * determine here that there were some "entire" clusters that span
3569 * this 'allocated' range.
3570 * In the non-bigalloc case, this function will just end up returning num_blks
3571 * without ever calling ext4_find_delalloc_range.
3572 */
3573static unsigned int
3574get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3575			   unsigned int num_blks)
3576{
3577	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3578	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3579	ext4_lblk_t lblk_from, lblk_to, c_offset;
3580	unsigned int allocated_clusters = 0;
3581
3582	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3583	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3584
3585	/* max possible clusters for this allocation */
3586	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3587
3588	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3589
3590	/* Check towards left side */
3591	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3592	if (c_offset) {
3593		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3594		lblk_to = lblk_from + c_offset - 1;
3595
3596		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3597			allocated_clusters--;
3598	}
3599
3600	/* Now check towards right. */
3601	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3602	if (allocated_clusters && c_offset) {
3603		lblk_from = lblk_start + num_blks;
3604		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3605
3606		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3607			allocated_clusters--;
3608	}
3609
3610	return allocated_clusters;
3611}
3612
3613static int
3614ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3615			struct ext4_map_blocks *map,
3616			struct ext4_ext_path *path, int flags,
3617			unsigned int allocated, ext4_fsblk_t newblock)
3618{
3619	int ret = 0;
3620	int err = 0;
3621	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3622
3623	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3624		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
3625		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3626		  flags, allocated);
3627	ext4_ext_show_leaf(inode, path);
3628
3629	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3630						    newblock);
3631
3632	/* get_block() before submit the IO, split the extent */
3633	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3634		ret = ext4_split_unwritten_extents(handle, inode, map,
3635						   path, flags);
3636		/*
3637		 * Flag the inode(non aio case) or end_io struct (aio case)
3638		 * that this IO needs to conversion to written when IO is
3639		 * completed
3640		 */
3641		if (io)
3642			ext4_set_io_unwritten_flag(inode, io);
3643		else
3644			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3645		if (ext4_should_dioread_nolock(inode))
3646			map->m_flags |= EXT4_MAP_UNINIT;
3647		goto out;
3648	}
3649	/* IO end_io complete, convert the filled extent to written */
3650	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3651		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3652							path);
3653		if (ret >= 0) {
3654			ext4_update_inode_fsync_trans(handle, inode, 1);
3655			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3656						 path, map->m_len);
3657		} else
3658			err = ret;
3659		goto out2;
3660	}
3661	/* buffered IO case */
3662	/*
3663	 * repeat fallocate creation request
3664	 * we already have an unwritten extent
3665	 */
3666	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3667		goto map_out;
3668
3669	/* buffered READ or buffered write_begin() lookup */
3670	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3671		/*
3672		 * We have blocks reserved already.  We
3673		 * return allocated blocks so that delalloc
3674		 * won't do block reservation for us.  But
3675		 * the buffer head will be unmapped so that
3676		 * a read from the block returns 0s.
3677		 */
3678		map->m_flags |= EXT4_MAP_UNWRITTEN;
3679		goto out1;
3680	}
3681
3682	/* buffered write, writepage time, convert*/
3683	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3684	if (ret >= 0)
3685		ext4_update_inode_fsync_trans(handle, inode, 1);
3686out:
3687	if (ret <= 0) {
3688		err = ret;
3689		goto out2;
3690	} else
3691		allocated = ret;
3692	map->m_flags |= EXT4_MAP_NEW;
3693	/*
3694	 * if we allocated more blocks than requested
3695	 * we need to make sure we unmap the extra block
3696	 * allocated. The actual needed block will get
3697	 * unmapped later when we find the buffer_head marked
3698	 * new.
3699	 */
3700	if (allocated > map->m_len) {
3701		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3702					newblock + map->m_len,
3703					allocated - map->m_len);
3704		allocated = map->m_len;
3705	}
3706
3707	/*
3708	 * If we have done fallocate with the offset that is already
3709	 * delayed allocated, we would have block reservation
3710	 * and quota reservation done in the delayed write path.
3711	 * But fallocate would have already updated quota and block
3712	 * count for this offset. So cancel these reservation
3713	 */
3714	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3715		unsigned int reserved_clusters;
3716		reserved_clusters = get_reserved_cluster_alloc(inode,
3717				map->m_lblk, map->m_len);
3718		if (reserved_clusters)
3719			ext4_da_update_reserve_space(inode,
3720						     reserved_clusters,
3721						     0);
3722	}
3723
3724map_out:
3725	map->m_flags |= EXT4_MAP_MAPPED;
3726	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3727		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3728					 map->m_len);
3729		if (err < 0)
3730			goto out2;
3731	}
3732out1:
3733	if (allocated > map->m_len)
3734		allocated = map->m_len;
3735	ext4_ext_show_leaf(inode, path);
3736	map->m_pblk = newblock;
3737	map->m_len = allocated;
3738out2:
3739	if (path) {
3740		ext4_ext_drop_refs(path);
3741		kfree(path);
3742	}
3743	return err ? err : allocated;
3744}
3745
3746/*
3747 * get_implied_cluster_alloc - check to see if the requested
3748 * allocation (in the map structure) overlaps with a cluster already
3749 * allocated in an extent.
3750 *	@sb	The filesystem superblock structure
3751 *	@map	The requested lblk->pblk mapping
3752 *	@ex	The extent structure which might contain an implied
3753 *			cluster allocation
3754 *
3755 * This function is called by ext4_ext_map_blocks() after we failed to
3756 * find blocks that were already in the inode's extent tree.  Hence,
3757 * we know that the beginning of the requested region cannot overlap
3758 * the extent from the inode's extent tree.  There are three cases we
3759 * want to catch.  The first is this case:
3760 *
3761 *		 |--- cluster # N--|
3762 *    |--- extent ---|	|---- requested region ---|
3763 *			|==========|
3764 *
3765 * The second case that we need to test for is this one:
3766 *
3767 *   |--------- cluster # N ----------------|
3768 *	   |--- requested region --|   |------- extent ----|
3769 *	   |=======================|
3770 *
3771 * The third case is when the requested region lies between two extents
3772 * within the same cluster:
3773 *          |------------- cluster # N-------------|
3774 * |----- ex -----|                  |---- ex_right ----|
3775 *                  |------ requested region ------|
3776 *                  |================|
3777 *
3778 * In each of the above cases, we need to set the map->m_pblk and
3779 * map->m_len so it corresponds to the return the extent labelled as
3780 * "|====|" from cluster #N, since it is already in use for data in
3781 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3782 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3783 * as a new "allocated" block region.  Otherwise, we will return 0 and
3784 * ext4_ext_map_blocks() will then allocate one or more new clusters
3785 * by calling ext4_mb_new_blocks().
3786 */
3787static int get_implied_cluster_alloc(struct super_block *sb,
3788				     struct ext4_map_blocks *map,
3789				     struct ext4_extent *ex,
3790				     struct ext4_ext_path *path)
3791{
3792	struct ext4_sb_info *sbi = EXT4_SB(sb);
3793	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3794	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3795	ext4_lblk_t rr_cluster_start;
3796	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3797	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3798	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3799
3800	/* The extent passed in that we are trying to match */
3801	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3802	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3803
3804	/* The requested region passed into ext4_map_blocks() */
3805	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3806
3807	if ((rr_cluster_start == ex_cluster_end) ||
3808	    (rr_cluster_start == ex_cluster_start)) {
3809		if (rr_cluster_start == ex_cluster_end)
3810			ee_start += ee_len - 1;
3811		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3812			c_offset;
3813		map->m_len = min(map->m_len,
3814				 (unsigned) sbi->s_cluster_ratio - c_offset);
3815		/*
3816		 * Check for and handle this case:
3817		 *
3818		 *   |--------- cluster # N-------------|
3819		 *		       |------- extent ----|
3820		 *	   |--- requested region ---|
3821		 *	   |===========|
3822		 */
3823
3824		if (map->m_lblk < ee_block)
3825			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3826
3827		/*
3828		 * Check for the case where there is already another allocated
3829		 * block to the right of 'ex' but before the end of the cluster.
3830		 *
3831		 *          |------------- cluster # N-------------|
3832		 * |----- ex -----|                  |---- ex_right ----|
3833		 *                  |------ requested region ------|
3834		 *                  |================|
3835		 */
3836		if (map->m_lblk > ee_block) {
3837			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3838			map->m_len = min(map->m_len, next - map->m_lblk);
3839		}
3840
3841		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3842		return 1;
3843	}
3844
3845	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3846	return 0;
3847}
3848
3849
3850/*
3851 * Block allocation/map/preallocation routine for extents based files
3852 *
3853 *
3854 * Need to be called with
3855 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3856 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3857 *
3858 * return > 0, number of of blocks already mapped/allocated
3859 *          if create == 0 and these are pre-allocated blocks
3860 *          	buffer head is unmapped
3861 *          otherwise blocks are mapped
3862 *
3863 * return = 0, if plain look up failed (blocks have not been allocated)
3864 *          buffer head is unmapped
3865 *
3866 * return < 0, error case.
3867 */
3868int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3869			struct ext4_map_blocks *map, int flags)
3870{
3871	struct ext4_ext_path *path = NULL;
3872	struct ext4_extent newex, *ex, *ex2;
3873	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3874	ext4_fsblk_t newblock = 0;
3875	int free_on_err = 0, err = 0, depth, ret;
3876	unsigned int allocated = 0, offset = 0;
3877	unsigned int allocated_clusters = 0;
3878	struct ext4_allocation_request ar;
3879	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3880	ext4_lblk_t cluster_offset;
3881
3882	ext_debug("blocks %u/%u requested for inode %lu\n",
3883		  map->m_lblk, map->m_len, inode->i_ino);
3884	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3885
3886	/* check in cache */
3887	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3888		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3889			if ((sbi->s_cluster_ratio > 1) &&
3890			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3891				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3892
3893			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3894				/*
3895				 * block isn't allocated yet and
3896				 * user doesn't want to allocate it
3897				 */
3898				goto out2;
3899			}
3900			/* we should allocate requested block */
3901		} else {
3902			/* block is already allocated */
3903			if (sbi->s_cluster_ratio > 1)
3904				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3905			newblock = map->m_lblk
3906				   - le32_to_cpu(newex.ee_block)
3907				   + ext4_ext_pblock(&newex);
3908			/* number of remaining blocks in the extent */
3909			allocated = ext4_ext_get_actual_len(&newex) -
3910				(map->m_lblk - le32_to_cpu(newex.ee_block));
3911			goto out;
3912		}
3913	}
3914
3915	/* find extent for this block */
3916	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3917	if (IS_ERR(path)) {
3918		err = PTR_ERR(path);
3919		path = NULL;
3920		goto out2;
3921	}
3922
3923	depth = ext_depth(inode);
3924
3925	/*
3926	 * consistent leaf must not be empty;
3927	 * this situation is possible, though, _during_ tree modification;
3928	 * this is why assert can't be put in ext4_ext_find_extent()
3929	 */
3930	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3931		EXT4_ERROR_INODE(inode, "bad extent address "
3932				 "lblock: %lu, depth: %d pblock %lld",
3933				 (unsigned long) map->m_lblk, depth,
3934				 path[depth].p_block);
3935		err = -EIO;
3936		goto out2;
3937	}
3938
3939	ex = path[depth].p_ext;
3940	if (ex) {
3941		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3942		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3943		unsigned short ee_len;
3944
3945		/*
3946		 * Uninitialized extents are treated as holes, except that
3947		 * we split out initialized portions during a write.
3948		 */
3949		ee_len = ext4_ext_get_actual_len(ex);
3950
3951		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3952
3953		/* if found extent covers block, simply return it */
3954		if (in_range(map->m_lblk, ee_block, ee_len)) {
3955			newblock = map->m_lblk - ee_block + ee_start;
3956			/* number of remaining blocks in the extent */
3957			allocated = ee_len - (map->m_lblk - ee_block);
3958			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3959				  ee_block, ee_len, newblock);
3960
3961			/*
3962			 * Do not put uninitialized extent
3963			 * in the cache
3964			 */
3965			if (!ext4_ext_is_uninitialized(ex)) {
3966				ext4_ext_put_in_cache(inode, ee_block,
3967					ee_len, ee_start);
3968				goto out;
3969			}
3970			ret = ext4_ext_handle_uninitialized_extents(
3971				handle, inode, map, path, flags,
3972				allocated, newblock);
3973			return ret;
3974		}
3975	}
3976
3977	if ((sbi->s_cluster_ratio > 1) &&
3978	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3979		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3980
3981	/*
3982	 * requested block isn't allocated yet;
3983	 * we couldn't try to create block if create flag is zero
3984	 */
3985	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3986		/*
3987		 * put just found gap into cache to speed up
3988		 * subsequent requests
3989		 */
3990		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3991		goto out2;
3992	}
3993
3994	/*
3995	 * Okay, we need to do block allocation.
3996	 */
3997	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3998	newex.ee_block = cpu_to_le32(map->m_lblk);
3999	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
4000
4001	/*
4002	 * If we are doing bigalloc, check to see if the extent returned
4003	 * by ext4_ext_find_extent() implies a cluster we can use.
4004	 */
4005	if (cluster_offset && ex &&
4006	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4007		ar.len = allocated = map->m_len;
4008		newblock = map->m_pblk;
4009		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4010		goto got_allocated_blocks;
4011	}
4012
4013	/* find neighbour allocated blocks */
4014	ar.lleft = map->m_lblk;
4015	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4016	if (err)
4017		goto out2;
4018	ar.lright = map->m_lblk;
4019	ex2 = NULL;
4020	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4021	if (err)
4022		goto out2;
4023
4024	/* Check if the extent after searching to the right implies a
4025	 * cluster we can use. */
4026	if ((sbi->s_cluster_ratio > 1) && ex2 &&
4027	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4028		ar.len = allocated = map->m_len;
4029		newblock = map->m_pblk;
4030		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4031		goto got_allocated_blocks;
4032	}
4033
4034	/*
4035	 * See if request is beyond maximum number of blocks we can have in
4036	 * a single extent. For an initialized extent this limit is
4037	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
4038	 * EXT_UNINIT_MAX_LEN.
4039	 */
4040	if (map->m_len > EXT_INIT_MAX_LEN &&
4041	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4042		map->m_len = EXT_INIT_MAX_LEN;
4043	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
4044		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4045		map->m_len = EXT_UNINIT_MAX_LEN;
4046
4047	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4048	newex.ee_len = cpu_to_le16(map->m_len);
4049	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4050	if (err)
4051		allocated = ext4_ext_get_actual_len(&newex);
4052	else
4053		allocated = map->m_len;
4054
4055	/* allocate new block */
4056	ar.inode = inode;
4057	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4058	ar.logical = map->m_lblk;
4059	/*
4060	 * We calculate the offset from the beginning of the cluster
4061	 * for the logical block number, since when we allocate a
4062	 * physical cluster, the physical block should start at the
4063	 * same offset from the beginning of the cluster.  This is
4064	 * needed so that future calls to get_implied_cluster_alloc()
4065	 * work correctly.
4066	 */
4067	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
4068	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4069	ar.goal -= offset;
4070	ar.logical -= offset;
4071	if (S_ISREG(inode->i_mode))
4072		ar.flags = EXT4_MB_HINT_DATA;
4073	else
4074		/* disable in-core preallocation for non-regular files */
4075		ar.flags = 0;
4076	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4077		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4078	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4079	if (!newblock)
4080		goto out2;
4081	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4082		  ar.goal, newblock, allocated);
4083	free_on_err = 1;
4084	allocated_clusters = ar.len;
4085	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4086	if (ar.len > allocated)
4087		ar.len = allocated;
4088
4089got_allocated_blocks:
4090	/* try to insert new extent into found leaf and return */
4091	ext4_ext_store_pblock(&newex, newblock + offset);
4092	newex.ee_len = cpu_to_le16(ar.len);
4093	/* Mark uninitialized */
4094	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4095		ext4_ext_mark_uninitialized(&newex);
4096		/*
4097		 * io_end structure was created for every IO write to an
4098		 * uninitialized extent. To avoid unnecessary conversion,
4099		 * here we flag the IO that really needs the conversion.
4100		 * For non asycn direct IO case, flag the inode state
4101		 * that we need to perform conversion when IO is done.
4102		 */
4103		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
4104			if (io)
4105				ext4_set_io_unwritten_flag(inode, io);
4106			else
4107				ext4_set_inode_state(inode,
4108						     EXT4_STATE_DIO_UNWRITTEN);
4109		}
4110		if (ext4_should_dioread_nolock(inode))
4111			map->m_flags |= EXT4_MAP_UNINIT;
4112	}
4113
4114	err = 0;
4115	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4116		err = check_eofblocks_fl(handle, inode, map->m_lblk,
4117					 path, ar.len);
4118	if (!err)
4119		err = ext4_ext_insert_extent(handle, inode, path,
4120					     &newex, flags);
4121	if (err && free_on_err) {
4122		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4123			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4124		/* free data blocks we just allocated */
4125		/* not a good idea to call discard here directly,
4126		 * but otherwise we'd need to call it every free() */
4127		ext4_discard_preallocations(inode);
4128		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
4129				 ext4_ext_get_actual_len(&newex), fb_flags);
4130		goto out2;
4131	}
4132
4133	/* previous routine could use block we allocated */
4134	newblock = ext4_ext_pblock(&newex);
4135	allocated = ext4_ext_get_actual_len(&newex);
4136	if (allocated > map->m_len)
4137		allocated = map->m_len;
4138	map->m_flags |= EXT4_MAP_NEW;
4139
4140	/*
4141	 * Update reserved blocks/metadata blocks after successful
4142	 * block allocation which had been deferred till now.
4143	 */
4144	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4145		unsigned int reserved_clusters;
4146		/*
4147		 * Check how many clusters we had reserved this allocated range
4148		 */
4149		reserved_clusters = get_reserved_cluster_alloc(inode,
4150						map->m_lblk, allocated);
4151		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4152			if (reserved_clusters) {
4153				/*
4154				 * We have clusters reserved for this range.
4155				 * But since we are not doing actual allocation
4156				 * and are simply using blocks from previously
4157				 * allocated cluster, we should release the
4158				 * reservation and not claim quota.
4159				 */
4160				ext4_da_update_reserve_space(inode,
4161						reserved_clusters, 0);
4162			}
4163		} else {
4164			BUG_ON(allocated_clusters < reserved_clusters);
4165			/* We will claim quota for all newly allocated blocks.*/
4166			ext4_da_update_reserve_space(inode, allocated_clusters,
4167							1);
4168			if (reserved_clusters < allocated_clusters) {
4169				struct ext4_inode_info *ei = EXT4_I(inode);
4170				int reservation = allocated_clusters -
4171						  reserved_clusters;
4172				/*
4173				 * It seems we claimed few clusters outside of
4174				 * the range of this allocation. We should give
4175				 * it back to the reservation pool. This can
4176				 * happen in the following case:
4177				 *
4178				 * * Suppose s_cluster_ratio is 4 (i.e., each
4179				 *   cluster has 4 blocks. Thus, the clusters
4180				 *   are [0-3],[4-7],[8-11]...
4181				 * * First comes delayed allocation write for
4182				 *   logical blocks 10 & 11. Since there were no
4183				 *   previous delayed allocated blocks in the
4184				 *   range [8-11], we would reserve 1 cluster
4185				 *   for this write.
4186				 * * Next comes write for logical blocks 3 to 8.
4187				 *   In this case, we will reserve 2 clusters
4188				 *   (for [0-3] and [4-7]; and not for [8-11] as
4189				 *   that range has a delayed allocated blocks.
4190				 *   Thus total reserved clusters now becomes 3.
4191				 * * Now, during the delayed allocation writeout
4192				 *   time, we will first write blocks [3-8] and
4193				 *   allocate 3 clusters for writing these
4194				 *   blocks. Also, we would claim all these
4195				 *   three clusters above.
4196				 * * Now when we come here to writeout the
4197				 *   blocks [10-11], we would expect to claim
4198				 *   the reservation of 1 cluster we had made
4199				 *   (and we would claim it since there are no
4200				 *   more delayed allocated blocks in the range
4201				 *   [8-11]. But our reserved cluster count had
4202				 *   already gone to 0.
4203				 *
4204				 *   Thus, at the step 4 above when we determine
4205				 *   that there are still some unwritten delayed
4206				 *   allocated blocks outside of our current
4207				 *   block range, we should increment the
4208				 *   reserved clusters count so that when the
4209				 *   remaining blocks finally gets written, we
4210				 *   could claim them.
4211				 */
4212				dquot_reserve_block(inode,
4213						EXT4_C2B(sbi, reservation));
4214				spin_lock(&ei->i_block_reservation_lock);
4215				ei->i_reserved_data_blocks += reservation;
4216				spin_unlock(&ei->i_block_reservation_lock);
4217			}
4218		}
4219	}
4220
4221	/*
4222	 * Cache the extent and update transaction to commit on fdatasync only
4223	 * when it is _not_ an uninitialized extent.
4224	 */
4225	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4226		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4227		ext4_update_inode_fsync_trans(handle, inode, 1);
4228	} else
4229		ext4_update_inode_fsync_trans(handle, inode, 0);
4230out:
4231	if (allocated > map->m_len)
4232		allocated = map->m_len;
4233	ext4_ext_show_leaf(inode, path);
4234	map->m_flags |= EXT4_MAP_MAPPED;
4235	map->m_pblk = newblock;
4236	map->m_len = allocated;
4237out2:
4238	if (path) {
4239		ext4_ext_drop_refs(path);
4240		kfree(path);
4241	}
4242
4243	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4244		newblock, map->m_len, err ? err : allocated);
4245
4246	return err ? err : allocated;
4247}
4248
4249void ext4_ext_truncate(struct inode *inode)
4250{
4251	struct address_space *mapping = inode->i_mapping;
4252	struct super_block *sb = inode->i_sb;
4253	ext4_lblk_t last_block;
4254	handle_t *handle;
4255	loff_t page_len;
4256	int err = 0;
4257
4258	/*
4259	 * finish any pending end_io work so we won't run the risk of
4260	 * converting any truncated blocks to initialized later
4261	 */
4262	ext4_flush_completed_IO(inode);
4263
4264	/*
4265	 * probably first extent we're gonna free will be last in block
4266	 */
4267	err = ext4_writepage_trans_blocks(inode);
4268	handle = ext4_journal_start(inode, err);
4269	if (IS_ERR(handle))
4270		return;
4271
4272	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4273		page_len = PAGE_CACHE_SIZE -
4274			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4275
4276		err = ext4_discard_partial_page_buffers(handle,
4277			mapping, inode->i_size, page_len, 0);
4278
4279		if (err)
4280			goto out_stop;
4281	}
4282
4283	if (ext4_orphan_add(handle, inode))
4284		goto out_stop;
4285
4286	down_write(&EXT4_I(inode)->i_data_sem);
4287	ext4_ext_invalidate_cache(inode);
4288
4289	ext4_discard_preallocations(inode);
4290
4291	/*
4292	 * TODO: optimization is possible here.
4293	 * Probably we need not scan at all,
4294	 * because page truncation is enough.
4295	 */
4296
4297	/* we have to know where to truncate from in crash case */
4298	EXT4_I(inode)->i_disksize = inode->i_size;
4299	ext4_mark_inode_dirty(handle, inode);
4300
4301	last_block = (inode->i_size + sb->s_blocksize - 1)
4302			>> EXT4_BLOCK_SIZE_BITS(sb);
4303	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4304
4305	/* In a multi-transaction truncate, we only make the final
4306	 * transaction synchronous.
4307	 */
4308	if (IS_SYNC(inode))
4309		ext4_handle_sync(handle);
4310
4311	up_write(&EXT4_I(inode)->i_data_sem);
4312
4313out_stop:
4314	/*
4315	 * If this was a simple ftruncate() and the file will remain alive,
4316	 * then we need to clear up the orphan record which we created above.
4317	 * However, if this was a real unlink then we were called by
4318	 * ext4_delete_inode(), and we allow that function to clean up the
4319	 * orphan info for us.
4320	 */
4321	if (inode->i_nlink)
4322		ext4_orphan_del(handle, inode);
4323
4324	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4325	ext4_mark_inode_dirty(handle, inode);
4326	ext4_journal_stop(handle);
4327}
4328
4329static void ext4_falloc_update_inode(struct inode *inode,
4330				int mode, loff_t new_size, int update_ctime)
4331{
4332	struct timespec now;
4333
4334	if (update_ctime) {
4335		now = current_fs_time(inode->i_sb);
4336		if (!timespec_equal(&inode->i_ctime, &now))
4337			inode->i_ctime = now;
4338	}
4339	/*
4340	 * Update only when preallocation was requested beyond
4341	 * the file size.
4342	 */
4343	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4344		if (new_size > i_size_read(inode))
4345			i_size_write(inode, new_size);
4346		if (new_size > EXT4_I(inode)->i_disksize)
4347			ext4_update_i_disksize(inode, new_size);
4348	} else {
4349		/*
4350		 * Mark that we allocate beyond EOF so the subsequent truncate
4351		 * can proceed even if the new size is the same as i_size.
4352		 */
4353		if (new_size > i_size_read(inode))
4354			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4355	}
4356
4357}
4358
4359/*
4360 * preallocate space for a file. This implements ext4's fallocate file
4361 * operation, which gets called from sys_fallocate system call.
4362 * For block-mapped files, posix_fallocate should fall back to the method
4363 * of writing zeroes to the required new blocks (the same behavior which is
4364 * expected for file systems which do not support fallocate() system call).
4365 */
4366long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4367{
4368	struct inode *inode = file->f_path.dentry->d_inode;
4369	handle_t *handle;
4370	loff_t new_size;
4371	unsigned int max_blocks;
4372	int ret = 0;
4373	int ret2 = 0;
4374	int retries = 0;
4375	int flags;
4376	struct ext4_map_blocks map;
4377	unsigned int credits, blkbits = inode->i_blkbits;
4378
4379	/*
4380	 * currently supporting (pre)allocate mode for extent-based
4381	 * files _only_
4382	 */
4383	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4384		return -EOPNOTSUPP;
4385
4386	/* Return error if mode is not supported */
4387	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4388		return -EOPNOTSUPP;
4389
4390	if (mode & FALLOC_FL_PUNCH_HOLE)
4391		return ext4_punch_hole(file, offset, len);
4392
4393	trace_ext4_fallocate_enter(inode, offset, len, mode);
4394	map.m_lblk = offset >> blkbits;
4395	/*
4396	 * We can't just convert len to max_blocks because
4397	 * If blocksize = 4096 offset = 3072 and len = 2048
4398	 */
4399	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4400		- map.m_lblk;
4401	/*
4402	 * credits to insert 1 extent into extent tree
4403	 */
4404	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4405	mutex_lock(&inode->i_mutex);
4406	ret = inode_newsize_ok(inode, (len + offset));
4407	if (ret) {
4408		mutex_unlock(&inode->i_mutex);
4409		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4410		return ret;
4411	}
4412	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4413	if (mode & FALLOC_FL_KEEP_SIZE)
4414		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4415	/*
4416	 * Don't normalize the request if it can fit in one extent so
4417	 * that it doesn't get unnecessarily split into multiple
4418	 * extents.
4419	 */
4420	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4421		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4422retry:
4423	while (ret >= 0 && ret < max_blocks) {
4424		map.m_lblk = map.m_lblk + ret;
4425		map.m_len = max_blocks = max_blocks - ret;
4426		handle = ext4_journal_start(inode, credits);
4427		if (IS_ERR(handle)) {
4428			ret = PTR_ERR(handle);
4429			break;
4430		}
4431		ret = ext4_map_blocks(handle, inode, &map, flags);
4432		if (ret <= 0) {
4433#ifdef EXT4FS_DEBUG
4434			WARN_ON(ret <= 0);
4435			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4436				    "returned error inode#%lu, block=%u, "
4437				    "max_blocks=%u", __func__,
4438				    inode->i_ino, map.m_lblk, max_blocks);
4439#endif
4440			ext4_mark_inode_dirty(handle, inode);
4441			ret2 = ext4_journal_stop(handle);
4442			break;
4443		}
4444		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4445						blkbits) >> blkbits))
4446			new_size = offset + len;
4447		else
4448			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4449
4450		ext4_falloc_update_inode(inode, mode, new_size,
4451					 (map.m_flags & EXT4_MAP_NEW));
4452		ext4_mark_inode_dirty(handle, inode);
4453		if ((file->f_flags & O_SYNC) && ret >= max_blocks)
4454			ext4_handle_sync(handle);
4455		ret2 = ext4_journal_stop(handle);
4456		if (ret2)
4457			break;
4458	}
4459	if (ret == -ENOSPC &&
4460			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4461		ret = 0;
4462		goto retry;
4463	}
4464	mutex_unlock(&inode->i_mutex);
4465	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4466				ret > 0 ? ret2 : ret);
4467	return ret > 0 ? ret2 : ret;
4468}
4469
4470/*
4471 * This function convert a range of blocks to written extents
4472 * The caller of this function will pass the start offset and the size.
4473 * all unwritten extents within this range will be converted to
4474 * written extents.
4475 *
4476 * This function is called from the direct IO end io call back
4477 * function, to convert the fallocated extents after IO is completed.
4478 * Returns 0 on success.
4479 */
4480int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4481				    ssize_t len)
4482{
4483	handle_t *handle;
4484	unsigned int max_blocks;
4485	int ret = 0;
4486	int ret2 = 0;
4487	struct ext4_map_blocks map;
4488	unsigned int credits, blkbits = inode->i_blkbits;
4489
4490	map.m_lblk = offset >> blkbits;
4491	/*
4492	 * We can't just convert len to max_blocks because
4493	 * If blocksize = 4096 offset = 3072 and len = 2048
4494	 */
4495	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4496		      map.m_lblk);
4497	/*
4498	 * credits to insert 1 extent into extent tree
4499	 */
4500	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4501	while (ret >= 0 && ret < max_blocks) {
4502		map.m_lblk += ret;
4503		map.m_len = (max_blocks -= ret);
4504		handle = ext4_journal_start(inode, credits);
4505		if (IS_ERR(handle)) {
4506			ret = PTR_ERR(handle);
4507			break;
4508		}
4509		ret = ext4_map_blocks(handle, inode, &map,
4510				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4511		if (ret <= 0) {
4512			WARN_ON(ret <= 0);
4513			ext4_msg(inode->i_sb, KERN_ERR,
4514				 "%s:%d: inode #%lu: block %u: len %u: "
4515				 "ext4_ext_map_blocks returned %d",
4516				 __func__, __LINE__, inode->i_ino, map.m_lblk,
4517				 map.m_len, ret);
4518		}
4519		ext4_mark_inode_dirty(handle, inode);
4520		ret2 = ext4_journal_stop(handle);
4521		if (ret <= 0 || ret2 )
4522			break;
4523	}
4524	return ret > 0 ? ret2 : ret;
4525}
4526
4527/*
4528 * Callback function called for each extent to gather FIEMAP information.
4529 */
4530static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4531		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4532		       void *data)
4533{
4534	__u64	logical;
4535	__u64	physical;
4536	__u64	length;
4537	__u32	flags = 0;
4538	int		ret = 0;
4539	struct fiemap_extent_info *fieinfo = data;
4540	unsigned char blksize_bits;
4541
4542	blksize_bits = inode->i_sb->s_blocksize_bits;
4543	logical = (__u64)newex->ec_block << blksize_bits;
4544
4545	if (newex->ec_start == 0) {
4546		/*
4547		 * No extent in extent-tree contains block @newex->ec_start,
4548		 * then the block may stay in 1)a hole or 2)delayed-extent.
4549		 *
4550		 * Holes or delayed-extents are processed as follows.
4551		 * 1. lookup dirty pages with specified range in pagecache.
4552		 *    If no page is got, then there is no delayed-extent and
4553		 *    return with EXT_CONTINUE.
4554		 * 2. find the 1st mapped buffer,
4555		 * 3. check if the mapped buffer is both in the request range
4556		 *    and a delayed buffer. If not, there is no delayed-extent,
4557		 *    then return.
4558		 * 4. a delayed-extent is found, the extent will be collected.
4559		 */
4560		ext4_lblk_t	end = 0;
4561		pgoff_t		last_offset;
4562		pgoff_t		offset;
4563		pgoff_t		index;
4564		pgoff_t		start_index = 0;
4565		struct page	**pages = NULL;
4566		struct buffer_head *bh = NULL;
4567		struct buffer_head *head = NULL;
4568		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4569
4570		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4571		if (pages == NULL)
4572			return -ENOMEM;
4573
4574		offset = logical >> PAGE_SHIFT;
4575repeat:
4576		last_offset = offset;
4577		head = NULL;
4578		ret = find_get_pages_tag(inode->i_mapping, &offset,
4579					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4580
4581		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4582			/* First time, try to find a mapped buffer. */
4583			if (ret == 0) {
4584out:
4585				for (index = 0; index < ret; index++)
4586					page_cache_release(pages[index]);
4587				/* just a hole. */
4588				kfree(pages);
4589				return EXT_CONTINUE;
4590			}
4591			index = 0;
4592
4593next_page:
4594			/* Try to find the 1st mapped buffer. */
4595			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4596				  blksize_bits;
4597			if (!page_has_buffers(pages[index]))
4598				goto out;
4599			head = page_buffers(pages[index]);
4600			if (!head)
4601				goto out;
4602
4603			index++;
4604			bh = head;
4605			do {
4606				if (end >= newex->ec_block +
4607					newex->ec_len)
4608					/* The buffer is out of
4609					 * the request range.
4610					 */
4611					goto out;
4612
4613				if (buffer_mapped(bh) &&
4614				    end >= newex->ec_block) {
4615					start_index = index - 1;
4616					/* get the 1st mapped buffer. */
4617					goto found_mapped_buffer;
4618				}
4619
4620				bh = bh->b_this_page;
4621				end++;
4622			} while (bh != head);
4623
4624			/* No mapped buffer in the range found in this page,
4625			 * We need to look up next page.
4626			 */
4627			if (index >= ret) {
4628				/* There is no page left, but we need to limit
4629				 * newex->ec_len.
4630				 */
4631				newex->ec_len = end - newex->ec_block;
4632				goto out;
4633			}
4634			goto next_page;
4635		} else {
4636			/*Find contiguous delayed buffers. */
4637			if (ret > 0 && pages[0]->index == last_offset)
4638				head = page_buffers(pages[0]);
4639			bh = head;
4640			index = 1;
4641			start_index = 0;
4642		}
4643
4644found_mapped_buffer:
4645		if (bh != NULL && buffer_delay(bh)) {
4646			/* 1st or contiguous delayed buffer found. */
4647			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4648				/*
4649				 * 1st delayed buffer found, record
4650				 * the start of extent.
4651				 */
4652				flags |= FIEMAP_EXTENT_DELALLOC;
4653				newex->ec_block = end;
4654				logical = (__u64)end << blksize_bits;
4655			}
4656			/* Find contiguous delayed buffers. */
4657			do {
4658				if (!buffer_delay(bh))
4659					goto found_delayed_extent;
4660				bh = bh->b_this_page;
4661				end++;
4662			} while (bh != head);
4663
4664			for (; index < ret; index++) {
4665				if (!page_has_buffers(pages[index])) {
4666					bh = NULL;
4667					break;
4668				}
4669				head = page_buffers(pages[index]);
4670				if (!head) {
4671					bh = NULL;
4672					break;
4673				}
4674
4675				if (pages[index]->index !=
4676				    pages[start_index]->index + index
4677				    - start_index) {
4678					/* Blocks are not contiguous. */
4679					bh = NULL;
4680					break;
4681				}
4682				bh = head;
4683				do {
4684					if (!buffer_delay(bh))
4685						/* Delayed-extent ends. */
4686						goto found_delayed_extent;
4687					bh = bh->b_this_page;
4688					end++;
4689				} while (bh != head);
4690			}
4691		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4692			/* a hole found. */
4693			goto out;
4694
4695found_delayed_extent:
4696		newex->ec_len = min(end - newex->ec_block,
4697						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4698		if (ret == nr_pages && bh != NULL &&
4699			newex->ec_len < EXT_INIT_MAX_LEN &&
4700			buffer_delay(bh)) {
4701			/* Have not collected an extent and continue. */
4702			for (index = 0; index < ret; index++)
4703				page_cache_release(pages[index]);
4704			goto repeat;
4705		}
4706
4707		for (index = 0; index < ret; index++)
4708			page_cache_release(pages[index]);
4709		kfree(pages);
4710	}
4711
4712	physical = (__u64)newex->ec_start << blksize_bits;
4713	length =   (__u64)newex->ec_len << blksize_bits;
4714
4715	if (ex && ext4_ext_is_uninitialized(ex))
4716		flags |= FIEMAP_EXTENT_UNWRITTEN;
4717
4718	if (next == EXT_MAX_BLOCKS)
4719		flags |= FIEMAP_EXTENT_LAST;
4720
4721	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4722					length, flags);
4723	if (ret < 0)
4724		return ret;
4725	if (ret == 1)
4726		return EXT_BREAK;
4727	return EXT_CONTINUE;
4728}
4729/* fiemap flags we can handle specified here */
4730#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4731
4732static int ext4_xattr_fiemap(struct inode *inode,
4733				struct fiemap_extent_info *fieinfo)
4734{
4735	__u64 physical = 0;
4736	__u64 length;
4737	__u32 flags = FIEMAP_EXTENT_LAST;
4738	int blockbits = inode->i_sb->s_blocksize_bits;
4739	int error = 0;
4740
4741	/* in-inode? */
4742	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4743		struct ext4_iloc iloc;
4744		int offset;	/* offset of xattr in inode */
4745
4746		error = ext4_get_inode_loc(inode, &iloc);
4747		if (error)
4748			return error;
4749		physical = iloc.bh->b_blocknr << blockbits;
4750		offset = EXT4_GOOD_OLD_INODE_SIZE +
4751				EXT4_I(inode)->i_extra_isize;
4752		physical += offset;
4753		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4754		flags |= FIEMAP_EXTENT_DATA_INLINE;
4755		brelse(iloc.bh);
4756	} else { /* external block */
4757		physical = EXT4_I(inode)->i_file_acl << blockbits;
4758		length = inode->i_sb->s_blocksize;
4759	}
4760
4761	if (physical)
4762		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4763						length, flags);
4764	return (error < 0 ? error : 0);
4765}
4766
4767/*
4768 * ext4_ext_punch_hole
4769 *
4770 * Punches a hole of "length" bytes in a file starting
4771 * at byte "offset"
4772 *
4773 * @inode:  The inode of the file to punch a hole in
4774 * @offset: The starting byte offset of the hole
4775 * @length: The length of the hole
4776 *
4777 * Returns the number of blocks removed or negative on err
4778 */
4779int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4780{
4781	struct inode *inode = file->f_path.dentry->d_inode;
4782	struct super_block *sb = inode->i_sb;
4783	ext4_lblk_t first_block, stop_block;
4784	struct address_space *mapping = inode->i_mapping;
4785	handle_t *handle;
4786	loff_t first_page, last_page, page_len;
4787	loff_t first_page_offset, last_page_offset;
4788	int credits, err = 0;
4789
4790	/* No need to punch hole beyond i_size */
4791	if (offset >= inode->i_size)
4792		return 0;
4793
4794	/*
4795	 * If the hole extends beyond i_size, set the hole
4796	 * to end after the page that contains i_size
4797	 */
4798	if (offset + length > inode->i_size) {
4799		length = inode->i_size +
4800		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4801		   offset;
4802	}
4803
4804	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4805	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4806
4807	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4808	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4809
4810	/*
4811	 * Write out all dirty pages to avoid race conditions
4812	 * Then release them.
4813	 */
4814	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4815		err = filemap_write_and_wait_range(mapping,
4816			offset, offset + length - 1);
4817
4818		if (err)
4819			return err;
4820	}
4821
4822	/* Now release the pages */
4823	if (last_page_offset > first_page_offset) {
4824		truncate_pagecache_range(inode, first_page_offset,
4825					 last_page_offset - 1);
4826	}
4827
4828	/* finish any pending end_io work */
4829	ext4_flush_completed_IO(inode);
4830
4831	credits = ext4_writepage_trans_blocks(inode);
4832	handle = ext4_journal_start(inode, credits);
4833	if (IS_ERR(handle))
4834		return PTR_ERR(handle);
4835
4836
4837	/*
4838	 * Now we need to zero out the non-page-aligned data in the
4839	 * pages at the start and tail of the hole, and unmap the buffer
4840	 * heads for the block aligned regions of the page that were
4841	 * completely zeroed.
4842	 */
4843	if (first_page > last_page) {
4844		/*
4845		 * If the file space being truncated is contained within a page
4846		 * just zero out and unmap the middle of that page
4847		 */
4848		err = ext4_discard_partial_page_buffers(handle,
4849			mapping, offset, length, 0);
4850
4851		if (err)
4852			goto out;
4853	} else {
4854		/*
4855		 * zero out and unmap the partial page that contains
4856		 * the start of the hole
4857		 */
4858		page_len  = first_page_offset - offset;
4859		if (page_len > 0) {
4860			err = ext4_discard_partial_page_buffers(handle, mapping,
4861						   offset, page_len, 0);
4862			if (err)
4863				goto out;
4864		}
4865
4866		/*
4867		 * zero out and unmap the partial page that contains
4868		 * the end of the hole
4869		 */
4870		page_len = offset + length - last_page_offset;
4871		if (page_len > 0) {
4872			err = ext4_discard_partial_page_buffers(handle, mapping,
4873					last_page_offset, page_len, 0);
4874			if (err)
4875				goto out;
4876		}
4877	}
4878
4879	/*
4880	 * If i_size is contained in the last page, we need to
4881	 * unmap and zero the partial page after i_size
4882	 */
4883	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4884	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4885
4886		page_len = PAGE_CACHE_SIZE -
4887			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4888
4889		if (page_len > 0) {
4890			err = ext4_discard_partial_page_buffers(handle,
4891			  mapping, inode->i_size, page_len, 0);
4892
4893			if (err)
4894				goto out;
4895		}
4896	}
4897
4898	first_block = (offset + sb->s_blocksize - 1) >>
4899		EXT4_BLOCK_SIZE_BITS(sb);
4900	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4901
4902	/* If there are no blocks to remove, return now */
4903	if (first_block >= stop_block)
4904		goto out;
4905
4906	down_write(&EXT4_I(inode)->i_data_sem);
4907	ext4_ext_invalidate_cache(inode);
4908	ext4_discard_preallocations(inode);
4909
4910	err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4911
4912	ext4_ext_invalidate_cache(inode);
4913	ext4_discard_preallocations(inode);
4914
4915	if (IS_SYNC(inode))
4916		ext4_handle_sync(handle);
4917
4918	up_write(&EXT4_I(inode)->i_data_sem);
4919
4920out:
4921	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4922	ext4_mark_inode_dirty(handle, inode);
4923	ext4_journal_stop(handle);
4924	return err;
4925}
4926int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4927		__u64 start, __u64 len)
4928{
4929	ext4_lblk_t start_blk;
4930	int error = 0;
4931
4932	/* fallback to generic here if not in extents fmt */
4933	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4934		return generic_block_fiemap(inode, fieinfo, start, len,
4935			ext4_get_block);
4936
4937	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4938		return -EBADR;
4939
4940	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4941		error = ext4_xattr_fiemap(inode, fieinfo);
4942	} else {
4943		ext4_lblk_t len_blks;
4944		__u64 last_blk;
4945
4946		start_blk = start >> inode->i_sb->s_blocksize_bits;
4947		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4948		if (last_blk >= EXT_MAX_BLOCKS)
4949			last_blk = EXT_MAX_BLOCKS-1;
4950		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4951
4952		/*
4953		 * Walk the extent tree gathering extent information.
4954		 * ext4_ext_fiemap_cb will push extents back to user.
4955		 */
4956		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4957					  ext4_ext_fiemap_cb, fieinfo);
4958	}
4959
4960	return error;
4961}
4962