extents.c revision ecb94f5fdf4b72547fca022421a9dca1672bddd4
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/fs.h>
33#include <linux/time.h>
34#include <linux/jbd2.h>
35#include <linux/highuid.h>
36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/string.h>
39#include <linux/slab.h>
40#include <linux/falloc.h>
41#include <asm/uaccess.h>
42#include <linux/fiemap.h>
43#include "ext4_jbd2.h"
44
45#include <trace/events/ext4.h>
46
47/*
48 * used by extent splitting.
49 */
50#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
51					due to ENOSPC */
52#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
54
55static __le32 ext4_extent_block_csum(struct inode *inode,
56				     struct ext4_extent_header *eh)
57{
58	struct ext4_inode_info *ei = EXT4_I(inode);
59	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
60	__u32 csum;
61
62	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
63			   EXT4_EXTENT_TAIL_OFFSET(eh));
64	return cpu_to_le32(csum);
65}
66
67static int ext4_extent_block_csum_verify(struct inode *inode,
68					 struct ext4_extent_header *eh)
69{
70	struct ext4_extent_tail *et;
71
72	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
73		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
74		return 1;
75
76	et = find_ext4_extent_tail(eh);
77	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
78		return 0;
79	return 1;
80}
81
82static void ext4_extent_block_csum_set(struct inode *inode,
83				       struct ext4_extent_header *eh)
84{
85	struct ext4_extent_tail *et;
86
87	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
88		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
89		return;
90
91	et = find_ext4_extent_tail(eh);
92	et->et_checksum = ext4_extent_block_csum(inode, eh);
93}
94
95static int ext4_split_extent(handle_t *handle,
96				struct inode *inode,
97				struct ext4_ext_path *path,
98				struct ext4_map_blocks *map,
99				int split_flag,
100				int flags);
101
102static int ext4_split_extent_at(handle_t *handle,
103			     struct inode *inode,
104			     struct ext4_ext_path *path,
105			     ext4_lblk_t split,
106			     int split_flag,
107			     int flags);
108
109static int ext4_ext_truncate_extend_restart(handle_t *handle,
110					    struct inode *inode,
111					    int needed)
112{
113	int err;
114
115	if (!ext4_handle_valid(handle))
116		return 0;
117	if (handle->h_buffer_credits > needed)
118		return 0;
119	err = ext4_journal_extend(handle, needed);
120	if (err <= 0)
121		return err;
122	err = ext4_truncate_restart_trans(handle, inode, needed);
123	if (err == 0)
124		err = -EAGAIN;
125
126	return err;
127}
128
129/*
130 * could return:
131 *  - EROFS
132 *  - ENOMEM
133 */
134static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
135				struct ext4_ext_path *path)
136{
137	if (path->p_bh) {
138		/* path points to block */
139		return ext4_journal_get_write_access(handle, path->p_bh);
140	}
141	/* path points to leaf/index in inode body */
142	/* we use in-core data, no need to protect them */
143	return 0;
144}
145
146/*
147 * could return:
148 *  - EROFS
149 *  - ENOMEM
150 *  - EIO
151 */
152#define ext4_ext_dirty(handle, inode, path) \
153		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
154static int __ext4_ext_dirty(const char *where, unsigned int line,
155			    handle_t *handle, struct inode *inode,
156			    struct ext4_ext_path *path)
157{
158	int err;
159	if (path->p_bh) {
160		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
161		/* path points to block */
162		err = __ext4_handle_dirty_metadata(where, line, handle,
163						   inode, path->p_bh);
164	} else {
165		/* path points to leaf/index in inode body */
166		err = ext4_mark_inode_dirty(handle, inode);
167	}
168	return err;
169}
170
171static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
172			      struct ext4_ext_path *path,
173			      ext4_lblk_t block)
174{
175	if (path) {
176		int depth = path->p_depth;
177		struct ext4_extent *ex;
178
179		/*
180		 * Try to predict block placement assuming that we are
181		 * filling in a file which will eventually be
182		 * non-sparse --- i.e., in the case of libbfd writing
183		 * an ELF object sections out-of-order but in a way
184		 * the eventually results in a contiguous object or
185		 * executable file, or some database extending a table
186		 * space file.  However, this is actually somewhat
187		 * non-ideal if we are writing a sparse file such as
188		 * qemu or KVM writing a raw image file that is going
189		 * to stay fairly sparse, since it will end up
190		 * fragmenting the file system's free space.  Maybe we
191		 * should have some hueristics or some way to allow
192		 * userspace to pass a hint to file system,
193		 * especially if the latter case turns out to be
194		 * common.
195		 */
196		ex = path[depth].p_ext;
197		if (ex) {
198			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
199			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
200
201			if (block > ext_block)
202				return ext_pblk + (block - ext_block);
203			else
204				return ext_pblk - (ext_block - block);
205		}
206
207		/* it looks like index is empty;
208		 * try to find starting block from index itself */
209		if (path[depth].p_bh)
210			return path[depth].p_bh->b_blocknr;
211	}
212
213	/* OK. use inode's group */
214	return ext4_inode_to_goal_block(inode);
215}
216
217/*
218 * Allocation for a meta data block
219 */
220static ext4_fsblk_t
221ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
222			struct ext4_ext_path *path,
223			struct ext4_extent *ex, int *err, unsigned int flags)
224{
225	ext4_fsblk_t goal, newblock;
226
227	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
228	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
229					NULL, err);
230	return newblock;
231}
232
233static inline int ext4_ext_space_block(struct inode *inode, int check)
234{
235	int size;
236
237	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
238			/ sizeof(struct ext4_extent);
239#ifdef AGGRESSIVE_TEST
240	if (!check && size > 6)
241		size = 6;
242#endif
243	return size;
244}
245
246static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
247{
248	int size;
249
250	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
251			/ sizeof(struct ext4_extent_idx);
252#ifdef AGGRESSIVE_TEST
253	if (!check && size > 5)
254		size = 5;
255#endif
256	return size;
257}
258
259static inline int ext4_ext_space_root(struct inode *inode, int check)
260{
261	int size;
262
263	size = sizeof(EXT4_I(inode)->i_data);
264	size -= sizeof(struct ext4_extent_header);
265	size /= sizeof(struct ext4_extent);
266#ifdef AGGRESSIVE_TEST
267	if (!check && size > 3)
268		size = 3;
269#endif
270	return size;
271}
272
273static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
274{
275	int size;
276
277	size = sizeof(EXT4_I(inode)->i_data);
278	size -= sizeof(struct ext4_extent_header);
279	size /= sizeof(struct ext4_extent_idx);
280#ifdef AGGRESSIVE_TEST
281	if (!check && size > 4)
282		size = 4;
283#endif
284	return size;
285}
286
287/*
288 * Calculate the number of metadata blocks needed
289 * to allocate @blocks
290 * Worse case is one block per extent
291 */
292int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
293{
294	struct ext4_inode_info *ei = EXT4_I(inode);
295	int idxs;
296
297	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
298		/ sizeof(struct ext4_extent_idx));
299
300	/*
301	 * If the new delayed allocation block is contiguous with the
302	 * previous da block, it can share index blocks with the
303	 * previous block, so we only need to allocate a new index
304	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
305	 * an additional index block, and at ldxs**3 blocks, yet
306	 * another index blocks.
307	 */
308	if (ei->i_da_metadata_calc_len &&
309	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
310		int num = 0;
311
312		if ((ei->i_da_metadata_calc_len % idxs) == 0)
313			num++;
314		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
315			num++;
316		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
317			num++;
318			ei->i_da_metadata_calc_len = 0;
319		} else
320			ei->i_da_metadata_calc_len++;
321		ei->i_da_metadata_calc_last_lblock++;
322		return num;
323	}
324
325	/*
326	 * In the worst case we need a new set of index blocks at
327	 * every level of the inode's extent tree.
328	 */
329	ei->i_da_metadata_calc_len = 1;
330	ei->i_da_metadata_calc_last_lblock = lblock;
331	return ext_depth(inode) + 1;
332}
333
334static int
335ext4_ext_max_entries(struct inode *inode, int depth)
336{
337	int max;
338
339	if (depth == ext_depth(inode)) {
340		if (depth == 0)
341			max = ext4_ext_space_root(inode, 1);
342		else
343			max = ext4_ext_space_root_idx(inode, 1);
344	} else {
345		if (depth == 0)
346			max = ext4_ext_space_block(inode, 1);
347		else
348			max = ext4_ext_space_block_idx(inode, 1);
349	}
350
351	return max;
352}
353
354static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
355{
356	ext4_fsblk_t block = ext4_ext_pblock(ext);
357	int len = ext4_ext_get_actual_len(ext);
358
359	if (len == 0)
360		return 0;
361	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
362}
363
364static int ext4_valid_extent_idx(struct inode *inode,
365				struct ext4_extent_idx *ext_idx)
366{
367	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
368
369	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
370}
371
372static int ext4_valid_extent_entries(struct inode *inode,
373				struct ext4_extent_header *eh,
374				int depth)
375{
376	unsigned short entries;
377	if (eh->eh_entries == 0)
378		return 1;
379
380	entries = le16_to_cpu(eh->eh_entries);
381
382	if (depth == 0) {
383		/* leaf entries */
384		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
385		while (entries) {
386			if (!ext4_valid_extent(inode, ext))
387				return 0;
388			ext++;
389			entries--;
390		}
391	} else {
392		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
393		while (entries) {
394			if (!ext4_valid_extent_idx(inode, ext_idx))
395				return 0;
396			ext_idx++;
397			entries--;
398		}
399	}
400	return 1;
401}
402
403static int __ext4_ext_check(const char *function, unsigned int line,
404			    struct inode *inode, struct ext4_extent_header *eh,
405			    int depth)
406{
407	const char *error_msg;
408	int max = 0;
409
410	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
411		error_msg = "invalid magic";
412		goto corrupted;
413	}
414	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
415		error_msg = "unexpected eh_depth";
416		goto corrupted;
417	}
418	if (unlikely(eh->eh_max == 0)) {
419		error_msg = "invalid eh_max";
420		goto corrupted;
421	}
422	max = ext4_ext_max_entries(inode, depth);
423	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
424		error_msg = "too large eh_max";
425		goto corrupted;
426	}
427	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
428		error_msg = "invalid eh_entries";
429		goto corrupted;
430	}
431	if (!ext4_valid_extent_entries(inode, eh, depth)) {
432		error_msg = "invalid extent entries";
433		goto corrupted;
434	}
435	/* Verify checksum on non-root extent tree nodes */
436	if (ext_depth(inode) != depth &&
437	    !ext4_extent_block_csum_verify(inode, eh)) {
438		error_msg = "extent tree corrupted";
439		goto corrupted;
440	}
441	return 0;
442
443corrupted:
444	ext4_error_inode(inode, function, line, 0,
445			"bad header/extent: %s - magic %x, "
446			"entries %u, max %u(%u), depth %u(%u)",
447			error_msg, le16_to_cpu(eh->eh_magic),
448			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
449			max, le16_to_cpu(eh->eh_depth), depth);
450
451	return -EIO;
452}
453
454#define ext4_ext_check(inode, eh, depth)	\
455	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
456
457int ext4_ext_check_inode(struct inode *inode)
458{
459	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
460}
461
462static int __ext4_ext_check_block(const char *function, unsigned int line,
463				  struct inode *inode,
464				  struct ext4_extent_header *eh,
465				  int depth,
466				  struct buffer_head *bh)
467{
468	int ret;
469
470	if (buffer_verified(bh))
471		return 0;
472	ret = ext4_ext_check(inode, eh, depth);
473	if (ret)
474		return ret;
475	set_buffer_verified(bh);
476	return ret;
477}
478
479#define ext4_ext_check_block(inode, eh, depth, bh)	\
480	__ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
481
482#ifdef EXT_DEBUG
483static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
484{
485	int k, l = path->p_depth;
486
487	ext_debug("path:");
488	for (k = 0; k <= l; k++, path++) {
489		if (path->p_idx) {
490		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
491			    ext4_idx_pblock(path->p_idx));
492		} else if (path->p_ext) {
493			ext_debug("  %d:[%d]%d:%llu ",
494				  le32_to_cpu(path->p_ext->ee_block),
495				  ext4_ext_is_uninitialized(path->p_ext),
496				  ext4_ext_get_actual_len(path->p_ext),
497				  ext4_ext_pblock(path->p_ext));
498		} else
499			ext_debug("  []");
500	}
501	ext_debug("\n");
502}
503
504static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
505{
506	int depth = ext_depth(inode);
507	struct ext4_extent_header *eh;
508	struct ext4_extent *ex;
509	int i;
510
511	if (!path)
512		return;
513
514	eh = path[depth].p_hdr;
515	ex = EXT_FIRST_EXTENT(eh);
516
517	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
518
519	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
520		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
521			  ext4_ext_is_uninitialized(ex),
522			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
523	}
524	ext_debug("\n");
525}
526
527static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
528			ext4_fsblk_t newblock, int level)
529{
530	int depth = ext_depth(inode);
531	struct ext4_extent *ex;
532
533	if (depth != level) {
534		struct ext4_extent_idx *idx;
535		idx = path[level].p_idx;
536		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
537			ext_debug("%d: move %d:%llu in new index %llu\n", level,
538					le32_to_cpu(idx->ei_block),
539					ext4_idx_pblock(idx),
540					newblock);
541			idx++;
542		}
543
544		return;
545	}
546
547	ex = path[depth].p_ext;
548	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
549		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
550				le32_to_cpu(ex->ee_block),
551				ext4_ext_pblock(ex),
552				ext4_ext_is_uninitialized(ex),
553				ext4_ext_get_actual_len(ex),
554				newblock);
555		ex++;
556	}
557}
558
559#else
560#define ext4_ext_show_path(inode, path)
561#define ext4_ext_show_leaf(inode, path)
562#define ext4_ext_show_move(inode, path, newblock, level)
563#endif
564
565void ext4_ext_drop_refs(struct ext4_ext_path *path)
566{
567	int depth = path->p_depth;
568	int i;
569
570	for (i = 0; i <= depth; i++, path++)
571		if (path->p_bh) {
572			brelse(path->p_bh);
573			path->p_bh = NULL;
574		}
575}
576
577/*
578 * ext4_ext_binsearch_idx:
579 * binary search for the closest index of the given block
580 * the header must be checked before calling this
581 */
582static void
583ext4_ext_binsearch_idx(struct inode *inode,
584			struct ext4_ext_path *path, ext4_lblk_t block)
585{
586	struct ext4_extent_header *eh = path->p_hdr;
587	struct ext4_extent_idx *r, *l, *m;
588
589
590	ext_debug("binsearch for %u(idx):  ", block);
591
592	l = EXT_FIRST_INDEX(eh) + 1;
593	r = EXT_LAST_INDEX(eh);
594	while (l <= r) {
595		m = l + (r - l) / 2;
596		if (block < le32_to_cpu(m->ei_block))
597			r = m - 1;
598		else
599			l = m + 1;
600		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
601				m, le32_to_cpu(m->ei_block),
602				r, le32_to_cpu(r->ei_block));
603	}
604
605	path->p_idx = l - 1;
606	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
607		  ext4_idx_pblock(path->p_idx));
608
609#ifdef CHECK_BINSEARCH
610	{
611		struct ext4_extent_idx *chix, *ix;
612		int k;
613
614		chix = ix = EXT_FIRST_INDEX(eh);
615		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
616		  if (k != 0 &&
617		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
618				printk(KERN_DEBUG "k=%d, ix=0x%p, "
619				       "first=0x%p\n", k,
620				       ix, EXT_FIRST_INDEX(eh));
621				printk(KERN_DEBUG "%u <= %u\n",
622				       le32_to_cpu(ix->ei_block),
623				       le32_to_cpu(ix[-1].ei_block));
624			}
625			BUG_ON(k && le32_to_cpu(ix->ei_block)
626					   <= le32_to_cpu(ix[-1].ei_block));
627			if (block < le32_to_cpu(ix->ei_block))
628				break;
629			chix = ix;
630		}
631		BUG_ON(chix != path->p_idx);
632	}
633#endif
634
635}
636
637/*
638 * ext4_ext_binsearch:
639 * binary search for closest extent of the given block
640 * the header must be checked before calling this
641 */
642static void
643ext4_ext_binsearch(struct inode *inode,
644		struct ext4_ext_path *path, ext4_lblk_t block)
645{
646	struct ext4_extent_header *eh = path->p_hdr;
647	struct ext4_extent *r, *l, *m;
648
649	if (eh->eh_entries == 0) {
650		/*
651		 * this leaf is empty:
652		 * we get such a leaf in split/add case
653		 */
654		return;
655	}
656
657	ext_debug("binsearch for %u:  ", block);
658
659	l = EXT_FIRST_EXTENT(eh) + 1;
660	r = EXT_LAST_EXTENT(eh);
661
662	while (l <= r) {
663		m = l + (r - l) / 2;
664		if (block < le32_to_cpu(m->ee_block))
665			r = m - 1;
666		else
667			l = m + 1;
668		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
669				m, le32_to_cpu(m->ee_block),
670				r, le32_to_cpu(r->ee_block));
671	}
672
673	path->p_ext = l - 1;
674	ext_debug("  -> %d:%llu:[%d]%d ",
675			le32_to_cpu(path->p_ext->ee_block),
676			ext4_ext_pblock(path->p_ext),
677			ext4_ext_is_uninitialized(path->p_ext),
678			ext4_ext_get_actual_len(path->p_ext));
679
680#ifdef CHECK_BINSEARCH
681	{
682		struct ext4_extent *chex, *ex;
683		int k;
684
685		chex = ex = EXT_FIRST_EXTENT(eh);
686		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
687			BUG_ON(k && le32_to_cpu(ex->ee_block)
688					  <= le32_to_cpu(ex[-1].ee_block));
689			if (block < le32_to_cpu(ex->ee_block))
690				break;
691			chex = ex;
692		}
693		BUG_ON(chex != path->p_ext);
694	}
695#endif
696
697}
698
699int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
700{
701	struct ext4_extent_header *eh;
702
703	eh = ext_inode_hdr(inode);
704	eh->eh_depth = 0;
705	eh->eh_entries = 0;
706	eh->eh_magic = EXT4_EXT_MAGIC;
707	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
708	ext4_mark_inode_dirty(handle, inode);
709	ext4_ext_invalidate_cache(inode);
710	return 0;
711}
712
713struct ext4_ext_path *
714ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
715					struct ext4_ext_path *path)
716{
717	struct ext4_extent_header *eh;
718	struct buffer_head *bh;
719	short int depth, i, ppos = 0, alloc = 0;
720
721	eh = ext_inode_hdr(inode);
722	depth = ext_depth(inode);
723
724	/* account possible depth increase */
725	if (!path) {
726		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
727				GFP_NOFS);
728		if (!path)
729			return ERR_PTR(-ENOMEM);
730		alloc = 1;
731	}
732	path[0].p_hdr = eh;
733	path[0].p_bh = NULL;
734
735	i = depth;
736	/* walk through the tree */
737	while (i) {
738		ext_debug("depth %d: num %d, max %d\n",
739			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
740
741		ext4_ext_binsearch_idx(inode, path + ppos, block);
742		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
743		path[ppos].p_depth = i;
744		path[ppos].p_ext = NULL;
745
746		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
747		if (unlikely(!bh))
748			goto err;
749		if (!bh_uptodate_or_lock(bh)) {
750			trace_ext4_ext_load_extent(inode, block,
751						path[ppos].p_block);
752			if (bh_submit_read(bh) < 0) {
753				put_bh(bh);
754				goto err;
755			}
756		}
757		eh = ext_block_hdr(bh);
758		ppos++;
759		if (unlikely(ppos > depth)) {
760			put_bh(bh);
761			EXT4_ERROR_INODE(inode,
762					 "ppos %d > depth %d", ppos, depth);
763			goto err;
764		}
765		path[ppos].p_bh = bh;
766		path[ppos].p_hdr = eh;
767		i--;
768
769		if (ext4_ext_check_block(inode, eh, i, bh))
770			goto err;
771	}
772
773	path[ppos].p_depth = i;
774	path[ppos].p_ext = NULL;
775	path[ppos].p_idx = NULL;
776
777	/* find extent */
778	ext4_ext_binsearch(inode, path + ppos, block);
779	/* if not an empty leaf */
780	if (path[ppos].p_ext)
781		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
782
783	ext4_ext_show_path(inode, path);
784
785	return path;
786
787err:
788	ext4_ext_drop_refs(path);
789	if (alloc)
790		kfree(path);
791	return ERR_PTR(-EIO);
792}
793
794/*
795 * ext4_ext_insert_index:
796 * insert new index [@logical;@ptr] into the block at @curp;
797 * check where to insert: before @curp or after @curp
798 */
799static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
800				 struct ext4_ext_path *curp,
801				 int logical, ext4_fsblk_t ptr)
802{
803	struct ext4_extent_idx *ix;
804	int len, err;
805
806	err = ext4_ext_get_access(handle, inode, curp);
807	if (err)
808		return err;
809
810	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
811		EXT4_ERROR_INODE(inode,
812				 "logical %d == ei_block %d!",
813				 logical, le32_to_cpu(curp->p_idx->ei_block));
814		return -EIO;
815	}
816
817	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
818			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
819		EXT4_ERROR_INODE(inode,
820				 "eh_entries %d >= eh_max %d!",
821				 le16_to_cpu(curp->p_hdr->eh_entries),
822				 le16_to_cpu(curp->p_hdr->eh_max));
823		return -EIO;
824	}
825
826	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
827		/* insert after */
828		ext_debug("insert new index %d after: %llu\n", logical, ptr);
829		ix = curp->p_idx + 1;
830	} else {
831		/* insert before */
832		ext_debug("insert new index %d before: %llu\n", logical, ptr);
833		ix = curp->p_idx;
834	}
835
836	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
837	BUG_ON(len < 0);
838	if (len > 0) {
839		ext_debug("insert new index %d: "
840				"move %d indices from 0x%p to 0x%p\n",
841				logical, len, ix, ix + 1);
842		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
843	}
844
845	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
846		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
847		return -EIO;
848	}
849
850	ix->ei_block = cpu_to_le32(logical);
851	ext4_idx_store_pblock(ix, ptr);
852	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
853
854	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
855		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
856		return -EIO;
857	}
858
859	err = ext4_ext_dirty(handle, inode, curp);
860	ext4_std_error(inode->i_sb, err);
861
862	return err;
863}
864
865/*
866 * ext4_ext_split:
867 * inserts new subtree into the path, using free index entry
868 * at depth @at:
869 * - allocates all needed blocks (new leaf and all intermediate index blocks)
870 * - makes decision where to split
871 * - moves remaining extents and index entries (right to the split point)
872 *   into the newly allocated blocks
873 * - initializes subtree
874 */
875static int ext4_ext_split(handle_t *handle, struct inode *inode,
876			  unsigned int flags,
877			  struct ext4_ext_path *path,
878			  struct ext4_extent *newext, int at)
879{
880	struct buffer_head *bh = NULL;
881	int depth = ext_depth(inode);
882	struct ext4_extent_header *neh;
883	struct ext4_extent_idx *fidx;
884	int i = at, k, m, a;
885	ext4_fsblk_t newblock, oldblock;
886	__le32 border;
887	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
888	int err = 0;
889
890	/* make decision: where to split? */
891	/* FIXME: now decision is simplest: at current extent */
892
893	/* if current leaf will be split, then we should use
894	 * border from split point */
895	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
896		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
897		return -EIO;
898	}
899	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
900		border = path[depth].p_ext[1].ee_block;
901		ext_debug("leaf will be split."
902				" next leaf starts at %d\n",
903				  le32_to_cpu(border));
904	} else {
905		border = newext->ee_block;
906		ext_debug("leaf will be added."
907				" next leaf starts at %d\n",
908				le32_to_cpu(border));
909	}
910
911	/*
912	 * If error occurs, then we break processing
913	 * and mark filesystem read-only. index won't
914	 * be inserted and tree will be in consistent
915	 * state. Next mount will repair buffers too.
916	 */
917
918	/*
919	 * Get array to track all allocated blocks.
920	 * We need this to handle errors and free blocks
921	 * upon them.
922	 */
923	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
924	if (!ablocks)
925		return -ENOMEM;
926
927	/* allocate all needed blocks */
928	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
929	for (a = 0; a < depth - at; a++) {
930		newblock = ext4_ext_new_meta_block(handle, inode, path,
931						   newext, &err, flags);
932		if (newblock == 0)
933			goto cleanup;
934		ablocks[a] = newblock;
935	}
936
937	/* initialize new leaf */
938	newblock = ablocks[--a];
939	if (unlikely(newblock == 0)) {
940		EXT4_ERROR_INODE(inode, "newblock == 0!");
941		err = -EIO;
942		goto cleanup;
943	}
944	bh = sb_getblk(inode->i_sb, newblock);
945	if (!bh) {
946		err = -EIO;
947		goto cleanup;
948	}
949	lock_buffer(bh);
950
951	err = ext4_journal_get_create_access(handle, bh);
952	if (err)
953		goto cleanup;
954
955	neh = ext_block_hdr(bh);
956	neh->eh_entries = 0;
957	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
958	neh->eh_magic = EXT4_EXT_MAGIC;
959	neh->eh_depth = 0;
960
961	/* move remainder of path[depth] to the new leaf */
962	if (unlikely(path[depth].p_hdr->eh_entries !=
963		     path[depth].p_hdr->eh_max)) {
964		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
965				 path[depth].p_hdr->eh_entries,
966				 path[depth].p_hdr->eh_max);
967		err = -EIO;
968		goto cleanup;
969	}
970	/* start copy from next extent */
971	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
972	ext4_ext_show_move(inode, path, newblock, depth);
973	if (m) {
974		struct ext4_extent *ex;
975		ex = EXT_FIRST_EXTENT(neh);
976		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
977		le16_add_cpu(&neh->eh_entries, m);
978	}
979
980	ext4_extent_block_csum_set(inode, neh);
981	set_buffer_uptodate(bh);
982	unlock_buffer(bh);
983
984	err = ext4_handle_dirty_metadata(handle, inode, bh);
985	if (err)
986		goto cleanup;
987	brelse(bh);
988	bh = NULL;
989
990	/* correct old leaf */
991	if (m) {
992		err = ext4_ext_get_access(handle, inode, path + depth);
993		if (err)
994			goto cleanup;
995		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
996		err = ext4_ext_dirty(handle, inode, path + depth);
997		if (err)
998			goto cleanup;
999
1000	}
1001
1002	/* create intermediate indexes */
1003	k = depth - at - 1;
1004	if (unlikely(k < 0)) {
1005		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1006		err = -EIO;
1007		goto cleanup;
1008	}
1009	if (k)
1010		ext_debug("create %d intermediate indices\n", k);
1011	/* insert new index into current index block */
1012	/* current depth stored in i var */
1013	i = depth - 1;
1014	while (k--) {
1015		oldblock = newblock;
1016		newblock = ablocks[--a];
1017		bh = sb_getblk(inode->i_sb, newblock);
1018		if (!bh) {
1019			err = -EIO;
1020			goto cleanup;
1021		}
1022		lock_buffer(bh);
1023
1024		err = ext4_journal_get_create_access(handle, bh);
1025		if (err)
1026			goto cleanup;
1027
1028		neh = ext_block_hdr(bh);
1029		neh->eh_entries = cpu_to_le16(1);
1030		neh->eh_magic = EXT4_EXT_MAGIC;
1031		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1032		neh->eh_depth = cpu_to_le16(depth - i);
1033		fidx = EXT_FIRST_INDEX(neh);
1034		fidx->ei_block = border;
1035		ext4_idx_store_pblock(fidx, oldblock);
1036
1037		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1038				i, newblock, le32_to_cpu(border), oldblock);
1039
1040		/* move remainder of path[i] to the new index block */
1041		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1042					EXT_LAST_INDEX(path[i].p_hdr))) {
1043			EXT4_ERROR_INODE(inode,
1044					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1045					 le32_to_cpu(path[i].p_ext->ee_block));
1046			err = -EIO;
1047			goto cleanup;
1048		}
1049		/* start copy indexes */
1050		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1051		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1052				EXT_MAX_INDEX(path[i].p_hdr));
1053		ext4_ext_show_move(inode, path, newblock, i);
1054		if (m) {
1055			memmove(++fidx, path[i].p_idx,
1056				sizeof(struct ext4_extent_idx) * m);
1057			le16_add_cpu(&neh->eh_entries, m);
1058		}
1059		ext4_extent_block_csum_set(inode, neh);
1060		set_buffer_uptodate(bh);
1061		unlock_buffer(bh);
1062
1063		err = ext4_handle_dirty_metadata(handle, inode, bh);
1064		if (err)
1065			goto cleanup;
1066		brelse(bh);
1067		bh = NULL;
1068
1069		/* correct old index */
1070		if (m) {
1071			err = ext4_ext_get_access(handle, inode, path + i);
1072			if (err)
1073				goto cleanup;
1074			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1075			err = ext4_ext_dirty(handle, inode, path + i);
1076			if (err)
1077				goto cleanup;
1078		}
1079
1080		i--;
1081	}
1082
1083	/* insert new index */
1084	err = ext4_ext_insert_index(handle, inode, path + at,
1085				    le32_to_cpu(border), newblock);
1086
1087cleanup:
1088	if (bh) {
1089		if (buffer_locked(bh))
1090			unlock_buffer(bh);
1091		brelse(bh);
1092	}
1093
1094	if (err) {
1095		/* free all allocated blocks in error case */
1096		for (i = 0; i < depth; i++) {
1097			if (!ablocks[i])
1098				continue;
1099			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1100					 EXT4_FREE_BLOCKS_METADATA);
1101		}
1102	}
1103	kfree(ablocks);
1104
1105	return err;
1106}
1107
1108/*
1109 * ext4_ext_grow_indepth:
1110 * implements tree growing procedure:
1111 * - allocates new block
1112 * - moves top-level data (index block or leaf) into the new block
1113 * - initializes new top-level, creating index that points to the
1114 *   just created block
1115 */
1116static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1117				 unsigned int flags,
1118				 struct ext4_extent *newext)
1119{
1120	struct ext4_extent_header *neh;
1121	struct buffer_head *bh;
1122	ext4_fsblk_t newblock;
1123	int err = 0;
1124
1125	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1126		newext, &err, flags);
1127	if (newblock == 0)
1128		return err;
1129
1130	bh = sb_getblk(inode->i_sb, newblock);
1131	if (!bh) {
1132		err = -EIO;
1133		ext4_std_error(inode->i_sb, err);
1134		return err;
1135	}
1136	lock_buffer(bh);
1137
1138	err = ext4_journal_get_create_access(handle, bh);
1139	if (err) {
1140		unlock_buffer(bh);
1141		goto out;
1142	}
1143
1144	/* move top-level index/leaf into new block */
1145	memmove(bh->b_data, EXT4_I(inode)->i_data,
1146		sizeof(EXT4_I(inode)->i_data));
1147
1148	/* set size of new block */
1149	neh = ext_block_hdr(bh);
1150	/* old root could have indexes or leaves
1151	 * so calculate e_max right way */
1152	if (ext_depth(inode))
1153		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1154	else
1155		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1156	neh->eh_magic = EXT4_EXT_MAGIC;
1157	ext4_extent_block_csum_set(inode, neh);
1158	set_buffer_uptodate(bh);
1159	unlock_buffer(bh);
1160
1161	err = ext4_handle_dirty_metadata(handle, inode, bh);
1162	if (err)
1163		goto out;
1164
1165	/* Update top-level index: num,max,pointer */
1166	neh = ext_inode_hdr(inode);
1167	neh->eh_entries = cpu_to_le16(1);
1168	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1169	if (neh->eh_depth == 0) {
1170		/* Root extent block becomes index block */
1171		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1172		EXT_FIRST_INDEX(neh)->ei_block =
1173			EXT_FIRST_EXTENT(neh)->ee_block;
1174	}
1175	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1176		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1177		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1178		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1179
1180	neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
1181	ext4_mark_inode_dirty(handle, inode);
1182out:
1183	brelse(bh);
1184
1185	return err;
1186}
1187
1188/*
1189 * ext4_ext_create_new_leaf:
1190 * finds empty index and adds new leaf.
1191 * if no free index is found, then it requests in-depth growing.
1192 */
1193static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1194				    unsigned int flags,
1195				    struct ext4_ext_path *path,
1196				    struct ext4_extent *newext)
1197{
1198	struct ext4_ext_path *curp;
1199	int depth, i, err = 0;
1200
1201repeat:
1202	i = depth = ext_depth(inode);
1203
1204	/* walk up to the tree and look for free index entry */
1205	curp = path + depth;
1206	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1207		i--;
1208		curp--;
1209	}
1210
1211	/* we use already allocated block for index block,
1212	 * so subsequent data blocks should be contiguous */
1213	if (EXT_HAS_FREE_INDEX(curp)) {
1214		/* if we found index with free entry, then use that
1215		 * entry: create all needed subtree and add new leaf */
1216		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1217		if (err)
1218			goto out;
1219
1220		/* refill path */
1221		ext4_ext_drop_refs(path);
1222		path = ext4_ext_find_extent(inode,
1223				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1224				    path);
1225		if (IS_ERR(path))
1226			err = PTR_ERR(path);
1227	} else {
1228		/* tree is full, time to grow in depth */
1229		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1230		if (err)
1231			goto out;
1232
1233		/* refill path */
1234		ext4_ext_drop_refs(path);
1235		path = ext4_ext_find_extent(inode,
1236				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1237				    path);
1238		if (IS_ERR(path)) {
1239			err = PTR_ERR(path);
1240			goto out;
1241		}
1242
1243		/*
1244		 * only first (depth 0 -> 1) produces free space;
1245		 * in all other cases we have to split the grown tree
1246		 */
1247		depth = ext_depth(inode);
1248		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1249			/* now we need to split */
1250			goto repeat;
1251		}
1252	}
1253
1254out:
1255	return err;
1256}
1257
1258/*
1259 * search the closest allocated block to the left for *logical
1260 * and returns it at @logical + it's physical address at @phys
1261 * if *logical is the smallest allocated block, the function
1262 * returns 0 at @phys
1263 * return value contains 0 (success) or error code
1264 */
1265static int ext4_ext_search_left(struct inode *inode,
1266				struct ext4_ext_path *path,
1267				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1268{
1269	struct ext4_extent_idx *ix;
1270	struct ext4_extent *ex;
1271	int depth, ee_len;
1272
1273	if (unlikely(path == NULL)) {
1274		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1275		return -EIO;
1276	}
1277	depth = path->p_depth;
1278	*phys = 0;
1279
1280	if (depth == 0 && path->p_ext == NULL)
1281		return 0;
1282
1283	/* usually extent in the path covers blocks smaller
1284	 * then *logical, but it can be that extent is the
1285	 * first one in the file */
1286
1287	ex = path[depth].p_ext;
1288	ee_len = ext4_ext_get_actual_len(ex);
1289	if (*logical < le32_to_cpu(ex->ee_block)) {
1290		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1291			EXT4_ERROR_INODE(inode,
1292					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1293					 *logical, le32_to_cpu(ex->ee_block));
1294			return -EIO;
1295		}
1296		while (--depth >= 0) {
1297			ix = path[depth].p_idx;
1298			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1299				EXT4_ERROR_INODE(inode,
1300				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1301				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1302				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1303		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1304				  depth);
1305				return -EIO;
1306			}
1307		}
1308		return 0;
1309	}
1310
1311	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1312		EXT4_ERROR_INODE(inode,
1313				 "logical %d < ee_block %d + ee_len %d!",
1314				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1315		return -EIO;
1316	}
1317
1318	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1319	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1320	return 0;
1321}
1322
1323/*
1324 * search the closest allocated block to the right for *logical
1325 * and returns it at @logical + it's physical address at @phys
1326 * if *logical is the largest allocated block, the function
1327 * returns 0 at @phys
1328 * return value contains 0 (success) or error code
1329 */
1330static int ext4_ext_search_right(struct inode *inode,
1331				 struct ext4_ext_path *path,
1332				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1333				 struct ext4_extent **ret_ex)
1334{
1335	struct buffer_head *bh = NULL;
1336	struct ext4_extent_header *eh;
1337	struct ext4_extent_idx *ix;
1338	struct ext4_extent *ex;
1339	ext4_fsblk_t block;
1340	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1341	int ee_len;
1342
1343	if (unlikely(path == NULL)) {
1344		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1345		return -EIO;
1346	}
1347	depth = path->p_depth;
1348	*phys = 0;
1349
1350	if (depth == 0 && path->p_ext == NULL)
1351		return 0;
1352
1353	/* usually extent in the path covers blocks smaller
1354	 * then *logical, but it can be that extent is the
1355	 * first one in the file */
1356
1357	ex = path[depth].p_ext;
1358	ee_len = ext4_ext_get_actual_len(ex);
1359	if (*logical < le32_to_cpu(ex->ee_block)) {
1360		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1361			EXT4_ERROR_INODE(inode,
1362					 "first_extent(path[%d].p_hdr) != ex",
1363					 depth);
1364			return -EIO;
1365		}
1366		while (--depth >= 0) {
1367			ix = path[depth].p_idx;
1368			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1369				EXT4_ERROR_INODE(inode,
1370						 "ix != EXT_FIRST_INDEX *logical %d!",
1371						 *logical);
1372				return -EIO;
1373			}
1374		}
1375		goto found_extent;
1376	}
1377
1378	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1379		EXT4_ERROR_INODE(inode,
1380				 "logical %d < ee_block %d + ee_len %d!",
1381				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1382		return -EIO;
1383	}
1384
1385	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1386		/* next allocated block in this leaf */
1387		ex++;
1388		goto found_extent;
1389	}
1390
1391	/* go up and search for index to the right */
1392	while (--depth >= 0) {
1393		ix = path[depth].p_idx;
1394		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1395			goto got_index;
1396	}
1397
1398	/* we've gone up to the root and found no index to the right */
1399	return 0;
1400
1401got_index:
1402	/* we've found index to the right, let's
1403	 * follow it and find the closest allocated
1404	 * block to the right */
1405	ix++;
1406	block = ext4_idx_pblock(ix);
1407	while (++depth < path->p_depth) {
1408		bh = sb_bread(inode->i_sb, block);
1409		if (bh == NULL)
1410			return -EIO;
1411		eh = ext_block_hdr(bh);
1412		/* subtract from p_depth to get proper eh_depth */
1413		if (ext4_ext_check_block(inode, eh,
1414					 path->p_depth - depth, bh)) {
1415			put_bh(bh);
1416			return -EIO;
1417		}
1418		ix = EXT_FIRST_INDEX(eh);
1419		block = ext4_idx_pblock(ix);
1420		put_bh(bh);
1421	}
1422
1423	bh = sb_bread(inode->i_sb, block);
1424	if (bh == NULL)
1425		return -EIO;
1426	eh = ext_block_hdr(bh);
1427	if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
1428		put_bh(bh);
1429		return -EIO;
1430	}
1431	ex = EXT_FIRST_EXTENT(eh);
1432found_extent:
1433	*logical = le32_to_cpu(ex->ee_block);
1434	*phys = ext4_ext_pblock(ex);
1435	*ret_ex = ex;
1436	if (bh)
1437		put_bh(bh);
1438	return 0;
1439}
1440
1441/*
1442 * ext4_ext_next_allocated_block:
1443 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1444 * NOTE: it considers block number from index entry as
1445 * allocated block. Thus, index entries have to be consistent
1446 * with leaves.
1447 */
1448static ext4_lblk_t
1449ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1450{
1451	int depth;
1452
1453	BUG_ON(path == NULL);
1454	depth = path->p_depth;
1455
1456	if (depth == 0 && path->p_ext == NULL)
1457		return EXT_MAX_BLOCKS;
1458
1459	while (depth >= 0) {
1460		if (depth == path->p_depth) {
1461			/* leaf */
1462			if (path[depth].p_ext &&
1463				path[depth].p_ext !=
1464					EXT_LAST_EXTENT(path[depth].p_hdr))
1465			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1466		} else {
1467			/* index */
1468			if (path[depth].p_idx !=
1469					EXT_LAST_INDEX(path[depth].p_hdr))
1470			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1471		}
1472		depth--;
1473	}
1474
1475	return EXT_MAX_BLOCKS;
1476}
1477
1478/*
1479 * ext4_ext_next_leaf_block:
1480 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1481 */
1482static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1483{
1484	int depth;
1485
1486	BUG_ON(path == NULL);
1487	depth = path->p_depth;
1488
1489	/* zero-tree has no leaf blocks at all */
1490	if (depth == 0)
1491		return EXT_MAX_BLOCKS;
1492
1493	/* go to index block */
1494	depth--;
1495
1496	while (depth >= 0) {
1497		if (path[depth].p_idx !=
1498				EXT_LAST_INDEX(path[depth].p_hdr))
1499			return (ext4_lblk_t)
1500				le32_to_cpu(path[depth].p_idx[1].ei_block);
1501		depth--;
1502	}
1503
1504	return EXT_MAX_BLOCKS;
1505}
1506
1507/*
1508 * ext4_ext_correct_indexes:
1509 * if leaf gets modified and modified extent is first in the leaf,
1510 * then we have to correct all indexes above.
1511 * TODO: do we need to correct tree in all cases?
1512 */
1513static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1514				struct ext4_ext_path *path)
1515{
1516	struct ext4_extent_header *eh;
1517	int depth = ext_depth(inode);
1518	struct ext4_extent *ex;
1519	__le32 border;
1520	int k, err = 0;
1521
1522	eh = path[depth].p_hdr;
1523	ex = path[depth].p_ext;
1524
1525	if (unlikely(ex == NULL || eh == NULL)) {
1526		EXT4_ERROR_INODE(inode,
1527				 "ex %p == NULL or eh %p == NULL", ex, eh);
1528		return -EIO;
1529	}
1530
1531	if (depth == 0) {
1532		/* there is no tree at all */
1533		return 0;
1534	}
1535
1536	if (ex != EXT_FIRST_EXTENT(eh)) {
1537		/* we correct tree if first leaf got modified only */
1538		return 0;
1539	}
1540
1541	/*
1542	 * TODO: we need correction if border is smaller than current one
1543	 */
1544	k = depth - 1;
1545	border = path[depth].p_ext->ee_block;
1546	err = ext4_ext_get_access(handle, inode, path + k);
1547	if (err)
1548		return err;
1549	path[k].p_idx->ei_block = border;
1550	err = ext4_ext_dirty(handle, inode, path + k);
1551	if (err)
1552		return err;
1553
1554	while (k--) {
1555		/* change all left-side indexes */
1556		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1557			break;
1558		err = ext4_ext_get_access(handle, inode, path + k);
1559		if (err)
1560			break;
1561		path[k].p_idx->ei_block = border;
1562		err = ext4_ext_dirty(handle, inode, path + k);
1563		if (err)
1564			break;
1565	}
1566
1567	return err;
1568}
1569
1570int
1571ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1572				struct ext4_extent *ex2)
1573{
1574	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1575
1576	/*
1577	 * Make sure that either both extents are uninitialized, or
1578	 * both are _not_.
1579	 */
1580	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1581		return 0;
1582
1583	if (ext4_ext_is_uninitialized(ex1))
1584		max_len = EXT_UNINIT_MAX_LEN;
1585	else
1586		max_len = EXT_INIT_MAX_LEN;
1587
1588	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1589	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1590
1591	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1592			le32_to_cpu(ex2->ee_block))
1593		return 0;
1594
1595	/*
1596	 * To allow future support for preallocated extents to be added
1597	 * as an RO_COMPAT feature, refuse to merge to extents if
1598	 * this can result in the top bit of ee_len being set.
1599	 */
1600	if (ext1_ee_len + ext2_ee_len > max_len)
1601		return 0;
1602#ifdef AGGRESSIVE_TEST
1603	if (ext1_ee_len >= 4)
1604		return 0;
1605#endif
1606
1607	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1608		return 1;
1609	return 0;
1610}
1611
1612/*
1613 * This function tries to merge the "ex" extent to the next extent in the tree.
1614 * It always tries to merge towards right. If you want to merge towards
1615 * left, pass "ex - 1" as argument instead of "ex".
1616 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1617 * 1 if they got merged.
1618 */
1619static int ext4_ext_try_to_merge_right(struct inode *inode,
1620				 struct ext4_ext_path *path,
1621				 struct ext4_extent *ex)
1622{
1623	struct ext4_extent_header *eh;
1624	unsigned int depth, len;
1625	int merge_done = 0;
1626	int uninitialized = 0;
1627
1628	depth = ext_depth(inode);
1629	BUG_ON(path[depth].p_hdr == NULL);
1630	eh = path[depth].p_hdr;
1631
1632	while (ex < EXT_LAST_EXTENT(eh)) {
1633		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1634			break;
1635		/* merge with next extent! */
1636		if (ext4_ext_is_uninitialized(ex))
1637			uninitialized = 1;
1638		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1639				+ ext4_ext_get_actual_len(ex + 1));
1640		if (uninitialized)
1641			ext4_ext_mark_uninitialized(ex);
1642
1643		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1644			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1645				* sizeof(struct ext4_extent);
1646			memmove(ex + 1, ex + 2, len);
1647		}
1648		le16_add_cpu(&eh->eh_entries, -1);
1649		merge_done = 1;
1650		WARN_ON(eh->eh_entries == 0);
1651		if (!eh->eh_entries)
1652			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1653	}
1654
1655	return merge_done;
1656}
1657
1658/*
1659 * This function does a very simple check to see if we can collapse
1660 * an extent tree with a single extent tree leaf block into the inode.
1661 */
1662static void ext4_ext_try_to_merge_up(handle_t *handle,
1663				     struct inode *inode,
1664				     struct ext4_ext_path *path)
1665{
1666	size_t s;
1667	unsigned max_root = ext4_ext_space_root(inode, 0);
1668	ext4_fsblk_t blk;
1669
1670	if ((path[0].p_depth != 1) ||
1671	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1672	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1673		return;
1674
1675	/*
1676	 * We need to modify the block allocation bitmap and the block
1677	 * group descriptor to release the extent tree block.  If we
1678	 * can't get the journal credits, give up.
1679	 */
1680	if (ext4_journal_extend(handle, 2))
1681		return;
1682
1683	/*
1684	 * Copy the extent data up to the inode
1685	 */
1686	blk = ext4_idx_pblock(path[0].p_idx);
1687	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1688		sizeof(struct ext4_extent_idx);
1689	s += sizeof(struct ext4_extent_header);
1690
1691	memcpy(path[0].p_hdr, path[1].p_hdr, s);
1692	path[0].p_depth = 0;
1693	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1694		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1695	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1696
1697	brelse(path[1].p_bh);
1698	ext4_free_blocks(handle, inode, NULL, blk, 1,
1699			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1700}
1701
1702/*
1703 * This function tries to merge the @ex extent to neighbours in the tree.
1704 * return 1 if merge left else 0.
1705 */
1706static void ext4_ext_try_to_merge(handle_t *handle,
1707				  struct inode *inode,
1708				  struct ext4_ext_path *path,
1709				  struct ext4_extent *ex) {
1710	struct ext4_extent_header *eh;
1711	unsigned int depth;
1712	int merge_done = 0;
1713
1714	depth = ext_depth(inode);
1715	BUG_ON(path[depth].p_hdr == NULL);
1716	eh = path[depth].p_hdr;
1717
1718	if (ex > EXT_FIRST_EXTENT(eh))
1719		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1720
1721	if (!merge_done)
1722		(void) ext4_ext_try_to_merge_right(inode, path, ex);
1723
1724	ext4_ext_try_to_merge_up(handle, inode, path);
1725}
1726
1727/*
1728 * check if a portion of the "newext" extent overlaps with an
1729 * existing extent.
1730 *
1731 * If there is an overlap discovered, it updates the length of the newext
1732 * such that there will be no overlap, and then returns 1.
1733 * If there is no overlap found, it returns 0.
1734 */
1735static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1736					   struct inode *inode,
1737					   struct ext4_extent *newext,
1738					   struct ext4_ext_path *path)
1739{
1740	ext4_lblk_t b1, b2;
1741	unsigned int depth, len1;
1742	unsigned int ret = 0;
1743
1744	b1 = le32_to_cpu(newext->ee_block);
1745	len1 = ext4_ext_get_actual_len(newext);
1746	depth = ext_depth(inode);
1747	if (!path[depth].p_ext)
1748		goto out;
1749	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1750	b2 &= ~(sbi->s_cluster_ratio - 1);
1751
1752	/*
1753	 * get the next allocated block if the extent in the path
1754	 * is before the requested block(s)
1755	 */
1756	if (b2 < b1) {
1757		b2 = ext4_ext_next_allocated_block(path);
1758		if (b2 == EXT_MAX_BLOCKS)
1759			goto out;
1760		b2 &= ~(sbi->s_cluster_ratio - 1);
1761	}
1762
1763	/* check for wrap through zero on extent logical start block*/
1764	if (b1 + len1 < b1) {
1765		len1 = EXT_MAX_BLOCKS - b1;
1766		newext->ee_len = cpu_to_le16(len1);
1767		ret = 1;
1768	}
1769
1770	/* check for overlap */
1771	if (b1 + len1 > b2) {
1772		newext->ee_len = cpu_to_le16(b2 - b1);
1773		ret = 1;
1774	}
1775out:
1776	return ret;
1777}
1778
1779/*
1780 * ext4_ext_insert_extent:
1781 * tries to merge requsted extent into the existing extent or
1782 * inserts requested extent as new one into the tree,
1783 * creating new leaf in the no-space case.
1784 */
1785int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1786				struct ext4_ext_path *path,
1787				struct ext4_extent *newext, int flag)
1788{
1789	struct ext4_extent_header *eh;
1790	struct ext4_extent *ex, *fex;
1791	struct ext4_extent *nearex; /* nearest extent */
1792	struct ext4_ext_path *npath = NULL;
1793	int depth, len, err;
1794	ext4_lblk_t next;
1795	unsigned uninitialized = 0;
1796	int flags = 0;
1797
1798	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1799		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1800		return -EIO;
1801	}
1802	depth = ext_depth(inode);
1803	ex = path[depth].p_ext;
1804	if (unlikely(path[depth].p_hdr == NULL)) {
1805		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1806		return -EIO;
1807	}
1808
1809	/* try to insert block into found extent and return */
1810	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1811		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1812		ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1813			  ext4_ext_is_uninitialized(newext),
1814			  ext4_ext_get_actual_len(newext),
1815			  le32_to_cpu(ex->ee_block),
1816			  ext4_ext_is_uninitialized(ex),
1817			  ext4_ext_get_actual_len(ex),
1818			  ext4_ext_pblock(ex));
1819		err = ext4_ext_get_access(handle, inode, path + depth);
1820		if (err)
1821			return err;
1822
1823		/*
1824		 * ext4_can_extents_be_merged should have checked that either
1825		 * both extents are uninitialized, or both aren't. Thus we
1826		 * need to check only one of them here.
1827		 */
1828		if (ext4_ext_is_uninitialized(ex))
1829			uninitialized = 1;
1830		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1831					+ ext4_ext_get_actual_len(newext));
1832		if (uninitialized)
1833			ext4_ext_mark_uninitialized(ex);
1834		eh = path[depth].p_hdr;
1835		nearex = ex;
1836		goto merge;
1837	}
1838
1839	depth = ext_depth(inode);
1840	eh = path[depth].p_hdr;
1841	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1842		goto has_space;
1843
1844	/* probably next leaf has space for us? */
1845	fex = EXT_LAST_EXTENT(eh);
1846	next = EXT_MAX_BLOCKS;
1847	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1848		next = ext4_ext_next_leaf_block(path);
1849	if (next != EXT_MAX_BLOCKS) {
1850		ext_debug("next leaf block - %u\n", next);
1851		BUG_ON(npath != NULL);
1852		npath = ext4_ext_find_extent(inode, next, NULL);
1853		if (IS_ERR(npath))
1854			return PTR_ERR(npath);
1855		BUG_ON(npath->p_depth != path->p_depth);
1856		eh = npath[depth].p_hdr;
1857		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1858			ext_debug("next leaf isn't full(%d)\n",
1859				  le16_to_cpu(eh->eh_entries));
1860			path = npath;
1861			goto has_space;
1862		}
1863		ext_debug("next leaf has no free space(%d,%d)\n",
1864			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1865	}
1866
1867	/*
1868	 * There is no free space in the found leaf.
1869	 * We're gonna add a new leaf in the tree.
1870	 */
1871	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1872		flags = EXT4_MB_USE_ROOT_BLOCKS;
1873	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1874	if (err)
1875		goto cleanup;
1876	depth = ext_depth(inode);
1877	eh = path[depth].p_hdr;
1878
1879has_space:
1880	nearex = path[depth].p_ext;
1881
1882	err = ext4_ext_get_access(handle, inode, path + depth);
1883	if (err)
1884		goto cleanup;
1885
1886	if (!nearex) {
1887		/* there is no extent in this leaf, create first one */
1888		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
1889				le32_to_cpu(newext->ee_block),
1890				ext4_ext_pblock(newext),
1891				ext4_ext_is_uninitialized(newext),
1892				ext4_ext_get_actual_len(newext));
1893		nearex = EXT_FIRST_EXTENT(eh);
1894	} else {
1895		if (le32_to_cpu(newext->ee_block)
1896			   > le32_to_cpu(nearex->ee_block)) {
1897			/* Insert after */
1898			ext_debug("insert %u:%llu:[%d]%d before: "
1899					"nearest %p\n",
1900					le32_to_cpu(newext->ee_block),
1901					ext4_ext_pblock(newext),
1902					ext4_ext_is_uninitialized(newext),
1903					ext4_ext_get_actual_len(newext),
1904					nearex);
1905			nearex++;
1906		} else {
1907			/* Insert before */
1908			BUG_ON(newext->ee_block == nearex->ee_block);
1909			ext_debug("insert %u:%llu:[%d]%d after: "
1910					"nearest %p\n",
1911					le32_to_cpu(newext->ee_block),
1912					ext4_ext_pblock(newext),
1913					ext4_ext_is_uninitialized(newext),
1914					ext4_ext_get_actual_len(newext),
1915					nearex);
1916		}
1917		len = EXT_LAST_EXTENT(eh) - nearex + 1;
1918		if (len > 0) {
1919			ext_debug("insert %u:%llu:[%d]%d: "
1920					"move %d extents from 0x%p to 0x%p\n",
1921					le32_to_cpu(newext->ee_block),
1922					ext4_ext_pblock(newext),
1923					ext4_ext_is_uninitialized(newext),
1924					ext4_ext_get_actual_len(newext),
1925					len, nearex, nearex + 1);
1926			memmove(nearex + 1, nearex,
1927				len * sizeof(struct ext4_extent));
1928		}
1929	}
1930
1931	le16_add_cpu(&eh->eh_entries, 1);
1932	path[depth].p_ext = nearex;
1933	nearex->ee_block = newext->ee_block;
1934	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1935	nearex->ee_len = newext->ee_len;
1936
1937merge:
1938	/* try to merge extents */
1939	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1940		ext4_ext_try_to_merge(handle, inode, path, nearex);
1941
1942
1943	/* time to correct all indexes above */
1944	err = ext4_ext_correct_indexes(handle, inode, path);
1945	if (err)
1946		goto cleanup;
1947
1948	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
1949
1950cleanup:
1951	if (npath) {
1952		ext4_ext_drop_refs(npath);
1953		kfree(npath);
1954	}
1955	ext4_ext_invalidate_cache(inode);
1956	return err;
1957}
1958
1959static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1960			       ext4_lblk_t num, ext_prepare_callback func,
1961			       void *cbdata)
1962{
1963	struct ext4_ext_path *path = NULL;
1964	struct ext4_ext_cache cbex;
1965	struct ext4_extent *ex;
1966	ext4_lblk_t next, start = 0, end = 0;
1967	ext4_lblk_t last = block + num;
1968	int depth, exists, err = 0;
1969
1970	BUG_ON(func == NULL);
1971	BUG_ON(inode == NULL);
1972
1973	while (block < last && block != EXT_MAX_BLOCKS) {
1974		num = last - block;
1975		/* find extent for this block */
1976		down_read(&EXT4_I(inode)->i_data_sem);
1977		path = ext4_ext_find_extent(inode, block, path);
1978		up_read(&EXT4_I(inode)->i_data_sem);
1979		if (IS_ERR(path)) {
1980			err = PTR_ERR(path);
1981			path = NULL;
1982			break;
1983		}
1984
1985		depth = ext_depth(inode);
1986		if (unlikely(path[depth].p_hdr == NULL)) {
1987			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1988			err = -EIO;
1989			break;
1990		}
1991		ex = path[depth].p_ext;
1992		next = ext4_ext_next_allocated_block(path);
1993
1994		exists = 0;
1995		if (!ex) {
1996			/* there is no extent yet, so try to allocate
1997			 * all requested space */
1998			start = block;
1999			end = block + num;
2000		} else if (le32_to_cpu(ex->ee_block) > block) {
2001			/* need to allocate space before found extent */
2002			start = block;
2003			end = le32_to_cpu(ex->ee_block);
2004			if (block + num < end)
2005				end = block + num;
2006		} else if (block >= le32_to_cpu(ex->ee_block)
2007					+ ext4_ext_get_actual_len(ex)) {
2008			/* need to allocate space after found extent */
2009			start = block;
2010			end = block + num;
2011			if (end >= next)
2012				end = next;
2013		} else if (block >= le32_to_cpu(ex->ee_block)) {
2014			/*
2015			 * some part of requested space is covered
2016			 * by found extent
2017			 */
2018			start = block;
2019			end = le32_to_cpu(ex->ee_block)
2020				+ ext4_ext_get_actual_len(ex);
2021			if (block + num < end)
2022				end = block + num;
2023			exists = 1;
2024		} else {
2025			BUG();
2026		}
2027		BUG_ON(end <= start);
2028
2029		if (!exists) {
2030			cbex.ec_block = start;
2031			cbex.ec_len = end - start;
2032			cbex.ec_start = 0;
2033		} else {
2034			cbex.ec_block = le32_to_cpu(ex->ee_block);
2035			cbex.ec_len = ext4_ext_get_actual_len(ex);
2036			cbex.ec_start = ext4_ext_pblock(ex);
2037		}
2038
2039		if (unlikely(cbex.ec_len == 0)) {
2040			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
2041			err = -EIO;
2042			break;
2043		}
2044		err = func(inode, next, &cbex, ex, cbdata);
2045		ext4_ext_drop_refs(path);
2046
2047		if (err < 0)
2048			break;
2049
2050		if (err == EXT_REPEAT)
2051			continue;
2052		else if (err == EXT_BREAK) {
2053			err = 0;
2054			break;
2055		}
2056
2057		if (ext_depth(inode) != depth) {
2058			/* depth was changed. we have to realloc path */
2059			kfree(path);
2060			path = NULL;
2061		}
2062
2063		block = cbex.ec_block + cbex.ec_len;
2064	}
2065
2066	if (path) {
2067		ext4_ext_drop_refs(path);
2068		kfree(path);
2069	}
2070
2071	return err;
2072}
2073
2074static void
2075ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
2076			__u32 len, ext4_fsblk_t start)
2077{
2078	struct ext4_ext_cache *cex;
2079	BUG_ON(len == 0);
2080	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2081	trace_ext4_ext_put_in_cache(inode, block, len, start);
2082	cex = &EXT4_I(inode)->i_cached_extent;
2083	cex->ec_block = block;
2084	cex->ec_len = len;
2085	cex->ec_start = start;
2086	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2087}
2088
2089/*
2090 * ext4_ext_put_gap_in_cache:
2091 * calculate boundaries of the gap that the requested block fits into
2092 * and cache this gap
2093 */
2094static void
2095ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2096				ext4_lblk_t block)
2097{
2098	int depth = ext_depth(inode);
2099	unsigned long len;
2100	ext4_lblk_t lblock;
2101	struct ext4_extent *ex;
2102
2103	ex = path[depth].p_ext;
2104	if (ex == NULL) {
2105		/* there is no extent yet, so gap is [0;-] */
2106		lblock = 0;
2107		len = EXT_MAX_BLOCKS;
2108		ext_debug("cache gap(whole file):");
2109	} else if (block < le32_to_cpu(ex->ee_block)) {
2110		lblock = block;
2111		len = le32_to_cpu(ex->ee_block) - block;
2112		ext_debug("cache gap(before): %u [%u:%u]",
2113				block,
2114				le32_to_cpu(ex->ee_block),
2115				 ext4_ext_get_actual_len(ex));
2116	} else if (block >= le32_to_cpu(ex->ee_block)
2117			+ ext4_ext_get_actual_len(ex)) {
2118		ext4_lblk_t next;
2119		lblock = le32_to_cpu(ex->ee_block)
2120			+ ext4_ext_get_actual_len(ex);
2121
2122		next = ext4_ext_next_allocated_block(path);
2123		ext_debug("cache gap(after): [%u:%u] %u",
2124				le32_to_cpu(ex->ee_block),
2125				ext4_ext_get_actual_len(ex),
2126				block);
2127		BUG_ON(next == lblock);
2128		len = next - lblock;
2129	} else {
2130		lblock = len = 0;
2131		BUG();
2132	}
2133
2134	ext_debug(" -> %u:%lu\n", lblock, len);
2135	ext4_ext_put_in_cache(inode, lblock, len, 0);
2136}
2137
2138/*
2139 * ext4_ext_check_cache()
2140 * Checks to see if the given block is in the cache.
2141 * If it is, the cached extent is stored in the given
2142 * cache extent pointer.  If the cached extent is a hole,
2143 * this routine should be used instead of
2144 * ext4_ext_in_cache if the calling function needs to
2145 * know the size of the hole.
2146 *
2147 * @inode: The files inode
2148 * @block: The block to look for in the cache
2149 * @ex:    Pointer where the cached extent will be stored
2150 *         if it contains block
2151 *
2152 * Return 0 if cache is invalid; 1 if the cache is valid
2153 */
2154static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2155	struct ext4_ext_cache *ex){
2156	struct ext4_ext_cache *cex;
2157	struct ext4_sb_info *sbi;
2158	int ret = 0;
2159
2160	/*
2161	 * We borrow i_block_reservation_lock to protect i_cached_extent
2162	 */
2163	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2164	cex = &EXT4_I(inode)->i_cached_extent;
2165	sbi = EXT4_SB(inode->i_sb);
2166
2167	/* has cache valid data? */
2168	if (cex->ec_len == 0)
2169		goto errout;
2170
2171	if (in_range(block, cex->ec_block, cex->ec_len)) {
2172		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2173		ext_debug("%u cached by %u:%u:%llu\n",
2174				block,
2175				cex->ec_block, cex->ec_len, cex->ec_start);
2176		ret = 1;
2177	}
2178errout:
2179	trace_ext4_ext_in_cache(inode, block, ret);
2180	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2181	return ret;
2182}
2183
2184/*
2185 * ext4_ext_in_cache()
2186 * Checks to see if the given block is in the cache.
2187 * If it is, the cached extent is stored in the given
2188 * extent pointer.
2189 *
2190 * @inode: The files inode
2191 * @block: The block to look for in the cache
2192 * @ex:    Pointer where the cached extent will be stored
2193 *         if it contains block
2194 *
2195 * Return 0 if cache is invalid; 1 if the cache is valid
2196 */
2197static int
2198ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2199			struct ext4_extent *ex)
2200{
2201	struct ext4_ext_cache cex;
2202	int ret = 0;
2203
2204	if (ext4_ext_check_cache(inode, block, &cex)) {
2205		ex->ee_block = cpu_to_le32(cex.ec_block);
2206		ext4_ext_store_pblock(ex, cex.ec_start);
2207		ex->ee_len = cpu_to_le16(cex.ec_len);
2208		ret = 1;
2209	}
2210
2211	return ret;
2212}
2213
2214
2215/*
2216 * ext4_ext_rm_idx:
2217 * removes index from the index block.
2218 */
2219static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2220			struct ext4_ext_path *path)
2221{
2222	int err;
2223	ext4_fsblk_t leaf;
2224
2225	/* free index block */
2226	path--;
2227	leaf = ext4_idx_pblock(path->p_idx);
2228	if (unlikely(path->p_hdr->eh_entries == 0)) {
2229		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2230		return -EIO;
2231	}
2232	err = ext4_ext_get_access(handle, inode, path);
2233	if (err)
2234		return err;
2235
2236	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2237		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2238		len *= sizeof(struct ext4_extent_idx);
2239		memmove(path->p_idx, path->p_idx + 1, len);
2240	}
2241
2242	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2243	err = ext4_ext_dirty(handle, inode, path);
2244	if (err)
2245		return err;
2246	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2247	trace_ext4_ext_rm_idx(inode, leaf);
2248
2249	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2250			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2251	return err;
2252}
2253
2254/*
2255 * ext4_ext_calc_credits_for_single_extent:
2256 * This routine returns max. credits that needed to insert an extent
2257 * to the extent tree.
2258 * When pass the actual path, the caller should calculate credits
2259 * under i_data_sem.
2260 */
2261int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2262						struct ext4_ext_path *path)
2263{
2264	if (path) {
2265		int depth = ext_depth(inode);
2266		int ret = 0;
2267
2268		/* probably there is space in leaf? */
2269		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2270				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2271
2272			/*
2273			 *  There are some space in the leaf tree, no
2274			 *  need to account for leaf block credit
2275			 *
2276			 *  bitmaps and block group descriptor blocks
2277			 *  and other metadata blocks still need to be
2278			 *  accounted.
2279			 */
2280			/* 1 bitmap, 1 block group descriptor */
2281			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2282			return ret;
2283		}
2284	}
2285
2286	return ext4_chunk_trans_blocks(inode, nrblocks);
2287}
2288
2289/*
2290 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2291 *
2292 * if nrblocks are fit in a single extent (chunk flag is 1), then
2293 * in the worse case, each tree level index/leaf need to be changed
2294 * if the tree split due to insert a new extent, then the old tree
2295 * index/leaf need to be updated too
2296 *
2297 * If the nrblocks are discontiguous, they could cause
2298 * the whole tree split more than once, but this is really rare.
2299 */
2300int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2301{
2302	int index;
2303	int depth = ext_depth(inode);
2304
2305	if (chunk)
2306		index = depth * 2;
2307	else
2308		index = depth * 3;
2309
2310	return index;
2311}
2312
2313static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2314			      struct ext4_extent *ex,
2315			      ext4_fsblk_t *partial_cluster,
2316			      ext4_lblk_t from, ext4_lblk_t to)
2317{
2318	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2319	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2320	ext4_fsblk_t pblk;
2321	int flags = EXT4_FREE_BLOCKS_FORGET;
2322
2323	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2324		flags |= EXT4_FREE_BLOCKS_METADATA;
2325	/*
2326	 * For bigalloc file systems, we never free a partial cluster
2327	 * at the beginning of the extent.  Instead, we make a note
2328	 * that we tried freeing the cluster, and check to see if we
2329	 * need to free it on a subsequent call to ext4_remove_blocks,
2330	 * or at the end of the ext4_truncate() operation.
2331	 */
2332	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2333
2334	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2335	/*
2336	 * If we have a partial cluster, and it's different from the
2337	 * cluster of the last block, we need to explicitly free the
2338	 * partial cluster here.
2339	 */
2340	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2341	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2342		ext4_free_blocks(handle, inode, NULL,
2343				 EXT4_C2B(sbi, *partial_cluster),
2344				 sbi->s_cluster_ratio, flags);
2345		*partial_cluster = 0;
2346	}
2347
2348#ifdef EXTENTS_STATS
2349	{
2350		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2351		spin_lock(&sbi->s_ext_stats_lock);
2352		sbi->s_ext_blocks += ee_len;
2353		sbi->s_ext_extents++;
2354		if (ee_len < sbi->s_ext_min)
2355			sbi->s_ext_min = ee_len;
2356		if (ee_len > sbi->s_ext_max)
2357			sbi->s_ext_max = ee_len;
2358		if (ext_depth(inode) > sbi->s_depth_max)
2359			sbi->s_depth_max = ext_depth(inode);
2360		spin_unlock(&sbi->s_ext_stats_lock);
2361	}
2362#endif
2363	if (from >= le32_to_cpu(ex->ee_block)
2364	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2365		/* tail removal */
2366		ext4_lblk_t num;
2367
2368		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2369		pblk = ext4_ext_pblock(ex) + ee_len - num;
2370		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2371		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2372		/*
2373		 * If the block range to be freed didn't start at the
2374		 * beginning of a cluster, and we removed the entire
2375		 * extent, save the partial cluster here, since we
2376		 * might need to delete if we determine that the
2377		 * truncate operation has removed all of the blocks in
2378		 * the cluster.
2379		 */
2380		if (pblk & (sbi->s_cluster_ratio - 1) &&
2381		    (ee_len == num))
2382			*partial_cluster = EXT4_B2C(sbi, pblk);
2383		else
2384			*partial_cluster = 0;
2385	} else if (from == le32_to_cpu(ex->ee_block)
2386		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2387		/* head removal */
2388		ext4_lblk_t num;
2389		ext4_fsblk_t start;
2390
2391		num = to - from;
2392		start = ext4_ext_pblock(ex);
2393
2394		ext_debug("free first %u blocks starting %llu\n", num, start);
2395		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2396
2397	} else {
2398		printk(KERN_INFO "strange request: removal(2) "
2399				"%u-%u from %u:%u\n",
2400				from, to, le32_to_cpu(ex->ee_block), ee_len);
2401	}
2402	return 0;
2403}
2404
2405
2406/*
2407 * ext4_ext_rm_leaf() Removes the extents associated with the
2408 * blocks appearing between "start" and "end", and splits the extents
2409 * if "start" and "end" appear in the same extent
2410 *
2411 * @handle: The journal handle
2412 * @inode:  The files inode
2413 * @path:   The path to the leaf
2414 * @start:  The first block to remove
2415 * @end:   The last block to remove
2416 */
2417static int
2418ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2419		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2420		 ext4_lblk_t start, ext4_lblk_t end)
2421{
2422	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2423	int err = 0, correct_index = 0;
2424	int depth = ext_depth(inode), credits;
2425	struct ext4_extent_header *eh;
2426	ext4_lblk_t a, b;
2427	unsigned num;
2428	ext4_lblk_t ex_ee_block;
2429	unsigned short ex_ee_len;
2430	unsigned uninitialized = 0;
2431	struct ext4_extent *ex;
2432
2433	/* the header must be checked already in ext4_ext_remove_space() */
2434	ext_debug("truncate since %u in leaf to %u\n", start, end);
2435	if (!path[depth].p_hdr)
2436		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2437	eh = path[depth].p_hdr;
2438	if (unlikely(path[depth].p_hdr == NULL)) {
2439		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2440		return -EIO;
2441	}
2442	/* find where to start removing */
2443	ex = EXT_LAST_EXTENT(eh);
2444
2445	ex_ee_block = le32_to_cpu(ex->ee_block);
2446	ex_ee_len = ext4_ext_get_actual_len(ex);
2447
2448	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2449
2450	while (ex >= EXT_FIRST_EXTENT(eh) &&
2451			ex_ee_block + ex_ee_len > start) {
2452
2453		if (ext4_ext_is_uninitialized(ex))
2454			uninitialized = 1;
2455		else
2456			uninitialized = 0;
2457
2458		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2459			 uninitialized, ex_ee_len);
2460		path[depth].p_ext = ex;
2461
2462		a = ex_ee_block > start ? ex_ee_block : start;
2463		b = ex_ee_block+ex_ee_len - 1 < end ?
2464			ex_ee_block+ex_ee_len - 1 : end;
2465
2466		ext_debug("  border %u:%u\n", a, b);
2467
2468		/* If this extent is beyond the end of the hole, skip it */
2469		if (end < ex_ee_block) {
2470			ex--;
2471			ex_ee_block = le32_to_cpu(ex->ee_block);
2472			ex_ee_len = ext4_ext_get_actual_len(ex);
2473			continue;
2474		} else if (b != ex_ee_block + ex_ee_len - 1) {
2475			EXT4_ERROR_INODE(inode,
2476					 "can not handle truncate %u:%u "
2477					 "on extent %u:%u",
2478					 start, end, ex_ee_block,
2479					 ex_ee_block + ex_ee_len - 1);
2480			err = -EIO;
2481			goto out;
2482		} else if (a != ex_ee_block) {
2483			/* remove tail of the extent */
2484			num = a - ex_ee_block;
2485		} else {
2486			/* remove whole extent: excellent! */
2487			num = 0;
2488		}
2489		/*
2490		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2491		 * descriptor) for each block group; assume two block
2492		 * groups plus ex_ee_len/blocks_per_block_group for
2493		 * the worst case
2494		 */
2495		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2496		if (ex == EXT_FIRST_EXTENT(eh)) {
2497			correct_index = 1;
2498			credits += (ext_depth(inode)) + 1;
2499		}
2500		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2501
2502		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2503		if (err)
2504			goto out;
2505
2506		err = ext4_ext_get_access(handle, inode, path + depth);
2507		if (err)
2508			goto out;
2509
2510		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2511					 a, b);
2512		if (err)
2513			goto out;
2514
2515		if (num == 0)
2516			/* this extent is removed; mark slot entirely unused */
2517			ext4_ext_store_pblock(ex, 0);
2518
2519		ex->ee_len = cpu_to_le16(num);
2520		/*
2521		 * Do not mark uninitialized if all the blocks in the
2522		 * extent have been removed.
2523		 */
2524		if (uninitialized && num)
2525			ext4_ext_mark_uninitialized(ex);
2526		/*
2527		 * If the extent was completely released,
2528		 * we need to remove it from the leaf
2529		 */
2530		if (num == 0) {
2531			if (end != EXT_MAX_BLOCKS - 1) {
2532				/*
2533				 * For hole punching, we need to scoot all the
2534				 * extents up when an extent is removed so that
2535				 * we dont have blank extents in the middle
2536				 */
2537				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2538					sizeof(struct ext4_extent));
2539
2540				/* Now get rid of the one at the end */
2541				memset(EXT_LAST_EXTENT(eh), 0,
2542					sizeof(struct ext4_extent));
2543			}
2544			le16_add_cpu(&eh->eh_entries, -1);
2545		} else
2546			*partial_cluster = 0;
2547
2548		err = ext4_ext_dirty(handle, inode, path + depth);
2549		if (err)
2550			goto out;
2551
2552		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2553				ext4_ext_pblock(ex));
2554		ex--;
2555		ex_ee_block = le32_to_cpu(ex->ee_block);
2556		ex_ee_len = ext4_ext_get_actual_len(ex);
2557	}
2558
2559	if (correct_index && eh->eh_entries)
2560		err = ext4_ext_correct_indexes(handle, inode, path);
2561
2562	/*
2563	 * If there is still a entry in the leaf node, check to see if
2564	 * it references the partial cluster.  This is the only place
2565	 * where it could; if it doesn't, we can free the cluster.
2566	 */
2567	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2568	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2569	     *partial_cluster)) {
2570		int flags = EXT4_FREE_BLOCKS_FORGET;
2571
2572		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2573			flags |= EXT4_FREE_BLOCKS_METADATA;
2574
2575		ext4_free_blocks(handle, inode, NULL,
2576				 EXT4_C2B(sbi, *partial_cluster),
2577				 sbi->s_cluster_ratio, flags);
2578		*partial_cluster = 0;
2579	}
2580
2581	/* if this leaf is free, then we should
2582	 * remove it from index block above */
2583	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2584		err = ext4_ext_rm_idx(handle, inode, path + depth);
2585
2586out:
2587	return err;
2588}
2589
2590/*
2591 * ext4_ext_more_to_rm:
2592 * returns 1 if current index has to be freed (even partial)
2593 */
2594static int
2595ext4_ext_more_to_rm(struct ext4_ext_path *path)
2596{
2597	BUG_ON(path->p_idx == NULL);
2598
2599	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2600		return 0;
2601
2602	/*
2603	 * if truncate on deeper level happened, it wasn't partial,
2604	 * so we have to consider current index for truncation
2605	 */
2606	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2607		return 0;
2608	return 1;
2609}
2610
2611static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2612				 ext4_lblk_t end)
2613{
2614	struct super_block *sb = inode->i_sb;
2615	int depth = ext_depth(inode);
2616	struct ext4_ext_path *path = NULL;
2617	ext4_fsblk_t partial_cluster = 0;
2618	handle_t *handle;
2619	int i = 0, err;
2620
2621	ext_debug("truncate since %u to %u\n", start, end);
2622
2623	/* probably first extent we're gonna free will be last in block */
2624	handle = ext4_journal_start(inode, depth + 1);
2625	if (IS_ERR(handle))
2626		return PTR_ERR(handle);
2627
2628again:
2629	ext4_ext_invalidate_cache(inode);
2630
2631	trace_ext4_ext_remove_space(inode, start, depth);
2632
2633	/*
2634	 * Check if we are removing extents inside the extent tree. If that
2635	 * is the case, we are going to punch a hole inside the extent tree
2636	 * so we have to check whether we need to split the extent covering
2637	 * the last block to remove so we can easily remove the part of it
2638	 * in ext4_ext_rm_leaf().
2639	 */
2640	if (end < EXT_MAX_BLOCKS - 1) {
2641		struct ext4_extent *ex;
2642		ext4_lblk_t ee_block;
2643
2644		/* find extent for this block */
2645		path = ext4_ext_find_extent(inode, end, NULL);
2646		if (IS_ERR(path)) {
2647			ext4_journal_stop(handle);
2648			return PTR_ERR(path);
2649		}
2650		depth = ext_depth(inode);
2651		ex = path[depth].p_ext;
2652		if (!ex) {
2653			ext4_ext_drop_refs(path);
2654			kfree(path);
2655			path = NULL;
2656			goto cont;
2657		}
2658
2659		ee_block = le32_to_cpu(ex->ee_block);
2660
2661		/*
2662		 * See if the last block is inside the extent, if so split
2663		 * the extent at 'end' block so we can easily remove the
2664		 * tail of the first part of the split extent in
2665		 * ext4_ext_rm_leaf().
2666		 */
2667		if (end >= ee_block &&
2668		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2669			int split_flag = 0;
2670
2671			if (ext4_ext_is_uninitialized(ex))
2672				split_flag = EXT4_EXT_MARK_UNINIT1 |
2673					     EXT4_EXT_MARK_UNINIT2;
2674
2675			/*
2676			 * Split the extent in two so that 'end' is the last
2677			 * block in the first new extent
2678			 */
2679			err = ext4_split_extent_at(handle, inode, path,
2680						end + 1, split_flag,
2681						EXT4_GET_BLOCKS_PRE_IO |
2682						EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2683
2684			if (err < 0)
2685				goto out;
2686		}
2687	}
2688cont:
2689
2690	/*
2691	 * We start scanning from right side, freeing all the blocks
2692	 * after i_size and walking into the tree depth-wise.
2693	 */
2694	depth = ext_depth(inode);
2695	if (path) {
2696		int k = i = depth;
2697		while (--k > 0)
2698			path[k].p_block =
2699				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2700	} else {
2701		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2702			       GFP_NOFS);
2703		if (path == NULL) {
2704			ext4_journal_stop(handle);
2705			return -ENOMEM;
2706		}
2707		path[0].p_depth = depth;
2708		path[0].p_hdr = ext_inode_hdr(inode);
2709		i = 0;
2710
2711		if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2712			err = -EIO;
2713			goto out;
2714		}
2715	}
2716	err = 0;
2717
2718	while (i >= 0 && err == 0) {
2719		if (i == depth) {
2720			/* this is leaf block */
2721			err = ext4_ext_rm_leaf(handle, inode, path,
2722					       &partial_cluster, start,
2723					       end);
2724			/* root level has p_bh == NULL, brelse() eats this */
2725			brelse(path[i].p_bh);
2726			path[i].p_bh = NULL;
2727			i--;
2728			continue;
2729		}
2730
2731		/* this is index block */
2732		if (!path[i].p_hdr) {
2733			ext_debug("initialize header\n");
2734			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2735		}
2736
2737		if (!path[i].p_idx) {
2738			/* this level hasn't been touched yet */
2739			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2740			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2741			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2742				  path[i].p_hdr,
2743				  le16_to_cpu(path[i].p_hdr->eh_entries));
2744		} else {
2745			/* we were already here, see at next index */
2746			path[i].p_idx--;
2747		}
2748
2749		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2750				i, EXT_FIRST_INDEX(path[i].p_hdr),
2751				path[i].p_idx);
2752		if (ext4_ext_more_to_rm(path + i)) {
2753			struct buffer_head *bh;
2754			/* go to the next level */
2755			ext_debug("move to level %d (block %llu)\n",
2756				  i + 1, ext4_idx_pblock(path[i].p_idx));
2757			memset(path + i + 1, 0, sizeof(*path));
2758			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2759			if (!bh) {
2760				/* should we reset i_size? */
2761				err = -EIO;
2762				break;
2763			}
2764			if (WARN_ON(i + 1 > depth)) {
2765				err = -EIO;
2766				break;
2767			}
2768			if (ext4_ext_check_block(inode, ext_block_hdr(bh),
2769							depth - i - 1, bh)) {
2770				err = -EIO;
2771				break;
2772			}
2773			path[i + 1].p_bh = bh;
2774
2775			/* save actual number of indexes since this
2776			 * number is changed at the next iteration */
2777			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2778			i++;
2779		} else {
2780			/* we finished processing this index, go up */
2781			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2782				/* index is empty, remove it;
2783				 * handle must be already prepared by the
2784				 * truncatei_leaf() */
2785				err = ext4_ext_rm_idx(handle, inode, path + i);
2786			}
2787			/* root level has p_bh == NULL, brelse() eats this */
2788			brelse(path[i].p_bh);
2789			path[i].p_bh = NULL;
2790			i--;
2791			ext_debug("return to level %d\n", i);
2792		}
2793	}
2794
2795	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2796			path->p_hdr->eh_entries);
2797
2798	/* If we still have something in the partial cluster and we have removed
2799	 * even the first extent, then we should free the blocks in the partial
2800	 * cluster as well. */
2801	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2802		int flags = EXT4_FREE_BLOCKS_FORGET;
2803
2804		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2805			flags |= EXT4_FREE_BLOCKS_METADATA;
2806
2807		ext4_free_blocks(handle, inode, NULL,
2808				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2809				 EXT4_SB(sb)->s_cluster_ratio, flags);
2810		partial_cluster = 0;
2811	}
2812
2813	/* TODO: flexible tree reduction should be here */
2814	if (path->p_hdr->eh_entries == 0) {
2815		/*
2816		 * truncate to zero freed all the tree,
2817		 * so we need to correct eh_depth
2818		 */
2819		err = ext4_ext_get_access(handle, inode, path);
2820		if (err == 0) {
2821			ext_inode_hdr(inode)->eh_depth = 0;
2822			ext_inode_hdr(inode)->eh_max =
2823				cpu_to_le16(ext4_ext_space_root(inode, 0));
2824			err = ext4_ext_dirty(handle, inode, path);
2825		}
2826	}
2827out:
2828	ext4_ext_drop_refs(path);
2829	kfree(path);
2830	if (err == -EAGAIN) {
2831		path = NULL;
2832		goto again;
2833	}
2834	ext4_journal_stop(handle);
2835
2836	return err;
2837}
2838
2839/*
2840 * called at mount time
2841 */
2842void ext4_ext_init(struct super_block *sb)
2843{
2844	/*
2845	 * possible initialization would be here
2846	 */
2847
2848	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2849#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2850		printk(KERN_INFO "EXT4-fs: file extents enabled"
2851#ifdef AGGRESSIVE_TEST
2852		       ", aggressive tests"
2853#endif
2854#ifdef CHECK_BINSEARCH
2855		       ", check binsearch"
2856#endif
2857#ifdef EXTENTS_STATS
2858		       ", stats"
2859#endif
2860		       "\n");
2861#endif
2862#ifdef EXTENTS_STATS
2863		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2864		EXT4_SB(sb)->s_ext_min = 1 << 30;
2865		EXT4_SB(sb)->s_ext_max = 0;
2866#endif
2867	}
2868}
2869
2870/*
2871 * called at umount time
2872 */
2873void ext4_ext_release(struct super_block *sb)
2874{
2875	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2876		return;
2877
2878#ifdef EXTENTS_STATS
2879	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2880		struct ext4_sb_info *sbi = EXT4_SB(sb);
2881		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2882			sbi->s_ext_blocks, sbi->s_ext_extents,
2883			sbi->s_ext_blocks / sbi->s_ext_extents);
2884		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2885			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2886	}
2887#endif
2888}
2889
2890/* FIXME!! we need to try to merge to left or right after zero-out  */
2891static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2892{
2893	ext4_fsblk_t ee_pblock;
2894	unsigned int ee_len;
2895	int ret;
2896
2897	ee_len    = ext4_ext_get_actual_len(ex);
2898	ee_pblock = ext4_ext_pblock(ex);
2899
2900	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2901	if (ret > 0)
2902		ret = 0;
2903
2904	return ret;
2905}
2906
2907/*
2908 * ext4_split_extent_at() splits an extent at given block.
2909 *
2910 * @handle: the journal handle
2911 * @inode: the file inode
2912 * @path: the path to the extent
2913 * @split: the logical block where the extent is splitted.
2914 * @split_flags: indicates if the extent could be zeroout if split fails, and
2915 *		 the states(init or uninit) of new extents.
2916 * @flags: flags used to insert new extent to extent tree.
2917 *
2918 *
2919 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2920 * of which are deterimined by split_flag.
2921 *
2922 * There are two cases:
2923 *  a> the extent are splitted into two extent.
2924 *  b> split is not needed, and just mark the extent.
2925 *
2926 * return 0 on success.
2927 */
2928static int ext4_split_extent_at(handle_t *handle,
2929			     struct inode *inode,
2930			     struct ext4_ext_path *path,
2931			     ext4_lblk_t split,
2932			     int split_flag,
2933			     int flags)
2934{
2935	ext4_fsblk_t newblock;
2936	ext4_lblk_t ee_block;
2937	struct ext4_extent *ex, newex, orig_ex;
2938	struct ext4_extent *ex2 = NULL;
2939	unsigned int ee_len, depth;
2940	int err = 0;
2941
2942	ext_debug("ext4_split_extents_at: inode %lu, logical"
2943		"block %llu\n", inode->i_ino, (unsigned long long)split);
2944
2945	ext4_ext_show_leaf(inode, path);
2946
2947	depth = ext_depth(inode);
2948	ex = path[depth].p_ext;
2949	ee_block = le32_to_cpu(ex->ee_block);
2950	ee_len = ext4_ext_get_actual_len(ex);
2951	newblock = split - ee_block + ext4_ext_pblock(ex);
2952
2953	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2954
2955	err = ext4_ext_get_access(handle, inode, path + depth);
2956	if (err)
2957		goto out;
2958
2959	if (split == ee_block) {
2960		/*
2961		 * case b: block @split is the block that the extent begins with
2962		 * then we just change the state of the extent, and splitting
2963		 * is not needed.
2964		 */
2965		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2966			ext4_ext_mark_uninitialized(ex);
2967		else
2968			ext4_ext_mark_initialized(ex);
2969
2970		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2971			ext4_ext_try_to_merge(handle, inode, path, ex);
2972
2973		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2974		goto out;
2975	}
2976
2977	/* case a */
2978	memcpy(&orig_ex, ex, sizeof(orig_ex));
2979	ex->ee_len = cpu_to_le16(split - ee_block);
2980	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2981		ext4_ext_mark_uninitialized(ex);
2982
2983	/*
2984	 * path may lead to new leaf, not to original leaf any more
2985	 * after ext4_ext_insert_extent() returns,
2986	 */
2987	err = ext4_ext_dirty(handle, inode, path + depth);
2988	if (err)
2989		goto fix_extent_len;
2990
2991	ex2 = &newex;
2992	ex2->ee_block = cpu_to_le32(split);
2993	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2994	ext4_ext_store_pblock(ex2, newblock);
2995	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2996		ext4_ext_mark_uninitialized(ex2);
2997
2998	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2999	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3000		err = ext4_ext_zeroout(inode, &orig_ex);
3001		if (err)
3002			goto fix_extent_len;
3003		/* update the extent length and mark as initialized */
3004		ex->ee_len = cpu_to_le16(ee_len);
3005		ext4_ext_try_to_merge(handle, inode, path, ex);
3006		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3007		goto out;
3008	} else if (err)
3009		goto fix_extent_len;
3010
3011out:
3012	ext4_ext_show_leaf(inode, path);
3013	return err;
3014
3015fix_extent_len:
3016	ex->ee_len = orig_ex.ee_len;
3017	ext4_ext_dirty(handle, inode, path + depth);
3018	return err;
3019}
3020
3021/*
3022 * ext4_split_extents() splits an extent and mark extent which is covered
3023 * by @map as split_flags indicates
3024 *
3025 * It may result in splitting the extent into multiple extents (upto three)
3026 * There are three possibilities:
3027 *   a> There is no split required
3028 *   b> Splits in two extents: Split is happening at either end of the extent
3029 *   c> Splits in three extents: Somone is splitting in middle of the extent
3030 *
3031 */
3032static int ext4_split_extent(handle_t *handle,
3033			      struct inode *inode,
3034			      struct ext4_ext_path *path,
3035			      struct ext4_map_blocks *map,
3036			      int split_flag,
3037			      int flags)
3038{
3039	ext4_lblk_t ee_block;
3040	struct ext4_extent *ex;
3041	unsigned int ee_len, depth;
3042	int err = 0;
3043	int uninitialized;
3044	int split_flag1, flags1;
3045
3046	depth = ext_depth(inode);
3047	ex = path[depth].p_ext;
3048	ee_block = le32_to_cpu(ex->ee_block);
3049	ee_len = ext4_ext_get_actual_len(ex);
3050	uninitialized = ext4_ext_is_uninitialized(ex);
3051
3052	if (map->m_lblk + map->m_len < ee_block + ee_len) {
3053		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
3054			      EXT4_EXT_MAY_ZEROOUT : 0;
3055		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3056		if (uninitialized)
3057			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
3058				       EXT4_EXT_MARK_UNINIT2;
3059		err = ext4_split_extent_at(handle, inode, path,
3060				map->m_lblk + map->m_len, split_flag1, flags1);
3061		if (err)
3062			goto out;
3063	}
3064
3065	ext4_ext_drop_refs(path);
3066	path = ext4_ext_find_extent(inode, map->m_lblk, path);
3067	if (IS_ERR(path))
3068		return PTR_ERR(path);
3069
3070	if (map->m_lblk >= ee_block) {
3071		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
3072			      EXT4_EXT_MAY_ZEROOUT : 0;
3073		if (uninitialized)
3074			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
3075		if (split_flag & EXT4_EXT_MARK_UNINIT2)
3076			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
3077		err = ext4_split_extent_at(handle, inode, path,
3078				map->m_lblk, split_flag1, flags);
3079		if (err)
3080			goto out;
3081	}
3082
3083	ext4_ext_show_leaf(inode, path);
3084out:
3085	return err ? err : map->m_len;
3086}
3087
3088#define EXT4_EXT_ZERO_LEN 7
3089/*
3090 * This function is called by ext4_ext_map_blocks() if someone tries to write
3091 * to an uninitialized extent. It may result in splitting the uninitialized
3092 * extent into multiple extents (up to three - one initialized and two
3093 * uninitialized).
3094 * There are three possibilities:
3095 *   a> There is no split required: Entire extent should be initialized
3096 *   b> Splits in two extents: Write is happening at either end of the extent
3097 *   c> Splits in three extents: Somone is writing in middle of the extent
3098 *
3099 * Pre-conditions:
3100 *  - The extent pointed to by 'path' is uninitialized.
3101 *  - The extent pointed to by 'path' contains a superset
3102 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3103 *
3104 * Post-conditions on success:
3105 *  - the returned value is the number of blocks beyond map->l_lblk
3106 *    that are allocated and initialized.
3107 *    It is guaranteed to be >= map->m_len.
3108 */
3109static int ext4_ext_convert_to_initialized(handle_t *handle,
3110					   struct inode *inode,
3111					   struct ext4_map_blocks *map,
3112					   struct ext4_ext_path *path)
3113{
3114	struct ext4_extent_header *eh;
3115	struct ext4_map_blocks split_map;
3116	struct ext4_extent zero_ex;
3117	struct ext4_extent *ex;
3118	ext4_lblk_t ee_block, eof_block;
3119	unsigned int ee_len, depth;
3120	int allocated;
3121	int err = 0;
3122	int split_flag = 0;
3123
3124	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3125		"block %llu, max_blocks %u\n", inode->i_ino,
3126		(unsigned long long)map->m_lblk, map->m_len);
3127
3128	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3129		inode->i_sb->s_blocksize_bits;
3130	if (eof_block < map->m_lblk + map->m_len)
3131		eof_block = map->m_lblk + map->m_len;
3132
3133	depth = ext_depth(inode);
3134	eh = path[depth].p_hdr;
3135	ex = path[depth].p_ext;
3136	ee_block = le32_to_cpu(ex->ee_block);
3137	ee_len = ext4_ext_get_actual_len(ex);
3138	allocated = ee_len - (map->m_lblk - ee_block);
3139
3140	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3141
3142	/* Pre-conditions */
3143	BUG_ON(!ext4_ext_is_uninitialized(ex));
3144	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3145
3146	/*
3147	 * Attempt to transfer newly initialized blocks from the currently
3148	 * uninitialized extent to its left neighbor. This is much cheaper
3149	 * than an insertion followed by a merge as those involve costly
3150	 * memmove() calls. This is the common case in steady state for
3151	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
3152	 * writes.
3153	 *
3154	 * Limitations of the current logic:
3155	 *  - L1: we only deal with writes at the start of the extent.
3156	 *    The approach could be extended to writes at the end
3157	 *    of the extent but this scenario was deemed less common.
3158	 *  - L2: we do not deal with writes covering the whole extent.
3159	 *    This would require removing the extent if the transfer
3160	 *    is possible.
3161	 *  - L3: we only attempt to merge with an extent stored in the
3162	 *    same extent tree node.
3163	 */
3164	if ((map->m_lblk == ee_block) &&	/*L1*/
3165		(map->m_len < ee_len) &&	/*L2*/
3166		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
3167		struct ext4_extent *prev_ex;
3168		ext4_lblk_t prev_lblk;
3169		ext4_fsblk_t prev_pblk, ee_pblk;
3170		unsigned int prev_len, write_len;
3171
3172		prev_ex = ex - 1;
3173		prev_lblk = le32_to_cpu(prev_ex->ee_block);
3174		prev_len = ext4_ext_get_actual_len(prev_ex);
3175		prev_pblk = ext4_ext_pblock(prev_ex);
3176		ee_pblk = ext4_ext_pblock(ex);
3177		write_len = map->m_len;
3178
3179		/*
3180		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
3181		 * upon those conditions:
3182		 * - C1: prev_ex is initialized,
3183		 * - C2: prev_ex is logically abutting ex,
3184		 * - C3: prev_ex is physically abutting ex,
3185		 * - C4: prev_ex can receive the additional blocks without
3186		 *   overflowing the (initialized) length limit.
3187		 */
3188		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
3189			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3190			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3191			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
3192			err = ext4_ext_get_access(handle, inode, path + depth);
3193			if (err)
3194				goto out;
3195
3196			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3197				map, ex, prev_ex);
3198
3199			/* Shift the start of ex by 'write_len' blocks */
3200			ex->ee_block = cpu_to_le32(ee_block + write_len);
3201			ext4_ext_store_pblock(ex, ee_pblk + write_len);
3202			ex->ee_len = cpu_to_le16(ee_len - write_len);
3203			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3204
3205			/* Extend prev_ex by 'write_len' blocks */
3206			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3207
3208			/* Mark the block containing both extents as dirty */
3209			ext4_ext_dirty(handle, inode, path + depth);
3210
3211			/* Update path to point to the right extent */
3212			path[depth].p_ext = prev_ex;
3213
3214			/* Result: number of initialized blocks past m_lblk */
3215			allocated = write_len;
3216			goto out;
3217		}
3218	}
3219
3220	WARN_ON(map->m_lblk < ee_block);
3221	/*
3222	 * It is safe to convert extent to initialized via explicit
3223	 * zeroout only if extent is fully insde i_size or new_size.
3224	 */
3225	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3226
3227	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
3228	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
3229	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3230		err = ext4_ext_zeroout(inode, ex);
3231		if (err)
3232			goto out;
3233
3234		err = ext4_ext_get_access(handle, inode, path + depth);
3235		if (err)
3236			goto out;
3237		ext4_ext_mark_initialized(ex);
3238		ext4_ext_try_to_merge(handle, inode, path, ex);
3239		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3240		goto out;
3241	}
3242
3243	/*
3244	 * four cases:
3245	 * 1. split the extent into three extents.
3246	 * 2. split the extent into two extents, zeroout the first half.
3247	 * 3. split the extent into two extents, zeroout the second half.
3248	 * 4. split the extent into two extents with out zeroout.
3249	 */
3250	split_map.m_lblk = map->m_lblk;
3251	split_map.m_len = map->m_len;
3252
3253	if (allocated > map->m_len) {
3254		if (allocated <= EXT4_EXT_ZERO_LEN &&
3255		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3256			/* case 3 */
3257			zero_ex.ee_block =
3258					 cpu_to_le32(map->m_lblk);
3259			zero_ex.ee_len = cpu_to_le16(allocated);
3260			ext4_ext_store_pblock(&zero_ex,
3261				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3262			err = ext4_ext_zeroout(inode, &zero_ex);
3263			if (err)
3264				goto out;
3265			split_map.m_lblk = map->m_lblk;
3266			split_map.m_len = allocated;
3267		} else if ((map->m_lblk - ee_block + map->m_len <
3268			   EXT4_EXT_ZERO_LEN) &&
3269			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3270			/* case 2 */
3271			if (map->m_lblk != ee_block) {
3272				zero_ex.ee_block = ex->ee_block;
3273				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3274							ee_block);
3275				ext4_ext_store_pblock(&zero_ex,
3276						      ext4_ext_pblock(ex));
3277				err = ext4_ext_zeroout(inode, &zero_ex);
3278				if (err)
3279					goto out;
3280			}
3281
3282			split_map.m_lblk = ee_block;
3283			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3284			allocated = map->m_len;
3285		}
3286	}
3287
3288	allocated = ext4_split_extent(handle, inode, path,
3289				       &split_map, split_flag, 0);
3290	if (allocated < 0)
3291		err = allocated;
3292
3293out:
3294	return err ? err : allocated;
3295}
3296
3297/*
3298 * This function is called by ext4_ext_map_blocks() from
3299 * ext4_get_blocks_dio_write() when DIO to write
3300 * to an uninitialized extent.
3301 *
3302 * Writing to an uninitialized extent may result in splitting the uninitialized
3303 * extent into multiple /initialized uninitialized extents (up to three)
3304 * There are three possibilities:
3305 *   a> There is no split required: Entire extent should be uninitialized
3306 *   b> Splits in two extents: Write is happening at either end of the extent
3307 *   c> Splits in three extents: Somone is writing in middle of the extent
3308 *
3309 * One of more index blocks maybe needed if the extent tree grow after
3310 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3311 * complete, we need to split the uninitialized extent before DIO submit
3312 * the IO. The uninitialized extent called at this time will be split
3313 * into three uninitialized extent(at most). After IO complete, the part
3314 * being filled will be convert to initialized by the end_io callback function
3315 * via ext4_convert_unwritten_extents().
3316 *
3317 * Returns the size of uninitialized extent to be written on success.
3318 */
3319static int ext4_split_unwritten_extents(handle_t *handle,
3320					struct inode *inode,
3321					struct ext4_map_blocks *map,
3322					struct ext4_ext_path *path,
3323					int flags)
3324{
3325	ext4_lblk_t eof_block;
3326	ext4_lblk_t ee_block;
3327	struct ext4_extent *ex;
3328	unsigned int ee_len;
3329	int split_flag = 0, depth;
3330
3331	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3332		"block %llu, max_blocks %u\n", inode->i_ino,
3333		(unsigned long long)map->m_lblk, map->m_len);
3334
3335	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3336		inode->i_sb->s_blocksize_bits;
3337	if (eof_block < map->m_lblk + map->m_len)
3338		eof_block = map->m_lblk + map->m_len;
3339	/*
3340	 * It is safe to convert extent to initialized via explicit
3341	 * zeroout only if extent is fully insde i_size or new_size.
3342	 */
3343	depth = ext_depth(inode);
3344	ex = path[depth].p_ext;
3345	ee_block = le32_to_cpu(ex->ee_block);
3346	ee_len = ext4_ext_get_actual_len(ex);
3347
3348	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3349	split_flag |= EXT4_EXT_MARK_UNINIT2;
3350
3351	flags |= EXT4_GET_BLOCKS_PRE_IO;
3352	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3353}
3354
3355static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3356					      struct inode *inode,
3357					      struct ext4_ext_path *path)
3358{
3359	struct ext4_extent *ex;
3360	int depth;
3361	int err = 0;
3362
3363	depth = ext_depth(inode);
3364	ex = path[depth].p_ext;
3365
3366	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3367		"block %llu, max_blocks %u\n", inode->i_ino,
3368		(unsigned long long)le32_to_cpu(ex->ee_block),
3369		ext4_ext_get_actual_len(ex));
3370
3371	err = ext4_ext_get_access(handle, inode, path + depth);
3372	if (err)
3373		goto out;
3374	/* first mark the extent as initialized */
3375	ext4_ext_mark_initialized(ex);
3376
3377	/* note: ext4_ext_correct_indexes() isn't needed here because
3378	 * borders are not changed
3379	 */
3380	ext4_ext_try_to_merge(handle, inode, path, ex);
3381
3382	/* Mark modified extent as dirty */
3383	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3384out:
3385	ext4_ext_show_leaf(inode, path);
3386	return err;
3387}
3388
3389static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3390			sector_t block, int count)
3391{
3392	int i;
3393	for (i = 0; i < count; i++)
3394                unmap_underlying_metadata(bdev, block + i);
3395}
3396
3397/*
3398 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3399 */
3400static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3401			      ext4_lblk_t lblk,
3402			      struct ext4_ext_path *path,
3403			      unsigned int len)
3404{
3405	int i, depth;
3406	struct ext4_extent_header *eh;
3407	struct ext4_extent *last_ex;
3408
3409	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3410		return 0;
3411
3412	depth = ext_depth(inode);
3413	eh = path[depth].p_hdr;
3414
3415	/*
3416	 * We're going to remove EOFBLOCKS_FL entirely in future so we
3417	 * do not care for this case anymore. Simply remove the flag
3418	 * if there are no extents.
3419	 */
3420	if (unlikely(!eh->eh_entries))
3421		goto out;
3422	last_ex = EXT_LAST_EXTENT(eh);
3423	/*
3424	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3425	 * last block in the last extent in the file.  We test this by
3426	 * first checking to see if the caller to
3427	 * ext4_ext_get_blocks() was interested in the last block (or
3428	 * a block beyond the last block) in the current extent.  If
3429	 * this turns out to be false, we can bail out from this
3430	 * function immediately.
3431	 */
3432	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3433	    ext4_ext_get_actual_len(last_ex))
3434		return 0;
3435	/*
3436	 * If the caller does appear to be planning to write at or
3437	 * beyond the end of the current extent, we then test to see
3438	 * if the current extent is the last extent in the file, by
3439	 * checking to make sure it was reached via the rightmost node
3440	 * at each level of the tree.
3441	 */
3442	for (i = depth-1; i >= 0; i--)
3443		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3444			return 0;
3445out:
3446	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3447	return ext4_mark_inode_dirty(handle, inode);
3448}
3449
3450/**
3451 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3452 *
3453 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3454 * whether there are any buffers marked for delayed allocation. It returns '1'
3455 * on the first delalloc'ed buffer head found. If no buffer head in the given
3456 * range is marked for delalloc, it returns 0.
3457 * lblk_start should always be <= lblk_end.
3458 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3459 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3460 * block sooner). This is useful when blocks are truncated sequentially from
3461 * lblk_start towards lblk_end.
3462 */
3463static int ext4_find_delalloc_range(struct inode *inode,
3464				    ext4_lblk_t lblk_start,
3465				    ext4_lblk_t lblk_end,
3466				    int search_hint_reverse)
3467{
3468	struct address_space *mapping = inode->i_mapping;
3469	struct buffer_head *head, *bh = NULL;
3470	struct page *page;
3471	ext4_lblk_t i, pg_lblk;
3472	pgoff_t index;
3473
3474	if (!test_opt(inode->i_sb, DELALLOC))
3475		return 0;
3476
3477	/* reverse search wont work if fs block size is less than page size */
3478	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3479		search_hint_reverse = 0;
3480
3481	if (search_hint_reverse)
3482		i = lblk_end;
3483	else
3484		i = lblk_start;
3485
3486	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3487
3488	while ((i >= lblk_start) && (i <= lblk_end)) {
3489		page = find_get_page(mapping, index);
3490		if (!page)
3491			goto nextpage;
3492
3493		if (!page_has_buffers(page))
3494			goto nextpage;
3495
3496		head = page_buffers(page);
3497		if (!head)
3498			goto nextpage;
3499
3500		bh = head;
3501		pg_lblk = index << (PAGE_CACHE_SHIFT -
3502						inode->i_blkbits);
3503		do {
3504			if (unlikely(pg_lblk < lblk_start)) {
3505				/*
3506				 * This is possible when fs block size is less
3507				 * than page size and our cluster starts/ends in
3508				 * middle of the page. So we need to skip the
3509				 * initial few blocks till we reach the 'lblk'
3510				 */
3511				pg_lblk++;
3512				continue;
3513			}
3514
3515			/* Check if the buffer is delayed allocated and that it
3516			 * is not yet mapped. (when da-buffers are mapped during
3517			 * their writeout, their da_mapped bit is set.)
3518			 */
3519			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3520				page_cache_release(page);
3521				trace_ext4_find_delalloc_range(inode,
3522						lblk_start, lblk_end,
3523						search_hint_reverse,
3524						1, i);
3525				return 1;
3526			}
3527			if (search_hint_reverse)
3528				i--;
3529			else
3530				i++;
3531		} while ((i >= lblk_start) && (i <= lblk_end) &&
3532				((bh = bh->b_this_page) != head));
3533nextpage:
3534		if (page)
3535			page_cache_release(page);
3536		/*
3537		 * Move to next page. 'i' will be the first lblk in the next
3538		 * page.
3539		 */
3540		if (search_hint_reverse)
3541			index--;
3542		else
3543			index++;
3544		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3545	}
3546
3547	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3548					search_hint_reverse, 0, 0);
3549	return 0;
3550}
3551
3552int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3553			       int search_hint_reverse)
3554{
3555	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3556	ext4_lblk_t lblk_start, lblk_end;
3557	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3558	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3559
3560	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3561					search_hint_reverse);
3562}
3563
3564/**
3565 * Determines how many complete clusters (out of those specified by the 'map')
3566 * are under delalloc and were reserved quota for.
3567 * This function is called when we are writing out the blocks that were
3568 * originally written with their allocation delayed, but then the space was
3569 * allocated using fallocate() before the delayed allocation could be resolved.
3570 * The cases to look for are:
3571 * ('=' indicated delayed allocated blocks
3572 *  '-' indicates non-delayed allocated blocks)
3573 * (a) partial clusters towards beginning and/or end outside of allocated range
3574 *     are not delalloc'ed.
3575 *	Ex:
3576 *	|----c---=|====c====|====c====|===-c----|
3577 *	         |++++++ allocated ++++++|
3578 *	==> 4 complete clusters in above example
3579 *
3580 * (b) partial cluster (outside of allocated range) towards either end is
3581 *     marked for delayed allocation. In this case, we will exclude that
3582 *     cluster.
3583 *	Ex:
3584 *	|----====c========|========c========|
3585 *	     |++++++ allocated ++++++|
3586 *	==> 1 complete clusters in above example
3587 *
3588 *	Ex:
3589 *	|================c================|
3590 *            |++++++ allocated ++++++|
3591 *	==> 0 complete clusters in above example
3592 *
3593 * The ext4_da_update_reserve_space will be called only if we
3594 * determine here that there were some "entire" clusters that span
3595 * this 'allocated' range.
3596 * In the non-bigalloc case, this function will just end up returning num_blks
3597 * without ever calling ext4_find_delalloc_range.
3598 */
3599static unsigned int
3600get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3601			   unsigned int num_blks)
3602{
3603	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3604	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3605	ext4_lblk_t lblk_from, lblk_to, c_offset;
3606	unsigned int allocated_clusters = 0;
3607
3608	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3609	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3610
3611	/* max possible clusters for this allocation */
3612	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3613
3614	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3615
3616	/* Check towards left side */
3617	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3618	if (c_offset) {
3619		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3620		lblk_to = lblk_from + c_offset - 1;
3621
3622		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3623			allocated_clusters--;
3624	}
3625
3626	/* Now check towards right. */
3627	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3628	if (allocated_clusters && c_offset) {
3629		lblk_from = lblk_start + num_blks;
3630		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3631
3632		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3633			allocated_clusters--;
3634	}
3635
3636	return allocated_clusters;
3637}
3638
3639static int
3640ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3641			struct ext4_map_blocks *map,
3642			struct ext4_ext_path *path, int flags,
3643			unsigned int allocated, ext4_fsblk_t newblock)
3644{
3645	int ret = 0;
3646	int err = 0;
3647	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3648
3649	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3650		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
3651		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3652		  flags, allocated);
3653	ext4_ext_show_leaf(inode, path);
3654
3655	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3656						    newblock);
3657
3658	/* get_block() before submit the IO, split the extent */
3659	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3660		ret = ext4_split_unwritten_extents(handle, inode, map,
3661						   path, flags);
3662		/*
3663		 * Flag the inode(non aio case) or end_io struct (aio case)
3664		 * that this IO needs to conversion to written when IO is
3665		 * completed
3666		 */
3667		if (io)
3668			ext4_set_io_unwritten_flag(inode, io);
3669		else
3670			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3671		if (ext4_should_dioread_nolock(inode))
3672			map->m_flags |= EXT4_MAP_UNINIT;
3673		goto out;
3674	}
3675	/* IO end_io complete, convert the filled extent to written */
3676	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3677		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3678							path);
3679		if (ret >= 0) {
3680			ext4_update_inode_fsync_trans(handle, inode, 1);
3681			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3682						 path, map->m_len);
3683		} else
3684			err = ret;
3685		goto out2;
3686	}
3687	/* buffered IO case */
3688	/*
3689	 * repeat fallocate creation request
3690	 * we already have an unwritten extent
3691	 */
3692	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3693		goto map_out;
3694
3695	/* buffered READ or buffered write_begin() lookup */
3696	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3697		/*
3698		 * We have blocks reserved already.  We
3699		 * return allocated blocks so that delalloc
3700		 * won't do block reservation for us.  But
3701		 * the buffer head will be unmapped so that
3702		 * a read from the block returns 0s.
3703		 */
3704		map->m_flags |= EXT4_MAP_UNWRITTEN;
3705		goto out1;
3706	}
3707
3708	/* buffered write, writepage time, convert*/
3709	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3710	if (ret >= 0)
3711		ext4_update_inode_fsync_trans(handle, inode, 1);
3712out:
3713	if (ret <= 0) {
3714		err = ret;
3715		goto out2;
3716	} else
3717		allocated = ret;
3718	map->m_flags |= EXT4_MAP_NEW;
3719	/*
3720	 * if we allocated more blocks than requested
3721	 * we need to make sure we unmap the extra block
3722	 * allocated. The actual needed block will get
3723	 * unmapped later when we find the buffer_head marked
3724	 * new.
3725	 */
3726	if (allocated > map->m_len) {
3727		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3728					newblock + map->m_len,
3729					allocated - map->m_len);
3730		allocated = map->m_len;
3731	}
3732
3733	/*
3734	 * If we have done fallocate with the offset that is already
3735	 * delayed allocated, we would have block reservation
3736	 * and quota reservation done in the delayed write path.
3737	 * But fallocate would have already updated quota and block
3738	 * count for this offset. So cancel these reservation
3739	 */
3740	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3741		unsigned int reserved_clusters;
3742		reserved_clusters = get_reserved_cluster_alloc(inode,
3743				map->m_lblk, map->m_len);
3744		if (reserved_clusters)
3745			ext4_da_update_reserve_space(inode,
3746						     reserved_clusters,
3747						     0);
3748	}
3749
3750map_out:
3751	map->m_flags |= EXT4_MAP_MAPPED;
3752	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3753		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3754					 map->m_len);
3755		if (err < 0)
3756			goto out2;
3757	}
3758out1:
3759	if (allocated > map->m_len)
3760		allocated = map->m_len;
3761	ext4_ext_show_leaf(inode, path);
3762	map->m_pblk = newblock;
3763	map->m_len = allocated;
3764out2:
3765	if (path) {
3766		ext4_ext_drop_refs(path);
3767		kfree(path);
3768	}
3769	return err ? err : allocated;
3770}
3771
3772/*
3773 * get_implied_cluster_alloc - check to see if the requested
3774 * allocation (in the map structure) overlaps with a cluster already
3775 * allocated in an extent.
3776 *	@sb	The filesystem superblock structure
3777 *	@map	The requested lblk->pblk mapping
3778 *	@ex	The extent structure which might contain an implied
3779 *			cluster allocation
3780 *
3781 * This function is called by ext4_ext_map_blocks() after we failed to
3782 * find blocks that were already in the inode's extent tree.  Hence,
3783 * we know that the beginning of the requested region cannot overlap
3784 * the extent from the inode's extent tree.  There are three cases we
3785 * want to catch.  The first is this case:
3786 *
3787 *		 |--- cluster # N--|
3788 *    |--- extent ---|	|---- requested region ---|
3789 *			|==========|
3790 *
3791 * The second case that we need to test for is this one:
3792 *
3793 *   |--------- cluster # N ----------------|
3794 *	   |--- requested region --|   |------- extent ----|
3795 *	   |=======================|
3796 *
3797 * The third case is when the requested region lies between two extents
3798 * within the same cluster:
3799 *          |------------- cluster # N-------------|
3800 * |----- ex -----|                  |---- ex_right ----|
3801 *                  |------ requested region ------|
3802 *                  |================|
3803 *
3804 * In each of the above cases, we need to set the map->m_pblk and
3805 * map->m_len so it corresponds to the return the extent labelled as
3806 * "|====|" from cluster #N, since it is already in use for data in
3807 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3808 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3809 * as a new "allocated" block region.  Otherwise, we will return 0 and
3810 * ext4_ext_map_blocks() will then allocate one or more new clusters
3811 * by calling ext4_mb_new_blocks().
3812 */
3813static int get_implied_cluster_alloc(struct super_block *sb,
3814				     struct ext4_map_blocks *map,
3815				     struct ext4_extent *ex,
3816				     struct ext4_ext_path *path)
3817{
3818	struct ext4_sb_info *sbi = EXT4_SB(sb);
3819	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3820	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3821	ext4_lblk_t rr_cluster_start;
3822	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3823	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3824	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3825
3826	/* The extent passed in that we are trying to match */
3827	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3828	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3829
3830	/* The requested region passed into ext4_map_blocks() */
3831	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3832
3833	if ((rr_cluster_start == ex_cluster_end) ||
3834	    (rr_cluster_start == ex_cluster_start)) {
3835		if (rr_cluster_start == ex_cluster_end)
3836			ee_start += ee_len - 1;
3837		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3838			c_offset;
3839		map->m_len = min(map->m_len,
3840				 (unsigned) sbi->s_cluster_ratio - c_offset);
3841		/*
3842		 * Check for and handle this case:
3843		 *
3844		 *   |--------- cluster # N-------------|
3845		 *		       |------- extent ----|
3846		 *	   |--- requested region ---|
3847		 *	   |===========|
3848		 */
3849
3850		if (map->m_lblk < ee_block)
3851			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3852
3853		/*
3854		 * Check for the case where there is already another allocated
3855		 * block to the right of 'ex' but before the end of the cluster.
3856		 *
3857		 *          |------------- cluster # N-------------|
3858		 * |----- ex -----|                  |---- ex_right ----|
3859		 *                  |------ requested region ------|
3860		 *                  |================|
3861		 */
3862		if (map->m_lblk > ee_block) {
3863			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3864			map->m_len = min(map->m_len, next - map->m_lblk);
3865		}
3866
3867		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3868		return 1;
3869	}
3870
3871	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3872	return 0;
3873}
3874
3875
3876/*
3877 * Block allocation/map/preallocation routine for extents based files
3878 *
3879 *
3880 * Need to be called with
3881 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3882 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3883 *
3884 * return > 0, number of of blocks already mapped/allocated
3885 *          if create == 0 and these are pre-allocated blocks
3886 *          	buffer head is unmapped
3887 *          otherwise blocks are mapped
3888 *
3889 * return = 0, if plain look up failed (blocks have not been allocated)
3890 *          buffer head is unmapped
3891 *
3892 * return < 0, error case.
3893 */
3894int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3895			struct ext4_map_blocks *map, int flags)
3896{
3897	struct ext4_ext_path *path = NULL;
3898	struct ext4_extent newex, *ex, *ex2;
3899	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3900	ext4_fsblk_t newblock = 0;
3901	int free_on_err = 0, err = 0, depth, ret;
3902	unsigned int allocated = 0, offset = 0;
3903	unsigned int allocated_clusters = 0;
3904	struct ext4_allocation_request ar;
3905	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3906	ext4_lblk_t cluster_offset;
3907
3908	ext_debug("blocks %u/%u requested for inode %lu\n",
3909		  map->m_lblk, map->m_len, inode->i_ino);
3910	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3911
3912	/* check in cache */
3913	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3914		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3915			if ((sbi->s_cluster_ratio > 1) &&
3916			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3917				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3918
3919			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3920				/*
3921				 * block isn't allocated yet and
3922				 * user doesn't want to allocate it
3923				 */
3924				goto out2;
3925			}
3926			/* we should allocate requested block */
3927		} else {
3928			/* block is already allocated */
3929			if (sbi->s_cluster_ratio > 1)
3930				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3931			newblock = map->m_lblk
3932				   - le32_to_cpu(newex.ee_block)
3933				   + ext4_ext_pblock(&newex);
3934			/* number of remaining blocks in the extent */
3935			allocated = ext4_ext_get_actual_len(&newex) -
3936				(map->m_lblk - le32_to_cpu(newex.ee_block));
3937			goto out;
3938		}
3939	}
3940
3941	/* find extent for this block */
3942	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3943	if (IS_ERR(path)) {
3944		err = PTR_ERR(path);
3945		path = NULL;
3946		goto out2;
3947	}
3948
3949	depth = ext_depth(inode);
3950
3951	/*
3952	 * consistent leaf must not be empty;
3953	 * this situation is possible, though, _during_ tree modification;
3954	 * this is why assert can't be put in ext4_ext_find_extent()
3955	 */
3956	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3957		EXT4_ERROR_INODE(inode, "bad extent address "
3958				 "lblock: %lu, depth: %d pblock %lld",
3959				 (unsigned long) map->m_lblk, depth,
3960				 path[depth].p_block);
3961		err = -EIO;
3962		goto out2;
3963	}
3964
3965	ex = path[depth].p_ext;
3966	if (ex) {
3967		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3968		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3969		unsigned short ee_len;
3970
3971		/*
3972		 * Uninitialized extents are treated as holes, except that
3973		 * we split out initialized portions during a write.
3974		 */
3975		ee_len = ext4_ext_get_actual_len(ex);
3976
3977		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3978
3979		/* if found extent covers block, simply return it */
3980		if (in_range(map->m_lblk, ee_block, ee_len)) {
3981			newblock = map->m_lblk - ee_block + ee_start;
3982			/* number of remaining blocks in the extent */
3983			allocated = ee_len - (map->m_lblk - ee_block);
3984			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3985				  ee_block, ee_len, newblock);
3986
3987			/*
3988			 * Do not put uninitialized extent
3989			 * in the cache
3990			 */
3991			if (!ext4_ext_is_uninitialized(ex)) {
3992				ext4_ext_put_in_cache(inode, ee_block,
3993					ee_len, ee_start);
3994				goto out;
3995			}
3996			ret = ext4_ext_handle_uninitialized_extents(
3997				handle, inode, map, path, flags,
3998				allocated, newblock);
3999			return ret;
4000		}
4001	}
4002
4003	if ((sbi->s_cluster_ratio > 1) &&
4004	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
4005		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4006
4007	/*
4008	 * requested block isn't allocated yet;
4009	 * we couldn't try to create block if create flag is zero
4010	 */
4011	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4012		/*
4013		 * put just found gap into cache to speed up
4014		 * subsequent requests
4015		 */
4016		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4017		goto out2;
4018	}
4019
4020	/*
4021	 * Okay, we need to do block allocation.
4022	 */
4023	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4024	newex.ee_block = cpu_to_le32(map->m_lblk);
4025	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
4026
4027	/*
4028	 * If we are doing bigalloc, check to see if the extent returned
4029	 * by ext4_ext_find_extent() implies a cluster we can use.
4030	 */
4031	if (cluster_offset && ex &&
4032	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4033		ar.len = allocated = map->m_len;
4034		newblock = map->m_pblk;
4035		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4036		goto got_allocated_blocks;
4037	}
4038
4039	/* find neighbour allocated blocks */
4040	ar.lleft = map->m_lblk;
4041	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4042	if (err)
4043		goto out2;
4044	ar.lright = map->m_lblk;
4045	ex2 = NULL;
4046	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4047	if (err)
4048		goto out2;
4049
4050	/* Check if the extent after searching to the right implies a
4051	 * cluster we can use. */
4052	if ((sbi->s_cluster_ratio > 1) && ex2 &&
4053	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4054		ar.len = allocated = map->m_len;
4055		newblock = map->m_pblk;
4056		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4057		goto got_allocated_blocks;
4058	}
4059
4060	/*
4061	 * See if request is beyond maximum number of blocks we can have in
4062	 * a single extent. For an initialized extent this limit is
4063	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
4064	 * EXT_UNINIT_MAX_LEN.
4065	 */
4066	if (map->m_len > EXT_INIT_MAX_LEN &&
4067	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4068		map->m_len = EXT_INIT_MAX_LEN;
4069	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
4070		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4071		map->m_len = EXT_UNINIT_MAX_LEN;
4072
4073	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4074	newex.ee_len = cpu_to_le16(map->m_len);
4075	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4076	if (err)
4077		allocated = ext4_ext_get_actual_len(&newex);
4078	else
4079		allocated = map->m_len;
4080
4081	/* allocate new block */
4082	ar.inode = inode;
4083	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4084	ar.logical = map->m_lblk;
4085	/*
4086	 * We calculate the offset from the beginning of the cluster
4087	 * for the logical block number, since when we allocate a
4088	 * physical cluster, the physical block should start at the
4089	 * same offset from the beginning of the cluster.  This is
4090	 * needed so that future calls to get_implied_cluster_alloc()
4091	 * work correctly.
4092	 */
4093	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
4094	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4095	ar.goal -= offset;
4096	ar.logical -= offset;
4097	if (S_ISREG(inode->i_mode))
4098		ar.flags = EXT4_MB_HINT_DATA;
4099	else
4100		/* disable in-core preallocation for non-regular files */
4101		ar.flags = 0;
4102	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4103		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4104	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4105	if (!newblock)
4106		goto out2;
4107	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4108		  ar.goal, newblock, allocated);
4109	free_on_err = 1;
4110	allocated_clusters = ar.len;
4111	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4112	if (ar.len > allocated)
4113		ar.len = allocated;
4114
4115got_allocated_blocks:
4116	/* try to insert new extent into found leaf and return */
4117	ext4_ext_store_pblock(&newex, newblock + offset);
4118	newex.ee_len = cpu_to_le16(ar.len);
4119	/* Mark uninitialized */
4120	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4121		ext4_ext_mark_uninitialized(&newex);
4122		/*
4123		 * io_end structure was created for every IO write to an
4124		 * uninitialized extent. To avoid unnecessary conversion,
4125		 * here we flag the IO that really needs the conversion.
4126		 * For non asycn direct IO case, flag the inode state
4127		 * that we need to perform conversion when IO is done.
4128		 */
4129		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
4130			if (io)
4131				ext4_set_io_unwritten_flag(inode, io);
4132			else
4133				ext4_set_inode_state(inode,
4134						     EXT4_STATE_DIO_UNWRITTEN);
4135		}
4136		if (ext4_should_dioread_nolock(inode))
4137			map->m_flags |= EXT4_MAP_UNINIT;
4138	}
4139
4140	err = 0;
4141	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4142		err = check_eofblocks_fl(handle, inode, map->m_lblk,
4143					 path, ar.len);
4144	if (!err)
4145		err = ext4_ext_insert_extent(handle, inode, path,
4146					     &newex, flags);
4147	if (err && free_on_err) {
4148		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4149			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4150		/* free data blocks we just allocated */
4151		/* not a good idea to call discard here directly,
4152		 * but otherwise we'd need to call it every free() */
4153		ext4_discard_preallocations(inode);
4154		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
4155				 ext4_ext_get_actual_len(&newex), fb_flags);
4156		goto out2;
4157	}
4158
4159	/* previous routine could use block we allocated */
4160	newblock = ext4_ext_pblock(&newex);
4161	allocated = ext4_ext_get_actual_len(&newex);
4162	if (allocated > map->m_len)
4163		allocated = map->m_len;
4164	map->m_flags |= EXT4_MAP_NEW;
4165
4166	/*
4167	 * Update reserved blocks/metadata blocks after successful
4168	 * block allocation which had been deferred till now.
4169	 */
4170	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4171		unsigned int reserved_clusters;
4172		/*
4173		 * Check how many clusters we had reserved this allocated range
4174		 */
4175		reserved_clusters = get_reserved_cluster_alloc(inode,
4176						map->m_lblk, allocated);
4177		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4178			if (reserved_clusters) {
4179				/*
4180				 * We have clusters reserved for this range.
4181				 * But since we are not doing actual allocation
4182				 * and are simply using blocks from previously
4183				 * allocated cluster, we should release the
4184				 * reservation and not claim quota.
4185				 */
4186				ext4_da_update_reserve_space(inode,
4187						reserved_clusters, 0);
4188			}
4189		} else {
4190			BUG_ON(allocated_clusters < reserved_clusters);
4191			/* We will claim quota for all newly allocated blocks.*/
4192			ext4_da_update_reserve_space(inode, allocated_clusters,
4193							1);
4194			if (reserved_clusters < allocated_clusters) {
4195				struct ext4_inode_info *ei = EXT4_I(inode);
4196				int reservation = allocated_clusters -
4197						  reserved_clusters;
4198				/*
4199				 * It seems we claimed few clusters outside of
4200				 * the range of this allocation. We should give
4201				 * it back to the reservation pool. This can
4202				 * happen in the following case:
4203				 *
4204				 * * Suppose s_cluster_ratio is 4 (i.e., each
4205				 *   cluster has 4 blocks. Thus, the clusters
4206				 *   are [0-3],[4-7],[8-11]...
4207				 * * First comes delayed allocation write for
4208				 *   logical blocks 10 & 11. Since there were no
4209				 *   previous delayed allocated blocks in the
4210				 *   range [8-11], we would reserve 1 cluster
4211				 *   for this write.
4212				 * * Next comes write for logical blocks 3 to 8.
4213				 *   In this case, we will reserve 2 clusters
4214				 *   (for [0-3] and [4-7]; and not for [8-11] as
4215				 *   that range has a delayed allocated blocks.
4216				 *   Thus total reserved clusters now becomes 3.
4217				 * * Now, during the delayed allocation writeout
4218				 *   time, we will first write blocks [3-8] and
4219				 *   allocate 3 clusters for writing these
4220				 *   blocks. Also, we would claim all these
4221				 *   three clusters above.
4222				 * * Now when we come here to writeout the
4223				 *   blocks [10-11], we would expect to claim
4224				 *   the reservation of 1 cluster we had made
4225				 *   (and we would claim it since there are no
4226				 *   more delayed allocated blocks in the range
4227				 *   [8-11]. But our reserved cluster count had
4228				 *   already gone to 0.
4229				 *
4230				 *   Thus, at the step 4 above when we determine
4231				 *   that there are still some unwritten delayed
4232				 *   allocated blocks outside of our current
4233				 *   block range, we should increment the
4234				 *   reserved clusters count so that when the
4235				 *   remaining blocks finally gets written, we
4236				 *   could claim them.
4237				 */
4238				dquot_reserve_block(inode,
4239						EXT4_C2B(sbi, reservation));
4240				spin_lock(&ei->i_block_reservation_lock);
4241				ei->i_reserved_data_blocks += reservation;
4242				spin_unlock(&ei->i_block_reservation_lock);
4243			}
4244		}
4245	}
4246
4247	/*
4248	 * Cache the extent and update transaction to commit on fdatasync only
4249	 * when it is _not_ an uninitialized extent.
4250	 */
4251	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4252		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4253		ext4_update_inode_fsync_trans(handle, inode, 1);
4254	} else
4255		ext4_update_inode_fsync_trans(handle, inode, 0);
4256out:
4257	if (allocated > map->m_len)
4258		allocated = map->m_len;
4259	ext4_ext_show_leaf(inode, path);
4260	map->m_flags |= EXT4_MAP_MAPPED;
4261	map->m_pblk = newblock;
4262	map->m_len = allocated;
4263out2:
4264	if (path) {
4265		ext4_ext_drop_refs(path);
4266		kfree(path);
4267	}
4268
4269	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4270		newblock, map->m_len, err ? err : allocated);
4271
4272	return err ? err : allocated;
4273}
4274
4275void ext4_ext_truncate(struct inode *inode)
4276{
4277	struct address_space *mapping = inode->i_mapping;
4278	struct super_block *sb = inode->i_sb;
4279	ext4_lblk_t last_block;
4280	handle_t *handle;
4281	loff_t page_len;
4282	int err = 0;
4283
4284	/*
4285	 * finish any pending end_io work so we won't run the risk of
4286	 * converting any truncated blocks to initialized later
4287	 */
4288	ext4_flush_completed_IO(inode);
4289
4290	/*
4291	 * probably first extent we're gonna free will be last in block
4292	 */
4293	err = ext4_writepage_trans_blocks(inode);
4294	handle = ext4_journal_start(inode, err);
4295	if (IS_ERR(handle))
4296		return;
4297
4298	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4299		page_len = PAGE_CACHE_SIZE -
4300			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4301
4302		err = ext4_discard_partial_page_buffers(handle,
4303			mapping, inode->i_size, page_len, 0);
4304
4305		if (err)
4306			goto out_stop;
4307	}
4308
4309	if (ext4_orphan_add(handle, inode))
4310		goto out_stop;
4311
4312	down_write(&EXT4_I(inode)->i_data_sem);
4313	ext4_ext_invalidate_cache(inode);
4314
4315	ext4_discard_preallocations(inode);
4316
4317	/*
4318	 * TODO: optimization is possible here.
4319	 * Probably we need not scan at all,
4320	 * because page truncation is enough.
4321	 */
4322
4323	/* we have to know where to truncate from in crash case */
4324	EXT4_I(inode)->i_disksize = inode->i_size;
4325	ext4_mark_inode_dirty(handle, inode);
4326
4327	last_block = (inode->i_size + sb->s_blocksize - 1)
4328			>> EXT4_BLOCK_SIZE_BITS(sb);
4329	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4330
4331	/* In a multi-transaction truncate, we only make the final
4332	 * transaction synchronous.
4333	 */
4334	if (IS_SYNC(inode))
4335		ext4_handle_sync(handle);
4336
4337	up_write(&EXT4_I(inode)->i_data_sem);
4338
4339out_stop:
4340	/*
4341	 * If this was a simple ftruncate() and the file will remain alive,
4342	 * then we need to clear up the orphan record which we created above.
4343	 * However, if this was a real unlink then we were called by
4344	 * ext4_delete_inode(), and we allow that function to clean up the
4345	 * orphan info for us.
4346	 */
4347	if (inode->i_nlink)
4348		ext4_orphan_del(handle, inode);
4349
4350	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4351	ext4_mark_inode_dirty(handle, inode);
4352	ext4_journal_stop(handle);
4353}
4354
4355static void ext4_falloc_update_inode(struct inode *inode,
4356				int mode, loff_t new_size, int update_ctime)
4357{
4358	struct timespec now;
4359
4360	if (update_ctime) {
4361		now = current_fs_time(inode->i_sb);
4362		if (!timespec_equal(&inode->i_ctime, &now))
4363			inode->i_ctime = now;
4364	}
4365	/*
4366	 * Update only when preallocation was requested beyond
4367	 * the file size.
4368	 */
4369	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4370		if (new_size > i_size_read(inode))
4371			i_size_write(inode, new_size);
4372		if (new_size > EXT4_I(inode)->i_disksize)
4373			ext4_update_i_disksize(inode, new_size);
4374	} else {
4375		/*
4376		 * Mark that we allocate beyond EOF so the subsequent truncate
4377		 * can proceed even if the new size is the same as i_size.
4378		 */
4379		if (new_size > i_size_read(inode))
4380			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4381	}
4382
4383}
4384
4385/*
4386 * preallocate space for a file. This implements ext4's fallocate file
4387 * operation, which gets called from sys_fallocate system call.
4388 * For block-mapped files, posix_fallocate should fall back to the method
4389 * of writing zeroes to the required new blocks (the same behavior which is
4390 * expected for file systems which do not support fallocate() system call).
4391 */
4392long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4393{
4394	struct inode *inode = file->f_path.dentry->d_inode;
4395	handle_t *handle;
4396	loff_t new_size;
4397	unsigned int max_blocks;
4398	int ret = 0;
4399	int ret2 = 0;
4400	int retries = 0;
4401	int flags;
4402	struct ext4_map_blocks map;
4403	unsigned int credits, blkbits = inode->i_blkbits;
4404
4405	/*
4406	 * currently supporting (pre)allocate mode for extent-based
4407	 * files _only_
4408	 */
4409	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4410		return -EOPNOTSUPP;
4411
4412	/* Return error if mode is not supported */
4413	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4414		return -EOPNOTSUPP;
4415
4416	if (mode & FALLOC_FL_PUNCH_HOLE)
4417		return ext4_punch_hole(file, offset, len);
4418
4419	trace_ext4_fallocate_enter(inode, offset, len, mode);
4420	map.m_lblk = offset >> blkbits;
4421	/*
4422	 * We can't just convert len to max_blocks because
4423	 * If blocksize = 4096 offset = 3072 and len = 2048
4424	 */
4425	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4426		- map.m_lblk;
4427	/*
4428	 * credits to insert 1 extent into extent tree
4429	 */
4430	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4431	mutex_lock(&inode->i_mutex);
4432	ret = inode_newsize_ok(inode, (len + offset));
4433	if (ret) {
4434		mutex_unlock(&inode->i_mutex);
4435		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4436		return ret;
4437	}
4438	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4439	if (mode & FALLOC_FL_KEEP_SIZE)
4440		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4441	/*
4442	 * Don't normalize the request if it can fit in one extent so
4443	 * that it doesn't get unnecessarily split into multiple
4444	 * extents.
4445	 */
4446	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4447		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4448retry:
4449	while (ret >= 0 && ret < max_blocks) {
4450		map.m_lblk = map.m_lblk + ret;
4451		map.m_len = max_blocks = max_blocks - ret;
4452		handle = ext4_journal_start(inode, credits);
4453		if (IS_ERR(handle)) {
4454			ret = PTR_ERR(handle);
4455			break;
4456		}
4457		ret = ext4_map_blocks(handle, inode, &map, flags);
4458		if (ret <= 0) {
4459#ifdef EXT4FS_DEBUG
4460			WARN_ON(ret <= 0);
4461			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4462				    "returned error inode#%lu, block=%u, "
4463				    "max_blocks=%u", __func__,
4464				    inode->i_ino, map.m_lblk, max_blocks);
4465#endif
4466			ext4_mark_inode_dirty(handle, inode);
4467			ret2 = ext4_journal_stop(handle);
4468			break;
4469		}
4470		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4471						blkbits) >> blkbits))
4472			new_size = offset + len;
4473		else
4474			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4475
4476		ext4_falloc_update_inode(inode, mode, new_size,
4477					 (map.m_flags & EXT4_MAP_NEW));
4478		ext4_mark_inode_dirty(handle, inode);
4479		if ((file->f_flags & O_SYNC) && ret >= max_blocks)
4480			ext4_handle_sync(handle);
4481		ret2 = ext4_journal_stop(handle);
4482		if (ret2)
4483			break;
4484	}
4485	if (ret == -ENOSPC &&
4486			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4487		ret = 0;
4488		goto retry;
4489	}
4490	mutex_unlock(&inode->i_mutex);
4491	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4492				ret > 0 ? ret2 : ret);
4493	return ret > 0 ? ret2 : ret;
4494}
4495
4496/*
4497 * This function convert a range of blocks to written extents
4498 * The caller of this function will pass the start offset and the size.
4499 * all unwritten extents within this range will be converted to
4500 * written extents.
4501 *
4502 * This function is called from the direct IO end io call back
4503 * function, to convert the fallocated extents after IO is completed.
4504 * Returns 0 on success.
4505 */
4506int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4507				    ssize_t len)
4508{
4509	handle_t *handle;
4510	unsigned int max_blocks;
4511	int ret = 0;
4512	int ret2 = 0;
4513	struct ext4_map_blocks map;
4514	unsigned int credits, blkbits = inode->i_blkbits;
4515
4516	map.m_lblk = offset >> blkbits;
4517	/*
4518	 * We can't just convert len to max_blocks because
4519	 * If blocksize = 4096 offset = 3072 and len = 2048
4520	 */
4521	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4522		      map.m_lblk);
4523	/*
4524	 * credits to insert 1 extent into extent tree
4525	 */
4526	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4527	while (ret >= 0 && ret < max_blocks) {
4528		map.m_lblk += ret;
4529		map.m_len = (max_blocks -= ret);
4530		handle = ext4_journal_start(inode, credits);
4531		if (IS_ERR(handle)) {
4532			ret = PTR_ERR(handle);
4533			break;
4534		}
4535		ret = ext4_map_blocks(handle, inode, &map,
4536				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4537		if (ret <= 0) {
4538			WARN_ON(ret <= 0);
4539			ext4_msg(inode->i_sb, KERN_ERR,
4540				 "%s:%d: inode #%lu: block %u: len %u: "
4541				 "ext4_ext_map_blocks returned %d",
4542				 __func__, __LINE__, inode->i_ino, map.m_lblk,
4543				 map.m_len, ret);
4544		}
4545		ext4_mark_inode_dirty(handle, inode);
4546		ret2 = ext4_journal_stop(handle);
4547		if (ret <= 0 || ret2 )
4548			break;
4549	}
4550	return ret > 0 ? ret2 : ret;
4551}
4552
4553/*
4554 * Callback function called for each extent to gather FIEMAP information.
4555 */
4556static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4557		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4558		       void *data)
4559{
4560	__u64	logical;
4561	__u64	physical;
4562	__u64	length;
4563	__u32	flags = 0;
4564	int		ret = 0;
4565	struct fiemap_extent_info *fieinfo = data;
4566	unsigned char blksize_bits;
4567
4568	blksize_bits = inode->i_sb->s_blocksize_bits;
4569	logical = (__u64)newex->ec_block << blksize_bits;
4570
4571	if (newex->ec_start == 0) {
4572		/*
4573		 * No extent in extent-tree contains block @newex->ec_start,
4574		 * then the block may stay in 1)a hole or 2)delayed-extent.
4575		 *
4576		 * Holes or delayed-extents are processed as follows.
4577		 * 1. lookup dirty pages with specified range in pagecache.
4578		 *    If no page is got, then there is no delayed-extent and
4579		 *    return with EXT_CONTINUE.
4580		 * 2. find the 1st mapped buffer,
4581		 * 3. check if the mapped buffer is both in the request range
4582		 *    and a delayed buffer. If not, there is no delayed-extent,
4583		 *    then return.
4584		 * 4. a delayed-extent is found, the extent will be collected.
4585		 */
4586		ext4_lblk_t	end = 0;
4587		pgoff_t		last_offset;
4588		pgoff_t		offset;
4589		pgoff_t		index;
4590		pgoff_t		start_index = 0;
4591		struct page	**pages = NULL;
4592		struct buffer_head *bh = NULL;
4593		struct buffer_head *head = NULL;
4594		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4595
4596		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4597		if (pages == NULL)
4598			return -ENOMEM;
4599
4600		offset = logical >> PAGE_SHIFT;
4601repeat:
4602		last_offset = offset;
4603		head = NULL;
4604		ret = find_get_pages_tag(inode->i_mapping, &offset,
4605					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4606
4607		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4608			/* First time, try to find a mapped buffer. */
4609			if (ret == 0) {
4610out:
4611				for (index = 0; index < ret; index++)
4612					page_cache_release(pages[index]);
4613				/* just a hole. */
4614				kfree(pages);
4615				return EXT_CONTINUE;
4616			}
4617			index = 0;
4618
4619next_page:
4620			/* Try to find the 1st mapped buffer. */
4621			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4622				  blksize_bits;
4623			if (!page_has_buffers(pages[index]))
4624				goto out;
4625			head = page_buffers(pages[index]);
4626			if (!head)
4627				goto out;
4628
4629			index++;
4630			bh = head;
4631			do {
4632				if (end >= newex->ec_block +
4633					newex->ec_len)
4634					/* The buffer is out of
4635					 * the request range.
4636					 */
4637					goto out;
4638
4639				if (buffer_mapped(bh) &&
4640				    end >= newex->ec_block) {
4641					start_index = index - 1;
4642					/* get the 1st mapped buffer. */
4643					goto found_mapped_buffer;
4644				}
4645
4646				bh = bh->b_this_page;
4647				end++;
4648			} while (bh != head);
4649
4650			/* No mapped buffer in the range found in this page,
4651			 * We need to look up next page.
4652			 */
4653			if (index >= ret) {
4654				/* There is no page left, but we need to limit
4655				 * newex->ec_len.
4656				 */
4657				newex->ec_len = end - newex->ec_block;
4658				goto out;
4659			}
4660			goto next_page;
4661		} else {
4662			/*Find contiguous delayed buffers. */
4663			if (ret > 0 && pages[0]->index == last_offset)
4664				head = page_buffers(pages[0]);
4665			bh = head;
4666			index = 1;
4667			start_index = 0;
4668		}
4669
4670found_mapped_buffer:
4671		if (bh != NULL && buffer_delay(bh)) {
4672			/* 1st or contiguous delayed buffer found. */
4673			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4674				/*
4675				 * 1st delayed buffer found, record
4676				 * the start of extent.
4677				 */
4678				flags |= FIEMAP_EXTENT_DELALLOC;
4679				newex->ec_block = end;
4680				logical = (__u64)end << blksize_bits;
4681			}
4682			/* Find contiguous delayed buffers. */
4683			do {
4684				if (!buffer_delay(bh))
4685					goto found_delayed_extent;
4686				bh = bh->b_this_page;
4687				end++;
4688			} while (bh != head);
4689
4690			for (; index < ret; index++) {
4691				if (!page_has_buffers(pages[index])) {
4692					bh = NULL;
4693					break;
4694				}
4695				head = page_buffers(pages[index]);
4696				if (!head) {
4697					bh = NULL;
4698					break;
4699				}
4700
4701				if (pages[index]->index !=
4702				    pages[start_index]->index + index
4703				    - start_index) {
4704					/* Blocks are not contiguous. */
4705					bh = NULL;
4706					break;
4707				}
4708				bh = head;
4709				do {
4710					if (!buffer_delay(bh))
4711						/* Delayed-extent ends. */
4712						goto found_delayed_extent;
4713					bh = bh->b_this_page;
4714					end++;
4715				} while (bh != head);
4716			}
4717		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4718			/* a hole found. */
4719			goto out;
4720
4721found_delayed_extent:
4722		newex->ec_len = min(end - newex->ec_block,
4723						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4724		if (ret == nr_pages && bh != NULL &&
4725			newex->ec_len < EXT_INIT_MAX_LEN &&
4726			buffer_delay(bh)) {
4727			/* Have not collected an extent and continue. */
4728			for (index = 0; index < ret; index++)
4729				page_cache_release(pages[index]);
4730			goto repeat;
4731		}
4732
4733		for (index = 0; index < ret; index++)
4734			page_cache_release(pages[index]);
4735		kfree(pages);
4736	}
4737
4738	physical = (__u64)newex->ec_start << blksize_bits;
4739	length =   (__u64)newex->ec_len << blksize_bits;
4740
4741	if (ex && ext4_ext_is_uninitialized(ex))
4742		flags |= FIEMAP_EXTENT_UNWRITTEN;
4743
4744	if (next == EXT_MAX_BLOCKS)
4745		flags |= FIEMAP_EXTENT_LAST;
4746
4747	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4748					length, flags);
4749	if (ret < 0)
4750		return ret;
4751	if (ret == 1)
4752		return EXT_BREAK;
4753	return EXT_CONTINUE;
4754}
4755/* fiemap flags we can handle specified here */
4756#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4757
4758static int ext4_xattr_fiemap(struct inode *inode,
4759				struct fiemap_extent_info *fieinfo)
4760{
4761	__u64 physical = 0;
4762	__u64 length;
4763	__u32 flags = FIEMAP_EXTENT_LAST;
4764	int blockbits = inode->i_sb->s_blocksize_bits;
4765	int error = 0;
4766
4767	/* in-inode? */
4768	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4769		struct ext4_iloc iloc;
4770		int offset;	/* offset of xattr in inode */
4771
4772		error = ext4_get_inode_loc(inode, &iloc);
4773		if (error)
4774			return error;
4775		physical = iloc.bh->b_blocknr << blockbits;
4776		offset = EXT4_GOOD_OLD_INODE_SIZE +
4777				EXT4_I(inode)->i_extra_isize;
4778		physical += offset;
4779		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4780		flags |= FIEMAP_EXTENT_DATA_INLINE;
4781		brelse(iloc.bh);
4782	} else { /* external block */
4783		physical = EXT4_I(inode)->i_file_acl << blockbits;
4784		length = inode->i_sb->s_blocksize;
4785	}
4786
4787	if (physical)
4788		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4789						length, flags);
4790	return (error < 0 ? error : 0);
4791}
4792
4793/*
4794 * ext4_ext_punch_hole
4795 *
4796 * Punches a hole of "length" bytes in a file starting
4797 * at byte "offset"
4798 *
4799 * @inode:  The inode of the file to punch a hole in
4800 * @offset: The starting byte offset of the hole
4801 * @length: The length of the hole
4802 *
4803 * Returns the number of blocks removed or negative on err
4804 */
4805int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4806{
4807	struct inode *inode = file->f_path.dentry->d_inode;
4808	struct super_block *sb = inode->i_sb;
4809	ext4_lblk_t first_block, stop_block;
4810	struct address_space *mapping = inode->i_mapping;
4811	handle_t *handle;
4812	loff_t first_page, last_page, page_len;
4813	loff_t first_page_offset, last_page_offset;
4814	int credits, err = 0;
4815
4816	/* No need to punch hole beyond i_size */
4817	if (offset >= inode->i_size)
4818		return 0;
4819
4820	/*
4821	 * If the hole extends beyond i_size, set the hole
4822	 * to end after the page that contains i_size
4823	 */
4824	if (offset + length > inode->i_size) {
4825		length = inode->i_size +
4826		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4827		   offset;
4828	}
4829
4830	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4831	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4832
4833	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4834	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4835
4836	/*
4837	 * Write out all dirty pages to avoid race conditions
4838	 * Then release them.
4839	 */
4840	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4841		err = filemap_write_and_wait_range(mapping,
4842			offset, offset + length - 1);
4843
4844		if (err)
4845			return err;
4846	}
4847
4848	/* Now release the pages */
4849	if (last_page_offset > first_page_offset) {
4850		truncate_pagecache_range(inode, first_page_offset,
4851					 last_page_offset - 1);
4852	}
4853
4854	/* finish any pending end_io work */
4855	ext4_flush_completed_IO(inode);
4856
4857	credits = ext4_writepage_trans_blocks(inode);
4858	handle = ext4_journal_start(inode, credits);
4859	if (IS_ERR(handle))
4860		return PTR_ERR(handle);
4861
4862	err = ext4_orphan_add(handle, inode);
4863	if (err)
4864		goto out;
4865
4866	/*
4867	 * Now we need to zero out the non-page-aligned data in the
4868	 * pages at the start and tail of the hole, and unmap the buffer
4869	 * heads for the block aligned regions of the page that were
4870	 * completely zeroed.
4871	 */
4872	if (first_page > last_page) {
4873		/*
4874		 * If the file space being truncated is contained within a page
4875		 * just zero out and unmap the middle of that page
4876		 */
4877		err = ext4_discard_partial_page_buffers(handle,
4878			mapping, offset, length, 0);
4879
4880		if (err)
4881			goto out;
4882	} else {
4883		/*
4884		 * zero out and unmap the partial page that contains
4885		 * the start of the hole
4886		 */
4887		page_len  = first_page_offset - offset;
4888		if (page_len > 0) {
4889			err = ext4_discard_partial_page_buffers(handle, mapping,
4890						   offset, page_len, 0);
4891			if (err)
4892				goto out;
4893		}
4894
4895		/*
4896		 * zero out and unmap the partial page that contains
4897		 * the end of the hole
4898		 */
4899		page_len = offset + length - last_page_offset;
4900		if (page_len > 0) {
4901			err = ext4_discard_partial_page_buffers(handle, mapping,
4902					last_page_offset, page_len, 0);
4903			if (err)
4904				goto out;
4905		}
4906	}
4907
4908	/*
4909	 * If i_size is contained in the last page, we need to
4910	 * unmap and zero the partial page after i_size
4911	 */
4912	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4913	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4914
4915		page_len = PAGE_CACHE_SIZE -
4916			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4917
4918		if (page_len > 0) {
4919			err = ext4_discard_partial_page_buffers(handle,
4920			  mapping, inode->i_size, page_len, 0);
4921
4922			if (err)
4923				goto out;
4924		}
4925	}
4926
4927	first_block = (offset + sb->s_blocksize - 1) >>
4928		EXT4_BLOCK_SIZE_BITS(sb);
4929	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4930
4931	/* If there are no blocks to remove, return now */
4932	if (first_block >= stop_block)
4933		goto out;
4934
4935	down_write(&EXT4_I(inode)->i_data_sem);
4936	ext4_ext_invalidate_cache(inode);
4937	ext4_discard_preallocations(inode);
4938
4939	err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4940
4941	ext4_ext_invalidate_cache(inode);
4942	ext4_discard_preallocations(inode);
4943
4944	if (IS_SYNC(inode))
4945		ext4_handle_sync(handle);
4946
4947	up_write(&EXT4_I(inode)->i_data_sem);
4948
4949out:
4950	ext4_orphan_del(handle, inode);
4951	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4952	ext4_mark_inode_dirty(handle, inode);
4953	ext4_journal_stop(handle);
4954	return err;
4955}
4956int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4957		__u64 start, __u64 len)
4958{
4959	ext4_lblk_t start_blk;
4960	int error = 0;
4961
4962	/* fallback to generic here if not in extents fmt */
4963	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4964		return generic_block_fiemap(inode, fieinfo, start, len,
4965			ext4_get_block);
4966
4967	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4968		return -EBADR;
4969
4970	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4971		error = ext4_xattr_fiemap(inode, fieinfo);
4972	} else {
4973		ext4_lblk_t len_blks;
4974		__u64 last_blk;
4975
4976		start_blk = start >> inode->i_sb->s_blocksize_bits;
4977		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4978		if (last_blk >= EXT_MAX_BLOCKS)
4979			last_blk = EXT_MAX_BLOCKS-1;
4980		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4981
4982		/*
4983		 * Walk the extent tree gathering extent information.
4984		 * ext4_ext_fiemap_cb will push extents back to user.
4985		 */
4986		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4987					  ext4_ext_fiemap_cb, fieinfo);
4988	}
4989
4990	return error;
4991}
4992