extents.c revision 1939dd84b3f52e9c8d1b46dffae2058f16a3ff6a
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45
46#include <trace/events/ext4.h>
47
48static int ext4_split_extent(handle_t *handle,
49				struct inode *inode,
50				struct ext4_ext_path *path,
51				struct ext4_map_blocks *map,
52				int split_flag,
53				int flags);
54
55static int ext4_ext_truncate_extend_restart(handle_t *handle,
56					    struct inode *inode,
57					    int needed)
58{
59	int err;
60
61	if (!ext4_handle_valid(handle))
62		return 0;
63	if (handle->h_buffer_credits > needed)
64		return 0;
65	err = ext4_journal_extend(handle, needed);
66	if (err <= 0)
67		return err;
68	err = ext4_truncate_restart_trans(handle, inode, needed);
69	if (err == 0)
70		err = -EAGAIN;
71
72	return err;
73}
74
75/*
76 * could return:
77 *  - EROFS
78 *  - ENOMEM
79 */
80static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
81				struct ext4_ext_path *path)
82{
83	if (path->p_bh) {
84		/* path points to block */
85		return ext4_journal_get_write_access(handle, path->p_bh);
86	}
87	/* path points to leaf/index in inode body */
88	/* we use in-core data, no need to protect them */
89	return 0;
90}
91
92/*
93 * could return:
94 *  - EROFS
95 *  - ENOMEM
96 *  - EIO
97 */
98#define ext4_ext_dirty(handle, inode, path) \
99		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101			    handle_t *handle, struct inode *inode,
102			    struct ext4_ext_path *path)
103{
104	int err;
105	if (path->p_bh) {
106		/* path points to block */
107		err = __ext4_handle_dirty_metadata(where, line, handle,
108						   inode, path->p_bh);
109	} else {
110		/* path points to leaf/index in inode body */
111		err = ext4_mark_inode_dirty(handle, inode);
112	}
113	return err;
114}
115
116static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117			      struct ext4_ext_path *path,
118			      ext4_lblk_t block)
119{
120	int depth;
121
122	if (path) {
123		struct ext4_extent *ex;
124		depth = path->p_depth;
125
126		/*
127		 * Try to predict block placement assuming that we are
128		 * filling in a file which will eventually be
129		 * non-sparse --- i.e., in the case of libbfd writing
130		 * an ELF object sections out-of-order but in a way
131		 * the eventually results in a contiguous object or
132		 * executable file, or some database extending a table
133		 * space file.  However, this is actually somewhat
134		 * non-ideal if we are writing a sparse file such as
135		 * qemu or KVM writing a raw image file that is going
136		 * to stay fairly sparse, since it will end up
137		 * fragmenting the file system's free space.  Maybe we
138		 * should have some hueristics or some way to allow
139		 * userspace to pass a hint to file system,
140		 * especially if the latter case turns out to be
141		 * common.
142		 */
143		ex = path[depth].p_ext;
144		if (ex) {
145			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
146			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
147
148			if (block > ext_block)
149				return ext_pblk + (block - ext_block);
150			else
151				return ext_pblk - (ext_block - block);
152		}
153
154		/* it looks like index is empty;
155		 * try to find starting block from index itself */
156		if (path[depth].p_bh)
157			return path[depth].p_bh->b_blocknr;
158	}
159
160	/* OK. use inode's group */
161	return ext4_inode_to_goal_block(inode);
162}
163
164/*
165 * Allocation for a meta data block
166 */
167static ext4_fsblk_t
168ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
169			struct ext4_ext_path *path,
170			struct ext4_extent *ex, int *err, unsigned int flags)
171{
172	ext4_fsblk_t goal, newblock;
173
174	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
175	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
176					NULL, err);
177	return newblock;
178}
179
180static inline int ext4_ext_space_block(struct inode *inode, int check)
181{
182	int size;
183
184	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
185			/ sizeof(struct ext4_extent);
186	if (!check) {
187#ifdef AGGRESSIVE_TEST
188		if (size > 6)
189			size = 6;
190#endif
191	}
192	return size;
193}
194
195static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
196{
197	int size;
198
199	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
200			/ sizeof(struct ext4_extent_idx);
201	if (!check) {
202#ifdef AGGRESSIVE_TEST
203		if (size > 5)
204			size = 5;
205#endif
206	}
207	return size;
208}
209
210static inline int ext4_ext_space_root(struct inode *inode, int check)
211{
212	int size;
213
214	size = sizeof(EXT4_I(inode)->i_data);
215	size -= sizeof(struct ext4_extent_header);
216	size /= sizeof(struct ext4_extent);
217	if (!check) {
218#ifdef AGGRESSIVE_TEST
219		if (size > 3)
220			size = 3;
221#endif
222	}
223	return size;
224}
225
226static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
227{
228	int size;
229
230	size = sizeof(EXT4_I(inode)->i_data);
231	size -= sizeof(struct ext4_extent_header);
232	size /= sizeof(struct ext4_extent_idx);
233	if (!check) {
234#ifdef AGGRESSIVE_TEST
235		if (size > 4)
236			size = 4;
237#endif
238	}
239	return size;
240}
241
242/*
243 * Calculate the number of metadata blocks needed
244 * to allocate @blocks
245 * Worse case is one block per extent
246 */
247int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
248{
249	struct ext4_inode_info *ei = EXT4_I(inode);
250	int idxs, num = 0;
251
252	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
253		/ sizeof(struct ext4_extent_idx));
254
255	/*
256	 * If the new delayed allocation block is contiguous with the
257	 * previous da block, it can share index blocks with the
258	 * previous block, so we only need to allocate a new index
259	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
260	 * an additional index block, and at ldxs**3 blocks, yet
261	 * another index blocks.
262	 */
263	if (ei->i_da_metadata_calc_len &&
264	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
265		if ((ei->i_da_metadata_calc_len % idxs) == 0)
266			num++;
267		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
268			num++;
269		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
270			num++;
271			ei->i_da_metadata_calc_len = 0;
272		} else
273			ei->i_da_metadata_calc_len++;
274		ei->i_da_metadata_calc_last_lblock++;
275		return num;
276	}
277
278	/*
279	 * In the worst case we need a new set of index blocks at
280	 * every level of the inode's extent tree.
281	 */
282	ei->i_da_metadata_calc_len = 1;
283	ei->i_da_metadata_calc_last_lblock = lblock;
284	return ext_depth(inode) + 1;
285}
286
287static int
288ext4_ext_max_entries(struct inode *inode, int depth)
289{
290	int max;
291
292	if (depth == ext_depth(inode)) {
293		if (depth == 0)
294			max = ext4_ext_space_root(inode, 1);
295		else
296			max = ext4_ext_space_root_idx(inode, 1);
297	} else {
298		if (depth == 0)
299			max = ext4_ext_space_block(inode, 1);
300		else
301			max = ext4_ext_space_block_idx(inode, 1);
302	}
303
304	return max;
305}
306
307static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
308{
309	ext4_fsblk_t block = ext4_ext_pblock(ext);
310	int len = ext4_ext_get_actual_len(ext);
311
312	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
313}
314
315static int ext4_valid_extent_idx(struct inode *inode,
316				struct ext4_extent_idx *ext_idx)
317{
318	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
319
320	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
321}
322
323static int ext4_valid_extent_entries(struct inode *inode,
324				struct ext4_extent_header *eh,
325				int depth)
326{
327	struct ext4_extent *ext;
328	struct ext4_extent_idx *ext_idx;
329	unsigned short entries;
330	if (eh->eh_entries == 0)
331		return 1;
332
333	entries = le16_to_cpu(eh->eh_entries);
334
335	if (depth == 0) {
336		/* leaf entries */
337		ext = EXT_FIRST_EXTENT(eh);
338		while (entries) {
339			if (!ext4_valid_extent(inode, ext))
340				return 0;
341			ext++;
342			entries--;
343		}
344	} else {
345		ext_idx = EXT_FIRST_INDEX(eh);
346		while (entries) {
347			if (!ext4_valid_extent_idx(inode, ext_idx))
348				return 0;
349			ext_idx++;
350			entries--;
351		}
352	}
353	return 1;
354}
355
356static int __ext4_ext_check(const char *function, unsigned int line,
357			    struct inode *inode, struct ext4_extent_header *eh,
358			    int depth)
359{
360	const char *error_msg;
361	int max = 0;
362
363	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
364		error_msg = "invalid magic";
365		goto corrupted;
366	}
367	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
368		error_msg = "unexpected eh_depth";
369		goto corrupted;
370	}
371	if (unlikely(eh->eh_max == 0)) {
372		error_msg = "invalid eh_max";
373		goto corrupted;
374	}
375	max = ext4_ext_max_entries(inode, depth);
376	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
377		error_msg = "too large eh_max";
378		goto corrupted;
379	}
380	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
381		error_msg = "invalid eh_entries";
382		goto corrupted;
383	}
384	if (!ext4_valid_extent_entries(inode, eh, depth)) {
385		error_msg = "invalid extent entries";
386		goto corrupted;
387	}
388	return 0;
389
390corrupted:
391	ext4_error_inode(inode, function, line, 0,
392			"bad header/extent: %s - magic %x, "
393			"entries %u, max %u(%u), depth %u(%u)",
394			error_msg, le16_to_cpu(eh->eh_magic),
395			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
396			max, le16_to_cpu(eh->eh_depth), depth);
397
398	return -EIO;
399}
400
401#define ext4_ext_check(inode, eh, depth)	\
402	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
403
404int ext4_ext_check_inode(struct inode *inode)
405{
406	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
407}
408
409#ifdef EXT_DEBUG
410static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
411{
412	int k, l = path->p_depth;
413
414	ext_debug("path:");
415	for (k = 0; k <= l; k++, path++) {
416		if (path->p_idx) {
417		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
418			    ext4_idx_pblock(path->p_idx));
419		} else if (path->p_ext) {
420			ext_debug("  %d:[%d]%d:%llu ",
421				  le32_to_cpu(path->p_ext->ee_block),
422				  ext4_ext_is_uninitialized(path->p_ext),
423				  ext4_ext_get_actual_len(path->p_ext),
424				  ext4_ext_pblock(path->p_ext));
425		} else
426			ext_debug("  []");
427	}
428	ext_debug("\n");
429}
430
431static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
432{
433	int depth = ext_depth(inode);
434	struct ext4_extent_header *eh;
435	struct ext4_extent *ex;
436	int i;
437
438	if (!path)
439		return;
440
441	eh = path[depth].p_hdr;
442	ex = EXT_FIRST_EXTENT(eh);
443
444	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
445
446	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
447		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
448			  ext4_ext_is_uninitialized(ex),
449			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
450	}
451	ext_debug("\n");
452}
453
454static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
455			ext4_fsblk_t newblock, int level)
456{
457	int depth = ext_depth(inode);
458	struct ext4_extent *ex;
459
460	if (depth != level) {
461		struct ext4_extent_idx *idx;
462		idx = path[level].p_idx;
463		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
464			ext_debug("%d: move %d:%llu in new index %llu\n", level,
465					le32_to_cpu(idx->ei_block),
466					ext4_idx_pblock(idx),
467					newblock);
468			idx++;
469		}
470
471		return;
472	}
473
474	ex = path[depth].p_ext;
475	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
476		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
477				le32_to_cpu(ex->ee_block),
478				ext4_ext_pblock(ex),
479				ext4_ext_is_uninitialized(ex),
480				ext4_ext_get_actual_len(ex),
481				newblock);
482		ex++;
483	}
484}
485
486#else
487#define ext4_ext_show_path(inode, path)
488#define ext4_ext_show_leaf(inode, path)
489#define ext4_ext_show_move(inode, path, newblock, level)
490#endif
491
492void ext4_ext_drop_refs(struct ext4_ext_path *path)
493{
494	int depth = path->p_depth;
495	int i;
496
497	for (i = 0; i <= depth; i++, path++)
498		if (path->p_bh) {
499			brelse(path->p_bh);
500			path->p_bh = NULL;
501		}
502}
503
504/*
505 * ext4_ext_binsearch_idx:
506 * binary search for the closest index of the given block
507 * the header must be checked before calling this
508 */
509static void
510ext4_ext_binsearch_idx(struct inode *inode,
511			struct ext4_ext_path *path, ext4_lblk_t block)
512{
513	struct ext4_extent_header *eh = path->p_hdr;
514	struct ext4_extent_idx *r, *l, *m;
515
516
517	ext_debug("binsearch for %u(idx):  ", block);
518
519	l = EXT_FIRST_INDEX(eh) + 1;
520	r = EXT_LAST_INDEX(eh);
521	while (l <= r) {
522		m = l + (r - l) / 2;
523		if (block < le32_to_cpu(m->ei_block))
524			r = m - 1;
525		else
526			l = m + 1;
527		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
528				m, le32_to_cpu(m->ei_block),
529				r, le32_to_cpu(r->ei_block));
530	}
531
532	path->p_idx = l - 1;
533	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
534		  ext4_idx_pblock(path->p_idx));
535
536#ifdef CHECK_BINSEARCH
537	{
538		struct ext4_extent_idx *chix, *ix;
539		int k;
540
541		chix = ix = EXT_FIRST_INDEX(eh);
542		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
543		  if (k != 0 &&
544		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
545				printk(KERN_DEBUG "k=%d, ix=0x%p, "
546				       "first=0x%p\n", k,
547				       ix, EXT_FIRST_INDEX(eh));
548				printk(KERN_DEBUG "%u <= %u\n",
549				       le32_to_cpu(ix->ei_block),
550				       le32_to_cpu(ix[-1].ei_block));
551			}
552			BUG_ON(k && le32_to_cpu(ix->ei_block)
553					   <= le32_to_cpu(ix[-1].ei_block));
554			if (block < le32_to_cpu(ix->ei_block))
555				break;
556			chix = ix;
557		}
558		BUG_ON(chix != path->p_idx);
559	}
560#endif
561
562}
563
564/*
565 * ext4_ext_binsearch:
566 * binary search for closest extent of the given block
567 * the header must be checked before calling this
568 */
569static void
570ext4_ext_binsearch(struct inode *inode,
571		struct ext4_ext_path *path, ext4_lblk_t block)
572{
573	struct ext4_extent_header *eh = path->p_hdr;
574	struct ext4_extent *r, *l, *m;
575
576	if (eh->eh_entries == 0) {
577		/*
578		 * this leaf is empty:
579		 * we get such a leaf in split/add case
580		 */
581		return;
582	}
583
584	ext_debug("binsearch for %u:  ", block);
585
586	l = EXT_FIRST_EXTENT(eh) + 1;
587	r = EXT_LAST_EXTENT(eh);
588
589	while (l <= r) {
590		m = l + (r - l) / 2;
591		if (block < le32_to_cpu(m->ee_block))
592			r = m - 1;
593		else
594			l = m + 1;
595		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
596				m, le32_to_cpu(m->ee_block),
597				r, le32_to_cpu(r->ee_block));
598	}
599
600	path->p_ext = l - 1;
601	ext_debug("  -> %d:%llu:[%d]%d ",
602			le32_to_cpu(path->p_ext->ee_block),
603			ext4_ext_pblock(path->p_ext),
604			ext4_ext_is_uninitialized(path->p_ext),
605			ext4_ext_get_actual_len(path->p_ext));
606
607#ifdef CHECK_BINSEARCH
608	{
609		struct ext4_extent *chex, *ex;
610		int k;
611
612		chex = ex = EXT_FIRST_EXTENT(eh);
613		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
614			BUG_ON(k && le32_to_cpu(ex->ee_block)
615					  <= le32_to_cpu(ex[-1].ee_block));
616			if (block < le32_to_cpu(ex->ee_block))
617				break;
618			chex = ex;
619		}
620		BUG_ON(chex != path->p_ext);
621	}
622#endif
623
624}
625
626int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
627{
628	struct ext4_extent_header *eh;
629
630	eh = ext_inode_hdr(inode);
631	eh->eh_depth = 0;
632	eh->eh_entries = 0;
633	eh->eh_magic = EXT4_EXT_MAGIC;
634	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
635	ext4_mark_inode_dirty(handle, inode);
636	ext4_ext_invalidate_cache(inode);
637	return 0;
638}
639
640struct ext4_ext_path *
641ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
642					struct ext4_ext_path *path)
643{
644	struct ext4_extent_header *eh;
645	struct buffer_head *bh;
646	short int depth, i, ppos = 0, alloc = 0;
647
648	eh = ext_inode_hdr(inode);
649	depth = ext_depth(inode);
650
651	/* account possible depth increase */
652	if (!path) {
653		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
654				GFP_NOFS);
655		if (!path)
656			return ERR_PTR(-ENOMEM);
657		alloc = 1;
658	}
659	path[0].p_hdr = eh;
660	path[0].p_bh = NULL;
661
662	i = depth;
663	/* walk through the tree */
664	while (i) {
665		int need_to_validate = 0;
666
667		ext_debug("depth %d: num %d, max %d\n",
668			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
669
670		ext4_ext_binsearch_idx(inode, path + ppos, block);
671		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
672		path[ppos].p_depth = i;
673		path[ppos].p_ext = NULL;
674
675		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
676		if (unlikely(!bh))
677			goto err;
678		if (!bh_uptodate_or_lock(bh)) {
679			trace_ext4_ext_load_extent(inode, block,
680						path[ppos].p_block);
681			if (bh_submit_read(bh) < 0) {
682				put_bh(bh);
683				goto err;
684			}
685			/* validate the extent entries */
686			need_to_validate = 1;
687		}
688		eh = ext_block_hdr(bh);
689		ppos++;
690		if (unlikely(ppos > depth)) {
691			put_bh(bh);
692			EXT4_ERROR_INODE(inode,
693					 "ppos %d > depth %d", ppos, depth);
694			goto err;
695		}
696		path[ppos].p_bh = bh;
697		path[ppos].p_hdr = eh;
698		i--;
699
700		if (need_to_validate && ext4_ext_check(inode, eh, i))
701			goto err;
702	}
703
704	path[ppos].p_depth = i;
705	path[ppos].p_ext = NULL;
706	path[ppos].p_idx = NULL;
707
708	/* find extent */
709	ext4_ext_binsearch(inode, path + ppos, block);
710	/* if not an empty leaf */
711	if (path[ppos].p_ext)
712		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
713
714	ext4_ext_show_path(inode, path);
715
716	return path;
717
718err:
719	ext4_ext_drop_refs(path);
720	if (alloc)
721		kfree(path);
722	return ERR_PTR(-EIO);
723}
724
725/*
726 * ext4_ext_insert_index:
727 * insert new index [@logical;@ptr] into the block at @curp;
728 * check where to insert: before @curp or after @curp
729 */
730static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
731				 struct ext4_ext_path *curp,
732				 int logical, ext4_fsblk_t ptr)
733{
734	struct ext4_extent_idx *ix;
735	int len, err;
736
737	err = ext4_ext_get_access(handle, inode, curp);
738	if (err)
739		return err;
740
741	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
742		EXT4_ERROR_INODE(inode,
743				 "logical %d == ei_block %d!",
744				 logical, le32_to_cpu(curp->p_idx->ei_block));
745		return -EIO;
746	}
747
748	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
749			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
750		EXT4_ERROR_INODE(inode,
751				 "eh_entries %d >= eh_max %d!",
752				 le16_to_cpu(curp->p_hdr->eh_entries),
753				 le16_to_cpu(curp->p_hdr->eh_max));
754		return -EIO;
755	}
756
757	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
758	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
759		/* insert after */
760		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
761			len = (len - 1) * sizeof(struct ext4_extent_idx);
762			len = len < 0 ? 0 : len;
763			ext_debug("insert new index %d after: %llu. "
764					"move %d from 0x%p to 0x%p\n",
765					logical, ptr, len,
766					(curp->p_idx + 1), (curp->p_idx + 2));
767			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
768		}
769		ix = curp->p_idx + 1;
770	} else {
771		/* insert before */
772		len = len * sizeof(struct ext4_extent_idx);
773		len = len < 0 ? 0 : len;
774		ext_debug("insert new index %d before: %llu. "
775				"move %d from 0x%p to 0x%p\n",
776				logical, ptr, len,
777				curp->p_idx, (curp->p_idx + 1));
778		memmove(curp->p_idx + 1, curp->p_idx, len);
779		ix = curp->p_idx;
780	}
781
782	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
783		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
784		return -EIO;
785	}
786
787	ix->ei_block = cpu_to_le32(logical);
788	ext4_idx_store_pblock(ix, ptr);
789	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
790
791	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
792		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
793		return -EIO;
794	}
795
796	err = ext4_ext_dirty(handle, inode, curp);
797	ext4_std_error(inode->i_sb, err);
798
799	return err;
800}
801
802/*
803 * ext4_ext_split:
804 * inserts new subtree into the path, using free index entry
805 * at depth @at:
806 * - allocates all needed blocks (new leaf and all intermediate index blocks)
807 * - makes decision where to split
808 * - moves remaining extents and index entries (right to the split point)
809 *   into the newly allocated blocks
810 * - initializes subtree
811 */
812static int ext4_ext_split(handle_t *handle, struct inode *inode,
813			  unsigned int flags,
814			  struct ext4_ext_path *path,
815			  struct ext4_extent *newext, int at)
816{
817	struct buffer_head *bh = NULL;
818	int depth = ext_depth(inode);
819	struct ext4_extent_header *neh;
820	struct ext4_extent_idx *fidx;
821	int i = at, k, m, a;
822	ext4_fsblk_t newblock, oldblock;
823	__le32 border;
824	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
825	int err = 0;
826
827	/* make decision: where to split? */
828	/* FIXME: now decision is simplest: at current extent */
829
830	/* if current leaf will be split, then we should use
831	 * border from split point */
832	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
833		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
834		return -EIO;
835	}
836	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
837		border = path[depth].p_ext[1].ee_block;
838		ext_debug("leaf will be split."
839				" next leaf starts at %d\n",
840				  le32_to_cpu(border));
841	} else {
842		border = newext->ee_block;
843		ext_debug("leaf will be added."
844				" next leaf starts at %d\n",
845				le32_to_cpu(border));
846	}
847
848	/*
849	 * If error occurs, then we break processing
850	 * and mark filesystem read-only. index won't
851	 * be inserted and tree will be in consistent
852	 * state. Next mount will repair buffers too.
853	 */
854
855	/*
856	 * Get array to track all allocated blocks.
857	 * We need this to handle errors and free blocks
858	 * upon them.
859	 */
860	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
861	if (!ablocks)
862		return -ENOMEM;
863
864	/* allocate all needed blocks */
865	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
866	for (a = 0; a < depth - at; a++) {
867		newblock = ext4_ext_new_meta_block(handle, inode, path,
868						   newext, &err, flags);
869		if (newblock == 0)
870			goto cleanup;
871		ablocks[a] = newblock;
872	}
873
874	/* initialize new leaf */
875	newblock = ablocks[--a];
876	if (unlikely(newblock == 0)) {
877		EXT4_ERROR_INODE(inode, "newblock == 0!");
878		err = -EIO;
879		goto cleanup;
880	}
881	bh = sb_getblk(inode->i_sb, newblock);
882	if (!bh) {
883		err = -EIO;
884		goto cleanup;
885	}
886	lock_buffer(bh);
887
888	err = ext4_journal_get_create_access(handle, bh);
889	if (err)
890		goto cleanup;
891
892	neh = ext_block_hdr(bh);
893	neh->eh_entries = 0;
894	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
895	neh->eh_magic = EXT4_EXT_MAGIC;
896	neh->eh_depth = 0;
897
898	/* move remainder of path[depth] to the new leaf */
899	if (unlikely(path[depth].p_hdr->eh_entries !=
900		     path[depth].p_hdr->eh_max)) {
901		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
902				 path[depth].p_hdr->eh_entries,
903				 path[depth].p_hdr->eh_max);
904		err = -EIO;
905		goto cleanup;
906	}
907	/* start copy from next extent */
908	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
909	ext4_ext_show_move(inode, path, newblock, depth);
910	if (m) {
911		struct ext4_extent *ex;
912		ex = EXT_FIRST_EXTENT(neh);
913		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
914		le16_add_cpu(&neh->eh_entries, m);
915	}
916
917	set_buffer_uptodate(bh);
918	unlock_buffer(bh);
919
920	err = ext4_handle_dirty_metadata(handle, inode, bh);
921	if (err)
922		goto cleanup;
923	brelse(bh);
924	bh = NULL;
925
926	/* correct old leaf */
927	if (m) {
928		err = ext4_ext_get_access(handle, inode, path + depth);
929		if (err)
930			goto cleanup;
931		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
932		err = ext4_ext_dirty(handle, inode, path + depth);
933		if (err)
934			goto cleanup;
935
936	}
937
938	/* create intermediate indexes */
939	k = depth - at - 1;
940	if (unlikely(k < 0)) {
941		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
942		err = -EIO;
943		goto cleanup;
944	}
945	if (k)
946		ext_debug("create %d intermediate indices\n", k);
947	/* insert new index into current index block */
948	/* current depth stored in i var */
949	i = depth - 1;
950	while (k--) {
951		oldblock = newblock;
952		newblock = ablocks[--a];
953		bh = sb_getblk(inode->i_sb, newblock);
954		if (!bh) {
955			err = -EIO;
956			goto cleanup;
957		}
958		lock_buffer(bh);
959
960		err = ext4_journal_get_create_access(handle, bh);
961		if (err)
962			goto cleanup;
963
964		neh = ext_block_hdr(bh);
965		neh->eh_entries = cpu_to_le16(1);
966		neh->eh_magic = EXT4_EXT_MAGIC;
967		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
968		neh->eh_depth = cpu_to_le16(depth - i);
969		fidx = EXT_FIRST_INDEX(neh);
970		fidx->ei_block = border;
971		ext4_idx_store_pblock(fidx, oldblock);
972
973		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
974				i, newblock, le32_to_cpu(border), oldblock);
975
976		/* move remainder of path[i] to the new index block */
977		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978					EXT_LAST_INDEX(path[i].p_hdr))) {
979			EXT4_ERROR_INODE(inode,
980					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
981					 le32_to_cpu(path[i].p_ext->ee_block));
982			err = -EIO;
983			goto cleanup;
984		}
985		/* start copy indexes */
986		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988				EXT_MAX_INDEX(path[i].p_hdr));
989		ext4_ext_show_move(inode, path, newblock, i);
990		if (m) {
991			memmove(++fidx, path[i].p_idx,
992				sizeof(struct ext4_extent_idx) * m);
993			le16_add_cpu(&neh->eh_entries, m);
994		}
995		set_buffer_uptodate(bh);
996		unlock_buffer(bh);
997
998		err = ext4_handle_dirty_metadata(handle, inode, bh);
999		if (err)
1000			goto cleanup;
1001		brelse(bh);
1002		bh = NULL;
1003
1004		/* correct old index */
1005		if (m) {
1006			err = ext4_ext_get_access(handle, inode, path + i);
1007			if (err)
1008				goto cleanup;
1009			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1010			err = ext4_ext_dirty(handle, inode, path + i);
1011			if (err)
1012				goto cleanup;
1013		}
1014
1015		i--;
1016	}
1017
1018	/* insert new index */
1019	err = ext4_ext_insert_index(handle, inode, path + at,
1020				    le32_to_cpu(border), newblock);
1021
1022cleanup:
1023	if (bh) {
1024		if (buffer_locked(bh))
1025			unlock_buffer(bh);
1026		brelse(bh);
1027	}
1028
1029	if (err) {
1030		/* free all allocated blocks in error case */
1031		for (i = 0; i < depth; i++) {
1032			if (!ablocks[i])
1033				continue;
1034			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1035					 EXT4_FREE_BLOCKS_METADATA);
1036		}
1037	}
1038	kfree(ablocks);
1039
1040	return err;
1041}
1042
1043/*
1044 * ext4_ext_grow_indepth:
1045 * implements tree growing procedure:
1046 * - allocates new block
1047 * - moves top-level data (index block or leaf) into the new block
1048 * - initializes new top-level, creating index that points to the
1049 *   just created block
1050 */
1051static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1052				 unsigned int flags,
1053				 struct ext4_extent *newext)
1054{
1055	struct ext4_extent_header *neh;
1056	struct buffer_head *bh;
1057	ext4_fsblk_t newblock;
1058	int err = 0;
1059
1060	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1061		newext, &err, flags);
1062	if (newblock == 0)
1063		return err;
1064
1065	bh = sb_getblk(inode->i_sb, newblock);
1066	if (!bh) {
1067		err = -EIO;
1068		ext4_std_error(inode->i_sb, err);
1069		return err;
1070	}
1071	lock_buffer(bh);
1072
1073	err = ext4_journal_get_create_access(handle, bh);
1074	if (err) {
1075		unlock_buffer(bh);
1076		goto out;
1077	}
1078
1079	/* move top-level index/leaf into new block */
1080	memmove(bh->b_data, EXT4_I(inode)->i_data,
1081		sizeof(EXT4_I(inode)->i_data));
1082
1083	/* set size of new block */
1084	neh = ext_block_hdr(bh);
1085	/* old root could have indexes or leaves
1086	 * so calculate e_max right way */
1087	if (ext_depth(inode))
1088		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1089	else
1090		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1091	neh->eh_magic = EXT4_EXT_MAGIC;
1092	set_buffer_uptodate(bh);
1093	unlock_buffer(bh);
1094
1095	err = ext4_handle_dirty_metadata(handle, inode, bh);
1096	if (err)
1097		goto out;
1098
1099	/* Update top-level index: num,max,pointer */
1100	neh = ext_inode_hdr(inode);
1101	neh->eh_entries = cpu_to_le16(1);
1102	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1103	if (neh->eh_depth == 0) {
1104		/* Root extent block becomes index block */
1105		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1106		EXT_FIRST_INDEX(neh)->ei_block =
1107			EXT_FIRST_EXTENT(neh)->ee_block;
1108	}
1109	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1110		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1111		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1112		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1113
1114	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1115	ext4_mark_inode_dirty(handle, inode);
1116out:
1117	brelse(bh);
1118
1119	return err;
1120}
1121
1122/*
1123 * ext4_ext_create_new_leaf:
1124 * finds empty index and adds new leaf.
1125 * if no free index is found, then it requests in-depth growing.
1126 */
1127static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1128				    unsigned int flags,
1129				    struct ext4_ext_path *path,
1130				    struct ext4_extent *newext)
1131{
1132	struct ext4_ext_path *curp;
1133	int depth, i, err = 0;
1134
1135repeat:
1136	i = depth = ext_depth(inode);
1137
1138	/* walk up to the tree and look for free index entry */
1139	curp = path + depth;
1140	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1141		i--;
1142		curp--;
1143	}
1144
1145	/* we use already allocated block for index block,
1146	 * so subsequent data blocks should be contiguous */
1147	if (EXT_HAS_FREE_INDEX(curp)) {
1148		/* if we found index with free entry, then use that
1149		 * entry: create all needed subtree and add new leaf */
1150		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1151		if (err)
1152			goto out;
1153
1154		/* refill path */
1155		ext4_ext_drop_refs(path);
1156		path = ext4_ext_find_extent(inode,
1157				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1158				    path);
1159		if (IS_ERR(path))
1160			err = PTR_ERR(path);
1161	} else {
1162		/* tree is full, time to grow in depth */
1163		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1164		if (err)
1165			goto out;
1166
1167		/* refill path */
1168		ext4_ext_drop_refs(path);
1169		path = ext4_ext_find_extent(inode,
1170				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1171				    path);
1172		if (IS_ERR(path)) {
1173			err = PTR_ERR(path);
1174			goto out;
1175		}
1176
1177		/*
1178		 * only first (depth 0 -> 1) produces free space;
1179		 * in all other cases we have to split the grown tree
1180		 */
1181		depth = ext_depth(inode);
1182		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1183			/* now we need to split */
1184			goto repeat;
1185		}
1186	}
1187
1188out:
1189	return err;
1190}
1191
1192/*
1193 * search the closest allocated block to the left for *logical
1194 * and returns it at @logical + it's physical address at @phys
1195 * if *logical is the smallest allocated block, the function
1196 * returns 0 at @phys
1197 * return value contains 0 (success) or error code
1198 */
1199static int ext4_ext_search_left(struct inode *inode,
1200				struct ext4_ext_path *path,
1201				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1202{
1203	struct ext4_extent_idx *ix;
1204	struct ext4_extent *ex;
1205	int depth, ee_len;
1206
1207	if (unlikely(path == NULL)) {
1208		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1209		return -EIO;
1210	}
1211	depth = path->p_depth;
1212	*phys = 0;
1213
1214	if (depth == 0 && path->p_ext == NULL)
1215		return 0;
1216
1217	/* usually extent in the path covers blocks smaller
1218	 * then *logical, but it can be that extent is the
1219	 * first one in the file */
1220
1221	ex = path[depth].p_ext;
1222	ee_len = ext4_ext_get_actual_len(ex);
1223	if (*logical < le32_to_cpu(ex->ee_block)) {
1224		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1225			EXT4_ERROR_INODE(inode,
1226					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1227					 *logical, le32_to_cpu(ex->ee_block));
1228			return -EIO;
1229		}
1230		while (--depth >= 0) {
1231			ix = path[depth].p_idx;
1232			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1233				EXT4_ERROR_INODE(inode,
1234				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1235				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1236				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1237		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1238				  depth);
1239				return -EIO;
1240			}
1241		}
1242		return 0;
1243	}
1244
1245	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1246		EXT4_ERROR_INODE(inode,
1247				 "logical %d < ee_block %d + ee_len %d!",
1248				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1249		return -EIO;
1250	}
1251
1252	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1253	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1254	return 0;
1255}
1256
1257/*
1258 * search the closest allocated block to the right for *logical
1259 * and returns it at @logical + it's physical address at @phys
1260 * if *logical is the largest allocated block, the function
1261 * returns 0 at @phys
1262 * return value contains 0 (success) or error code
1263 */
1264static int ext4_ext_search_right(struct inode *inode,
1265				 struct ext4_ext_path *path,
1266				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1267				 struct ext4_extent **ret_ex)
1268{
1269	struct buffer_head *bh = NULL;
1270	struct ext4_extent_header *eh;
1271	struct ext4_extent_idx *ix;
1272	struct ext4_extent *ex;
1273	ext4_fsblk_t block;
1274	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1275	int ee_len;
1276
1277	if (unlikely(path == NULL)) {
1278		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1279		return -EIO;
1280	}
1281	depth = path->p_depth;
1282	*phys = 0;
1283
1284	if (depth == 0 && path->p_ext == NULL)
1285		return 0;
1286
1287	/* usually extent in the path covers blocks smaller
1288	 * then *logical, but it can be that extent is the
1289	 * first one in the file */
1290
1291	ex = path[depth].p_ext;
1292	ee_len = ext4_ext_get_actual_len(ex);
1293	if (*logical < le32_to_cpu(ex->ee_block)) {
1294		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1295			EXT4_ERROR_INODE(inode,
1296					 "first_extent(path[%d].p_hdr) != ex",
1297					 depth);
1298			return -EIO;
1299		}
1300		while (--depth >= 0) {
1301			ix = path[depth].p_idx;
1302			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1303				EXT4_ERROR_INODE(inode,
1304						 "ix != EXT_FIRST_INDEX *logical %d!",
1305						 *logical);
1306				return -EIO;
1307			}
1308		}
1309		goto found_extent;
1310	}
1311
1312	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1313		EXT4_ERROR_INODE(inode,
1314				 "logical %d < ee_block %d + ee_len %d!",
1315				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1316		return -EIO;
1317	}
1318
1319	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1320		/* next allocated block in this leaf */
1321		ex++;
1322		goto found_extent;
1323	}
1324
1325	/* go up and search for index to the right */
1326	while (--depth >= 0) {
1327		ix = path[depth].p_idx;
1328		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1329			goto got_index;
1330	}
1331
1332	/* we've gone up to the root and found no index to the right */
1333	return 0;
1334
1335got_index:
1336	/* we've found index to the right, let's
1337	 * follow it and find the closest allocated
1338	 * block to the right */
1339	ix++;
1340	block = ext4_idx_pblock(ix);
1341	while (++depth < path->p_depth) {
1342		bh = sb_bread(inode->i_sb, block);
1343		if (bh == NULL)
1344			return -EIO;
1345		eh = ext_block_hdr(bh);
1346		/* subtract from p_depth to get proper eh_depth */
1347		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1348			put_bh(bh);
1349			return -EIO;
1350		}
1351		ix = EXT_FIRST_INDEX(eh);
1352		block = ext4_idx_pblock(ix);
1353		put_bh(bh);
1354	}
1355
1356	bh = sb_bread(inode->i_sb, block);
1357	if (bh == NULL)
1358		return -EIO;
1359	eh = ext_block_hdr(bh);
1360	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1361		put_bh(bh);
1362		return -EIO;
1363	}
1364	ex = EXT_FIRST_EXTENT(eh);
1365found_extent:
1366	*logical = le32_to_cpu(ex->ee_block);
1367	*phys = ext4_ext_pblock(ex);
1368	*ret_ex = ex;
1369	if (bh)
1370		put_bh(bh);
1371	return 0;
1372}
1373
1374/*
1375 * ext4_ext_next_allocated_block:
1376 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1377 * NOTE: it considers block number from index entry as
1378 * allocated block. Thus, index entries have to be consistent
1379 * with leaves.
1380 */
1381static ext4_lblk_t
1382ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1383{
1384	int depth;
1385
1386	BUG_ON(path == NULL);
1387	depth = path->p_depth;
1388
1389	if (depth == 0 && path->p_ext == NULL)
1390		return EXT_MAX_BLOCKS;
1391
1392	while (depth >= 0) {
1393		if (depth == path->p_depth) {
1394			/* leaf */
1395			if (path[depth].p_ext !=
1396					EXT_LAST_EXTENT(path[depth].p_hdr))
1397			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1398		} else {
1399			/* index */
1400			if (path[depth].p_idx !=
1401					EXT_LAST_INDEX(path[depth].p_hdr))
1402			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1403		}
1404		depth--;
1405	}
1406
1407	return EXT_MAX_BLOCKS;
1408}
1409
1410/*
1411 * ext4_ext_next_leaf_block:
1412 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1413 */
1414static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1415{
1416	int depth;
1417
1418	BUG_ON(path == NULL);
1419	depth = path->p_depth;
1420
1421	/* zero-tree has no leaf blocks at all */
1422	if (depth == 0)
1423		return EXT_MAX_BLOCKS;
1424
1425	/* go to index block */
1426	depth--;
1427
1428	while (depth >= 0) {
1429		if (path[depth].p_idx !=
1430				EXT_LAST_INDEX(path[depth].p_hdr))
1431			return (ext4_lblk_t)
1432				le32_to_cpu(path[depth].p_idx[1].ei_block);
1433		depth--;
1434	}
1435
1436	return EXT_MAX_BLOCKS;
1437}
1438
1439/*
1440 * ext4_ext_correct_indexes:
1441 * if leaf gets modified and modified extent is first in the leaf,
1442 * then we have to correct all indexes above.
1443 * TODO: do we need to correct tree in all cases?
1444 */
1445static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1446				struct ext4_ext_path *path)
1447{
1448	struct ext4_extent_header *eh;
1449	int depth = ext_depth(inode);
1450	struct ext4_extent *ex;
1451	__le32 border;
1452	int k, err = 0;
1453
1454	eh = path[depth].p_hdr;
1455	ex = path[depth].p_ext;
1456
1457	if (unlikely(ex == NULL || eh == NULL)) {
1458		EXT4_ERROR_INODE(inode,
1459				 "ex %p == NULL or eh %p == NULL", ex, eh);
1460		return -EIO;
1461	}
1462
1463	if (depth == 0) {
1464		/* there is no tree at all */
1465		return 0;
1466	}
1467
1468	if (ex != EXT_FIRST_EXTENT(eh)) {
1469		/* we correct tree if first leaf got modified only */
1470		return 0;
1471	}
1472
1473	/*
1474	 * TODO: we need correction if border is smaller than current one
1475	 */
1476	k = depth - 1;
1477	border = path[depth].p_ext->ee_block;
1478	err = ext4_ext_get_access(handle, inode, path + k);
1479	if (err)
1480		return err;
1481	path[k].p_idx->ei_block = border;
1482	err = ext4_ext_dirty(handle, inode, path + k);
1483	if (err)
1484		return err;
1485
1486	while (k--) {
1487		/* change all left-side indexes */
1488		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1489			break;
1490		err = ext4_ext_get_access(handle, inode, path + k);
1491		if (err)
1492			break;
1493		path[k].p_idx->ei_block = border;
1494		err = ext4_ext_dirty(handle, inode, path + k);
1495		if (err)
1496			break;
1497	}
1498
1499	return err;
1500}
1501
1502int
1503ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1504				struct ext4_extent *ex2)
1505{
1506	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1507
1508	/*
1509	 * Make sure that either both extents are uninitialized, or
1510	 * both are _not_.
1511	 */
1512	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1513		return 0;
1514
1515	if (ext4_ext_is_uninitialized(ex1))
1516		max_len = EXT_UNINIT_MAX_LEN;
1517	else
1518		max_len = EXT_INIT_MAX_LEN;
1519
1520	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1521	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1522
1523	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1524			le32_to_cpu(ex2->ee_block))
1525		return 0;
1526
1527	/*
1528	 * To allow future support for preallocated extents to be added
1529	 * as an RO_COMPAT feature, refuse to merge to extents if
1530	 * this can result in the top bit of ee_len being set.
1531	 */
1532	if (ext1_ee_len + ext2_ee_len > max_len)
1533		return 0;
1534#ifdef AGGRESSIVE_TEST
1535	if (ext1_ee_len >= 4)
1536		return 0;
1537#endif
1538
1539	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1540		return 1;
1541	return 0;
1542}
1543
1544/*
1545 * This function tries to merge the "ex" extent to the next extent in the tree.
1546 * It always tries to merge towards right. If you want to merge towards
1547 * left, pass "ex - 1" as argument instead of "ex".
1548 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1549 * 1 if they got merged.
1550 */
1551static int ext4_ext_try_to_merge_right(struct inode *inode,
1552				 struct ext4_ext_path *path,
1553				 struct ext4_extent *ex)
1554{
1555	struct ext4_extent_header *eh;
1556	unsigned int depth, len;
1557	int merge_done = 0;
1558	int uninitialized = 0;
1559
1560	depth = ext_depth(inode);
1561	BUG_ON(path[depth].p_hdr == NULL);
1562	eh = path[depth].p_hdr;
1563
1564	while (ex < EXT_LAST_EXTENT(eh)) {
1565		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1566			break;
1567		/* merge with next extent! */
1568		if (ext4_ext_is_uninitialized(ex))
1569			uninitialized = 1;
1570		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1571				+ ext4_ext_get_actual_len(ex + 1));
1572		if (uninitialized)
1573			ext4_ext_mark_uninitialized(ex);
1574
1575		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1576			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1577				* sizeof(struct ext4_extent);
1578			memmove(ex + 1, ex + 2, len);
1579		}
1580		le16_add_cpu(&eh->eh_entries, -1);
1581		merge_done = 1;
1582		WARN_ON(eh->eh_entries == 0);
1583		if (!eh->eh_entries)
1584			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1585	}
1586
1587	return merge_done;
1588}
1589
1590/*
1591 * This function tries to merge the @ex extent to neighbours in the tree.
1592 * return 1 if merge left else 0.
1593 */
1594static int ext4_ext_try_to_merge(struct inode *inode,
1595				  struct ext4_ext_path *path,
1596				  struct ext4_extent *ex) {
1597	struct ext4_extent_header *eh;
1598	unsigned int depth;
1599	int merge_done = 0;
1600	int ret = 0;
1601
1602	depth = ext_depth(inode);
1603	BUG_ON(path[depth].p_hdr == NULL);
1604	eh = path[depth].p_hdr;
1605
1606	if (ex > EXT_FIRST_EXTENT(eh))
1607		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1608
1609	if (!merge_done)
1610		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1611
1612	return ret;
1613}
1614
1615/*
1616 * check if a portion of the "newext" extent overlaps with an
1617 * existing extent.
1618 *
1619 * If there is an overlap discovered, it updates the length of the newext
1620 * such that there will be no overlap, and then returns 1.
1621 * If there is no overlap found, it returns 0.
1622 */
1623static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1624					   struct inode *inode,
1625					   struct ext4_extent *newext,
1626					   struct ext4_ext_path *path)
1627{
1628	ext4_lblk_t b1, b2;
1629	unsigned int depth, len1;
1630	unsigned int ret = 0;
1631
1632	b1 = le32_to_cpu(newext->ee_block);
1633	len1 = ext4_ext_get_actual_len(newext);
1634	depth = ext_depth(inode);
1635	if (!path[depth].p_ext)
1636		goto out;
1637	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1638	b2 &= ~(sbi->s_cluster_ratio - 1);
1639
1640	/*
1641	 * get the next allocated block if the extent in the path
1642	 * is before the requested block(s)
1643	 */
1644	if (b2 < b1) {
1645		b2 = ext4_ext_next_allocated_block(path);
1646		if (b2 == EXT_MAX_BLOCKS)
1647			goto out;
1648		b2 &= ~(sbi->s_cluster_ratio - 1);
1649	}
1650
1651	/* check for wrap through zero on extent logical start block*/
1652	if (b1 + len1 < b1) {
1653		len1 = EXT_MAX_BLOCKS - b1;
1654		newext->ee_len = cpu_to_le16(len1);
1655		ret = 1;
1656	}
1657
1658	/* check for overlap */
1659	if (b1 + len1 > b2) {
1660		newext->ee_len = cpu_to_le16(b2 - b1);
1661		ret = 1;
1662	}
1663out:
1664	return ret;
1665}
1666
1667/*
1668 * ext4_ext_insert_extent:
1669 * tries to merge requsted extent into the existing extent or
1670 * inserts requested extent as new one into the tree,
1671 * creating new leaf in the no-space case.
1672 */
1673int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1674				struct ext4_ext_path *path,
1675				struct ext4_extent *newext, int flag)
1676{
1677	struct ext4_extent_header *eh;
1678	struct ext4_extent *ex, *fex;
1679	struct ext4_extent *nearex; /* nearest extent */
1680	struct ext4_ext_path *npath = NULL;
1681	int depth, len, err;
1682	ext4_lblk_t next;
1683	unsigned uninitialized = 0;
1684	int flags = 0;
1685
1686	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1687		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1688		return -EIO;
1689	}
1690	depth = ext_depth(inode);
1691	ex = path[depth].p_ext;
1692	if (unlikely(path[depth].p_hdr == NULL)) {
1693		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1694		return -EIO;
1695	}
1696
1697	/* try to insert block into found extent and return */
1698	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1699		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1700		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1701			  ext4_ext_is_uninitialized(newext),
1702			  ext4_ext_get_actual_len(newext),
1703			  le32_to_cpu(ex->ee_block),
1704			  ext4_ext_is_uninitialized(ex),
1705			  ext4_ext_get_actual_len(ex),
1706			  ext4_ext_pblock(ex));
1707		err = ext4_ext_get_access(handle, inode, path + depth);
1708		if (err)
1709			return err;
1710
1711		/*
1712		 * ext4_can_extents_be_merged should have checked that either
1713		 * both extents are uninitialized, or both aren't. Thus we
1714		 * need to check only one of them here.
1715		 */
1716		if (ext4_ext_is_uninitialized(ex))
1717			uninitialized = 1;
1718		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1719					+ ext4_ext_get_actual_len(newext));
1720		if (uninitialized)
1721			ext4_ext_mark_uninitialized(ex);
1722		eh = path[depth].p_hdr;
1723		nearex = ex;
1724		goto merge;
1725	}
1726
1727	depth = ext_depth(inode);
1728	eh = path[depth].p_hdr;
1729	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1730		goto has_space;
1731
1732	/* probably next leaf has space for us? */
1733	fex = EXT_LAST_EXTENT(eh);
1734	next = EXT_MAX_BLOCKS;
1735	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1736		next = ext4_ext_next_leaf_block(path);
1737	if (next != EXT_MAX_BLOCKS) {
1738		ext_debug("next leaf block - %d\n", next);
1739		BUG_ON(npath != NULL);
1740		npath = ext4_ext_find_extent(inode, next, NULL);
1741		if (IS_ERR(npath))
1742			return PTR_ERR(npath);
1743		BUG_ON(npath->p_depth != path->p_depth);
1744		eh = npath[depth].p_hdr;
1745		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1746			ext_debug("next leaf isn't full(%d)\n",
1747				  le16_to_cpu(eh->eh_entries));
1748			path = npath;
1749			goto has_space;
1750		}
1751		ext_debug("next leaf has no free space(%d,%d)\n",
1752			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1753	}
1754
1755	/*
1756	 * There is no free space in the found leaf.
1757	 * We're gonna add a new leaf in the tree.
1758	 */
1759	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1760		flags = EXT4_MB_USE_ROOT_BLOCKS;
1761	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1762	if (err)
1763		goto cleanup;
1764	depth = ext_depth(inode);
1765	eh = path[depth].p_hdr;
1766
1767has_space:
1768	nearex = path[depth].p_ext;
1769
1770	err = ext4_ext_get_access(handle, inode, path + depth);
1771	if (err)
1772		goto cleanup;
1773
1774	if (!nearex) {
1775		/* there is no extent in this leaf, create first one */
1776		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1777				le32_to_cpu(newext->ee_block),
1778				ext4_ext_pblock(newext),
1779				ext4_ext_is_uninitialized(newext),
1780				ext4_ext_get_actual_len(newext));
1781		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1782	} else if (le32_to_cpu(newext->ee_block)
1783			   > le32_to_cpu(nearex->ee_block)) {
1784/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1785		if (nearex != EXT_LAST_EXTENT(eh)) {
1786			len = EXT_MAX_EXTENT(eh) - nearex;
1787			len = (len - 1) * sizeof(struct ext4_extent);
1788			len = len < 0 ? 0 : len;
1789			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1790					"move %d from 0x%p to 0x%p\n",
1791					le32_to_cpu(newext->ee_block),
1792					ext4_ext_pblock(newext),
1793					ext4_ext_is_uninitialized(newext),
1794					ext4_ext_get_actual_len(newext),
1795					nearex, len, nearex + 1, nearex + 2);
1796			memmove(nearex + 2, nearex + 1, len);
1797		}
1798		path[depth].p_ext = nearex + 1;
1799	} else {
1800		BUG_ON(newext->ee_block == nearex->ee_block);
1801		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1802		len = len < 0 ? 0 : len;
1803		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1804				"move %d from 0x%p to 0x%p\n",
1805				le32_to_cpu(newext->ee_block),
1806				ext4_ext_pblock(newext),
1807				ext4_ext_is_uninitialized(newext),
1808				ext4_ext_get_actual_len(newext),
1809				nearex, len, nearex, nearex + 1);
1810		memmove(nearex + 1, nearex, len);
1811		path[depth].p_ext = nearex;
1812	}
1813
1814	le16_add_cpu(&eh->eh_entries, 1);
1815	nearex = path[depth].p_ext;
1816	nearex->ee_block = newext->ee_block;
1817	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1818	nearex->ee_len = newext->ee_len;
1819
1820merge:
1821	/* try to merge extents to the right */
1822	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1823		ext4_ext_try_to_merge(inode, path, nearex);
1824
1825	/* try to merge extents to the left */
1826
1827	/* time to correct all indexes above */
1828	err = ext4_ext_correct_indexes(handle, inode, path);
1829	if (err)
1830		goto cleanup;
1831
1832	err = ext4_ext_dirty(handle, inode, path + depth);
1833
1834cleanup:
1835	if (npath) {
1836		ext4_ext_drop_refs(npath);
1837		kfree(npath);
1838	}
1839	ext4_ext_invalidate_cache(inode);
1840	return err;
1841}
1842
1843static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1844			       ext4_lblk_t num, ext_prepare_callback func,
1845			       void *cbdata)
1846{
1847	struct ext4_ext_path *path = NULL;
1848	struct ext4_ext_cache cbex;
1849	struct ext4_extent *ex;
1850	ext4_lblk_t next, start = 0, end = 0;
1851	ext4_lblk_t last = block + num;
1852	int depth, exists, err = 0;
1853
1854	BUG_ON(func == NULL);
1855	BUG_ON(inode == NULL);
1856
1857	while (block < last && block != EXT_MAX_BLOCKS) {
1858		num = last - block;
1859		/* find extent for this block */
1860		down_read(&EXT4_I(inode)->i_data_sem);
1861		path = ext4_ext_find_extent(inode, block, path);
1862		up_read(&EXT4_I(inode)->i_data_sem);
1863		if (IS_ERR(path)) {
1864			err = PTR_ERR(path);
1865			path = NULL;
1866			break;
1867		}
1868
1869		depth = ext_depth(inode);
1870		if (unlikely(path[depth].p_hdr == NULL)) {
1871			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1872			err = -EIO;
1873			break;
1874		}
1875		ex = path[depth].p_ext;
1876		next = ext4_ext_next_allocated_block(path);
1877
1878		exists = 0;
1879		if (!ex) {
1880			/* there is no extent yet, so try to allocate
1881			 * all requested space */
1882			start = block;
1883			end = block + num;
1884		} else if (le32_to_cpu(ex->ee_block) > block) {
1885			/* need to allocate space before found extent */
1886			start = block;
1887			end = le32_to_cpu(ex->ee_block);
1888			if (block + num < end)
1889				end = block + num;
1890		} else if (block >= le32_to_cpu(ex->ee_block)
1891					+ ext4_ext_get_actual_len(ex)) {
1892			/* need to allocate space after found extent */
1893			start = block;
1894			end = block + num;
1895			if (end >= next)
1896				end = next;
1897		} else if (block >= le32_to_cpu(ex->ee_block)) {
1898			/*
1899			 * some part of requested space is covered
1900			 * by found extent
1901			 */
1902			start = block;
1903			end = le32_to_cpu(ex->ee_block)
1904				+ ext4_ext_get_actual_len(ex);
1905			if (block + num < end)
1906				end = block + num;
1907			exists = 1;
1908		} else {
1909			BUG();
1910		}
1911		BUG_ON(end <= start);
1912
1913		if (!exists) {
1914			cbex.ec_block = start;
1915			cbex.ec_len = end - start;
1916			cbex.ec_start = 0;
1917		} else {
1918			cbex.ec_block = le32_to_cpu(ex->ee_block);
1919			cbex.ec_len = ext4_ext_get_actual_len(ex);
1920			cbex.ec_start = ext4_ext_pblock(ex);
1921		}
1922
1923		if (unlikely(cbex.ec_len == 0)) {
1924			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1925			err = -EIO;
1926			break;
1927		}
1928		err = func(inode, next, &cbex, ex, cbdata);
1929		ext4_ext_drop_refs(path);
1930
1931		if (err < 0)
1932			break;
1933
1934		if (err == EXT_REPEAT)
1935			continue;
1936		else if (err == EXT_BREAK) {
1937			err = 0;
1938			break;
1939		}
1940
1941		if (ext_depth(inode) != depth) {
1942			/* depth was changed. we have to realloc path */
1943			kfree(path);
1944			path = NULL;
1945		}
1946
1947		block = cbex.ec_block + cbex.ec_len;
1948	}
1949
1950	if (path) {
1951		ext4_ext_drop_refs(path);
1952		kfree(path);
1953	}
1954
1955	return err;
1956}
1957
1958static void
1959ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1960			__u32 len, ext4_fsblk_t start)
1961{
1962	struct ext4_ext_cache *cex;
1963	BUG_ON(len == 0);
1964	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1965	trace_ext4_ext_put_in_cache(inode, block, len, start);
1966	cex = &EXT4_I(inode)->i_cached_extent;
1967	cex->ec_block = block;
1968	cex->ec_len = len;
1969	cex->ec_start = start;
1970	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1971}
1972
1973/*
1974 * ext4_ext_put_gap_in_cache:
1975 * calculate boundaries of the gap that the requested block fits into
1976 * and cache this gap
1977 */
1978static void
1979ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1980				ext4_lblk_t block)
1981{
1982	int depth = ext_depth(inode);
1983	unsigned long len;
1984	ext4_lblk_t lblock;
1985	struct ext4_extent *ex;
1986
1987	ex = path[depth].p_ext;
1988	if (ex == NULL) {
1989		/* there is no extent yet, so gap is [0;-] */
1990		lblock = 0;
1991		len = EXT_MAX_BLOCKS;
1992		ext_debug("cache gap(whole file):");
1993	} else if (block < le32_to_cpu(ex->ee_block)) {
1994		lblock = block;
1995		len = le32_to_cpu(ex->ee_block) - block;
1996		ext_debug("cache gap(before): %u [%u:%u]",
1997				block,
1998				le32_to_cpu(ex->ee_block),
1999				 ext4_ext_get_actual_len(ex));
2000	} else if (block >= le32_to_cpu(ex->ee_block)
2001			+ ext4_ext_get_actual_len(ex)) {
2002		ext4_lblk_t next;
2003		lblock = le32_to_cpu(ex->ee_block)
2004			+ ext4_ext_get_actual_len(ex);
2005
2006		next = ext4_ext_next_allocated_block(path);
2007		ext_debug("cache gap(after): [%u:%u] %u",
2008				le32_to_cpu(ex->ee_block),
2009				ext4_ext_get_actual_len(ex),
2010				block);
2011		BUG_ON(next == lblock);
2012		len = next - lblock;
2013	} else {
2014		lblock = len = 0;
2015		BUG();
2016	}
2017
2018	ext_debug(" -> %u:%lu\n", lblock, len);
2019	ext4_ext_put_in_cache(inode, lblock, len, 0);
2020}
2021
2022/*
2023 * ext4_ext_check_cache()
2024 * Checks to see if the given block is in the cache.
2025 * If it is, the cached extent is stored in the given
2026 * cache extent pointer.  If the cached extent is a hole,
2027 * this routine should be used instead of
2028 * ext4_ext_in_cache if the calling function needs to
2029 * know the size of the hole.
2030 *
2031 * @inode: The files inode
2032 * @block: The block to look for in the cache
2033 * @ex:    Pointer where the cached extent will be stored
2034 *         if it contains block
2035 *
2036 * Return 0 if cache is invalid; 1 if the cache is valid
2037 */
2038static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2039	struct ext4_ext_cache *ex){
2040	struct ext4_ext_cache *cex;
2041	struct ext4_sb_info *sbi;
2042	int ret = 0;
2043
2044	/*
2045	 * We borrow i_block_reservation_lock to protect i_cached_extent
2046	 */
2047	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2048	cex = &EXT4_I(inode)->i_cached_extent;
2049	sbi = EXT4_SB(inode->i_sb);
2050
2051	/* has cache valid data? */
2052	if (cex->ec_len == 0)
2053		goto errout;
2054
2055	if (in_range(block, cex->ec_block, cex->ec_len)) {
2056		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2057		ext_debug("%u cached by %u:%u:%llu\n",
2058				block,
2059				cex->ec_block, cex->ec_len, cex->ec_start);
2060		ret = 1;
2061	}
2062errout:
2063	if (!ret)
2064		sbi->extent_cache_misses++;
2065	else
2066		sbi->extent_cache_hits++;
2067	trace_ext4_ext_in_cache(inode, block, ret);
2068	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2069	return ret;
2070}
2071
2072/*
2073 * ext4_ext_in_cache()
2074 * Checks to see if the given block is in the cache.
2075 * If it is, the cached extent is stored in the given
2076 * extent pointer.
2077 *
2078 * @inode: The files inode
2079 * @block: The block to look for in the cache
2080 * @ex:    Pointer where the cached extent will be stored
2081 *         if it contains block
2082 *
2083 * Return 0 if cache is invalid; 1 if the cache is valid
2084 */
2085static int
2086ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2087			struct ext4_extent *ex)
2088{
2089	struct ext4_ext_cache cex;
2090	int ret = 0;
2091
2092	if (ext4_ext_check_cache(inode, block, &cex)) {
2093		ex->ee_block = cpu_to_le32(cex.ec_block);
2094		ext4_ext_store_pblock(ex, cex.ec_start);
2095		ex->ee_len = cpu_to_le16(cex.ec_len);
2096		ret = 1;
2097	}
2098
2099	return ret;
2100}
2101
2102
2103/*
2104 * ext4_ext_rm_idx:
2105 * removes index from the index block.
2106 */
2107static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2108			struct ext4_ext_path *path)
2109{
2110	int err;
2111	ext4_fsblk_t leaf;
2112
2113	/* free index block */
2114	path--;
2115	leaf = ext4_idx_pblock(path->p_idx);
2116	if (unlikely(path->p_hdr->eh_entries == 0)) {
2117		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2118		return -EIO;
2119	}
2120	err = ext4_ext_get_access(handle, inode, path);
2121	if (err)
2122		return err;
2123
2124	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2125		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2126		len *= sizeof(struct ext4_extent_idx);
2127		memmove(path->p_idx, path->p_idx + 1, len);
2128	}
2129
2130	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2131	err = ext4_ext_dirty(handle, inode, path);
2132	if (err)
2133		return err;
2134	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2135	trace_ext4_ext_rm_idx(inode, leaf);
2136
2137	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2138			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2139	return err;
2140}
2141
2142/*
2143 * ext4_ext_calc_credits_for_single_extent:
2144 * This routine returns max. credits that needed to insert an extent
2145 * to the extent tree.
2146 * When pass the actual path, the caller should calculate credits
2147 * under i_data_sem.
2148 */
2149int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2150						struct ext4_ext_path *path)
2151{
2152	if (path) {
2153		int depth = ext_depth(inode);
2154		int ret = 0;
2155
2156		/* probably there is space in leaf? */
2157		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2158				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2159
2160			/*
2161			 *  There are some space in the leaf tree, no
2162			 *  need to account for leaf block credit
2163			 *
2164			 *  bitmaps and block group descriptor blocks
2165			 *  and other metadata blocks still need to be
2166			 *  accounted.
2167			 */
2168			/* 1 bitmap, 1 block group descriptor */
2169			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2170			return ret;
2171		}
2172	}
2173
2174	return ext4_chunk_trans_blocks(inode, nrblocks);
2175}
2176
2177/*
2178 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2179 *
2180 * if nrblocks are fit in a single extent (chunk flag is 1), then
2181 * in the worse case, each tree level index/leaf need to be changed
2182 * if the tree split due to insert a new extent, then the old tree
2183 * index/leaf need to be updated too
2184 *
2185 * If the nrblocks are discontiguous, they could cause
2186 * the whole tree split more than once, but this is really rare.
2187 */
2188int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2189{
2190	int index;
2191	int depth = ext_depth(inode);
2192
2193	if (chunk)
2194		index = depth * 2;
2195	else
2196		index = depth * 3;
2197
2198	return index;
2199}
2200
2201static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2202			      struct ext4_extent *ex,
2203			      ext4_fsblk_t *partial_cluster,
2204			      ext4_lblk_t from, ext4_lblk_t to)
2205{
2206	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2207	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2208	ext4_fsblk_t pblk;
2209	int flags = EXT4_FREE_BLOCKS_FORGET;
2210
2211	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2212		flags |= EXT4_FREE_BLOCKS_METADATA;
2213	/*
2214	 * For bigalloc file systems, we never free a partial cluster
2215	 * at the beginning of the extent.  Instead, we make a note
2216	 * that we tried freeing the cluster, and check to see if we
2217	 * need to free it on a subsequent call to ext4_remove_blocks,
2218	 * or at the end of the ext4_truncate() operation.
2219	 */
2220	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2221
2222	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2223	/*
2224	 * If we have a partial cluster, and it's different from the
2225	 * cluster of the last block, we need to explicitly free the
2226	 * partial cluster here.
2227	 */
2228	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2229	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2230		ext4_free_blocks(handle, inode, NULL,
2231				 EXT4_C2B(sbi, *partial_cluster),
2232				 sbi->s_cluster_ratio, flags);
2233		*partial_cluster = 0;
2234	}
2235
2236#ifdef EXTENTS_STATS
2237	{
2238		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2239		spin_lock(&sbi->s_ext_stats_lock);
2240		sbi->s_ext_blocks += ee_len;
2241		sbi->s_ext_extents++;
2242		if (ee_len < sbi->s_ext_min)
2243			sbi->s_ext_min = ee_len;
2244		if (ee_len > sbi->s_ext_max)
2245			sbi->s_ext_max = ee_len;
2246		if (ext_depth(inode) > sbi->s_depth_max)
2247			sbi->s_depth_max = ext_depth(inode);
2248		spin_unlock(&sbi->s_ext_stats_lock);
2249	}
2250#endif
2251	if (from >= le32_to_cpu(ex->ee_block)
2252	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2253		/* tail removal */
2254		ext4_lblk_t num;
2255
2256		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2257		pblk = ext4_ext_pblock(ex) + ee_len - num;
2258		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2259		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2260		/*
2261		 * If the block range to be freed didn't start at the
2262		 * beginning of a cluster, and we removed the entire
2263		 * extent, save the partial cluster here, since we
2264		 * might need to delete if we determine that the
2265		 * truncate operation has removed all of the blocks in
2266		 * the cluster.
2267		 */
2268		if (pblk & (sbi->s_cluster_ratio - 1) &&
2269		    (ee_len == num))
2270			*partial_cluster = EXT4_B2C(sbi, pblk);
2271		else
2272			*partial_cluster = 0;
2273	} else if (from == le32_to_cpu(ex->ee_block)
2274		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2275		/* head removal */
2276		ext4_lblk_t num;
2277		ext4_fsblk_t start;
2278
2279		num = to - from;
2280		start = ext4_ext_pblock(ex);
2281
2282		ext_debug("free first %u blocks starting %llu\n", num, start);
2283		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2284
2285	} else {
2286		printk(KERN_INFO "strange request: removal(2) "
2287				"%u-%u from %u:%u\n",
2288				from, to, le32_to_cpu(ex->ee_block), ee_len);
2289	}
2290	return 0;
2291}
2292
2293
2294/*
2295 * ext4_ext_rm_leaf() Removes the extents associated with the
2296 * blocks appearing between "start" and "end", and splits the extents
2297 * if "start" and "end" appear in the same extent
2298 *
2299 * @handle: The journal handle
2300 * @inode:  The files inode
2301 * @path:   The path to the leaf
2302 * @start:  The first block to remove
2303 * @end:   The last block to remove
2304 */
2305static int
2306ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2307		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2308		 ext4_lblk_t start, ext4_lblk_t end)
2309{
2310	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2311	int err = 0, correct_index = 0;
2312	int depth = ext_depth(inode), credits;
2313	struct ext4_extent_header *eh;
2314	ext4_lblk_t a, b, block;
2315	unsigned num;
2316	ext4_lblk_t ex_ee_block;
2317	unsigned short ex_ee_len;
2318	unsigned uninitialized = 0;
2319	struct ext4_extent *ex;
2320	struct ext4_map_blocks map;
2321
2322	/* the header must be checked already in ext4_ext_remove_space() */
2323	ext_debug("truncate since %u in leaf\n", start);
2324	if (!path[depth].p_hdr)
2325		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2326	eh = path[depth].p_hdr;
2327	if (unlikely(path[depth].p_hdr == NULL)) {
2328		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2329		return -EIO;
2330	}
2331	/* find where to start removing */
2332	ex = EXT_LAST_EXTENT(eh);
2333
2334	ex_ee_block = le32_to_cpu(ex->ee_block);
2335	ex_ee_len = ext4_ext_get_actual_len(ex);
2336
2337	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2338
2339	while (ex >= EXT_FIRST_EXTENT(eh) &&
2340			ex_ee_block + ex_ee_len > start) {
2341
2342		if (ext4_ext_is_uninitialized(ex))
2343			uninitialized = 1;
2344		else
2345			uninitialized = 0;
2346
2347		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2348			 uninitialized, ex_ee_len);
2349		path[depth].p_ext = ex;
2350
2351		a = ex_ee_block > start ? ex_ee_block : start;
2352		b = ex_ee_block+ex_ee_len - 1 < end ?
2353			ex_ee_block+ex_ee_len - 1 : end;
2354
2355		ext_debug("  border %u:%u\n", a, b);
2356
2357		/* If this extent is beyond the end of the hole, skip it */
2358		if (end <= ex_ee_block) {
2359			ex--;
2360			ex_ee_block = le32_to_cpu(ex->ee_block);
2361			ex_ee_len = ext4_ext_get_actual_len(ex);
2362			continue;
2363		} else if (a != ex_ee_block &&
2364			b != ex_ee_block + ex_ee_len - 1) {
2365			/*
2366			 * If this is a truncate, then this condition should
2367			 * never happen because at least one of the end points
2368			 * needs to be on the edge of the extent.
2369			 */
2370			if (end == EXT_MAX_BLOCKS - 1) {
2371				ext_debug("  bad truncate %u:%u\n",
2372						start, end);
2373				block = 0;
2374				num = 0;
2375				err = -EIO;
2376				goto out;
2377			}
2378			/*
2379			 * else this is a hole punch, so the extent needs to
2380			 * be split since neither edge of the hole is on the
2381			 * extent edge
2382			 */
2383			else{
2384				map.m_pblk = ext4_ext_pblock(ex);
2385				map.m_lblk = ex_ee_block;
2386				map.m_len = b - ex_ee_block;
2387
2388				err = ext4_split_extent(handle,
2389					inode, path, &map, 0,
2390					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2391					EXT4_GET_BLOCKS_PRE_IO);
2392
2393				if (err < 0)
2394					goto out;
2395
2396				ex_ee_len = ext4_ext_get_actual_len(ex);
2397
2398				b = ex_ee_block+ex_ee_len - 1 < end ?
2399					ex_ee_block+ex_ee_len - 1 : end;
2400
2401				/* Then remove tail of this extent */
2402				block = ex_ee_block;
2403				num = a - block;
2404			}
2405		} else if (a != ex_ee_block) {
2406			/* remove tail of the extent */
2407			block = ex_ee_block;
2408			num = a - block;
2409		} else if (b != ex_ee_block + ex_ee_len - 1) {
2410			/* remove head of the extent */
2411			block = b;
2412			num =  ex_ee_block + ex_ee_len - b;
2413
2414			/*
2415			 * If this is a truncate, this condition
2416			 * should never happen
2417			 */
2418			if (end == EXT_MAX_BLOCKS - 1) {
2419				ext_debug("  bad truncate %u:%u\n",
2420					start, end);
2421				err = -EIO;
2422				goto out;
2423			}
2424		} else {
2425			/* remove whole extent: excellent! */
2426			block = ex_ee_block;
2427			num = 0;
2428			if (a != ex_ee_block) {
2429				ext_debug("  bad truncate %u:%u\n",
2430					start, end);
2431				err = -EIO;
2432				goto out;
2433			}
2434
2435			if (b != ex_ee_block + ex_ee_len - 1) {
2436				ext_debug("  bad truncate %u:%u\n",
2437					start, end);
2438				err = -EIO;
2439				goto out;
2440			}
2441		}
2442
2443		/*
2444		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2445		 * descriptor) for each block group; assume two block
2446		 * groups plus ex_ee_len/blocks_per_block_group for
2447		 * the worst case
2448		 */
2449		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2450		if (ex == EXT_FIRST_EXTENT(eh)) {
2451			correct_index = 1;
2452			credits += (ext_depth(inode)) + 1;
2453		}
2454		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2455
2456		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2457		if (err)
2458			goto out;
2459
2460		err = ext4_ext_get_access(handle, inode, path + depth);
2461		if (err)
2462			goto out;
2463
2464		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2465					 a, b);
2466		if (err)
2467			goto out;
2468
2469		if (num == 0) {
2470			/* this extent is removed; mark slot entirely unused */
2471			ext4_ext_store_pblock(ex, 0);
2472		} else if (block != ex_ee_block) {
2473			/*
2474			 * If this was a head removal, then we need to update
2475			 * the physical block since it is now at a different
2476			 * location
2477			 */
2478			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2479		}
2480
2481		ex->ee_block = cpu_to_le32(block);
2482		ex->ee_len = cpu_to_le16(num);
2483		/*
2484		 * Do not mark uninitialized if all the blocks in the
2485		 * extent have been removed.
2486		 */
2487		if (uninitialized && num)
2488			ext4_ext_mark_uninitialized(ex);
2489
2490		err = ext4_ext_dirty(handle, inode, path + depth);
2491		if (err)
2492			goto out;
2493
2494		/*
2495		 * If the extent was completely released,
2496		 * we need to remove it from the leaf
2497		 */
2498		if (num == 0) {
2499			if (end != EXT_MAX_BLOCKS - 1) {
2500				/*
2501				 * For hole punching, we need to scoot all the
2502				 * extents up when an extent is removed so that
2503				 * we dont have blank extents in the middle
2504				 */
2505				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2506					sizeof(struct ext4_extent));
2507
2508				/* Now get rid of the one at the end */
2509				memset(EXT_LAST_EXTENT(eh), 0,
2510					sizeof(struct ext4_extent));
2511			}
2512			le16_add_cpu(&eh->eh_entries, -1);
2513		} else
2514			*partial_cluster = 0;
2515
2516		ext_debug("new extent: %u:%u:%llu\n", block, num,
2517				ext4_ext_pblock(ex));
2518		ex--;
2519		ex_ee_block = le32_to_cpu(ex->ee_block);
2520		ex_ee_len = ext4_ext_get_actual_len(ex);
2521	}
2522
2523	if (correct_index && eh->eh_entries)
2524		err = ext4_ext_correct_indexes(handle, inode, path);
2525
2526	/*
2527	 * If there is still a entry in the leaf node, check to see if
2528	 * it references the partial cluster.  This is the only place
2529	 * where it could; if it doesn't, we can free the cluster.
2530	 */
2531	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2532	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2533	     *partial_cluster)) {
2534		int flags = EXT4_FREE_BLOCKS_FORGET;
2535
2536		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2537			flags |= EXT4_FREE_BLOCKS_METADATA;
2538
2539		ext4_free_blocks(handle, inode, NULL,
2540				 EXT4_C2B(sbi, *partial_cluster),
2541				 sbi->s_cluster_ratio, flags);
2542		*partial_cluster = 0;
2543	}
2544
2545	/* if this leaf is free, then we should
2546	 * remove it from index block above */
2547	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2548		err = ext4_ext_rm_idx(handle, inode, path + depth);
2549
2550out:
2551	return err;
2552}
2553
2554/*
2555 * ext4_ext_more_to_rm:
2556 * returns 1 if current index has to be freed (even partial)
2557 */
2558static int
2559ext4_ext_more_to_rm(struct ext4_ext_path *path)
2560{
2561	BUG_ON(path->p_idx == NULL);
2562
2563	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2564		return 0;
2565
2566	/*
2567	 * if truncate on deeper level happened, it wasn't partial,
2568	 * so we have to consider current index for truncation
2569	 */
2570	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2571		return 0;
2572	return 1;
2573}
2574
2575static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2576{
2577	struct super_block *sb = inode->i_sb;
2578	int depth = ext_depth(inode);
2579	struct ext4_ext_path *path;
2580	ext4_fsblk_t partial_cluster = 0;
2581	handle_t *handle;
2582	int i, err;
2583
2584	ext_debug("truncate since %u\n", start);
2585
2586	/* probably first extent we're gonna free will be last in block */
2587	handle = ext4_journal_start(inode, depth + 1);
2588	if (IS_ERR(handle))
2589		return PTR_ERR(handle);
2590
2591again:
2592	ext4_ext_invalidate_cache(inode);
2593
2594	trace_ext4_ext_remove_space(inode, start, depth);
2595
2596	/*
2597	 * We start scanning from right side, freeing all the blocks
2598	 * after i_size and walking into the tree depth-wise.
2599	 */
2600	depth = ext_depth(inode);
2601	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2602	if (path == NULL) {
2603		ext4_journal_stop(handle);
2604		return -ENOMEM;
2605	}
2606	path[0].p_depth = depth;
2607	path[0].p_hdr = ext_inode_hdr(inode);
2608	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2609		err = -EIO;
2610		goto out;
2611	}
2612	i = err = 0;
2613
2614	while (i >= 0 && err == 0) {
2615		if (i == depth) {
2616			/* this is leaf block */
2617			err = ext4_ext_rm_leaf(handle, inode, path,
2618					       &partial_cluster, start,
2619					       EXT_MAX_BLOCKS - 1);
2620			/* root level has p_bh == NULL, brelse() eats this */
2621			brelse(path[i].p_bh);
2622			path[i].p_bh = NULL;
2623			i--;
2624			continue;
2625		}
2626
2627		/* this is index block */
2628		if (!path[i].p_hdr) {
2629			ext_debug("initialize header\n");
2630			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2631		}
2632
2633		if (!path[i].p_idx) {
2634			/* this level hasn't been touched yet */
2635			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2636			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2637			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2638				  path[i].p_hdr,
2639				  le16_to_cpu(path[i].p_hdr->eh_entries));
2640		} else {
2641			/* we were already here, see at next index */
2642			path[i].p_idx--;
2643		}
2644
2645		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2646				i, EXT_FIRST_INDEX(path[i].p_hdr),
2647				path[i].p_idx);
2648		if (ext4_ext_more_to_rm(path + i)) {
2649			struct buffer_head *bh;
2650			/* go to the next level */
2651			ext_debug("move to level %d (block %llu)\n",
2652				  i + 1, ext4_idx_pblock(path[i].p_idx));
2653			memset(path + i + 1, 0, sizeof(*path));
2654			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2655			if (!bh) {
2656				/* should we reset i_size? */
2657				err = -EIO;
2658				break;
2659			}
2660			if (WARN_ON(i + 1 > depth)) {
2661				err = -EIO;
2662				break;
2663			}
2664			if (ext4_ext_check(inode, ext_block_hdr(bh),
2665							depth - i - 1)) {
2666				err = -EIO;
2667				break;
2668			}
2669			path[i + 1].p_bh = bh;
2670
2671			/* save actual number of indexes since this
2672			 * number is changed at the next iteration */
2673			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2674			i++;
2675		} else {
2676			/* we finished processing this index, go up */
2677			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2678				/* index is empty, remove it;
2679				 * handle must be already prepared by the
2680				 * truncatei_leaf() */
2681				err = ext4_ext_rm_idx(handle, inode, path + i);
2682			}
2683			/* root level has p_bh == NULL, brelse() eats this */
2684			brelse(path[i].p_bh);
2685			path[i].p_bh = NULL;
2686			i--;
2687			ext_debug("return to level %d\n", i);
2688		}
2689	}
2690
2691	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2692			path->p_hdr->eh_entries);
2693
2694	/* If we still have something in the partial cluster and we have removed
2695	 * even the first extent, then we should free the blocks in the partial
2696	 * cluster as well. */
2697	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2698		int flags = EXT4_FREE_BLOCKS_FORGET;
2699
2700		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2701			flags |= EXT4_FREE_BLOCKS_METADATA;
2702
2703		ext4_free_blocks(handle, inode, NULL,
2704				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2705				 EXT4_SB(sb)->s_cluster_ratio, flags);
2706		partial_cluster = 0;
2707	}
2708
2709	/* TODO: flexible tree reduction should be here */
2710	if (path->p_hdr->eh_entries == 0) {
2711		/*
2712		 * truncate to zero freed all the tree,
2713		 * so we need to correct eh_depth
2714		 */
2715		err = ext4_ext_get_access(handle, inode, path);
2716		if (err == 0) {
2717			ext_inode_hdr(inode)->eh_depth = 0;
2718			ext_inode_hdr(inode)->eh_max =
2719				cpu_to_le16(ext4_ext_space_root(inode, 0));
2720			err = ext4_ext_dirty(handle, inode, path);
2721		}
2722	}
2723out:
2724	ext4_ext_drop_refs(path);
2725	kfree(path);
2726	if (err == -EAGAIN)
2727		goto again;
2728	ext4_journal_stop(handle);
2729
2730	return err;
2731}
2732
2733/*
2734 * called at mount time
2735 */
2736void ext4_ext_init(struct super_block *sb)
2737{
2738	/*
2739	 * possible initialization would be here
2740	 */
2741
2742	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2743#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2744		printk(KERN_INFO "EXT4-fs: file extents enabled");
2745#ifdef AGGRESSIVE_TEST
2746		printk(", aggressive tests");
2747#endif
2748#ifdef CHECK_BINSEARCH
2749		printk(", check binsearch");
2750#endif
2751#ifdef EXTENTS_STATS
2752		printk(", stats");
2753#endif
2754		printk("\n");
2755#endif
2756#ifdef EXTENTS_STATS
2757		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2758		EXT4_SB(sb)->s_ext_min = 1 << 30;
2759		EXT4_SB(sb)->s_ext_max = 0;
2760#endif
2761	}
2762}
2763
2764/*
2765 * called at umount time
2766 */
2767void ext4_ext_release(struct super_block *sb)
2768{
2769	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2770		return;
2771
2772#ifdef EXTENTS_STATS
2773	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2774		struct ext4_sb_info *sbi = EXT4_SB(sb);
2775		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2776			sbi->s_ext_blocks, sbi->s_ext_extents,
2777			sbi->s_ext_blocks / sbi->s_ext_extents);
2778		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2779			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2780	}
2781#endif
2782}
2783
2784/* FIXME!! we need to try to merge to left or right after zero-out  */
2785static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2786{
2787	ext4_fsblk_t ee_pblock;
2788	unsigned int ee_len;
2789	int ret;
2790
2791	ee_len    = ext4_ext_get_actual_len(ex);
2792	ee_pblock = ext4_ext_pblock(ex);
2793
2794	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2795	if (ret > 0)
2796		ret = 0;
2797
2798	return ret;
2799}
2800
2801/*
2802 * used by extent splitting.
2803 */
2804#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2805					due to ENOSPC */
2806#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2807#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2808
2809/*
2810 * ext4_split_extent_at() splits an extent at given block.
2811 *
2812 * @handle: the journal handle
2813 * @inode: the file inode
2814 * @path: the path to the extent
2815 * @split: the logical block where the extent is splitted.
2816 * @split_flags: indicates if the extent could be zeroout if split fails, and
2817 *		 the states(init or uninit) of new extents.
2818 * @flags: flags used to insert new extent to extent tree.
2819 *
2820 *
2821 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2822 * of which are deterimined by split_flag.
2823 *
2824 * There are two cases:
2825 *  a> the extent are splitted into two extent.
2826 *  b> split is not needed, and just mark the extent.
2827 *
2828 * return 0 on success.
2829 */
2830static int ext4_split_extent_at(handle_t *handle,
2831			     struct inode *inode,
2832			     struct ext4_ext_path *path,
2833			     ext4_lblk_t split,
2834			     int split_flag,
2835			     int flags)
2836{
2837	ext4_fsblk_t newblock;
2838	ext4_lblk_t ee_block;
2839	struct ext4_extent *ex, newex, orig_ex;
2840	struct ext4_extent *ex2 = NULL;
2841	unsigned int ee_len, depth;
2842	int err = 0;
2843
2844	ext_debug("ext4_split_extents_at: inode %lu, logical"
2845		"block %llu\n", inode->i_ino, (unsigned long long)split);
2846
2847	ext4_ext_show_leaf(inode, path);
2848
2849	depth = ext_depth(inode);
2850	ex = path[depth].p_ext;
2851	ee_block = le32_to_cpu(ex->ee_block);
2852	ee_len = ext4_ext_get_actual_len(ex);
2853	newblock = split - ee_block + ext4_ext_pblock(ex);
2854
2855	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2856
2857	err = ext4_ext_get_access(handle, inode, path + depth);
2858	if (err)
2859		goto out;
2860
2861	if (split == ee_block) {
2862		/*
2863		 * case b: block @split is the block that the extent begins with
2864		 * then we just change the state of the extent, and splitting
2865		 * is not needed.
2866		 */
2867		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2868			ext4_ext_mark_uninitialized(ex);
2869		else
2870			ext4_ext_mark_initialized(ex);
2871
2872		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2873			ext4_ext_try_to_merge(inode, path, ex);
2874
2875		err = ext4_ext_dirty(handle, inode, path + depth);
2876		goto out;
2877	}
2878
2879	/* case a */
2880	memcpy(&orig_ex, ex, sizeof(orig_ex));
2881	ex->ee_len = cpu_to_le16(split - ee_block);
2882	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2883		ext4_ext_mark_uninitialized(ex);
2884
2885	/*
2886	 * path may lead to new leaf, not to original leaf any more
2887	 * after ext4_ext_insert_extent() returns,
2888	 */
2889	err = ext4_ext_dirty(handle, inode, path + depth);
2890	if (err)
2891		goto fix_extent_len;
2892
2893	ex2 = &newex;
2894	ex2->ee_block = cpu_to_le32(split);
2895	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2896	ext4_ext_store_pblock(ex2, newblock);
2897	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2898		ext4_ext_mark_uninitialized(ex2);
2899
2900	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2901	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2902		err = ext4_ext_zeroout(inode, &orig_ex);
2903		if (err)
2904			goto fix_extent_len;
2905		/* update the extent length and mark as initialized */
2906		ex->ee_len = cpu_to_le32(ee_len);
2907		ext4_ext_try_to_merge(inode, path, ex);
2908		err = ext4_ext_dirty(handle, inode, path + depth);
2909		goto out;
2910	} else if (err)
2911		goto fix_extent_len;
2912
2913out:
2914	ext4_ext_show_leaf(inode, path);
2915	return err;
2916
2917fix_extent_len:
2918	ex->ee_len = orig_ex.ee_len;
2919	ext4_ext_dirty(handle, inode, path + depth);
2920	return err;
2921}
2922
2923/*
2924 * ext4_split_extents() splits an extent and mark extent which is covered
2925 * by @map as split_flags indicates
2926 *
2927 * It may result in splitting the extent into multiple extents (upto three)
2928 * There are three possibilities:
2929 *   a> There is no split required
2930 *   b> Splits in two extents: Split is happening at either end of the extent
2931 *   c> Splits in three extents: Somone is splitting in middle of the extent
2932 *
2933 */
2934static int ext4_split_extent(handle_t *handle,
2935			      struct inode *inode,
2936			      struct ext4_ext_path *path,
2937			      struct ext4_map_blocks *map,
2938			      int split_flag,
2939			      int flags)
2940{
2941	ext4_lblk_t ee_block;
2942	struct ext4_extent *ex;
2943	unsigned int ee_len, depth;
2944	int err = 0;
2945	int uninitialized;
2946	int split_flag1, flags1;
2947
2948	depth = ext_depth(inode);
2949	ex = path[depth].p_ext;
2950	ee_block = le32_to_cpu(ex->ee_block);
2951	ee_len = ext4_ext_get_actual_len(ex);
2952	uninitialized = ext4_ext_is_uninitialized(ex);
2953
2954	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2955		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2956			      EXT4_EXT_MAY_ZEROOUT : 0;
2957		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2958		if (uninitialized)
2959			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2960				       EXT4_EXT_MARK_UNINIT2;
2961		err = ext4_split_extent_at(handle, inode, path,
2962				map->m_lblk + map->m_len, split_flag1, flags1);
2963		if (err)
2964			goto out;
2965	}
2966
2967	ext4_ext_drop_refs(path);
2968	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2969	if (IS_ERR(path))
2970		return PTR_ERR(path);
2971
2972	if (map->m_lblk >= ee_block) {
2973		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2974			      EXT4_EXT_MAY_ZEROOUT : 0;
2975		if (uninitialized)
2976			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2977		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2978			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2979		err = ext4_split_extent_at(handle, inode, path,
2980				map->m_lblk, split_flag1, flags);
2981		if (err)
2982			goto out;
2983	}
2984
2985	ext4_ext_show_leaf(inode, path);
2986out:
2987	return err ? err : map->m_len;
2988}
2989
2990#define EXT4_EXT_ZERO_LEN 7
2991/*
2992 * This function is called by ext4_ext_map_blocks() if someone tries to write
2993 * to an uninitialized extent. It may result in splitting the uninitialized
2994 * extent into multiple extents (up to three - one initialized and two
2995 * uninitialized).
2996 * There are three possibilities:
2997 *   a> There is no split required: Entire extent should be initialized
2998 *   b> Splits in two extents: Write is happening at either end of the extent
2999 *   c> Splits in three extents: Somone is writing in middle of the extent
3000 */
3001static int ext4_ext_convert_to_initialized(handle_t *handle,
3002					   struct inode *inode,
3003					   struct ext4_map_blocks *map,
3004					   struct ext4_ext_path *path)
3005{
3006	struct ext4_map_blocks split_map;
3007	struct ext4_extent zero_ex;
3008	struct ext4_extent *ex;
3009	ext4_lblk_t ee_block, eof_block;
3010	unsigned int allocated, ee_len, depth;
3011	int err = 0;
3012	int split_flag = 0;
3013
3014	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3015		"block %llu, max_blocks %u\n", inode->i_ino,
3016		(unsigned long long)map->m_lblk, map->m_len);
3017
3018	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3019		inode->i_sb->s_blocksize_bits;
3020	if (eof_block < map->m_lblk + map->m_len)
3021		eof_block = map->m_lblk + map->m_len;
3022
3023	depth = ext_depth(inode);
3024	ex = path[depth].p_ext;
3025	ee_block = le32_to_cpu(ex->ee_block);
3026	ee_len = ext4_ext_get_actual_len(ex);
3027	allocated = ee_len - (map->m_lblk - ee_block);
3028
3029	WARN_ON(map->m_lblk < ee_block);
3030	/*
3031	 * It is safe to convert extent to initialized via explicit
3032	 * zeroout only if extent is fully insde i_size or new_size.
3033	 */
3034	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3035
3036	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
3037	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
3038	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3039		err = ext4_ext_zeroout(inode, ex);
3040		if (err)
3041			goto out;
3042
3043		err = ext4_ext_get_access(handle, inode, path + depth);
3044		if (err)
3045			goto out;
3046		ext4_ext_mark_initialized(ex);
3047		ext4_ext_try_to_merge(inode, path, ex);
3048		err = ext4_ext_dirty(handle, inode, path + depth);
3049		goto out;
3050	}
3051
3052	/*
3053	 * four cases:
3054	 * 1. split the extent into three extents.
3055	 * 2. split the extent into two extents, zeroout the first half.
3056	 * 3. split the extent into two extents, zeroout the second half.
3057	 * 4. split the extent into two extents with out zeroout.
3058	 */
3059	split_map.m_lblk = map->m_lblk;
3060	split_map.m_len = map->m_len;
3061
3062	if (allocated > map->m_len) {
3063		if (allocated <= EXT4_EXT_ZERO_LEN &&
3064		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3065			/* case 3 */
3066			zero_ex.ee_block =
3067					 cpu_to_le32(map->m_lblk);
3068			zero_ex.ee_len = cpu_to_le16(allocated);
3069			ext4_ext_store_pblock(&zero_ex,
3070				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3071			err = ext4_ext_zeroout(inode, &zero_ex);
3072			if (err)
3073				goto out;
3074			split_map.m_lblk = map->m_lblk;
3075			split_map.m_len = allocated;
3076		} else if ((map->m_lblk - ee_block + map->m_len <
3077			   EXT4_EXT_ZERO_LEN) &&
3078			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3079			/* case 2 */
3080			if (map->m_lblk != ee_block) {
3081				zero_ex.ee_block = ex->ee_block;
3082				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3083							ee_block);
3084				ext4_ext_store_pblock(&zero_ex,
3085						      ext4_ext_pblock(ex));
3086				err = ext4_ext_zeroout(inode, &zero_ex);
3087				if (err)
3088					goto out;
3089			}
3090
3091			split_map.m_lblk = ee_block;
3092			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3093			allocated = map->m_len;
3094		}
3095	}
3096
3097	allocated = ext4_split_extent(handle, inode, path,
3098				       &split_map, split_flag, 0);
3099	if (allocated < 0)
3100		err = allocated;
3101
3102out:
3103	return err ? err : allocated;
3104}
3105
3106/*
3107 * This function is called by ext4_ext_map_blocks() from
3108 * ext4_get_blocks_dio_write() when DIO to write
3109 * to an uninitialized extent.
3110 *
3111 * Writing to an uninitialized extent may result in splitting the uninitialized
3112 * extent into multiple /initialized uninitialized extents (up to three)
3113 * There are three possibilities:
3114 *   a> There is no split required: Entire extent should be uninitialized
3115 *   b> Splits in two extents: Write is happening at either end of the extent
3116 *   c> Splits in three extents: Somone is writing in middle of the extent
3117 *
3118 * One of more index blocks maybe needed if the extent tree grow after
3119 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3120 * complete, we need to split the uninitialized extent before DIO submit
3121 * the IO. The uninitialized extent called at this time will be split
3122 * into three uninitialized extent(at most). After IO complete, the part
3123 * being filled will be convert to initialized by the end_io callback function
3124 * via ext4_convert_unwritten_extents().
3125 *
3126 * Returns the size of uninitialized extent to be written on success.
3127 */
3128static int ext4_split_unwritten_extents(handle_t *handle,
3129					struct inode *inode,
3130					struct ext4_map_blocks *map,
3131					struct ext4_ext_path *path,
3132					int flags)
3133{
3134	ext4_lblk_t eof_block;
3135	ext4_lblk_t ee_block;
3136	struct ext4_extent *ex;
3137	unsigned int ee_len;
3138	int split_flag = 0, depth;
3139
3140	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3141		"block %llu, max_blocks %u\n", inode->i_ino,
3142		(unsigned long long)map->m_lblk, map->m_len);
3143
3144	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3145		inode->i_sb->s_blocksize_bits;
3146	if (eof_block < map->m_lblk + map->m_len)
3147		eof_block = map->m_lblk + map->m_len;
3148	/*
3149	 * It is safe to convert extent to initialized via explicit
3150	 * zeroout only if extent is fully insde i_size or new_size.
3151	 */
3152	depth = ext_depth(inode);
3153	ex = path[depth].p_ext;
3154	ee_block = le32_to_cpu(ex->ee_block);
3155	ee_len = ext4_ext_get_actual_len(ex);
3156
3157	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3158	split_flag |= EXT4_EXT_MARK_UNINIT2;
3159
3160	flags |= EXT4_GET_BLOCKS_PRE_IO;
3161	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3162}
3163
3164static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3165					      struct inode *inode,
3166					      struct ext4_ext_path *path)
3167{
3168	struct ext4_extent *ex;
3169	int depth;
3170	int err = 0;
3171
3172	depth = ext_depth(inode);
3173	ex = path[depth].p_ext;
3174
3175	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3176		"block %llu, max_blocks %u\n", inode->i_ino,
3177		(unsigned long long)le32_to_cpu(ex->ee_block),
3178		ext4_ext_get_actual_len(ex));
3179
3180	err = ext4_ext_get_access(handle, inode, path + depth);
3181	if (err)
3182		goto out;
3183	/* first mark the extent as initialized */
3184	ext4_ext_mark_initialized(ex);
3185
3186	/* note: ext4_ext_correct_indexes() isn't needed here because
3187	 * borders are not changed
3188	 */
3189	ext4_ext_try_to_merge(inode, path, ex);
3190
3191	/* Mark modified extent as dirty */
3192	err = ext4_ext_dirty(handle, inode, path + depth);
3193out:
3194	ext4_ext_show_leaf(inode, path);
3195	return err;
3196}
3197
3198static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3199			sector_t block, int count)
3200{
3201	int i;
3202	for (i = 0; i < count; i++)
3203                unmap_underlying_metadata(bdev, block + i);
3204}
3205
3206/*
3207 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3208 */
3209static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3210			      ext4_lblk_t lblk,
3211			      struct ext4_ext_path *path,
3212			      unsigned int len)
3213{
3214	int i, depth;
3215	struct ext4_extent_header *eh;
3216	struct ext4_extent *last_ex;
3217
3218	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3219		return 0;
3220
3221	depth = ext_depth(inode);
3222	eh = path[depth].p_hdr;
3223
3224	if (unlikely(!eh->eh_entries)) {
3225		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3226				 "EOFBLOCKS_FL set");
3227		return -EIO;
3228	}
3229	last_ex = EXT_LAST_EXTENT(eh);
3230	/*
3231	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3232	 * last block in the last extent in the file.  We test this by
3233	 * first checking to see if the caller to
3234	 * ext4_ext_get_blocks() was interested in the last block (or
3235	 * a block beyond the last block) in the current extent.  If
3236	 * this turns out to be false, we can bail out from this
3237	 * function immediately.
3238	 */
3239	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3240	    ext4_ext_get_actual_len(last_ex))
3241		return 0;
3242	/*
3243	 * If the caller does appear to be planning to write at or
3244	 * beyond the end of the current extent, we then test to see
3245	 * if the current extent is the last extent in the file, by
3246	 * checking to make sure it was reached via the rightmost node
3247	 * at each level of the tree.
3248	 */
3249	for (i = depth-1; i >= 0; i--)
3250		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3251			return 0;
3252	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3253	return ext4_mark_inode_dirty(handle, inode);
3254}
3255
3256/**
3257 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3258 *
3259 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3260 * whether there are any buffers marked for delayed allocation. It returns '1'
3261 * on the first delalloc'ed buffer head found. If no buffer head in the given
3262 * range is marked for delalloc, it returns 0.
3263 * lblk_start should always be <= lblk_end.
3264 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3265 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3266 * block sooner). This is useful when blocks are truncated sequentially from
3267 * lblk_start towards lblk_end.
3268 */
3269static int ext4_find_delalloc_range(struct inode *inode,
3270				    ext4_lblk_t lblk_start,
3271				    ext4_lblk_t lblk_end,
3272				    int search_hint_reverse)
3273{
3274	struct address_space *mapping = inode->i_mapping;
3275	struct buffer_head *head, *bh = NULL;
3276	struct page *page;
3277	ext4_lblk_t i, pg_lblk;
3278	pgoff_t index;
3279
3280	/* reverse search wont work if fs block size is less than page size */
3281	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3282		search_hint_reverse = 0;
3283
3284	if (search_hint_reverse)
3285		i = lblk_end;
3286	else
3287		i = lblk_start;
3288
3289	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3290
3291	while ((i >= lblk_start) && (i <= lblk_end)) {
3292		page = find_get_page(mapping, index);
3293		if (!page)
3294			goto nextpage;
3295
3296		if (!page_has_buffers(page))
3297			goto nextpage;
3298
3299		head = page_buffers(page);
3300		if (!head)
3301			goto nextpage;
3302
3303		bh = head;
3304		pg_lblk = index << (PAGE_CACHE_SHIFT -
3305						inode->i_blkbits);
3306		do {
3307			if (unlikely(pg_lblk < lblk_start)) {
3308				/*
3309				 * This is possible when fs block size is less
3310				 * than page size and our cluster starts/ends in
3311				 * middle of the page. So we need to skip the
3312				 * initial few blocks till we reach the 'lblk'
3313				 */
3314				pg_lblk++;
3315				continue;
3316			}
3317
3318			/* Check if the buffer is delayed allocated and that it
3319			 * is not yet mapped. (when da-buffers are mapped during
3320			 * their writeout, their da_mapped bit is set.)
3321			 */
3322			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3323				page_cache_release(page);
3324				trace_ext4_find_delalloc_range(inode,
3325						lblk_start, lblk_end,
3326						search_hint_reverse,
3327						1, i);
3328				return 1;
3329			}
3330			if (search_hint_reverse)
3331				i--;
3332			else
3333				i++;
3334		} while ((i >= lblk_start) && (i <= lblk_end) &&
3335				((bh = bh->b_this_page) != head));
3336nextpage:
3337		if (page)
3338			page_cache_release(page);
3339		/*
3340		 * Move to next page. 'i' will be the first lblk in the next
3341		 * page.
3342		 */
3343		if (search_hint_reverse)
3344			index--;
3345		else
3346			index++;
3347		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3348	}
3349
3350	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3351					search_hint_reverse, 0, 0);
3352	return 0;
3353}
3354
3355int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3356			       int search_hint_reverse)
3357{
3358	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3359	ext4_lblk_t lblk_start, lblk_end;
3360	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3361	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3362
3363	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3364					search_hint_reverse);
3365}
3366
3367/**
3368 * Determines how many complete clusters (out of those specified by the 'map')
3369 * are under delalloc and were reserved quota for.
3370 * This function is called when we are writing out the blocks that were
3371 * originally written with their allocation delayed, but then the space was
3372 * allocated using fallocate() before the delayed allocation could be resolved.
3373 * The cases to look for are:
3374 * ('=' indicated delayed allocated blocks
3375 *  '-' indicates non-delayed allocated blocks)
3376 * (a) partial clusters towards beginning and/or end outside of allocated range
3377 *     are not delalloc'ed.
3378 *	Ex:
3379 *	|----c---=|====c====|====c====|===-c----|
3380 *	         |++++++ allocated ++++++|
3381 *	==> 4 complete clusters in above example
3382 *
3383 * (b) partial cluster (outside of allocated range) towards either end is
3384 *     marked for delayed allocation. In this case, we will exclude that
3385 *     cluster.
3386 *	Ex:
3387 *	|----====c========|========c========|
3388 *	     |++++++ allocated ++++++|
3389 *	==> 1 complete clusters in above example
3390 *
3391 *	Ex:
3392 *	|================c================|
3393 *            |++++++ allocated ++++++|
3394 *	==> 0 complete clusters in above example
3395 *
3396 * The ext4_da_update_reserve_space will be called only if we
3397 * determine here that there were some "entire" clusters that span
3398 * this 'allocated' range.
3399 * In the non-bigalloc case, this function will just end up returning num_blks
3400 * without ever calling ext4_find_delalloc_range.
3401 */
3402static unsigned int
3403get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3404			   unsigned int num_blks)
3405{
3406	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3407	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3408	ext4_lblk_t lblk_from, lblk_to, c_offset;
3409	unsigned int allocated_clusters = 0;
3410
3411	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3412	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3413
3414	/* max possible clusters for this allocation */
3415	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3416
3417	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3418
3419	/* Check towards left side */
3420	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3421	if (c_offset) {
3422		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3423		lblk_to = lblk_from + c_offset - 1;
3424
3425		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3426			allocated_clusters--;
3427	}
3428
3429	/* Now check towards right. */
3430	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3431	if (allocated_clusters && c_offset) {
3432		lblk_from = lblk_start + num_blks;
3433		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3434
3435		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3436			allocated_clusters--;
3437	}
3438
3439	return allocated_clusters;
3440}
3441
3442static int
3443ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3444			struct ext4_map_blocks *map,
3445			struct ext4_ext_path *path, int flags,
3446			unsigned int allocated, ext4_fsblk_t newblock)
3447{
3448	int ret = 0;
3449	int err = 0;
3450	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3451
3452	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3453		  "block %llu, max_blocks %u, flags %d, allocated %u",
3454		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3455		  flags, allocated);
3456	ext4_ext_show_leaf(inode, path);
3457
3458	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3459						    newblock);
3460
3461	/* get_block() before submit the IO, split the extent */
3462	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3463		ret = ext4_split_unwritten_extents(handle, inode, map,
3464						   path, flags);
3465		/*
3466		 * Flag the inode(non aio case) or end_io struct (aio case)
3467		 * that this IO needs to conversion to written when IO is
3468		 * completed
3469		 */
3470		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3471			io->flag = EXT4_IO_END_UNWRITTEN;
3472			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3473		} else
3474			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3475		if (ext4_should_dioread_nolock(inode))
3476			map->m_flags |= EXT4_MAP_UNINIT;
3477		goto out;
3478	}
3479	/* IO end_io complete, convert the filled extent to written */
3480	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3481		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3482							path);
3483		if (ret >= 0) {
3484			ext4_update_inode_fsync_trans(handle, inode, 1);
3485			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3486						 path, map->m_len);
3487		} else
3488			err = ret;
3489		goto out2;
3490	}
3491	/* buffered IO case */
3492	/*
3493	 * repeat fallocate creation request
3494	 * we already have an unwritten extent
3495	 */
3496	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3497		goto map_out;
3498
3499	/* buffered READ or buffered write_begin() lookup */
3500	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3501		/*
3502		 * We have blocks reserved already.  We
3503		 * return allocated blocks so that delalloc
3504		 * won't do block reservation for us.  But
3505		 * the buffer head will be unmapped so that
3506		 * a read from the block returns 0s.
3507		 */
3508		map->m_flags |= EXT4_MAP_UNWRITTEN;
3509		goto out1;
3510	}
3511
3512	/* buffered write, writepage time, convert*/
3513	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3514	if (ret >= 0) {
3515		ext4_update_inode_fsync_trans(handle, inode, 1);
3516		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3517					 map->m_len);
3518		if (err < 0)
3519			goto out2;
3520	}
3521
3522out:
3523	if (ret <= 0) {
3524		err = ret;
3525		goto out2;
3526	} else
3527		allocated = ret;
3528	map->m_flags |= EXT4_MAP_NEW;
3529	/*
3530	 * if we allocated more blocks than requested
3531	 * we need to make sure we unmap the extra block
3532	 * allocated. The actual needed block will get
3533	 * unmapped later when we find the buffer_head marked
3534	 * new.
3535	 */
3536	if (allocated > map->m_len) {
3537		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3538					newblock + map->m_len,
3539					allocated - map->m_len);
3540		allocated = map->m_len;
3541	}
3542
3543	/*
3544	 * If we have done fallocate with the offset that is already
3545	 * delayed allocated, we would have block reservation
3546	 * and quota reservation done in the delayed write path.
3547	 * But fallocate would have already updated quota and block
3548	 * count for this offset. So cancel these reservation
3549	 */
3550	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3551		unsigned int reserved_clusters;
3552		reserved_clusters = get_reserved_cluster_alloc(inode,
3553				map->m_lblk, map->m_len);
3554		if (reserved_clusters)
3555			ext4_da_update_reserve_space(inode,
3556						     reserved_clusters,
3557						     0);
3558	}
3559
3560map_out:
3561	map->m_flags |= EXT4_MAP_MAPPED;
3562out1:
3563	if (allocated > map->m_len)
3564		allocated = map->m_len;
3565	ext4_ext_show_leaf(inode, path);
3566	map->m_pblk = newblock;
3567	map->m_len = allocated;
3568out2:
3569	if (path) {
3570		ext4_ext_drop_refs(path);
3571		kfree(path);
3572	}
3573	return err ? err : allocated;
3574}
3575
3576/*
3577 * get_implied_cluster_alloc - check to see if the requested
3578 * allocation (in the map structure) overlaps with a cluster already
3579 * allocated in an extent.
3580 *	@sb	The filesystem superblock structure
3581 *	@map	The requested lblk->pblk mapping
3582 *	@ex	The extent structure which might contain an implied
3583 *			cluster allocation
3584 *
3585 * This function is called by ext4_ext_map_blocks() after we failed to
3586 * find blocks that were already in the inode's extent tree.  Hence,
3587 * we know that the beginning of the requested region cannot overlap
3588 * the extent from the inode's extent tree.  There are three cases we
3589 * want to catch.  The first is this case:
3590 *
3591 *		 |--- cluster # N--|
3592 *    |--- extent ---|	|---- requested region ---|
3593 *			|==========|
3594 *
3595 * The second case that we need to test for is this one:
3596 *
3597 *   |--------- cluster # N ----------------|
3598 *	   |--- requested region --|   |------- extent ----|
3599 *	   |=======================|
3600 *
3601 * The third case is when the requested region lies between two extents
3602 * within the same cluster:
3603 *          |------------- cluster # N-------------|
3604 * |----- ex -----|                  |---- ex_right ----|
3605 *                  |------ requested region ------|
3606 *                  |================|
3607 *
3608 * In each of the above cases, we need to set the map->m_pblk and
3609 * map->m_len so it corresponds to the return the extent labelled as
3610 * "|====|" from cluster #N, since it is already in use for data in
3611 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3612 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3613 * as a new "allocated" block region.  Otherwise, we will return 0 and
3614 * ext4_ext_map_blocks() will then allocate one or more new clusters
3615 * by calling ext4_mb_new_blocks().
3616 */
3617static int get_implied_cluster_alloc(struct super_block *sb,
3618				     struct ext4_map_blocks *map,
3619				     struct ext4_extent *ex,
3620				     struct ext4_ext_path *path)
3621{
3622	struct ext4_sb_info *sbi = EXT4_SB(sb);
3623	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3624	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3625	ext4_lblk_t rr_cluster_start, rr_cluster_end;
3626	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3627	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3628	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3629
3630	/* The extent passed in that we are trying to match */
3631	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3632	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3633
3634	/* The requested region passed into ext4_map_blocks() */
3635	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3636	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3637
3638	if ((rr_cluster_start == ex_cluster_end) ||
3639	    (rr_cluster_start == ex_cluster_start)) {
3640		if (rr_cluster_start == ex_cluster_end)
3641			ee_start += ee_len - 1;
3642		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3643			c_offset;
3644		map->m_len = min(map->m_len,
3645				 (unsigned) sbi->s_cluster_ratio - c_offset);
3646		/*
3647		 * Check for and handle this case:
3648		 *
3649		 *   |--------- cluster # N-------------|
3650		 *		       |------- extent ----|
3651		 *	   |--- requested region ---|
3652		 *	   |===========|
3653		 */
3654
3655		if (map->m_lblk < ee_block)
3656			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3657
3658		/*
3659		 * Check for the case where there is already another allocated
3660		 * block to the right of 'ex' but before the end of the cluster.
3661		 *
3662		 *          |------------- cluster # N-------------|
3663		 * |----- ex -----|                  |---- ex_right ----|
3664		 *                  |------ requested region ------|
3665		 *                  |================|
3666		 */
3667		if (map->m_lblk > ee_block) {
3668			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3669			map->m_len = min(map->m_len, next - map->m_lblk);
3670		}
3671
3672		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3673		return 1;
3674	}
3675
3676	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3677	return 0;
3678}
3679
3680
3681/*
3682 * Block allocation/map/preallocation routine for extents based files
3683 *
3684 *
3685 * Need to be called with
3686 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3687 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3688 *
3689 * return > 0, number of of blocks already mapped/allocated
3690 *          if create == 0 and these are pre-allocated blocks
3691 *          	buffer head is unmapped
3692 *          otherwise blocks are mapped
3693 *
3694 * return = 0, if plain look up failed (blocks have not been allocated)
3695 *          buffer head is unmapped
3696 *
3697 * return < 0, error case.
3698 */
3699int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3700			struct ext4_map_blocks *map, int flags)
3701{
3702	struct ext4_ext_path *path = NULL;
3703	struct ext4_extent newex, *ex, *ex2;
3704	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3705	ext4_fsblk_t newblock = 0;
3706	int free_on_err = 0, err = 0, depth, ret;
3707	unsigned int allocated = 0, offset = 0;
3708	unsigned int allocated_clusters = 0, reserved_clusters = 0;
3709	unsigned int punched_out = 0;
3710	unsigned int result = 0;
3711	struct ext4_allocation_request ar;
3712	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3713	ext4_lblk_t cluster_offset;
3714	struct ext4_map_blocks punch_map;
3715
3716	ext_debug("blocks %u/%u requested for inode %lu\n",
3717		  map->m_lblk, map->m_len, inode->i_ino);
3718	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3719
3720	/* check in cache */
3721	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3722		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3723		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3724			if ((sbi->s_cluster_ratio > 1) &&
3725			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3726				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3727
3728			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3729				/*
3730				 * block isn't allocated yet and
3731				 * user doesn't want to allocate it
3732				 */
3733				goto out2;
3734			}
3735			/* we should allocate requested block */
3736		} else {
3737			/* block is already allocated */
3738			if (sbi->s_cluster_ratio > 1)
3739				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3740			newblock = map->m_lblk
3741				   - le32_to_cpu(newex.ee_block)
3742				   + ext4_ext_pblock(&newex);
3743			/* number of remaining blocks in the extent */
3744			allocated = ext4_ext_get_actual_len(&newex) -
3745				(map->m_lblk - le32_to_cpu(newex.ee_block));
3746			goto out;
3747		}
3748	}
3749
3750	/* find extent for this block */
3751	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3752	if (IS_ERR(path)) {
3753		err = PTR_ERR(path);
3754		path = NULL;
3755		goto out2;
3756	}
3757
3758	depth = ext_depth(inode);
3759
3760	/*
3761	 * consistent leaf must not be empty;
3762	 * this situation is possible, though, _during_ tree modification;
3763	 * this is why assert can't be put in ext4_ext_find_extent()
3764	 */
3765	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3766		EXT4_ERROR_INODE(inode, "bad extent address "
3767				 "lblock: %lu, depth: %d pblock %lld",
3768				 (unsigned long) map->m_lblk, depth,
3769				 path[depth].p_block);
3770		err = -EIO;
3771		goto out2;
3772	}
3773
3774	ex = path[depth].p_ext;
3775	if (ex) {
3776		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3777		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3778		unsigned short ee_len;
3779
3780		/*
3781		 * Uninitialized extents are treated as holes, except that
3782		 * we split out initialized portions during a write.
3783		 */
3784		ee_len = ext4_ext_get_actual_len(ex);
3785
3786		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3787
3788		/* if found extent covers block, simply return it */
3789		if (in_range(map->m_lblk, ee_block, ee_len)) {
3790			ext4_fsblk_t partial_cluster = 0;
3791
3792			newblock = map->m_lblk - ee_block + ee_start;
3793			/* number of remaining blocks in the extent */
3794			allocated = ee_len - (map->m_lblk - ee_block);
3795			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3796				  ee_block, ee_len, newblock);
3797
3798			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3799				/*
3800				 * Do not put uninitialized extent
3801				 * in the cache
3802				 */
3803				if (!ext4_ext_is_uninitialized(ex)) {
3804					ext4_ext_put_in_cache(inode, ee_block,
3805						ee_len, ee_start);
3806					goto out;
3807				}
3808				ret = ext4_ext_handle_uninitialized_extents(
3809					handle, inode, map, path, flags,
3810					allocated, newblock);
3811				return ret;
3812			}
3813
3814			/*
3815			 * Punch out the map length, but only to the
3816			 * end of the extent
3817			 */
3818			punched_out = allocated < map->m_len ?
3819				allocated : map->m_len;
3820
3821			/*
3822			 * Sense extents need to be converted to
3823			 * uninitialized, they must fit in an
3824			 * uninitialized extent
3825			 */
3826			if (punched_out > EXT_UNINIT_MAX_LEN)
3827				punched_out = EXT_UNINIT_MAX_LEN;
3828
3829			punch_map.m_lblk = map->m_lblk;
3830			punch_map.m_pblk = newblock;
3831			punch_map.m_len = punched_out;
3832			punch_map.m_flags = 0;
3833
3834			/* Check to see if the extent needs to be split */
3835			if (punch_map.m_len != ee_len ||
3836				punch_map.m_lblk != ee_block) {
3837
3838				ret = ext4_split_extent(handle, inode,
3839				path, &punch_map, 0,
3840				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3841				EXT4_GET_BLOCKS_PRE_IO);
3842
3843				if (ret < 0) {
3844					err = ret;
3845					goto out2;
3846				}
3847				/*
3848				 * find extent for the block at
3849				 * the start of the hole
3850				 */
3851				ext4_ext_drop_refs(path);
3852				kfree(path);
3853
3854				path = ext4_ext_find_extent(inode,
3855				map->m_lblk, NULL);
3856				if (IS_ERR(path)) {
3857					err = PTR_ERR(path);
3858					path = NULL;
3859					goto out2;
3860				}
3861
3862				depth = ext_depth(inode);
3863				ex = path[depth].p_ext;
3864				ee_len = ext4_ext_get_actual_len(ex);
3865				ee_block = le32_to_cpu(ex->ee_block);
3866				ee_start = ext4_ext_pblock(ex);
3867
3868			}
3869
3870			ext4_ext_mark_uninitialized(ex);
3871
3872			ext4_ext_invalidate_cache(inode);
3873
3874			err = ext4_ext_rm_leaf(handle, inode, path,
3875					       &partial_cluster, map->m_lblk,
3876					       map->m_lblk + punched_out);
3877
3878			if (!err && path->p_hdr->eh_entries == 0) {
3879				/*
3880				 * Punch hole freed all of this sub tree,
3881				 * so we need to correct eh_depth
3882				 */
3883				err = ext4_ext_get_access(handle, inode, path);
3884				if (err == 0) {
3885					ext_inode_hdr(inode)->eh_depth = 0;
3886					ext_inode_hdr(inode)->eh_max =
3887					cpu_to_le16(ext4_ext_space_root(
3888						inode, 0));
3889
3890					err = ext4_ext_dirty(
3891						handle, inode, path);
3892				}
3893			}
3894
3895			goto out2;
3896		}
3897	}
3898
3899	if ((sbi->s_cluster_ratio > 1) &&
3900	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3901		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3902
3903	/*
3904	 * requested block isn't allocated yet;
3905	 * we couldn't try to create block if create flag is zero
3906	 */
3907	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3908		/*
3909		 * put just found gap into cache to speed up
3910		 * subsequent requests
3911		 */
3912		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3913		goto out2;
3914	}
3915
3916	/*
3917	 * Okay, we need to do block allocation.
3918	 */
3919	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3920	newex.ee_block = cpu_to_le32(map->m_lblk);
3921	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3922
3923	/*
3924	 * If we are doing bigalloc, check to see if the extent returned
3925	 * by ext4_ext_find_extent() implies a cluster we can use.
3926	 */
3927	if (cluster_offset && ex &&
3928	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3929		ar.len = allocated = map->m_len;
3930		newblock = map->m_pblk;
3931		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3932		goto got_allocated_blocks;
3933	}
3934
3935	/* find neighbour allocated blocks */
3936	ar.lleft = map->m_lblk;
3937	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3938	if (err)
3939		goto out2;
3940	ar.lright = map->m_lblk;
3941	ex2 = NULL;
3942	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3943	if (err)
3944		goto out2;
3945
3946	/* Check if the extent after searching to the right implies a
3947	 * cluster we can use. */
3948	if ((sbi->s_cluster_ratio > 1) && ex2 &&
3949	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3950		ar.len = allocated = map->m_len;
3951		newblock = map->m_pblk;
3952		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3953		goto got_allocated_blocks;
3954	}
3955
3956	/*
3957	 * See if request is beyond maximum number of blocks we can have in
3958	 * a single extent. For an initialized extent this limit is
3959	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3960	 * EXT_UNINIT_MAX_LEN.
3961	 */
3962	if (map->m_len > EXT_INIT_MAX_LEN &&
3963	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3964		map->m_len = EXT_INIT_MAX_LEN;
3965	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3966		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3967		map->m_len = EXT_UNINIT_MAX_LEN;
3968
3969	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3970	newex.ee_len = cpu_to_le16(map->m_len);
3971	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3972	if (err)
3973		allocated = ext4_ext_get_actual_len(&newex);
3974	else
3975		allocated = map->m_len;
3976
3977	/* allocate new block */
3978	ar.inode = inode;
3979	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3980	ar.logical = map->m_lblk;
3981	/*
3982	 * We calculate the offset from the beginning of the cluster
3983	 * for the logical block number, since when we allocate a
3984	 * physical cluster, the physical block should start at the
3985	 * same offset from the beginning of the cluster.  This is
3986	 * needed so that future calls to get_implied_cluster_alloc()
3987	 * work correctly.
3988	 */
3989	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
3990	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
3991	ar.goal -= offset;
3992	ar.logical -= offset;
3993	if (S_ISREG(inode->i_mode))
3994		ar.flags = EXT4_MB_HINT_DATA;
3995	else
3996		/* disable in-core preallocation for non-regular files */
3997		ar.flags = 0;
3998	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3999		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4000	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4001	if (!newblock)
4002		goto out2;
4003	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4004		  ar.goal, newblock, allocated);
4005	free_on_err = 1;
4006	allocated_clusters = ar.len;
4007	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4008	if (ar.len > allocated)
4009		ar.len = allocated;
4010
4011got_allocated_blocks:
4012	/* try to insert new extent into found leaf and return */
4013	ext4_ext_store_pblock(&newex, newblock + offset);
4014	newex.ee_len = cpu_to_le16(ar.len);
4015	/* Mark uninitialized */
4016	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4017		ext4_ext_mark_uninitialized(&newex);
4018		/*
4019		 * io_end structure was created for every IO write to an
4020		 * uninitialized extent. To avoid unnecessary conversion,
4021		 * here we flag the IO that really needs the conversion.
4022		 * For non asycn direct IO case, flag the inode state
4023		 * that we need to perform conversion when IO is done.
4024		 */
4025		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
4026			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
4027				io->flag = EXT4_IO_END_UNWRITTEN;
4028				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
4029			} else
4030				ext4_set_inode_state(inode,
4031						     EXT4_STATE_DIO_UNWRITTEN);
4032		}
4033		if (ext4_should_dioread_nolock(inode))
4034			map->m_flags |= EXT4_MAP_UNINIT;
4035	}
4036
4037	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
4038	if (!err)
4039		err = ext4_ext_insert_extent(handle, inode, path,
4040					     &newex, flags);
4041	if (err && free_on_err) {
4042		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4043			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4044		/* free data blocks we just allocated */
4045		/* not a good idea to call discard here directly,
4046		 * but otherwise we'd need to call it every free() */
4047		ext4_discard_preallocations(inode);
4048		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
4049				 ext4_ext_get_actual_len(&newex), fb_flags);
4050		goto out2;
4051	}
4052
4053	/* previous routine could use block we allocated */
4054	newblock = ext4_ext_pblock(&newex);
4055	allocated = ext4_ext_get_actual_len(&newex);
4056	if (allocated > map->m_len)
4057		allocated = map->m_len;
4058	map->m_flags |= EXT4_MAP_NEW;
4059
4060	/*
4061	 * Update reserved blocks/metadata blocks after successful
4062	 * block allocation which had been deferred till now.
4063	 */
4064	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4065		/*
4066		 * Check how many clusters we had reserved this allocted range.
4067		 */
4068		reserved_clusters = get_reserved_cluster_alloc(inode,
4069						map->m_lblk, allocated);
4070		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4071			if (reserved_clusters) {
4072				/*
4073				 * We have clusters reserved for this range.
4074				 * But since we are not doing actual allocation
4075				 * and are simply using blocks from previously
4076				 * allocated cluster, we should release the
4077				 * reservation and not claim quota.
4078				 */
4079				ext4_da_update_reserve_space(inode,
4080						reserved_clusters, 0);
4081			}
4082		} else {
4083			BUG_ON(allocated_clusters < reserved_clusters);
4084			/* We will claim quota for all newly allocated blocks.*/
4085			ext4_da_update_reserve_space(inode, allocated_clusters,
4086							1);
4087			if (reserved_clusters < allocated_clusters) {
4088				struct ext4_inode_info *ei = EXT4_I(inode);
4089				int reservation = allocated_clusters -
4090						  reserved_clusters;
4091				/*
4092				 * It seems we claimed few clusters outside of
4093				 * the range of this allocation. We should give
4094				 * it back to the reservation pool. This can
4095				 * happen in the following case:
4096				 *
4097				 * * Suppose s_cluster_ratio is 4 (i.e., each
4098				 *   cluster has 4 blocks. Thus, the clusters
4099				 *   are [0-3],[4-7],[8-11]...
4100				 * * First comes delayed allocation write for
4101				 *   logical blocks 10 & 11. Since there were no
4102				 *   previous delayed allocated blocks in the
4103				 *   range [8-11], we would reserve 1 cluster
4104				 *   for this write.
4105				 * * Next comes write for logical blocks 3 to 8.
4106				 *   In this case, we will reserve 2 clusters
4107				 *   (for [0-3] and [4-7]; and not for [8-11] as
4108				 *   that range has a delayed allocated blocks.
4109				 *   Thus total reserved clusters now becomes 3.
4110				 * * Now, during the delayed allocation writeout
4111				 *   time, we will first write blocks [3-8] and
4112				 *   allocate 3 clusters for writing these
4113				 *   blocks. Also, we would claim all these
4114				 *   three clusters above.
4115				 * * Now when we come here to writeout the
4116				 *   blocks [10-11], we would expect to claim
4117				 *   the reservation of 1 cluster we had made
4118				 *   (and we would claim it since there are no
4119				 *   more delayed allocated blocks in the range
4120				 *   [8-11]. But our reserved cluster count had
4121				 *   already gone to 0.
4122				 *
4123				 *   Thus, at the step 4 above when we determine
4124				 *   that there are still some unwritten delayed
4125				 *   allocated blocks outside of our current
4126				 *   block range, we should increment the
4127				 *   reserved clusters count so that when the
4128				 *   remaining blocks finally gets written, we
4129				 *   could claim them.
4130				 */
4131				dquot_reserve_block(inode,
4132						EXT4_C2B(sbi, reservation));
4133				spin_lock(&ei->i_block_reservation_lock);
4134				ei->i_reserved_data_blocks += reservation;
4135				spin_unlock(&ei->i_block_reservation_lock);
4136			}
4137		}
4138	}
4139
4140	/*
4141	 * Cache the extent and update transaction to commit on fdatasync only
4142	 * when it is _not_ an uninitialized extent.
4143	 */
4144	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4145		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4146		ext4_update_inode_fsync_trans(handle, inode, 1);
4147	} else
4148		ext4_update_inode_fsync_trans(handle, inode, 0);
4149out:
4150	if (allocated > map->m_len)
4151		allocated = map->m_len;
4152	ext4_ext_show_leaf(inode, path);
4153	map->m_flags |= EXT4_MAP_MAPPED;
4154	map->m_pblk = newblock;
4155	map->m_len = allocated;
4156out2:
4157	if (path) {
4158		ext4_ext_drop_refs(path);
4159		kfree(path);
4160	}
4161	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4162		newblock, map->m_len, err ? err : allocated);
4163
4164	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4165			punched_out : allocated;
4166
4167	return err ? err : result;
4168}
4169
4170void ext4_ext_truncate(struct inode *inode)
4171{
4172	struct address_space *mapping = inode->i_mapping;
4173	struct super_block *sb = inode->i_sb;
4174	ext4_lblk_t last_block;
4175	handle_t *handle;
4176	loff_t page_len;
4177	int err = 0;
4178
4179	/*
4180	 * finish any pending end_io work so we won't run the risk of
4181	 * converting any truncated blocks to initialized later
4182	 */
4183	ext4_flush_completed_IO(inode);
4184
4185	/*
4186	 * probably first extent we're gonna free will be last in block
4187	 */
4188	err = ext4_writepage_trans_blocks(inode);
4189	handle = ext4_journal_start(inode, err);
4190	if (IS_ERR(handle))
4191		return;
4192
4193	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4194		page_len = PAGE_CACHE_SIZE -
4195			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4196
4197		err = ext4_discard_partial_page_buffers(handle,
4198			mapping, inode->i_size, page_len, 0);
4199
4200		if (err)
4201			goto out_stop;
4202	}
4203
4204	if (ext4_orphan_add(handle, inode))
4205		goto out_stop;
4206
4207	down_write(&EXT4_I(inode)->i_data_sem);
4208	ext4_ext_invalidate_cache(inode);
4209
4210	ext4_discard_preallocations(inode);
4211
4212	/*
4213	 * TODO: optimization is possible here.
4214	 * Probably we need not scan at all,
4215	 * because page truncation is enough.
4216	 */
4217
4218	/* we have to know where to truncate from in crash case */
4219	EXT4_I(inode)->i_disksize = inode->i_size;
4220	ext4_mark_inode_dirty(handle, inode);
4221
4222	last_block = (inode->i_size + sb->s_blocksize - 1)
4223			>> EXT4_BLOCK_SIZE_BITS(sb);
4224	err = ext4_ext_remove_space(inode, last_block);
4225
4226	/* In a multi-transaction truncate, we only make the final
4227	 * transaction synchronous.
4228	 */
4229	if (IS_SYNC(inode))
4230		ext4_handle_sync(handle);
4231
4232	up_write(&EXT4_I(inode)->i_data_sem);
4233
4234out_stop:
4235	/*
4236	 * If this was a simple ftruncate() and the file will remain alive,
4237	 * then we need to clear up the orphan record which we created above.
4238	 * However, if this was a real unlink then we were called by
4239	 * ext4_delete_inode(), and we allow that function to clean up the
4240	 * orphan info for us.
4241	 */
4242	if (inode->i_nlink)
4243		ext4_orphan_del(handle, inode);
4244
4245	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4246	ext4_mark_inode_dirty(handle, inode);
4247	ext4_journal_stop(handle);
4248}
4249
4250static void ext4_falloc_update_inode(struct inode *inode,
4251				int mode, loff_t new_size, int update_ctime)
4252{
4253	struct timespec now;
4254
4255	if (update_ctime) {
4256		now = current_fs_time(inode->i_sb);
4257		if (!timespec_equal(&inode->i_ctime, &now))
4258			inode->i_ctime = now;
4259	}
4260	/*
4261	 * Update only when preallocation was requested beyond
4262	 * the file size.
4263	 */
4264	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4265		if (new_size > i_size_read(inode))
4266			i_size_write(inode, new_size);
4267		if (new_size > EXT4_I(inode)->i_disksize)
4268			ext4_update_i_disksize(inode, new_size);
4269	} else {
4270		/*
4271		 * Mark that we allocate beyond EOF so the subsequent truncate
4272		 * can proceed even if the new size is the same as i_size.
4273		 */
4274		if (new_size > i_size_read(inode))
4275			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4276	}
4277
4278}
4279
4280/*
4281 * preallocate space for a file. This implements ext4's fallocate file
4282 * operation, which gets called from sys_fallocate system call.
4283 * For block-mapped files, posix_fallocate should fall back to the method
4284 * of writing zeroes to the required new blocks (the same behavior which is
4285 * expected for file systems which do not support fallocate() system call).
4286 */
4287long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4288{
4289	struct inode *inode = file->f_path.dentry->d_inode;
4290	handle_t *handle;
4291	loff_t new_size;
4292	unsigned int max_blocks;
4293	int ret = 0;
4294	int ret2 = 0;
4295	int retries = 0;
4296	struct ext4_map_blocks map;
4297	unsigned int credits, blkbits = inode->i_blkbits;
4298
4299	/*
4300	 * currently supporting (pre)allocate mode for extent-based
4301	 * files _only_
4302	 */
4303	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4304		return -EOPNOTSUPP;
4305
4306	/* Return error if mode is not supported */
4307	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4308		return -EOPNOTSUPP;
4309
4310	if (mode & FALLOC_FL_PUNCH_HOLE)
4311		return ext4_punch_hole(file, offset, len);
4312
4313	trace_ext4_fallocate_enter(inode, offset, len, mode);
4314	map.m_lblk = offset >> blkbits;
4315	/*
4316	 * We can't just convert len to max_blocks because
4317	 * If blocksize = 4096 offset = 3072 and len = 2048
4318	 */
4319	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4320		- map.m_lblk;
4321	/*
4322	 * credits to insert 1 extent into extent tree
4323	 */
4324	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4325	mutex_lock(&inode->i_mutex);
4326	ret = inode_newsize_ok(inode, (len + offset));
4327	if (ret) {
4328		mutex_unlock(&inode->i_mutex);
4329		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4330		return ret;
4331	}
4332retry:
4333	while (ret >= 0 && ret < max_blocks) {
4334		map.m_lblk = map.m_lblk + ret;
4335		map.m_len = max_blocks = max_blocks - ret;
4336		handle = ext4_journal_start(inode, credits);
4337		if (IS_ERR(handle)) {
4338			ret = PTR_ERR(handle);
4339			break;
4340		}
4341		ret = ext4_map_blocks(handle, inode, &map,
4342				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4343				      EXT4_GET_BLOCKS_NO_NORMALIZE);
4344		if (ret <= 0) {
4345#ifdef EXT4FS_DEBUG
4346			WARN_ON(ret <= 0);
4347			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4348				    "returned error inode#%lu, block=%u, "
4349				    "max_blocks=%u", __func__,
4350				    inode->i_ino, map.m_lblk, max_blocks);
4351#endif
4352			ext4_mark_inode_dirty(handle, inode);
4353			ret2 = ext4_journal_stop(handle);
4354			break;
4355		}
4356		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4357						blkbits) >> blkbits))
4358			new_size = offset + len;
4359		else
4360			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4361
4362		ext4_falloc_update_inode(inode, mode, new_size,
4363					 (map.m_flags & EXT4_MAP_NEW));
4364		ext4_mark_inode_dirty(handle, inode);
4365		ret2 = ext4_journal_stop(handle);
4366		if (ret2)
4367			break;
4368	}
4369	if (ret == -ENOSPC &&
4370			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4371		ret = 0;
4372		goto retry;
4373	}
4374	mutex_unlock(&inode->i_mutex);
4375	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4376				ret > 0 ? ret2 : ret);
4377	return ret > 0 ? ret2 : ret;
4378}
4379
4380/*
4381 * This function convert a range of blocks to written extents
4382 * The caller of this function will pass the start offset and the size.
4383 * all unwritten extents within this range will be converted to
4384 * written extents.
4385 *
4386 * This function is called from the direct IO end io call back
4387 * function, to convert the fallocated extents after IO is completed.
4388 * Returns 0 on success.
4389 */
4390int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4391				    ssize_t len)
4392{
4393	handle_t *handle;
4394	unsigned int max_blocks;
4395	int ret = 0;
4396	int ret2 = 0;
4397	struct ext4_map_blocks map;
4398	unsigned int credits, blkbits = inode->i_blkbits;
4399
4400	map.m_lblk = offset >> blkbits;
4401	/*
4402	 * We can't just convert len to max_blocks because
4403	 * If blocksize = 4096 offset = 3072 and len = 2048
4404	 */
4405	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4406		      map.m_lblk);
4407	/*
4408	 * credits to insert 1 extent into extent tree
4409	 */
4410	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4411	while (ret >= 0 && ret < max_blocks) {
4412		map.m_lblk += ret;
4413		map.m_len = (max_blocks -= ret);
4414		handle = ext4_journal_start(inode, credits);
4415		if (IS_ERR(handle)) {
4416			ret = PTR_ERR(handle);
4417			break;
4418		}
4419		ret = ext4_map_blocks(handle, inode, &map,
4420				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4421		if (ret <= 0) {
4422			WARN_ON(ret <= 0);
4423			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4424				    "returned error inode#%lu, block=%u, "
4425				    "max_blocks=%u", __func__,
4426				    inode->i_ino, map.m_lblk, map.m_len);
4427		}
4428		ext4_mark_inode_dirty(handle, inode);
4429		ret2 = ext4_journal_stop(handle);
4430		if (ret <= 0 || ret2 )
4431			break;
4432	}
4433	return ret > 0 ? ret2 : ret;
4434}
4435
4436/*
4437 * Callback function called for each extent to gather FIEMAP information.
4438 */
4439static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4440		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4441		       void *data)
4442{
4443	__u64	logical;
4444	__u64	physical;
4445	__u64	length;
4446	__u32	flags = 0;
4447	int		ret = 0;
4448	struct fiemap_extent_info *fieinfo = data;
4449	unsigned char blksize_bits;
4450
4451	blksize_bits = inode->i_sb->s_blocksize_bits;
4452	logical = (__u64)newex->ec_block << blksize_bits;
4453
4454	if (newex->ec_start == 0) {
4455		/*
4456		 * No extent in extent-tree contains block @newex->ec_start,
4457		 * then the block may stay in 1)a hole or 2)delayed-extent.
4458		 *
4459		 * Holes or delayed-extents are processed as follows.
4460		 * 1. lookup dirty pages with specified range in pagecache.
4461		 *    If no page is got, then there is no delayed-extent and
4462		 *    return with EXT_CONTINUE.
4463		 * 2. find the 1st mapped buffer,
4464		 * 3. check if the mapped buffer is both in the request range
4465		 *    and a delayed buffer. If not, there is no delayed-extent,
4466		 *    then return.
4467		 * 4. a delayed-extent is found, the extent will be collected.
4468		 */
4469		ext4_lblk_t	end = 0;
4470		pgoff_t		last_offset;
4471		pgoff_t		offset;
4472		pgoff_t		index;
4473		pgoff_t		start_index = 0;
4474		struct page	**pages = NULL;
4475		struct buffer_head *bh = NULL;
4476		struct buffer_head *head = NULL;
4477		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4478
4479		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4480		if (pages == NULL)
4481			return -ENOMEM;
4482
4483		offset = logical >> PAGE_SHIFT;
4484repeat:
4485		last_offset = offset;
4486		head = NULL;
4487		ret = find_get_pages_tag(inode->i_mapping, &offset,
4488					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4489
4490		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4491			/* First time, try to find a mapped buffer. */
4492			if (ret == 0) {
4493out:
4494				for (index = 0; index < ret; index++)
4495					page_cache_release(pages[index]);
4496				/* just a hole. */
4497				kfree(pages);
4498				return EXT_CONTINUE;
4499			}
4500			index = 0;
4501
4502next_page:
4503			/* Try to find the 1st mapped buffer. */
4504			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4505				  blksize_bits;
4506			if (!page_has_buffers(pages[index]))
4507				goto out;
4508			head = page_buffers(pages[index]);
4509			if (!head)
4510				goto out;
4511
4512			index++;
4513			bh = head;
4514			do {
4515				if (end >= newex->ec_block +
4516					newex->ec_len)
4517					/* The buffer is out of
4518					 * the request range.
4519					 */
4520					goto out;
4521
4522				if (buffer_mapped(bh) &&
4523				    end >= newex->ec_block) {
4524					start_index = index - 1;
4525					/* get the 1st mapped buffer. */
4526					goto found_mapped_buffer;
4527				}
4528
4529				bh = bh->b_this_page;
4530				end++;
4531			} while (bh != head);
4532
4533			/* No mapped buffer in the range found in this page,
4534			 * We need to look up next page.
4535			 */
4536			if (index >= ret) {
4537				/* There is no page left, but we need to limit
4538				 * newex->ec_len.
4539				 */
4540				newex->ec_len = end - newex->ec_block;
4541				goto out;
4542			}
4543			goto next_page;
4544		} else {
4545			/*Find contiguous delayed buffers. */
4546			if (ret > 0 && pages[0]->index == last_offset)
4547				head = page_buffers(pages[0]);
4548			bh = head;
4549			index = 1;
4550			start_index = 0;
4551		}
4552
4553found_mapped_buffer:
4554		if (bh != NULL && buffer_delay(bh)) {
4555			/* 1st or contiguous delayed buffer found. */
4556			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4557				/*
4558				 * 1st delayed buffer found, record
4559				 * the start of extent.
4560				 */
4561				flags |= FIEMAP_EXTENT_DELALLOC;
4562				newex->ec_block = end;
4563				logical = (__u64)end << blksize_bits;
4564			}
4565			/* Find contiguous delayed buffers. */
4566			do {
4567				if (!buffer_delay(bh))
4568					goto found_delayed_extent;
4569				bh = bh->b_this_page;
4570				end++;
4571			} while (bh != head);
4572
4573			for (; index < ret; index++) {
4574				if (!page_has_buffers(pages[index])) {
4575					bh = NULL;
4576					break;
4577				}
4578				head = page_buffers(pages[index]);
4579				if (!head) {
4580					bh = NULL;
4581					break;
4582				}
4583
4584				if (pages[index]->index !=
4585				    pages[start_index]->index + index
4586				    - start_index) {
4587					/* Blocks are not contiguous. */
4588					bh = NULL;
4589					break;
4590				}
4591				bh = head;
4592				do {
4593					if (!buffer_delay(bh))
4594						/* Delayed-extent ends. */
4595						goto found_delayed_extent;
4596					bh = bh->b_this_page;
4597					end++;
4598				} while (bh != head);
4599			}
4600		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4601			/* a hole found. */
4602			goto out;
4603
4604found_delayed_extent:
4605		newex->ec_len = min(end - newex->ec_block,
4606						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4607		if (ret == nr_pages && bh != NULL &&
4608			newex->ec_len < EXT_INIT_MAX_LEN &&
4609			buffer_delay(bh)) {
4610			/* Have not collected an extent and continue. */
4611			for (index = 0; index < ret; index++)
4612				page_cache_release(pages[index]);
4613			goto repeat;
4614		}
4615
4616		for (index = 0; index < ret; index++)
4617			page_cache_release(pages[index]);
4618		kfree(pages);
4619	}
4620
4621	physical = (__u64)newex->ec_start << blksize_bits;
4622	length =   (__u64)newex->ec_len << blksize_bits;
4623
4624	if (ex && ext4_ext_is_uninitialized(ex))
4625		flags |= FIEMAP_EXTENT_UNWRITTEN;
4626
4627	if (next == EXT_MAX_BLOCKS)
4628		flags |= FIEMAP_EXTENT_LAST;
4629
4630	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4631					length, flags);
4632	if (ret < 0)
4633		return ret;
4634	if (ret == 1)
4635		return EXT_BREAK;
4636	return EXT_CONTINUE;
4637}
4638/* fiemap flags we can handle specified here */
4639#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4640
4641static int ext4_xattr_fiemap(struct inode *inode,
4642				struct fiemap_extent_info *fieinfo)
4643{
4644	__u64 physical = 0;
4645	__u64 length;
4646	__u32 flags = FIEMAP_EXTENT_LAST;
4647	int blockbits = inode->i_sb->s_blocksize_bits;
4648	int error = 0;
4649
4650	/* in-inode? */
4651	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4652		struct ext4_iloc iloc;
4653		int offset;	/* offset of xattr in inode */
4654
4655		error = ext4_get_inode_loc(inode, &iloc);
4656		if (error)
4657			return error;
4658		physical = iloc.bh->b_blocknr << blockbits;
4659		offset = EXT4_GOOD_OLD_INODE_SIZE +
4660				EXT4_I(inode)->i_extra_isize;
4661		physical += offset;
4662		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4663		flags |= FIEMAP_EXTENT_DATA_INLINE;
4664		brelse(iloc.bh);
4665	} else { /* external block */
4666		physical = EXT4_I(inode)->i_file_acl << blockbits;
4667		length = inode->i_sb->s_blocksize;
4668	}
4669
4670	if (physical)
4671		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4672						length, flags);
4673	return (error < 0 ? error : 0);
4674}
4675
4676/*
4677 * ext4_ext_punch_hole
4678 *
4679 * Punches a hole of "length" bytes in a file starting
4680 * at byte "offset"
4681 *
4682 * @inode:  The inode of the file to punch a hole in
4683 * @offset: The starting byte offset of the hole
4684 * @length: The length of the hole
4685 *
4686 * Returns the number of blocks removed or negative on err
4687 */
4688int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4689{
4690	struct inode *inode = file->f_path.dentry->d_inode;
4691	struct super_block *sb = inode->i_sb;
4692	struct ext4_ext_cache cache_ex;
4693	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4694	struct address_space *mapping = inode->i_mapping;
4695	struct ext4_map_blocks map;
4696	handle_t *handle;
4697	loff_t first_page, last_page, page_len;
4698	loff_t first_page_offset, last_page_offset;
4699	int ret, credits, blocks_released, err = 0;
4700
4701	/* No need to punch hole beyond i_size */
4702	if (offset >= inode->i_size)
4703		return 0;
4704
4705	/*
4706	 * If the hole extends beyond i_size, set the hole
4707	 * to end after the page that contains i_size
4708	 */
4709	if (offset + length > inode->i_size) {
4710		length = inode->i_size +
4711		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4712		   offset;
4713	}
4714
4715	first_block = (offset + sb->s_blocksize - 1) >>
4716		EXT4_BLOCK_SIZE_BITS(sb);
4717	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4718
4719	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4720	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4721
4722	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4723	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4724
4725	/*
4726	 * Write out all dirty pages to avoid race conditions
4727	 * Then release them.
4728	 */
4729	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4730		err = filemap_write_and_wait_range(mapping,
4731			offset, offset + length - 1);
4732
4733		if (err)
4734			return err;
4735	}
4736
4737	/* Now release the pages */
4738	if (last_page_offset > first_page_offset) {
4739		truncate_inode_pages_range(mapping, first_page_offset,
4740					   last_page_offset-1);
4741	}
4742
4743	/* finish any pending end_io work */
4744	ext4_flush_completed_IO(inode);
4745
4746	credits = ext4_writepage_trans_blocks(inode);
4747	handle = ext4_journal_start(inode, credits);
4748	if (IS_ERR(handle))
4749		return PTR_ERR(handle);
4750
4751	err = ext4_orphan_add(handle, inode);
4752	if (err)
4753		goto out;
4754
4755	/*
4756	 * Now we need to zero out the non-page-aligned data in the
4757	 * pages at the start and tail of the hole, and unmap the buffer
4758	 * heads for the block aligned regions of the page that were
4759	 * completely zeroed.
4760	 */
4761	if (first_page > last_page) {
4762		/*
4763		 * If the file space being truncated is contained within a page
4764		 * just zero out and unmap the middle of that page
4765		 */
4766		err = ext4_discard_partial_page_buffers(handle,
4767			mapping, offset, length, 0);
4768
4769		if (err)
4770			goto out;
4771	} else {
4772		/*
4773		 * zero out and unmap the partial page that contains
4774		 * the start of the hole
4775		 */
4776		page_len  = first_page_offset - offset;
4777		if (page_len > 0) {
4778			err = ext4_discard_partial_page_buffers(handle, mapping,
4779						   offset, page_len, 0);
4780			if (err)
4781				goto out;
4782		}
4783
4784		/*
4785		 * zero out and unmap the partial page that contains
4786		 * the end of the hole
4787		 */
4788		page_len = offset + length - last_page_offset;
4789		if (page_len > 0) {
4790			err = ext4_discard_partial_page_buffers(handle, mapping,
4791					last_page_offset, page_len, 0);
4792			if (err)
4793				goto out;
4794		}
4795	}
4796
4797
4798	/*
4799	 * If i_size is contained in the last page, we need to
4800	 * unmap and zero the partial page after i_size
4801	 */
4802	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4803	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4804
4805		page_len = PAGE_CACHE_SIZE -
4806			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4807
4808		if (page_len > 0) {
4809			err = ext4_discard_partial_page_buffers(handle,
4810			  mapping, inode->i_size, page_len, 0);
4811
4812			if (err)
4813				goto out;
4814		}
4815	}
4816
4817	/* If there are no blocks to remove, return now */
4818	if (first_block >= last_block)
4819		goto out;
4820
4821	down_write(&EXT4_I(inode)->i_data_sem);
4822	ext4_ext_invalidate_cache(inode);
4823	ext4_discard_preallocations(inode);
4824
4825	/*
4826	 * Loop over all the blocks and identify blocks
4827	 * that need to be punched out
4828	 */
4829	iblock = first_block;
4830	blocks_released = 0;
4831	while (iblock < last_block) {
4832		max_blocks = last_block - iblock;
4833		num_blocks = 1;
4834		memset(&map, 0, sizeof(map));
4835		map.m_lblk = iblock;
4836		map.m_len = max_blocks;
4837		ret = ext4_ext_map_blocks(handle, inode, &map,
4838			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4839
4840		if (ret > 0) {
4841			blocks_released += ret;
4842			num_blocks = ret;
4843		} else if (ret == 0) {
4844			/*
4845			 * If map blocks could not find the block,
4846			 * then it is in a hole.  If the hole was
4847			 * not already cached, then map blocks should
4848			 * put it in the cache.  So we can get the hole
4849			 * out of the cache
4850			 */
4851			memset(&cache_ex, 0, sizeof(cache_ex));
4852			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4853				!cache_ex.ec_start) {
4854
4855				/* The hole is cached */
4856				num_blocks = cache_ex.ec_block +
4857				cache_ex.ec_len - iblock;
4858
4859			} else {
4860				/* The block could not be identified */
4861				err = -EIO;
4862				break;
4863			}
4864		} else {
4865			/* Map blocks error */
4866			err = ret;
4867			break;
4868		}
4869
4870		if (num_blocks == 0) {
4871			/* This condition should never happen */
4872			ext_debug("Block lookup failed");
4873			err = -EIO;
4874			break;
4875		}
4876
4877		iblock += num_blocks;
4878	}
4879
4880	if (blocks_released > 0) {
4881		ext4_ext_invalidate_cache(inode);
4882		ext4_discard_preallocations(inode);
4883	}
4884
4885	if (IS_SYNC(inode))
4886		ext4_handle_sync(handle);
4887
4888	up_write(&EXT4_I(inode)->i_data_sem);
4889
4890out:
4891	ext4_orphan_del(handle, inode);
4892	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4893	ext4_mark_inode_dirty(handle, inode);
4894	ext4_journal_stop(handle);
4895	return err;
4896}
4897int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4898		__u64 start, __u64 len)
4899{
4900	ext4_lblk_t start_blk;
4901	int error = 0;
4902
4903	/* fallback to generic here if not in extents fmt */
4904	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4905		return generic_block_fiemap(inode, fieinfo, start, len,
4906			ext4_get_block);
4907
4908	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4909		return -EBADR;
4910
4911	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4912		error = ext4_xattr_fiemap(inode, fieinfo);
4913	} else {
4914		ext4_lblk_t len_blks;
4915		__u64 last_blk;
4916
4917		start_blk = start >> inode->i_sb->s_blocksize_bits;
4918		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4919		if (last_blk >= EXT_MAX_BLOCKS)
4920			last_blk = EXT_MAX_BLOCKS-1;
4921		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4922
4923		/*
4924		 * Walk the extent tree gathering extent information.
4925		 * ext4_ext_fiemap_cb will push extents back to user.
4926		 */
4927		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4928					  ext4_ext_fiemap_cb, fieinfo);
4929	}
4930
4931	return error;
4932}
4933