extents.c revision f85b287a01237857a50c93868231f7e831581a27
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45
46#include <trace/events/ext4.h>
47
48static int ext4_split_extent(handle_t *handle,
49				struct inode *inode,
50				struct ext4_ext_path *path,
51				struct ext4_map_blocks *map,
52				int split_flag,
53				int flags);
54
55static int ext4_ext_truncate_extend_restart(handle_t *handle,
56					    struct inode *inode,
57					    int needed)
58{
59	int err;
60
61	if (!ext4_handle_valid(handle))
62		return 0;
63	if (handle->h_buffer_credits > needed)
64		return 0;
65	err = ext4_journal_extend(handle, needed);
66	if (err <= 0)
67		return err;
68	err = ext4_truncate_restart_trans(handle, inode, needed);
69	if (err == 0)
70		err = -EAGAIN;
71
72	return err;
73}
74
75/*
76 * could return:
77 *  - EROFS
78 *  - ENOMEM
79 */
80static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
81				struct ext4_ext_path *path)
82{
83	if (path->p_bh) {
84		/* path points to block */
85		return ext4_journal_get_write_access(handle, path->p_bh);
86	}
87	/* path points to leaf/index in inode body */
88	/* we use in-core data, no need to protect them */
89	return 0;
90}
91
92/*
93 * could return:
94 *  - EROFS
95 *  - ENOMEM
96 *  - EIO
97 */
98#define ext4_ext_dirty(handle, inode, path) \
99		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101			    handle_t *handle, struct inode *inode,
102			    struct ext4_ext_path *path)
103{
104	int err;
105	if (path->p_bh) {
106		/* path points to block */
107		err = __ext4_handle_dirty_metadata(where, line, handle,
108						   inode, path->p_bh);
109	} else {
110		/* path points to leaf/index in inode body */
111		err = ext4_mark_inode_dirty(handle, inode);
112	}
113	return err;
114}
115
116static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117			      struct ext4_ext_path *path,
118			      ext4_lblk_t block)
119{
120	int depth;
121
122	if (path) {
123		struct ext4_extent *ex;
124		depth = path->p_depth;
125
126		/*
127		 * Try to predict block placement assuming that we are
128		 * filling in a file which will eventually be
129		 * non-sparse --- i.e., in the case of libbfd writing
130		 * an ELF object sections out-of-order but in a way
131		 * the eventually results in a contiguous object or
132		 * executable file, or some database extending a table
133		 * space file.  However, this is actually somewhat
134		 * non-ideal if we are writing a sparse file such as
135		 * qemu or KVM writing a raw image file that is going
136		 * to stay fairly sparse, since it will end up
137		 * fragmenting the file system's free space.  Maybe we
138		 * should have some hueristics or some way to allow
139		 * userspace to pass a hint to file system,
140		 * especially if the latter case turns out to be
141		 * common.
142		 */
143		ex = path[depth].p_ext;
144		if (ex) {
145			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
146			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
147
148			if (block > ext_block)
149				return ext_pblk + (block - ext_block);
150			else
151				return ext_pblk - (ext_block - block);
152		}
153
154		/* it looks like index is empty;
155		 * try to find starting block from index itself */
156		if (path[depth].p_bh)
157			return path[depth].p_bh->b_blocknr;
158	}
159
160	/* OK. use inode's group */
161	return ext4_inode_to_goal_block(inode);
162}
163
164/*
165 * Allocation for a meta data block
166 */
167static ext4_fsblk_t
168ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
169			struct ext4_ext_path *path,
170			struct ext4_extent *ex, int *err, unsigned int flags)
171{
172	ext4_fsblk_t goal, newblock;
173
174	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
175	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
176					NULL, err);
177	return newblock;
178}
179
180static inline int ext4_ext_space_block(struct inode *inode, int check)
181{
182	int size;
183
184	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
185			/ sizeof(struct ext4_extent);
186	if (!check) {
187#ifdef AGGRESSIVE_TEST
188		if (size > 6)
189			size = 6;
190#endif
191	}
192	return size;
193}
194
195static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
196{
197	int size;
198
199	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
200			/ sizeof(struct ext4_extent_idx);
201	if (!check) {
202#ifdef AGGRESSIVE_TEST
203		if (size > 5)
204			size = 5;
205#endif
206	}
207	return size;
208}
209
210static inline int ext4_ext_space_root(struct inode *inode, int check)
211{
212	int size;
213
214	size = sizeof(EXT4_I(inode)->i_data);
215	size -= sizeof(struct ext4_extent_header);
216	size /= sizeof(struct ext4_extent);
217	if (!check) {
218#ifdef AGGRESSIVE_TEST
219		if (size > 3)
220			size = 3;
221#endif
222	}
223	return size;
224}
225
226static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
227{
228	int size;
229
230	size = sizeof(EXT4_I(inode)->i_data);
231	size -= sizeof(struct ext4_extent_header);
232	size /= sizeof(struct ext4_extent_idx);
233	if (!check) {
234#ifdef AGGRESSIVE_TEST
235		if (size > 4)
236			size = 4;
237#endif
238	}
239	return size;
240}
241
242/*
243 * Calculate the number of metadata blocks needed
244 * to allocate @blocks
245 * Worse case is one block per extent
246 */
247int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
248{
249	struct ext4_inode_info *ei = EXT4_I(inode);
250	int idxs, num = 0;
251
252	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
253		/ sizeof(struct ext4_extent_idx));
254
255	/*
256	 * If the new delayed allocation block is contiguous with the
257	 * previous da block, it can share index blocks with the
258	 * previous block, so we only need to allocate a new index
259	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
260	 * an additional index block, and at ldxs**3 blocks, yet
261	 * another index blocks.
262	 */
263	if (ei->i_da_metadata_calc_len &&
264	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
265		if ((ei->i_da_metadata_calc_len % idxs) == 0)
266			num++;
267		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
268			num++;
269		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
270			num++;
271			ei->i_da_metadata_calc_len = 0;
272		} else
273			ei->i_da_metadata_calc_len++;
274		ei->i_da_metadata_calc_last_lblock++;
275		return num;
276	}
277
278	/*
279	 * In the worst case we need a new set of index blocks at
280	 * every level of the inode's extent tree.
281	 */
282	ei->i_da_metadata_calc_len = 1;
283	ei->i_da_metadata_calc_last_lblock = lblock;
284	return ext_depth(inode) + 1;
285}
286
287static int
288ext4_ext_max_entries(struct inode *inode, int depth)
289{
290	int max;
291
292	if (depth == ext_depth(inode)) {
293		if (depth == 0)
294			max = ext4_ext_space_root(inode, 1);
295		else
296			max = ext4_ext_space_root_idx(inode, 1);
297	} else {
298		if (depth == 0)
299			max = ext4_ext_space_block(inode, 1);
300		else
301			max = ext4_ext_space_block_idx(inode, 1);
302	}
303
304	return max;
305}
306
307static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
308{
309	ext4_fsblk_t block = ext4_ext_pblock(ext);
310	int len = ext4_ext_get_actual_len(ext);
311
312	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
313}
314
315static int ext4_valid_extent_idx(struct inode *inode,
316				struct ext4_extent_idx *ext_idx)
317{
318	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
319
320	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
321}
322
323static int ext4_valid_extent_entries(struct inode *inode,
324				struct ext4_extent_header *eh,
325				int depth)
326{
327	struct ext4_extent *ext;
328	struct ext4_extent_idx *ext_idx;
329	unsigned short entries;
330	if (eh->eh_entries == 0)
331		return 1;
332
333	entries = le16_to_cpu(eh->eh_entries);
334
335	if (depth == 0) {
336		/* leaf entries */
337		ext = EXT_FIRST_EXTENT(eh);
338		while (entries) {
339			if (!ext4_valid_extent(inode, ext))
340				return 0;
341			ext++;
342			entries--;
343		}
344	} else {
345		ext_idx = EXT_FIRST_INDEX(eh);
346		while (entries) {
347			if (!ext4_valid_extent_idx(inode, ext_idx))
348				return 0;
349			ext_idx++;
350			entries--;
351		}
352	}
353	return 1;
354}
355
356static int __ext4_ext_check(const char *function, unsigned int line,
357			    struct inode *inode, struct ext4_extent_header *eh,
358			    int depth)
359{
360	const char *error_msg;
361	int max = 0;
362
363	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
364		error_msg = "invalid magic";
365		goto corrupted;
366	}
367	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
368		error_msg = "unexpected eh_depth";
369		goto corrupted;
370	}
371	if (unlikely(eh->eh_max == 0)) {
372		error_msg = "invalid eh_max";
373		goto corrupted;
374	}
375	max = ext4_ext_max_entries(inode, depth);
376	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
377		error_msg = "too large eh_max";
378		goto corrupted;
379	}
380	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
381		error_msg = "invalid eh_entries";
382		goto corrupted;
383	}
384	if (!ext4_valid_extent_entries(inode, eh, depth)) {
385		error_msg = "invalid extent entries";
386		goto corrupted;
387	}
388	return 0;
389
390corrupted:
391	ext4_error_inode(inode, function, line, 0,
392			"bad header/extent: %s - magic %x, "
393			"entries %u, max %u(%u), depth %u(%u)",
394			error_msg, le16_to_cpu(eh->eh_magic),
395			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
396			max, le16_to_cpu(eh->eh_depth), depth);
397
398	return -EIO;
399}
400
401#define ext4_ext_check(inode, eh, depth)	\
402	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
403
404int ext4_ext_check_inode(struct inode *inode)
405{
406	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
407}
408
409#ifdef EXT_DEBUG
410static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
411{
412	int k, l = path->p_depth;
413
414	ext_debug("path:");
415	for (k = 0; k <= l; k++, path++) {
416		if (path->p_idx) {
417		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
418			    ext4_idx_pblock(path->p_idx));
419		} else if (path->p_ext) {
420			ext_debug("  %d:[%d]%d:%llu ",
421				  le32_to_cpu(path->p_ext->ee_block),
422				  ext4_ext_is_uninitialized(path->p_ext),
423				  ext4_ext_get_actual_len(path->p_ext),
424				  ext4_ext_pblock(path->p_ext));
425		} else
426			ext_debug("  []");
427	}
428	ext_debug("\n");
429}
430
431static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
432{
433	int depth = ext_depth(inode);
434	struct ext4_extent_header *eh;
435	struct ext4_extent *ex;
436	int i;
437
438	if (!path)
439		return;
440
441	eh = path[depth].p_hdr;
442	ex = EXT_FIRST_EXTENT(eh);
443
444	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
445
446	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
447		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
448			  ext4_ext_is_uninitialized(ex),
449			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
450	}
451	ext_debug("\n");
452}
453
454static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
455			ext4_fsblk_t newblock, int level)
456{
457	int depth = ext_depth(inode);
458	struct ext4_extent *ex;
459
460	if (depth != level) {
461		struct ext4_extent_idx *idx;
462		idx = path[level].p_idx;
463		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
464			ext_debug("%d: move %d:%llu in new index %llu\n", level,
465					le32_to_cpu(idx->ei_block),
466					ext4_idx_pblock(idx),
467					newblock);
468			idx++;
469		}
470
471		return;
472	}
473
474	ex = path[depth].p_ext;
475	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
476		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
477				le32_to_cpu(ex->ee_block),
478				ext4_ext_pblock(ex),
479				ext4_ext_is_uninitialized(ex),
480				ext4_ext_get_actual_len(ex),
481				newblock);
482		ex++;
483	}
484}
485
486#else
487#define ext4_ext_show_path(inode, path)
488#define ext4_ext_show_leaf(inode, path)
489#define ext4_ext_show_move(inode, path, newblock, level)
490#endif
491
492void ext4_ext_drop_refs(struct ext4_ext_path *path)
493{
494	int depth = path->p_depth;
495	int i;
496
497	for (i = 0; i <= depth; i++, path++)
498		if (path->p_bh) {
499			brelse(path->p_bh);
500			path->p_bh = NULL;
501		}
502}
503
504/*
505 * ext4_ext_binsearch_idx:
506 * binary search for the closest index of the given block
507 * the header must be checked before calling this
508 */
509static void
510ext4_ext_binsearch_idx(struct inode *inode,
511			struct ext4_ext_path *path, ext4_lblk_t block)
512{
513	struct ext4_extent_header *eh = path->p_hdr;
514	struct ext4_extent_idx *r, *l, *m;
515
516
517	ext_debug("binsearch for %u(idx):  ", block);
518
519	l = EXT_FIRST_INDEX(eh) + 1;
520	r = EXT_LAST_INDEX(eh);
521	while (l <= r) {
522		m = l + (r - l) / 2;
523		if (block < le32_to_cpu(m->ei_block))
524			r = m - 1;
525		else
526			l = m + 1;
527		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
528				m, le32_to_cpu(m->ei_block),
529				r, le32_to_cpu(r->ei_block));
530	}
531
532	path->p_idx = l - 1;
533	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
534		  ext4_idx_pblock(path->p_idx));
535
536#ifdef CHECK_BINSEARCH
537	{
538		struct ext4_extent_idx *chix, *ix;
539		int k;
540
541		chix = ix = EXT_FIRST_INDEX(eh);
542		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
543		  if (k != 0 &&
544		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
545				printk(KERN_DEBUG "k=%d, ix=0x%p, "
546				       "first=0x%p\n", k,
547				       ix, EXT_FIRST_INDEX(eh));
548				printk(KERN_DEBUG "%u <= %u\n",
549				       le32_to_cpu(ix->ei_block),
550				       le32_to_cpu(ix[-1].ei_block));
551			}
552			BUG_ON(k && le32_to_cpu(ix->ei_block)
553					   <= le32_to_cpu(ix[-1].ei_block));
554			if (block < le32_to_cpu(ix->ei_block))
555				break;
556			chix = ix;
557		}
558		BUG_ON(chix != path->p_idx);
559	}
560#endif
561
562}
563
564/*
565 * ext4_ext_binsearch:
566 * binary search for closest extent of the given block
567 * the header must be checked before calling this
568 */
569static void
570ext4_ext_binsearch(struct inode *inode,
571		struct ext4_ext_path *path, ext4_lblk_t block)
572{
573	struct ext4_extent_header *eh = path->p_hdr;
574	struct ext4_extent *r, *l, *m;
575
576	if (eh->eh_entries == 0) {
577		/*
578		 * this leaf is empty:
579		 * we get such a leaf in split/add case
580		 */
581		return;
582	}
583
584	ext_debug("binsearch for %u:  ", block);
585
586	l = EXT_FIRST_EXTENT(eh) + 1;
587	r = EXT_LAST_EXTENT(eh);
588
589	while (l <= r) {
590		m = l + (r - l) / 2;
591		if (block < le32_to_cpu(m->ee_block))
592			r = m - 1;
593		else
594			l = m + 1;
595		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
596				m, le32_to_cpu(m->ee_block),
597				r, le32_to_cpu(r->ee_block));
598	}
599
600	path->p_ext = l - 1;
601	ext_debug("  -> %d:%llu:[%d]%d ",
602			le32_to_cpu(path->p_ext->ee_block),
603			ext4_ext_pblock(path->p_ext),
604			ext4_ext_is_uninitialized(path->p_ext),
605			ext4_ext_get_actual_len(path->p_ext));
606
607#ifdef CHECK_BINSEARCH
608	{
609		struct ext4_extent *chex, *ex;
610		int k;
611
612		chex = ex = EXT_FIRST_EXTENT(eh);
613		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
614			BUG_ON(k && le32_to_cpu(ex->ee_block)
615					  <= le32_to_cpu(ex[-1].ee_block));
616			if (block < le32_to_cpu(ex->ee_block))
617				break;
618			chex = ex;
619		}
620		BUG_ON(chex != path->p_ext);
621	}
622#endif
623
624}
625
626int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
627{
628	struct ext4_extent_header *eh;
629
630	eh = ext_inode_hdr(inode);
631	eh->eh_depth = 0;
632	eh->eh_entries = 0;
633	eh->eh_magic = EXT4_EXT_MAGIC;
634	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
635	ext4_mark_inode_dirty(handle, inode);
636	ext4_ext_invalidate_cache(inode);
637	return 0;
638}
639
640struct ext4_ext_path *
641ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
642					struct ext4_ext_path *path)
643{
644	struct ext4_extent_header *eh;
645	struct buffer_head *bh;
646	short int depth, i, ppos = 0, alloc = 0;
647
648	eh = ext_inode_hdr(inode);
649	depth = ext_depth(inode);
650
651	/* account possible depth increase */
652	if (!path) {
653		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
654				GFP_NOFS);
655		if (!path)
656			return ERR_PTR(-ENOMEM);
657		alloc = 1;
658	}
659	path[0].p_hdr = eh;
660	path[0].p_bh = NULL;
661
662	i = depth;
663	/* walk through the tree */
664	while (i) {
665		int need_to_validate = 0;
666
667		ext_debug("depth %d: num %d, max %d\n",
668			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
669
670		ext4_ext_binsearch_idx(inode, path + ppos, block);
671		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
672		path[ppos].p_depth = i;
673		path[ppos].p_ext = NULL;
674
675		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
676		if (unlikely(!bh))
677			goto err;
678		if (!bh_uptodate_or_lock(bh)) {
679			trace_ext4_ext_load_extent(inode, block,
680						path[ppos].p_block);
681			if (bh_submit_read(bh) < 0) {
682				put_bh(bh);
683				goto err;
684			}
685			/* validate the extent entries */
686			need_to_validate = 1;
687		}
688		eh = ext_block_hdr(bh);
689		ppos++;
690		if (unlikely(ppos > depth)) {
691			put_bh(bh);
692			EXT4_ERROR_INODE(inode,
693					 "ppos %d > depth %d", ppos, depth);
694			goto err;
695		}
696		path[ppos].p_bh = bh;
697		path[ppos].p_hdr = eh;
698		i--;
699
700		if (need_to_validate && ext4_ext_check(inode, eh, i))
701			goto err;
702	}
703
704	path[ppos].p_depth = i;
705	path[ppos].p_ext = NULL;
706	path[ppos].p_idx = NULL;
707
708	/* find extent */
709	ext4_ext_binsearch(inode, path + ppos, block);
710	/* if not an empty leaf */
711	if (path[ppos].p_ext)
712		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
713
714	ext4_ext_show_path(inode, path);
715
716	return path;
717
718err:
719	ext4_ext_drop_refs(path);
720	if (alloc)
721		kfree(path);
722	return ERR_PTR(-EIO);
723}
724
725/*
726 * ext4_ext_insert_index:
727 * insert new index [@logical;@ptr] into the block at @curp;
728 * check where to insert: before @curp or after @curp
729 */
730static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
731				 struct ext4_ext_path *curp,
732				 int logical, ext4_fsblk_t ptr)
733{
734	struct ext4_extent_idx *ix;
735	int len, err;
736
737	err = ext4_ext_get_access(handle, inode, curp);
738	if (err)
739		return err;
740
741	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
742		EXT4_ERROR_INODE(inode,
743				 "logical %d == ei_block %d!",
744				 logical, le32_to_cpu(curp->p_idx->ei_block));
745		return -EIO;
746	}
747
748	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
749			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
750		EXT4_ERROR_INODE(inode,
751				 "eh_entries %d >= eh_max %d!",
752				 le16_to_cpu(curp->p_hdr->eh_entries),
753				 le16_to_cpu(curp->p_hdr->eh_max));
754		return -EIO;
755	}
756
757	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
758	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
759		/* insert after */
760		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
761			len = (len - 1) * sizeof(struct ext4_extent_idx);
762			len = len < 0 ? 0 : len;
763			ext_debug("insert new index %d after: %llu. "
764					"move %d from 0x%p to 0x%p\n",
765					logical, ptr, len,
766					(curp->p_idx + 1), (curp->p_idx + 2));
767			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
768		}
769		ix = curp->p_idx + 1;
770	} else {
771		/* insert before */
772		len = len * sizeof(struct ext4_extent_idx);
773		len = len < 0 ? 0 : len;
774		ext_debug("insert new index %d before: %llu. "
775				"move %d from 0x%p to 0x%p\n",
776				logical, ptr, len,
777				curp->p_idx, (curp->p_idx + 1));
778		memmove(curp->p_idx + 1, curp->p_idx, len);
779		ix = curp->p_idx;
780	}
781
782	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
783		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
784		return -EIO;
785	}
786
787	ix->ei_block = cpu_to_le32(logical);
788	ext4_idx_store_pblock(ix, ptr);
789	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
790
791	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
792		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
793		return -EIO;
794	}
795
796	err = ext4_ext_dirty(handle, inode, curp);
797	ext4_std_error(inode->i_sb, err);
798
799	return err;
800}
801
802/*
803 * ext4_ext_split:
804 * inserts new subtree into the path, using free index entry
805 * at depth @at:
806 * - allocates all needed blocks (new leaf and all intermediate index blocks)
807 * - makes decision where to split
808 * - moves remaining extents and index entries (right to the split point)
809 *   into the newly allocated blocks
810 * - initializes subtree
811 */
812static int ext4_ext_split(handle_t *handle, struct inode *inode,
813			  unsigned int flags,
814			  struct ext4_ext_path *path,
815			  struct ext4_extent *newext, int at)
816{
817	struct buffer_head *bh = NULL;
818	int depth = ext_depth(inode);
819	struct ext4_extent_header *neh;
820	struct ext4_extent_idx *fidx;
821	int i = at, k, m, a;
822	ext4_fsblk_t newblock, oldblock;
823	__le32 border;
824	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
825	int err = 0;
826
827	/* make decision: where to split? */
828	/* FIXME: now decision is simplest: at current extent */
829
830	/* if current leaf will be split, then we should use
831	 * border from split point */
832	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
833		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
834		return -EIO;
835	}
836	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
837		border = path[depth].p_ext[1].ee_block;
838		ext_debug("leaf will be split."
839				" next leaf starts at %d\n",
840				  le32_to_cpu(border));
841	} else {
842		border = newext->ee_block;
843		ext_debug("leaf will be added."
844				" next leaf starts at %d\n",
845				le32_to_cpu(border));
846	}
847
848	/*
849	 * If error occurs, then we break processing
850	 * and mark filesystem read-only. index won't
851	 * be inserted and tree will be in consistent
852	 * state. Next mount will repair buffers too.
853	 */
854
855	/*
856	 * Get array to track all allocated blocks.
857	 * We need this to handle errors and free blocks
858	 * upon them.
859	 */
860	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
861	if (!ablocks)
862		return -ENOMEM;
863
864	/* allocate all needed blocks */
865	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
866	for (a = 0; a < depth - at; a++) {
867		newblock = ext4_ext_new_meta_block(handle, inode, path,
868						   newext, &err, flags);
869		if (newblock == 0)
870			goto cleanup;
871		ablocks[a] = newblock;
872	}
873
874	/* initialize new leaf */
875	newblock = ablocks[--a];
876	if (unlikely(newblock == 0)) {
877		EXT4_ERROR_INODE(inode, "newblock == 0!");
878		err = -EIO;
879		goto cleanup;
880	}
881	bh = sb_getblk(inode->i_sb, newblock);
882	if (!bh) {
883		err = -EIO;
884		goto cleanup;
885	}
886	lock_buffer(bh);
887
888	err = ext4_journal_get_create_access(handle, bh);
889	if (err)
890		goto cleanup;
891
892	neh = ext_block_hdr(bh);
893	neh->eh_entries = 0;
894	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
895	neh->eh_magic = EXT4_EXT_MAGIC;
896	neh->eh_depth = 0;
897
898	/* move remainder of path[depth] to the new leaf */
899	if (unlikely(path[depth].p_hdr->eh_entries !=
900		     path[depth].p_hdr->eh_max)) {
901		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
902				 path[depth].p_hdr->eh_entries,
903				 path[depth].p_hdr->eh_max);
904		err = -EIO;
905		goto cleanup;
906	}
907	/* start copy from next extent */
908	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
909	ext4_ext_show_move(inode, path, newblock, depth);
910	if (m) {
911		struct ext4_extent *ex;
912		ex = EXT_FIRST_EXTENT(neh);
913		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
914		le16_add_cpu(&neh->eh_entries, m);
915	}
916
917	set_buffer_uptodate(bh);
918	unlock_buffer(bh);
919
920	err = ext4_handle_dirty_metadata(handle, inode, bh);
921	if (err)
922		goto cleanup;
923	brelse(bh);
924	bh = NULL;
925
926	/* correct old leaf */
927	if (m) {
928		err = ext4_ext_get_access(handle, inode, path + depth);
929		if (err)
930			goto cleanup;
931		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
932		err = ext4_ext_dirty(handle, inode, path + depth);
933		if (err)
934			goto cleanup;
935
936	}
937
938	/* create intermediate indexes */
939	k = depth - at - 1;
940	if (unlikely(k < 0)) {
941		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
942		err = -EIO;
943		goto cleanup;
944	}
945	if (k)
946		ext_debug("create %d intermediate indices\n", k);
947	/* insert new index into current index block */
948	/* current depth stored in i var */
949	i = depth - 1;
950	while (k--) {
951		oldblock = newblock;
952		newblock = ablocks[--a];
953		bh = sb_getblk(inode->i_sb, newblock);
954		if (!bh) {
955			err = -EIO;
956			goto cleanup;
957		}
958		lock_buffer(bh);
959
960		err = ext4_journal_get_create_access(handle, bh);
961		if (err)
962			goto cleanup;
963
964		neh = ext_block_hdr(bh);
965		neh->eh_entries = cpu_to_le16(1);
966		neh->eh_magic = EXT4_EXT_MAGIC;
967		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
968		neh->eh_depth = cpu_to_le16(depth - i);
969		fidx = EXT_FIRST_INDEX(neh);
970		fidx->ei_block = border;
971		ext4_idx_store_pblock(fidx, oldblock);
972
973		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
974				i, newblock, le32_to_cpu(border), oldblock);
975
976		/* move remainder of path[i] to the new index block */
977		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978					EXT_LAST_INDEX(path[i].p_hdr))) {
979			EXT4_ERROR_INODE(inode,
980					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
981					 le32_to_cpu(path[i].p_ext->ee_block));
982			err = -EIO;
983			goto cleanup;
984		}
985		/* start copy indexes */
986		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988				EXT_MAX_INDEX(path[i].p_hdr));
989		ext4_ext_show_move(inode, path, newblock, i);
990		if (m) {
991			memmove(++fidx, path[i].p_idx,
992				sizeof(struct ext4_extent_idx) * m);
993			le16_add_cpu(&neh->eh_entries, m);
994		}
995		set_buffer_uptodate(bh);
996		unlock_buffer(bh);
997
998		err = ext4_handle_dirty_metadata(handle, inode, bh);
999		if (err)
1000			goto cleanup;
1001		brelse(bh);
1002		bh = NULL;
1003
1004		/* correct old index */
1005		if (m) {
1006			err = ext4_ext_get_access(handle, inode, path + i);
1007			if (err)
1008				goto cleanup;
1009			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1010			err = ext4_ext_dirty(handle, inode, path + i);
1011			if (err)
1012				goto cleanup;
1013		}
1014
1015		i--;
1016	}
1017
1018	/* insert new index */
1019	err = ext4_ext_insert_index(handle, inode, path + at,
1020				    le32_to_cpu(border), newblock);
1021
1022cleanup:
1023	if (bh) {
1024		if (buffer_locked(bh))
1025			unlock_buffer(bh);
1026		brelse(bh);
1027	}
1028
1029	if (err) {
1030		/* free all allocated blocks in error case */
1031		for (i = 0; i < depth; i++) {
1032			if (!ablocks[i])
1033				continue;
1034			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1035					 EXT4_FREE_BLOCKS_METADATA);
1036		}
1037	}
1038	kfree(ablocks);
1039
1040	return err;
1041}
1042
1043/*
1044 * ext4_ext_grow_indepth:
1045 * implements tree growing procedure:
1046 * - allocates new block
1047 * - moves top-level data (index block or leaf) into the new block
1048 * - initializes new top-level, creating index that points to the
1049 *   just created block
1050 */
1051static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1052				 unsigned int flags,
1053				 struct ext4_extent *newext)
1054{
1055	struct ext4_extent_header *neh;
1056	struct buffer_head *bh;
1057	ext4_fsblk_t newblock;
1058	int err = 0;
1059
1060	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1061		newext, &err, flags);
1062	if (newblock == 0)
1063		return err;
1064
1065	bh = sb_getblk(inode->i_sb, newblock);
1066	if (!bh) {
1067		err = -EIO;
1068		ext4_std_error(inode->i_sb, err);
1069		return err;
1070	}
1071	lock_buffer(bh);
1072
1073	err = ext4_journal_get_create_access(handle, bh);
1074	if (err) {
1075		unlock_buffer(bh);
1076		goto out;
1077	}
1078
1079	/* move top-level index/leaf into new block */
1080	memmove(bh->b_data, EXT4_I(inode)->i_data,
1081		sizeof(EXT4_I(inode)->i_data));
1082
1083	/* set size of new block */
1084	neh = ext_block_hdr(bh);
1085	/* old root could have indexes or leaves
1086	 * so calculate e_max right way */
1087	if (ext_depth(inode))
1088		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1089	else
1090		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1091	neh->eh_magic = EXT4_EXT_MAGIC;
1092	set_buffer_uptodate(bh);
1093	unlock_buffer(bh);
1094
1095	err = ext4_handle_dirty_metadata(handle, inode, bh);
1096	if (err)
1097		goto out;
1098
1099	/* Update top-level index: num,max,pointer */
1100	neh = ext_inode_hdr(inode);
1101	neh->eh_entries = cpu_to_le16(1);
1102	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1103	if (neh->eh_depth == 0) {
1104		/* Root extent block becomes index block */
1105		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1106		EXT_FIRST_INDEX(neh)->ei_block =
1107			EXT_FIRST_EXTENT(neh)->ee_block;
1108	}
1109	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1110		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1111		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1112		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1113
1114	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1115	ext4_mark_inode_dirty(handle, inode);
1116out:
1117	brelse(bh);
1118
1119	return err;
1120}
1121
1122/*
1123 * ext4_ext_create_new_leaf:
1124 * finds empty index and adds new leaf.
1125 * if no free index is found, then it requests in-depth growing.
1126 */
1127static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1128				    unsigned int flags,
1129				    struct ext4_ext_path *path,
1130				    struct ext4_extent *newext)
1131{
1132	struct ext4_ext_path *curp;
1133	int depth, i, err = 0;
1134
1135repeat:
1136	i = depth = ext_depth(inode);
1137
1138	/* walk up to the tree and look for free index entry */
1139	curp = path + depth;
1140	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1141		i--;
1142		curp--;
1143	}
1144
1145	/* we use already allocated block for index block,
1146	 * so subsequent data blocks should be contiguous */
1147	if (EXT_HAS_FREE_INDEX(curp)) {
1148		/* if we found index with free entry, then use that
1149		 * entry: create all needed subtree and add new leaf */
1150		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1151		if (err)
1152			goto out;
1153
1154		/* refill path */
1155		ext4_ext_drop_refs(path);
1156		path = ext4_ext_find_extent(inode,
1157				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1158				    path);
1159		if (IS_ERR(path))
1160			err = PTR_ERR(path);
1161	} else {
1162		/* tree is full, time to grow in depth */
1163		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1164		if (err)
1165			goto out;
1166
1167		/* refill path */
1168		ext4_ext_drop_refs(path);
1169		path = ext4_ext_find_extent(inode,
1170				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1171				    path);
1172		if (IS_ERR(path)) {
1173			err = PTR_ERR(path);
1174			goto out;
1175		}
1176
1177		/*
1178		 * only first (depth 0 -> 1) produces free space;
1179		 * in all other cases we have to split the grown tree
1180		 */
1181		depth = ext_depth(inode);
1182		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1183			/* now we need to split */
1184			goto repeat;
1185		}
1186	}
1187
1188out:
1189	return err;
1190}
1191
1192/*
1193 * search the closest allocated block to the left for *logical
1194 * and returns it at @logical + it's physical address at @phys
1195 * if *logical is the smallest allocated block, the function
1196 * returns 0 at @phys
1197 * return value contains 0 (success) or error code
1198 */
1199static int ext4_ext_search_left(struct inode *inode,
1200				struct ext4_ext_path *path,
1201				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1202{
1203	struct ext4_extent_idx *ix;
1204	struct ext4_extent *ex;
1205	int depth, ee_len;
1206
1207	if (unlikely(path == NULL)) {
1208		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1209		return -EIO;
1210	}
1211	depth = path->p_depth;
1212	*phys = 0;
1213
1214	if (depth == 0 && path->p_ext == NULL)
1215		return 0;
1216
1217	/* usually extent in the path covers blocks smaller
1218	 * then *logical, but it can be that extent is the
1219	 * first one in the file */
1220
1221	ex = path[depth].p_ext;
1222	ee_len = ext4_ext_get_actual_len(ex);
1223	if (*logical < le32_to_cpu(ex->ee_block)) {
1224		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1225			EXT4_ERROR_INODE(inode,
1226					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1227					 *logical, le32_to_cpu(ex->ee_block));
1228			return -EIO;
1229		}
1230		while (--depth >= 0) {
1231			ix = path[depth].p_idx;
1232			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1233				EXT4_ERROR_INODE(inode,
1234				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1235				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1236				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1237		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1238				  depth);
1239				return -EIO;
1240			}
1241		}
1242		return 0;
1243	}
1244
1245	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1246		EXT4_ERROR_INODE(inode,
1247				 "logical %d < ee_block %d + ee_len %d!",
1248				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1249		return -EIO;
1250	}
1251
1252	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1253	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1254	return 0;
1255}
1256
1257/*
1258 * search the closest allocated block to the right for *logical
1259 * and returns it at @logical + it's physical address at @phys
1260 * if *logical is the largest allocated block, the function
1261 * returns 0 at @phys
1262 * return value contains 0 (success) or error code
1263 */
1264static int ext4_ext_search_right(struct inode *inode,
1265				 struct ext4_ext_path *path,
1266				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1267				 struct ext4_extent **ret_ex)
1268{
1269	struct buffer_head *bh = NULL;
1270	struct ext4_extent_header *eh;
1271	struct ext4_extent_idx *ix;
1272	struct ext4_extent *ex;
1273	ext4_fsblk_t block;
1274	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1275	int ee_len;
1276
1277	if (unlikely(path == NULL)) {
1278		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1279		return -EIO;
1280	}
1281	depth = path->p_depth;
1282	*phys = 0;
1283
1284	if (depth == 0 && path->p_ext == NULL)
1285		return 0;
1286
1287	/* usually extent in the path covers blocks smaller
1288	 * then *logical, but it can be that extent is the
1289	 * first one in the file */
1290
1291	ex = path[depth].p_ext;
1292	ee_len = ext4_ext_get_actual_len(ex);
1293	if (*logical < le32_to_cpu(ex->ee_block)) {
1294		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1295			EXT4_ERROR_INODE(inode,
1296					 "first_extent(path[%d].p_hdr) != ex",
1297					 depth);
1298			return -EIO;
1299		}
1300		while (--depth >= 0) {
1301			ix = path[depth].p_idx;
1302			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1303				EXT4_ERROR_INODE(inode,
1304						 "ix != EXT_FIRST_INDEX *logical %d!",
1305						 *logical);
1306				return -EIO;
1307			}
1308		}
1309		goto found_extent;
1310	}
1311
1312	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1313		EXT4_ERROR_INODE(inode,
1314				 "logical %d < ee_block %d + ee_len %d!",
1315				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1316		return -EIO;
1317	}
1318
1319	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1320		/* next allocated block in this leaf */
1321		ex++;
1322		goto found_extent;
1323	}
1324
1325	/* go up and search for index to the right */
1326	while (--depth >= 0) {
1327		ix = path[depth].p_idx;
1328		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1329			goto got_index;
1330	}
1331
1332	/* we've gone up to the root and found no index to the right */
1333	return 0;
1334
1335got_index:
1336	/* we've found index to the right, let's
1337	 * follow it and find the closest allocated
1338	 * block to the right */
1339	ix++;
1340	block = ext4_idx_pblock(ix);
1341	while (++depth < path->p_depth) {
1342		bh = sb_bread(inode->i_sb, block);
1343		if (bh == NULL)
1344			return -EIO;
1345		eh = ext_block_hdr(bh);
1346		/* subtract from p_depth to get proper eh_depth */
1347		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1348			put_bh(bh);
1349			return -EIO;
1350		}
1351		ix = EXT_FIRST_INDEX(eh);
1352		block = ext4_idx_pblock(ix);
1353		put_bh(bh);
1354	}
1355
1356	bh = sb_bread(inode->i_sb, block);
1357	if (bh == NULL)
1358		return -EIO;
1359	eh = ext_block_hdr(bh);
1360	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1361		put_bh(bh);
1362		return -EIO;
1363	}
1364	ex = EXT_FIRST_EXTENT(eh);
1365found_extent:
1366	*logical = le32_to_cpu(ex->ee_block);
1367	*phys = ext4_ext_pblock(ex);
1368	*ret_ex = ex;
1369	if (bh)
1370		put_bh(bh);
1371	return 0;
1372}
1373
1374/*
1375 * ext4_ext_next_allocated_block:
1376 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1377 * NOTE: it considers block number from index entry as
1378 * allocated block. Thus, index entries have to be consistent
1379 * with leaves.
1380 */
1381static ext4_lblk_t
1382ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1383{
1384	int depth;
1385
1386	BUG_ON(path == NULL);
1387	depth = path->p_depth;
1388
1389	if (depth == 0 && path->p_ext == NULL)
1390		return EXT_MAX_BLOCKS;
1391
1392	while (depth >= 0) {
1393		if (depth == path->p_depth) {
1394			/* leaf */
1395			if (path[depth].p_ext !=
1396					EXT_LAST_EXTENT(path[depth].p_hdr))
1397			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1398		} else {
1399			/* index */
1400			if (path[depth].p_idx !=
1401					EXT_LAST_INDEX(path[depth].p_hdr))
1402			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1403		}
1404		depth--;
1405	}
1406
1407	return EXT_MAX_BLOCKS;
1408}
1409
1410/*
1411 * ext4_ext_next_leaf_block:
1412 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1413 */
1414static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1415{
1416	int depth;
1417
1418	BUG_ON(path == NULL);
1419	depth = path->p_depth;
1420
1421	/* zero-tree has no leaf blocks at all */
1422	if (depth == 0)
1423		return EXT_MAX_BLOCKS;
1424
1425	/* go to index block */
1426	depth--;
1427
1428	while (depth >= 0) {
1429		if (path[depth].p_idx !=
1430				EXT_LAST_INDEX(path[depth].p_hdr))
1431			return (ext4_lblk_t)
1432				le32_to_cpu(path[depth].p_idx[1].ei_block);
1433		depth--;
1434	}
1435
1436	return EXT_MAX_BLOCKS;
1437}
1438
1439/*
1440 * ext4_ext_correct_indexes:
1441 * if leaf gets modified and modified extent is first in the leaf,
1442 * then we have to correct all indexes above.
1443 * TODO: do we need to correct tree in all cases?
1444 */
1445static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1446				struct ext4_ext_path *path)
1447{
1448	struct ext4_extent_header *eh;
1449	int depth = ext_depth(inode);
1450	struct ext4_extent *ex;
1451	__le32 border;
1452	int k, err = 0;
1453
1454	eh = path[depth].p_hdr;
1455	ex = path[depth].p_ext;
1456
1457	if (unlikely(ex == NULL || eh == NULL)) {
1458		EXT4_ERROR_INODE(inode,
1459				 "ex %p == NULL or eh %p == NULL", ex, eh);
1460		return -EIO;
1461	}
1462
1463	if (depth == 0) {
1464		/* there is no tree at all */
1465		return 0;
1466	}
1467
1468	if (ex != EXT_FIRST_EXTENT(eh)) {
1469		/* we correct tree if first leaf got modified only */
1470		return 0;
1471	}
1472
1473	/*
1474	 * TODO: we need correction if border is smaller than current one
1475	 */
1476	k = depth - 1;
1477	border = path[depth].p_ext->ee_block;
1478	err = ext4_ext_get_access(handle, inode, path + k);
1479	if (err)
1480		return err;
1481	path[k].p_idx->ei_block = border;
1482	err = ext4_ext_dirty(handle, inode, path + k);
1483	if (err)
1484		return err;
1485
1486	while (k--) {
1487		/* change all left-side indexes */
1488		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1489			break;
1490		err = ext4_ext_get_access(handle, inode, path + k);
1491		if (err)
1492			break;
1493		path[k].p_idx->ei_block = border;
1494		err = ext4_ext_dirty(handle, inode, path + k);
1495		if (err)
1496			break;
1497	}
1498
1499	return err;
1500}
1501
1502int
1503ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1504				struct ext4_extent *ex2)
1505{
1506	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1507
1508	/*
1509	 * Make sure that either both extents are uninitialized, or
1510	 * both are _not_.
1511	 */
1512	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1513		return 0;
1514
1515	if (ext4_ext_is_uninitialized(ex1))
1516		max_len = EXT_UNINIT_MAX_LEN;
1517	else
1518		max_len = EXT_INIT_MAX_LEN;
1519
1520	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1521	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1522
1523	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1524			le32_to_cpu(ex2->ee_block))
1525		return 0;
1526
1527	/*
1528	 * To allow future support for preallocated extents to be added
1529	 * as an RO_COMPAT feature, refuse to merge to extents if
1530	 * this can result in the top bit of ee_len being set.
1531	 */
1532	if (ext1_ee_len + ext2_ee_len > max_len)
1533		return 0;
1534#ifdef AGGRESSIVE_TEST
1535	if (ext1_ee_len >= 4)
1536		return 0;
1537#endif
1538
1539	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1540		return 1;
1541	return 0;
1542}
1543
1544/*
1545 * This function tries to merge the "ex" extent to the next extent in the tree.
1546 * It always tries to merge towards right. If you want to merge towards
1547 * left, pass "ex - 1" as argument instead of "ex".
1548 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1549 * 1 if they got merged.
1550 */
1551static int ext4_ext_try_to_merge_right(struct inode *inode,
1552				 struct ext4_ext_path *path,
1553				 struct ext4_extent *ex)
1554{
1555	struct ext4_extent_header *eh;
1556	unsigned int depth, len;
1557	int merge_done = 0;
1558	int uninitialized = 0;
1559
1560	depth = ext_depth(inode);
1561	BUG_ON(path[depth].p_hdr == NULL);
1562	eh = path[depth].p_hdr;
1563
1564	while (ex < EXT_LAST_EXTENT(eh)) {
1565		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1566			break;
1567		/* merge with next extent! */
1568		if (ext4_ext_is_uninitialized(ex))
1569			uninitialized = 1;
1570		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1571				+ ext4_ext_get_actual_len(ex + 1));
1572		if (uninitialized)
1573			ext4_ext_mark_uninitialized(ex);
1574
1575		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1576			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1577				* sizeof(struct ext4_extent);
1578			memmove(ex + 1, ex + 2, len);
1579		}
1580		le16_add_cpu(&eh->eh_entries, -1);
1581		merge_done = 1;
1582		WARN_ON(eh->eh_entries == 0);
1583		if (!eh->eh_entries)
1584			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1585	}
1586
1587	return merge_done;
1588}
1589
1590/*
1591 * This function tries to merge the @ex extent to neighbours in the tree.
1592 * return 1 if merge left else 0.
1593 */
1594static int ext4_ext_try_to_merge(struct inode *inode,
1595				  struct ext4_ext_path *path,
1596				  struct ext4_extent *ex) {
1597	struct ext4_extent_header *eh;
1598	unsigned int depth;
1599	int merge_done = 0;
1600	int ret = 0;
1601
1602	depth = ext_depth(inode);
1603	BUG_ON(path[depth].p_hdr == NULL);
1604	eh = path[depth].p_hdr;
1605
1606	if (ex > EXT_FIRST_EXTENT(eh))
1607		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1608
1609	if (!merge_done)
1610		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1611
1612	return ret;
1613}
1614
1615/*
1616 * check if a portion of the "newext" extent overlaps with an
1617 * existing extent.
1618 *
1619 * If there is an overlap discovered, it updates the length of the newext
1620 * such that there will be no overlap, and then returns 1.
1621 * If there is no overlap found, it returns 0.
1622 */
1623static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1624					   struct inode *inode,
1625					   struct ext4_extent *newext,
1626					   struct ext4_ext_path *path)
1627{
1628	ext4_lblk_t b1, b2;
1629	unsigned int depth, len1;
1630	unsigned int ret = 0;
1631
1632	b1 = le32_to_cpu(newext->ee_block);
1633	len1 = ext4_ext_get_actual_len(newext);
1634	depth = ext_depth(inode);
1635	if (!path[depth].p_ext)
1636		goto out;
1637	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1638	b2 &= ~(sbi->s_cluster_ratio - 1);
1639
1640	/*
1641	 * get the next allocated block if the extent in the path
1642	 * is before the requested block(s)
1643	 */
1644	if (b2 < b1) {
1645		b2 = ext4_ext_next_allocated_block(path);
1646		if (b2 == EXT_MAX_BLOCKS)
1647			goto out;
1648		b2 &= ~(sbi->s_cluster_ratio - 1);
1649	}
1650
1651	/* check for wrap through zero on extent logical start block*/
1652	if (b1 + len1 < b1) {
1653		len1 = EXT_MAX_BLOCKS - b1;
1654		newext->ee_len = cpu_to_le16(len1);
1655		ret = 1;
1656	}
1657
1658	/* check for overlap */
1659	if (b1 + len1 > b2) {
1660		newext->ee_len = cpu_to_le16(b2 - b1);
1661		ret = 1;
1662	}
1663out:
1664	return ret;
1665}
1666
1667/*
1668 * ext4_ext_insert_extent:
1669 * tries to merge requsted extent into the existing extent or
1670 * inserts requested extent as new one into the tree,
1671 * creating new leaf in the no-space case.
1672 */
1673int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1674				struct ext4_ext_path *path,
1675				struct ext4_extent *newext, int flag)
1676{
1677	struct ext4_extent_header *eh;
1678	struct ext4_extent *ex, *fex;
1679	struct ext4_extent *nearex; /* nearest extent */
1680	struct ext4_ext_path *npath = NULL;
1681	int depth, len, err;
1682	ext4_lblk_t next;
1683	unsigned uninitialized = 0;
1684	int flags = 0;
1685
1686	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1687		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1688		return -EIO;
1689	}
1690	depth = ext_depth(inode);
1691	ex = path[depth].p_ext;
1692	if (unlikely(path[depth].p_hdr == NULL)) {
1693		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1694		return -EIO;
1695	}
1696
1697	/* try to insert block into found extent and return */
1698	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1699		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1700		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1701			  ext4_ext_is_uninitialized(newext),
1702			  ext4_ext_get_actual_len(newext),
1703			  le32_to_cpu(ex->ee_block),
1704			  ext4_ext_is_uninitialized(ex),
1705			  ext4_ext_get_actual_len(ex),
1706			  ext4_ext_pblock(ex));
1707		err = ext4_ext_get_access(handle, inode, path + depth);
1708		if (err)
1709			return err;
1710
1711		/*
1712		 * ext4_can_extents_be_merged should have checked that either
1713		 * both extents are uninitialized, or both aren't. Thus we
1714		 * need to check only one of them here.
1715		 */
1716		if (ext4_ext_is_uninitialized(ex))
1717			uninitialized = 1;
1718		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1719					+ ext4_ext_get_actual_len(newext));
1720		if (uninitialized)
1721			ext4_ext_mark_uninitialized(ex);
1722		eh = path[depth].p_hdr;
1723		nearex = ex;
1724		goto merge;
1725	}
1726
1727	depth = ext_depth(inode);
1728	eh = path[depth].p_hdr;
1729	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1730		goto has_space;
1731
1732	/* probably next leaf has space for us? */
1733	fex = EXT_LAST_EXTENT(eh);
1734	next = EXT_MAX_BLOCKS;
1735	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1736		next = ext4_ext_next_leaf_block(path);
1737	if (next != EXT_MAX_BLOCKS) {
1738		ext_debug("next leaf block - %d\n", next);
1739		BUG_ON(npath != NULL);
1740		npath = ext4_ext_find_extent(inode, next, NULL);
1741		if (IS_ERR(npath))
1742			return PTR_ERR(npath);
1743		BUG_ON(npath->p_depth != path->p_depth);
1744		eh = npath[depth].p_hdr;
1745		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1746			ext_debug("next leaf isn't full(%d)\n",
1747				  le16_to_cpu(eh->eh_entries));
1748			path = npath;
1749			goto has_space;
1750		}
1751		ext_debug("next leaf has no free space(%d,%d)\n",
1752			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1753	}
1754
1755	/*
1756	 * There is no free space in the found leaf.
1757	 * We're gonna add a new leaf in the tree.
1758	 */
1759	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1760		flags = EXT4_MB_USE_ROOT_BLOCKS;
1761	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1762	if (err)
1763		goto cleanup;
1764	depth = ext_depth(inode);
1765	eh = path[depth].p_hdr;
1766
1767has_space:
1768	nearex = path[depth].p_ext;
1769
1770	err = ext4_ext_get_access(handle, inode, path + depth);
1771	if (err)
1772		goto cleanup;
1773
1774	if (!nearex) {
1775		/* there is no extent in this leaf, create first one */
1776		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1777				le32_to_cpu(newext->ee_block),
1778				ext4_ext_pblock(newext),
1779				ext4_ext_is_uninitialized(newext),
1780				ext4_ext_get_actual_len(newext));
1781		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1782	} else if (le32_to_cpu(newext->ee_block)
1783			   > le32_to_cpu(nearex->ee_block)) {
1784/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1785		if (nearex != EXT_LAST_EXTENT(eh)) {
1786			len = EXT_MAX_EXTENT(eh) - nearex;
1787			len = (len - 1) * sizeof(struct ext4_extent);
1788			len = len < 0 ? 0 : len;
1789			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1790					"move %d from 0x%p to 0x%p\n",
1791					le32_to_cpu(newext->ee_block),
1792					ext4_ext_pblock(newext),
1793					ext4_ext_is_uninitialized(newext),
1794					ext4_ext_get_actual_len(newext),
1795					nearex, len, nearex + 1, nearex + 2);
1796			memmove(nearex + 2, nearex + 1, len);
1797		}
1798		path[depth].p_ext = nearex + 1;
1799	} else {
1800		BUG_ON(newext->ee_block == nearex->ee_block);
1801		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1802		len = len < 0 ? 0 : len;
1803		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1804				"move %d from 0x%p to 0x%p\n",
1805				le32_to_cpu(newext->ee_block),
1806				ext4_ext_pblock(newext),
1807				ext4_ext_is_uninitialized(newext),
1808				ext4_ext_get_actual_len(newext),
1809				nearex, len, nearex, nearex + 1);
1810		memmove(nearex + 1, nearex, len);
1811		path[depth].p_ext = nearex;
1812	}
1813
1814	le16_add_cpu(&eh->eh_entries, 1);
1815	nearex = path[depth].p_ext;
1816	nearex->ee_block = newext->ee_block;
1817	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1818	nearex->ee_len = newext->ee_len;
1819
1820merge:
1821	/* try to merge extents to the right */
1822	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1823		ext4_ext_try_to_merge(inode, path, nearex);
1824
1825	/* try to merge extents to the left */
1826
1827	/* time to correct all indexes above */
1828	err = ext4_ext_correct_indexes(handle, inode, path);
1829	if (err)
1830		goto cleanup;
1831
1832	err = ext4_ext_dirty(handle, inode, path + depth);
1833
1834cleanup:
1835	if (npath) {
1836		ext4_ext_drop_refs(npath);
1837		kfree(npath);
1838	}
1839	ext4_ext_invalidate_cache(inode);
1840	return err;
1841}
1842
1843static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1844			       ext4_lblk_t num, ext_prepare_callback func,
1845			       void *cbdata)
1846{
1847	struct ext4_ext_path *path = NULL;
1848	struct ext4_ext_cache cbex;
1849	struct ext4_extent *ex;
1850	ext4_lblk_t next, start = 0, end = 0;
1851	ext4_lblk_t last = block + num;
1852	int depth, exists, err = 0;
1853
1854	BUG_ON(func == NULL);
1855	BUG_ON(inode == NULL);
1856
1857	while (block < last && block != EXT_MAX_BLOCKS) {
1858		num = last - block;
1859		/* find extent for this block */
1860		down_read(&EXT4_I(inode)->i_data_sem);
1861		path = ext4_ext_find_extent(inode, block, path);
1862		up_read(&EXT4_I(inode)->i_data_sem);
1863		if (IS_ERR(path)) {
1864			err = PTR_ERR(path);
1865			path = NULL;
1866			break;
1867		}
1868
1869		depth = ext_depth(inode);
1870		if (unlikely(path[depth].p_hdr == NULL)) {
1871			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1872			err = -EIO;
1873			break;
1874		}
1875		ex = path[depth].p_ext;
1876		next = ext4_ext_next_allocated_block(path);
1877
1878		exists = 0;
1879		if (!ex) {
1880			/* there is no extent yet, so try to allocate
1881			 * all requested space */
1882			start = block;
1883			end = block + num;
1884		} else if (le32_to_cpu(ex->ee_block) > block) {
1885			/* need to allocate space before found extent */
1886			start = block;
1887			end = le32_to_cpu(ex->ee_block);
1888			if (block + num < end)
1889				end = block + num;
1890		} else if (block >= le32_to_cpu(ex->ee_block)
1891					+ ext4_ext_get_actual_len(ex)) {
1892			/* need to allocate space after found extent */
1893			start = block;
1894			end = block + num;
1895			if (end >= next)
1896				end = next;
1897		} else if (block >= le32_to_cpu(ex->ee_block)) {
1898			/*
1899			 * some part of requested space is covered
1900			 * by found extent
1901			 */
1902			start = block;
1903			end = le32_to_cpu(ex->ee_block)
1904				+ ext4_ext_get_actual_len(ex);
1905			if (block + num < end)
1906				end = block + num;
1907			exists = 1;
1908		} else {
1909			BUG();
1910		}
1911		BUG_ON(end <= start);
1912
1913		if (!exists) {
1914			cbex.ec_block = start;
1915			cbex.ec_len = end - start;
1916			cbex.ec_start = 0;
1917		} else {
1918			cbex.ec_block = le32_to_cpu(ex->ee_block);
1919			cbex.ec_len = ext4_ext_get_actual_len(ex);
1920			cbex.ec_start = ext4_ext_pblock(ex);
1921		}
1922
1923		if (unlikely(cbex.ec_len == 0)) {
1924			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1925			err = -EIO;
1926			break;
1927		}
1928		err = func(inode, next, &cbex, ex, cbdata);
1929		ext4_ext_drop_refs(path);
1930
1931		if (err < 0)
1932			break;
1933
1934		if (err == EXT_REPEAT)
1935			continue;
1936		else if (err == EXT_BREAK) {
1937			err = 0;
1938			break;
1939		}
1940
1941		if (ext_depth(inode) != depth) {
1942			/* depth was changed. we have to realloc path */
1943			kfree(path);
1944			path = NULL;
1945		}
1946
1947		block = cbex.ec_block + cbex.ec_len;
1948	}
1949
1950	if (path) {
1951		ext4_ext_drop_refs(path);
1952		kfree(path);
1953	}
1954
1955	return err;
1956}
1957
1958static void
1959ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1960			__u32 len, ext4_fsblk_t start)
1961{
1962	struct ext4_ext_cache *cex;
1963	BUG_ON(len == 0);
1964	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1965	trace_ext4_ext_put_in_cache(inode, block, len, start);
1966	cex = &EXT4_I(inode)->i_cached_extent;
1967	cex->ec_block = block;
1968	cex->ec_len = len;
1969	cex->ec_start = start;
1970	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1971}
1972
1973/*
1974 * ext4_ext_put_gap_in_cache:
1975 * calculate boundaries of the gap that the requested block fits into
1976 * and cache this gap
1977 */
1978static void
1979ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1980				ext4_lblk_t block)
1981{
1982	int depth = ext_depth(inode);
1983	unsigned long len;
1984	ext4_lblk_t lblock;
1985	struct ext4_extent *ex;
1986
1987	ex = path[depth].p_ext;
1988	if (ex == NULL) {
1989		/* there is no extent yet, so gap is [0;-] */
1990		lblock = 0;
1991		len = EXT_MAX_BLOCKS;
1992		ext_debug("cache gap(whole file):");
1993	} else if (block < le32_to_cpu(ex->ee_block)) {
1994		lblock = block;
1995		len = le32_to_cpu(ex->ee_block) - block;
1996		ext_debug("cache gap(before): %u [%u:%u]",
1997				block,
1998				le32_to_cpu(ex->ee_block),
1999				 ext4_ext_get_actual_len(ex));
2000	} else if (block >= le32_to_cpu(ex->ee_block)
2001			+ ext4_ext_get_actual_len(ex)) {
2002		ext4_lblk_t next;
2003		lblock = le32_to_cpu(ex->ee_block)
2004			+ ext4_ext_get_actual_len(ex);
2005
2006		next = ext4_ext_next_allocated_block(path);
2007		ext_debug("cache gap(after): [%u:%u] %u",
2008				le32_to_cpu(ex->ee_block),
2009				ext4_ext_get_actual_len(ex),
2010				block);
2011		BUG_ON(next == lblock);
2012		len = next - lblock;
2013	} else {
2014		lblock = len = 0;
2015		BUG();
2016	}
2017
2018	ext_debug(" -> %u:%lu\n", lblock, len);
2019	ext4_ext_put_in_cache(inode, lblock, len, 0);
2020}
2021
2022/*
2023 * ext4_ext_check_cache()
2024 * Checks to see if the given block is in the cache.
2025 * If it is, the cached extent is stored in the given
2026 * cache extent pointer.  If the cached extent is a hole,
2027 * this routine should be used instead of
2028 * ext4_ext_in_cache if the calling function needs to
2029 * know the size of the hole.
2030 *
2031 * @inode: The files inode
2032 * @block: The block to look for in the cache
2033 * @ex:    Pointer where the cached extent will be stored
2034 *         if it contains block
2035 *
2036 * Return 0 if cache is invalid; 1 if the cache is valid
2037 */
2038static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2039	struct ext4_ext_cache *ex){
2040	struct ext4_ext_cache *cex;
2041	struct ext4_sb_info *sbi;
2042	int ret = 0;
2043
2044	/*
2045	 * We borrow i_block_reservation_lock to protect i_cached_extent
2046	 */
2047	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2048	cex = &EXT4_I(inode)->i_cached_extent;
2049	sbi = EXT4_SB(inode->i_sb);
2050
2051	/* has cache valid data? */
2052	if (cex->ec_len == 0)
2053		goto errout;
2054
2055	if (in_range(block, cex->ec_block, cex->ec_len)) {
2056		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2057		ext_debug("%u cached by %u:%u:%llu\n",
2058				block,
2059				cex->ec_block, cex->ec_len, cex->ec_start);
2060		ret = 1;
2061	}
2062errout:
2063	if (!ret)
2064		sbi->extent_cache_misses++;
2065	else
2066		sbi->extent_cache_hits++;
2067	trace_ext4_ext_in_cache(inode, block, ret);
2068	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2069	return ret;
2070}
2071
2072/*
2073 * ext4_ext_in_cache()
2074 * Checks to see if the given block is in the cache.
2075 * If it is, the cached extent is stored in the given
2076 * extent pointer.
2077 *
2078 * @inode: The files inode
2079 * @block: The block to look for in the cache
2080 * @ex:    Pointer where the cached extent will be stored
2081 *         if it contains block
2082 *
2083 * Return 0 if cache is invalid; 1 if the cache is valid
2084 */
2085static int
2086ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2087			struct ext4_extent *ex)
2088{
2089	struct ext4_ext_cache cex;
2090	int ret = 0;
2091
2092	if (ext4_ext_check_cache(inode, block, &cex)) {
2093		ex->ee_block = cpu_to_le32(cex.ec_block);
2094		ext4_ext_store_pblock(ex, cex.ec_start);
2095		ex->ee_len = cpu_to_le16(cex.ec_len);
2096		ret = 1;
2097	}
2098
2099	return ret;
2100}
2101
2102
2103/*
2104 * ext4_ext_rm_idx:
2105 * removes index from the index block.
2106 */
2107static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2108			struct ext4_ext_path *path)
2109{
2110	int err;
2111	ext4_fsblk_t leaf;
2112
2113	/* free index block */
2114	path--;
2115	leaf = ext4_idx_pblock(path->p_idx);
2116	if (unlikely(path->p_hdr->eh_entries == 0)) {
2117		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2118		return -EIO;
2119	}
2120	err = ext4_ext_get_access(handle, inode, path);
2121	if (err)
2122		return err;
2123
2124	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2125		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2126		len *= sizeof(struct ext4_extent_idx);
2127		memmove(path->p_idx, path->p_idx + 1, len);
2128	}
2129
2130	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2131	err = ext4_ext_dirty(handle, inode, path);
2132	if (err)
2133		return err;
2134	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2135	trace_ext4_ext_rm_idx(inode, leaf);
2136
2137	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2138			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2139	return err;
2140}
2141
2142/*
2143 * ext4_ext_calc_credits_for_single_extent:
2144 * This routine returns max. credits that needed to insert an extent
2145 * to the extent tree.
2146 * When pass the actual path, the caller should calculate credits
2147 * under i_data_sem.
2148 */
2149int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2150						struct ext4_ext_path *path)
2151{
2152	if (path) {
2153		int depth = ext_depth(inode);
2154		int ret = 0;
2155
2156		/* probably there is space in leaf? */
2157		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2158				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2159
2160			/*
2161			 *  There are some space in the leaf tree, no
2162			 *  need to account for leaf block credit
2163			 *
2164			 *  bitmaps and block group descriptor blocks
2165			 *  and other metadata blocks still need to be
2166			 *  accounted.
2167			 */
2168			/* 1 bitmap, 1 block group descriptor */
2169			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2170			return ret;
2171		}
2172	}
2173
2174	return ext4_chunk_trans_blocks(inode, nrblocks);
2175}
2176
2177/*
2178 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2179 *
2180 * if nrblocks are fit in a single extent (chunk flag is 1), then
2181 * in the worse case, each tree level index/leaf need to be changed
2182 * if the tree split due to insert a new extent, then the old tree
2183 * index/leaf need to be updated too
2184 *
2185 * If the nrblocks are discontiguous, they could cause
2186 * the whole tree split more than once, but this is really rare.
2187 */
2188int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2189{
2190	int index;
2191	int depth = ext_depth(inode);
2192
2193	if (chunk)
2194		index = depth * 2;
2195	else
2196		index = depth * 3;
2197
2198	return index;
2199}
2200
2201static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2202			      struct ext4_extent *ex,
2203			      ext4_fsblk_t *partial_cluster,
2204			      ext4_lblk_t from, ext4_lblk_t to)
2205{
2206	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2207	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2208	ext4_fsblk_t pblk;
2209	int flags = EXT4_FREE_BLOCKS_FORGET;
2210
2211	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2212		flags |= EXT4_FREE_BLOCKS_METADATA;
2213	/*
2214	 * For bigalloc file systems, we never free a partial cluster
2215	 * at the beginning of the extent.  Instead, we make a note
2216	 * that we tried freeing the cluster, and check to see if we
2217	 * need to free it on a subsequent call to ext4_remove_blocks,
2218	 * or at the end of the ext4_truncate() operation.
2219	 */
2220	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2221
2222	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2223	/*
2224	 * If we have a partial cluster, and it's different from the
2225	 * cluster of the last block, we need to explicitly free the
2226	 * partial cluster here.
2227	 */
2228	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2229	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2230		ext4_free_blocks(handle, inode, NULL,
2231				 EXT4_C2B(sbi, *partial_cluster),
2232				 sbi->s_cluster_ratio, flags);
2233		*partial_cluster = 0;
2234	}
2235
2236#ifdef EXTENTS_STATS
2237	{
2238		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2239		spin_lock(&sbi->s_ext_stats_lock);
2240		sbi->s_ext_blocks += ee_len;
2241		sbi->s_ext_extents++;
2242		if (ee_len < sbi->s_ext_min)
2243			sbi->s_ext_min = ee_len;
2244		if (ee_len > sbi->s_ext_max)
2245			sbi->s_ext_max = ee_len;
2246		if (ext_depth(inode) > sbi->s_depth_max)
2247			sbi->s_depth_max = ext_depth(inode);
2248		spin_unlock(&sbi->s_ext_stats_lock);
2249	}
2250#endif
2251	if (from >= le32_to_cpu(ex->ee_block)
2252	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2253		/* tail removal */
2254		ext4_lblk_t num;
2255
2256		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2257		pblk = ext4_ext_pblock(ex) + ee_len - num;
2258		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2259		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2260		/*
2261		 * If the block range to be freed didn't start at the
2262		 * beginning of a cluster, and we removed the entire
2263		 * extent, save the partial cluster here, since we
2264		 * might need to delete if we determine that the
2265		 * truncate operation has removed all of the blocks in
2266		 * the cluster.
2267		 */
2268		if (pblk & (sbi->s_cluster_ratio - 1) &&
2269		    (ee_len == num))
2270			*partial_cluster = EXT4_B2C(sbi, pblk);
2271		else
2272			*partial_cluster = 0;
2273	} else if (from == le32_to_cpu(ex->ee_block)
2274		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2275		/* head removal */
2276		ext4_lblk_t num;
2277		ext4_fsblk_t start;
2278
2279		num = to - from;
2280		start = ext4_ext_pblock(ex);
2281
2282		ext_debug("free first %u blocks starting %llu\n", num, start);
2283		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2284
2285	} else {
2286		printk(KERN_INFO "strange request: removal(2) "
2287				"%u-%u from %u:%u\n",
2288				from, to, le32_to_cpu(ex->ee_block), ee_len);
2289	}
2290	return 0;
2291}
2292
2293
2294/*
2295 * ext4_ext_rm_leaf() Removes the extents associated with the
2296 * blocks appearing between "start" and "end", and splits the extents
2297 * if "start" and "end" appear in the same extent
2298 *
2299 * @handle: The journal handle
2300 * @inode:  The files inode
2301 * @path:   The path to the leaf
2302 * @start:  The first block to remove
2303 * @end:   The last block to remove
2304 */
2305static int
2306ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2307		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2308		 ext4_lblk_t start, ext4_lblk_t end)
2309{
2310	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2311	int err = 0, correct_index = 0;
2312	int depth = ext_depth(inode), credits;
2313	struct ext4_extent_header *eh;
2314	ext4_lblk_t a, b;
2315	unsigned num;
2316	ext4_lblk_t ex_ee_block;
2317	unsigned short ex_ee_len;
2318	unsigned uninitialized = 0;
2319	struct ext4_extent *ex;
2320
2321	/* the header must be checked already in ext4_ext_remove_space() */
2322	ext_debug("truncate since %u in leaf\n", start);
2323	if (!path[depth].p_hdr)
2324		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2325	eh = path[depth].p_hdr;
2326	if (unlikely(path[depth].p_hdr == NULL)) {
2327		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2328		return -EIO;
2329	}
2330	/* find where to start removing */
2331	ex = EXT_LAST_EXTENT(eh);
2332
2333	ex_ee_block = le32_to_cpu(ex->ee_block);
2334	ex_ee_len = ext4_ext_get_actual_len(ex);
2335
2336	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2337
2338	while (ex >= EXT_FIRST_EXTENT(eh) &&
2339			ex_ee_block + ex_ee_len > start) {
2340
2341		if (ext4_ext_is_uninitialized(ex))
2342			uninitialized = 1;
2343		else
2344			uninitialized = 0;
2345
2346		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2347			 uninitialized, ex_ee_len);
2348		path[depth].p_ext = ex;
2349
2350		a = ex_ee_block > start ? ex_ee_block : start;
2351		b = ex_ee_block+ex_ee_len - 1 < end ?
2352			ex_ee_block+ex_ee_len - 1 : end;
2353
2354		ext_debug("  border %u:%u\n", a, b);
2355
2356		/* If this extent is beyond the end of the hole, skip it */
2357		if (end <= ex_ee_block) {
2358			ex--;
2359			ex_ee_block = le32_to_cpu(ex->ee_block);
2360			ex_ee_len = ext4_ext_get_actual_len(ex);
2361			continue;
2362		} else if (b != ex_ee_block + ex_ee_len - 1) {
2363			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
2364					 start, end);
2365			err = -EIO;
2366			goto out;
2367		} else if (a != ex_ee_block) {
2368			/* remove tail of the extent */
2369			num = a - ex_ee_block;
2370		} else {
2371			/* remove whole extent: excellent! */
2372			num = 0;
2373		}
2374		/*
2375		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2376		 * descriptor) for each block group; assume two block
2377		 * groups plus ex_ee_len/blocks_per_block_group for
2378		 * the worst case
2379		 */
2380		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2381		if (ex == EXT_FIRST_EXTENT(eh)) {
2382			correct_index = 1;
2383			credits += (ext_depth(inode)) + 1;
2384		}
2385		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2386
2387		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2388		if (err)
2389			goto out;
2390
2391		err = ext4_ext_get_access(handle, inode, path + depth);
2392		if (err)
2393			goto out;
2394
2395		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2396					 a, b);
2397		if (err)
2398			goto out;
2399
2400		if (num == 0)
2401			/* this extent is removed; mark slot entirely unused */
2402			ext4_ext_store_pblock(ex, 0);
2403
2404		ex->ee_len = cpu_to_le16(num);
2405		/*
2406		 * Do not mark uninitialized if all the blocks in the
2407		 * extent have been removed.
2408		 */
2409		if (uninitialized && num)
2410			ext4_ext_mark_uninitialized(ex);
2411		/*
2412		 * If the extent was completely released,
2413		 * we need to remove it from the leaf
2414		 */
2415		if (num == 0) {
2416			if (end != EXT_MAX_BLOCKS - 1) {
2417				/*
2418				 * For hole punching, we need to scoot all the
2419				 * extents up when an extent is removed so that
2420				 * we dont have blank extents in the middle
2421				 */
2422				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2423					sizeof(struct ext4_extent));
2424
2425				/* Now get rid of the one at the end */
2426				memset(EXT_LAST_EXTENT(eh), 0,
2427					sizeof(struct ext4_extent));
2428			}
2429			le16_add_cpu(&eh->eh_entries, -1);
2430		} else
2431			*partial_cluster = 0;
2432
2433		err = ext4_ext_dirty(handle, inode, path + depth);
2434		if (err)
2435			goto out;
2436
2437		ext_debug("new extent: %u:%u:%llu\n", block, num,
2438				ext4_ext_pblock(ex));
2439		ex--;
2440		ex_ee_block = le32_to_cpu(ex->ee_block);
2441		ex_ee_len = ext4_ext_get_actual_len(ex);
2442	}
2443
2444	if (correct_index && eh->eh_entries)
2445		err = ext4_ext_correct_indexes(handle, inode, path);
2446
2447	/*
2448	 * If there is still a entry in the leaf node, check to see if
2449	 * it references the partial cluster.  This is the only place
2450	 * where it could; if it doesn't, we can free the cluster.
2451	 */
2452	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2453	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2454	     *partial_cluster)) {
2455		int flags = EXT4_FREE_BLOCKS_FORGET;
2456
2457		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2458			flags |= EXT4_FREE_BLOCKS_METADATA;
2459
2460		ext4_free_blocks(handle, inode, NULL,
2461				 EXT4_C2B(sbi, *partial_cluster),
2462				 sbi->s_cluster_ratio, flags);
2463		*partial_cluster = 0;
2464	}
2465
2466	/* if this leaf is free, then we should
2467	 * remove it from index block above */
2468	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2469		err = ext4_ext_rm_idx(handle, inode, path + depth);
2470
2471out:
2472	return err;
2473}
2474
2475/*
2476 * ext4_ext_more_to_rm:
2477 * returns 1 if current index has to be freed (even partial)
2478 */
2479static int
2480ext4_ext_more_to_rm(struct ext4_ext_path *path)
2481{
2482	BUG_ON(path->p_idx == NULL);
2483
2484	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2485		return 0;
2486
2487	/*
2488	 * if truncate on deeper level happened, it wasn't partial,
2489	 * so we have to consider current index for truncation
2490	 */
2491	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2492		return 0;
2493	return 1;
2494}
2495
2496static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2497{
2498	struct super_block *sb = inode->i_sb;
2499	int depth = ext_depth(inode);
2500	struct ext4_ext_path *path;
2501	ext4_fsblk_t partial_cluster = 0;
2502	handle_t *handle;
2503	int i, err;
2504
2505	ext_debug("truncate since %u\n", start);
2506
2507	/* probably first extent we're gonna free will be last in block */
2508	handle = ext4_journal_start(inode, depth + 1);
2509	if (IS_ERR(handle))
2510		return PTR_ERR(handle);
2511
2512again:
2513	ext4_ext_invalidate_cache(inode);
2514
2515	trace_ext4_ext_remove_space(inode, start, depth);
2516
2517	/*
2518	 * We start scanning from right side, freeing all the blocks
2519	 * after i_size and walking into the tree depth-wise.
2520	 */
2521	depth = ext_depth(inode);
2522	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2523	if (path == NULL) {
2524		ext4_journal_stop(handle);
2525		return -ENOMEM;
2526	}
2527	path[0].p_depth = depth;
2528	path[0].p_hdr = ext_inode_hdr(inode);
2529	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2530		err = -EIO;
2531		goto out;
2532	}
2533	i = err = 0;
2534
2535	while (i >= 0 && err == 0) {
2536		if (i == depth) {
2537			/* this is leaf block */
2538			err = ext4_ext_rm_leaf(handle, inode, path,
2539					       &partial_cluster, start,
2540					       EXT_MAX_BLOCKS - 1);
2541			/* root level has p_bh == NULL, brelse() eats this */
2542			brelse(path[i].p_bh);
2543			path[i].p_bh = NULL;
2544			i--;
2545			continue;
2546		}
2547
2548		/* this is index block */
2549		if (!path[i].p_hdr) {
2550			ext_debug("initialize header\n");
2551			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2552		}
2553
2554		if (!path[i].p_idx) {
2555			/* this level hasn't been touched yet */
2556			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2557			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2558			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2559				  path[i].p_hdr,
2560				  le16_to_cpu(path[i].p_hdr->eh_entries));
2561		} else {
2562			/* we were already here, see at next index */
2563			path[i].p_idx--;
2564		}
2565
2566		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2567				i, EXT_FIRST_INDEX(path[i].p_hdr),
2568				path[i].p_idx);
2569		if (ext4_ext_more_to_rm(path + i)) {
2570			struct buffer_head *bh;
2571			/* go to the next level */
2572			ext_debug("move to level %d (block %llu)\n",
2573				  i + 1, ext4_idx_pblock(path[i].p_idx));
2574			memset(path + i + 1, 0, sizeof(*path));
2575			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2576			if (!bh) {
2577				/* should we reset i_size? */
2578				err = -EIO;
2579				break;
2580			}
2581			if (WARN_ON(i + 1 > depth)) {
2582				err = -EIO;
2583				break;
2584			}
2585			if (ext4_ext_check(inode, ext_block_hdr(bh),
2586							depth - i - 1)) {
2587				err = -EIO;
2588				break;
2589			}
2590			path[i + 1].p_bh = bh;
2591
2592			/* save actual number of indexes since this
2593			 * number is changed at the next iteration */
2594			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2595			i++;
2596		} else {
2597			/* we finished processing this index, go up */
2598			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2599				/* index is empty, remove it;
2600				 * handle must be already prepared by the
2601				 * truncatei_leaf() */
2602				err = ext4_ext_rm_idx(handle, inode, path + i);
2603			}
2604			/* root level has p_bh == NULL, brelse() eats this */
2605			brelse(path[i].p_bh);
2606			path[i].p_bh = NULL;
2607			i--;
2608			ext_debug("return to level %d\n", i);
2609		}
2610	}
2611
2612	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2613			path->p_hdr->eh_entries);
2614
2615	/* If we still have something in the partial cluster and we have removed
2616	 * even the first extent, then we should free the blocks in the partial
2617	 * cluster as well. */
2618	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2619		int flags = EXT4_FREE_BLOCKS_FORGET;
2620
2621		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2622			flags |= EXT4_FREE_BLOCKS_METADATA;
2623
2624		ext4_free_blocks(handle, inode, NULL,
2625				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2626				 EXT4_SB(sb)->s_cluster_ratio, flags);
2627		partial_cluster = 0;
2628	}
2629
2630	/* TODO: flexible tree reduction should be here */
2631	if (path->p_hdr->eh_entries == 0) {
2632		/*
2633		 * truncate to zero freed all the tree,
2634		 * so we need to correct eh_depth
2635		 */
2636		err = ext4_ext_get_access(handle, inode, path);
2637		if (err == 0) {
2638			ext_inode_hdr(inode)->eh_depth = 0;
2639			ext_inode_hdr(inode)->eh_max =
2640				cpu_to_le16(ext4_ext_space_root(inode, 0));
2641			err = ext4_ext_dirty(handle, inode, path);
2642		}
2643	}
2644out:
2645	ext4_ext_drop_refs(path);
2646	kfree(path);
2647	if (err == -EAGAIN)
2648		goto again;
2649	ext4_journal_stop(handle);
2650
2651	return err;
2652}
2653
2654/*
2655 * called at mount time
2656 */
2657void ext4_ext_init(struct super_block *sb)
2658{
2659	/*
2660	 * possible initialization would be here
2661	 */
2662
2663	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2664#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2665		printk(KERN_INFO "EXT4-fs: file extents enabled");
2666#ifdef AGGRESSIVE_TEST
2667		printk(", aggressive tests");
2668#endif
2669#ifdef CHECK_BINSEARCH
2670		printk(", check binsearch");
2671#endif
2672#ifdef EXTENTS_STATS
2673		printk(", stats");
2674#endif
2675		printk("\n");
2676#endif
2677#ifdef EXTENTS_STATS
2678		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2679		EXT4_SB(sb)->s_ext_min = 1 << 30;
2680		EXT4_SB(sb)->s_ext_max = 0;
2681#endif
2682	}
2683}
2684
2685/*
2686 * called at umount time
2687 */
2688void ext4_ext_release(struct super_block *sb)
2689{
2690	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2691		return;
2692
2693#ifdef EXTENTS_STATS
2694	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2695		struct ext4_sb_info *sbi = EXT4_SB(sb);
2696		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2697			sbi->s_ext_blocks, sbi->s_ext_extents,
2698			sbi->s_ext_blocks / sbi->s_ext_extents);
2699		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2700			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2701	}
2702#endif
2703}
2704
2705/* FIXME!! we need to try to merge to left or right after zero-out  */
2706static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2707{
2708	ext4_fsblk_t ee_pblock;
2709	unsigned int ee_len;
2710	int ret;
2711
2712	ee_len    = ext4_ext_get_actual_len(ex);
2713	ee_pblock = ext4_ext_pblock(ex);
2714
2715	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2716	if (ret > 0)
2717		ret = 0;
2718
2719	return ret;
2720}
2721
2722/*
2723 * used by extent splitting.
2724 */
2725#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2726					due to ENOSPC */
2727#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2728#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2729
2730/*
2731 * ext4_split_extent_at() splits an extent at given block.
2732 *
2733 * @handle: the journal handle
2734 * @inode: the file inode
2735 * @path: the path to the extent
2736 * @split: the logical block where the extent is splitted.
2737 * @split_flags: indicates if the extent could be zeroout if split fails, and
2738 *		 the states(init or uninit) of new extents.
2739 * @flags: flags used to insert new extent to extent tree.
2740 *
2741 *
2742 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2743 * of which are deterimined by split_flag.
2744 *
2745 * There are two cases:
2746 *  a> the extent are splitted into two extent.
2747 *  b> split is not needed, and just mark the extent.
2748 *
2749 * return 0 on success.
2750 */
2751static int ext4_split_extent_at(handle_t *handle,
2752			     struct inode *inode,
2753			     struct ext4_ext_path *path,
2754			     ext4_lblk_t split,
2755			     int split_flag,
2756			     int flags)
2757{
2758	ext4_fsblk_t newblock;
2759	ext4_lblk_t ee_block;
2760	struct ext4_extent *ex, newex, orig_ex;
2761	struct ext4_extent *ex2 = NULL;
2762	unsigned int ee_len, depth;
2763	int err = 0;
2764
2765	ext_debug("ext4_split_extents_at: inode %lu, logical"
2766		"block %llu\n", inode->i_ino, (unsigned long long)split);
2767
2768	ext4_ext_show_leaf(inode, path);
2769
2770	depth = ext_depth(inode);
2771	ex = path[depth].p_ext;
2772	ee_block = le32_to_cpu(ex->ee_block);
2773	ee_len = ext4_ext_get_actual_len(ex);
2774	newblock = split - ee_block + ext4_ext_pblock(ex);
2775
2776	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2777
2778	err = ext4_ext_get_access(handle, inode, path + depth);
2779	if (err)
2780		goto out;
2781
2782	if (split == ee_block) {
2783		/*
2784		 * case b: block @split is the block that the extent begins with
2785		 * then we just change the state of the extent, and splitting
2786		 * is not needed.
2787		 */
2788		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2789			ext4_ext_mark_uninitialized(ex);
2790		else
2791			ext4_ext_mark_initialized(ex);
2792
2793		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2794			ext4_ext_try_to_merge(inode, path, ex);
2795
2796		err = ext4_ext_dirty(handle, inode, path + depth);
2797		goto out;
2798	}
2799
2800	/* case a */
2801	memcpy(&orig_ex, ex, sizeof(orig_ex));
2802	ex->ee_len = cpu_to_le16(split - ee_block);
2803	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2804		ext4_ext_mark_uninitialized(ex);
2805
2806	/*
2807	 * path may lead to new leaf, not to original leaf any more
2808	 * after ext4_ext_insert_extent() returns,
2809	 */
2810	err = ext4_ext_dirty(handle, inode, path + depth);
2811	if (err)
2812		goto fix_extent_len;
2813
2814	ex2 = &newex;
2815	ex2->ee_block = cpu_to_le32(split);
2816	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2817	ext4_ext_store_pblock(ex2, newblock);
2818	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2819		ext4_ext_mark_uninitialized(ex2);
2820
2821	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2822	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2823		err = ext4_ext_zeroout(inode, &orig_ex);
2824		if (err)
2825			goto fix_extent_len;
2826		/* update the extent length and mark as initialized */
2827		ex->ee_len = cpu_to_le32(ee_len);
2828		ext4_ext_try_to_merge(inode, path, ex);
2829		err = ext4_ext_dirty(handle, inode, path + depth);
2830		goto out;
2831	} else if (err)
2832		goto fix_extent_len;
2833
2834out:
2835	ext4_ext_show_leaf(inode, path);
2836	return err;
2837
2838fix_extent_len:
2839	ex->ee_len = orig_ex.ee_len;
2840	ext4_ext_dirty(handle, inode, path + depth);
2841	return err;
2842}
2843
2844/*
2845 * ext4_split_extents() splits an extent and mark extent which is covered
2846 * by @map as split_flags indicates
2847 *
2848 * It may result in splitting the extent into multiple extents (upto three)
2849 * There are three possibilities:
2850 *   a> There is no split required
2851 *   b> Splits in two extents: Split is happening at either end of the extent
2852 *   c> Splits in three extents: Somone is splitting in middle of the extent
2853 *
2854 */
2855static int ext4_split_extent(handle_t *handle,
2856			      struct inode *inode,
2857			      struct ext4_ext_path *path,
2858			      struct ext4_map_blocks *map,
2859			      int split_flag,
2860			      int flags)
2861{
2862	ext4_lblk_t ee_block;
2863	struct ext4_extent *ex;
2864	unsigned int ee_len, depth;
2865	int err = 0;
2866	int uninitialized;
2867	int split_flag1, flags1;
2868
2869	depth = ext_depth(inode);
2870	ex = path[depth].p_ext;
2871	ee_block = le32_to_cpu(ex->ee_block);
2872	ee_len = ext4_ext_get_actual_len(ex);
2873	uninitialized = ext4_ext_is_uninitialized(ex);
2874
2875	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2876		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2877			      EXT4_EXT_MAY_ZEROOUT : 0;
2878		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2879		if (uninitialized)
2880			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2881				       EXT4_EXT_MARK_UNINIT2;
2882		err = ext4_split_extent_at(handle, inode, path,
2883				map->m_lblk + map->m_len, split_flag1, flags1);
2884		if (err)
2885			goto out;
2886	}
2887
2888	ext4_ext_drop_refs(path);
2889	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2890	if (IS_ERR(path))
2891		return PTR_ERR(path);
2892
2893	if (map->m_lblk >= ee_block) {
2894		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2895			      EXT4_EXT_MAY_ZEROOUT : 0;
2896		if (uninitialized)
2897			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2898		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2899			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2900		err = ext4_split_extent_at(handle, inode, path,
2901				map->m_lblk, split_flag1, flags);
2902		if (err)
2903			goto out;
2904	}
2905
2906	ext4_ext_show_leaf(inode, path);
2907out:
2908	return err ? err : map->m_len;
2909}
2910
2911#define EXT4_EXT_ZERO_LEN 7
2912/*
2913 * This function is called by ext4_ext_map_blocks() if someone tries to write
2914 * to an uninitialized extent. It may result in splitting the uninitialized
2915 * extent into multiple extents (up to three - one initialized and two
2916 * uninitialized).
2917 * There are three possibilities:
2918 *   a> There is no split required: Entire extent should be initialized
2919 *   b> Splits in two extents: Write is happening at either end of the extent
2920 *   c> Splits in three extents: Somone is writing in middle of the extent
2921 */
2922static int ext4_ext_convert_to_initialized(handle_t *handle,
2923					   struct inode *inode,
2924					   struct ext4_map_blocks *map,
2925					   struct ext4_ext_path *path)
2926{
2927	struct ext4_map_blocks split_map;
2928	struct ext4_extent zero_ex;
2929	struct ext4_extent *ex;
2930	ext4_lblk_t ee_block, eof_block;
2931	unsigned int ee_len, depth;
2932	int allocated;
2933	int err = 0;
2934	int split_flag = 0;
2935
2936	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2937		"block %llu, max_blocks %u\n", inode->i_ino,
2938		(unsigned long long)map->m_lblk, map->m_len);
2939
2940	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2941		inode->i_sb->s_blocksize_bits;
2942	if (eof_block < map->m_lblk + map->m_len)
2943		eof_block = map->m_lblk + map->m_len;
2944
2945	depth = ext_depth(inode);
2946	ex = path[depth].p_ext;
2947	ee_block = le32_to_cpu(ex->ee_block);
2948	ee_len = ext4_ext_get_actual_len(ex);
2949	allocated = ee_len - (map->m_lblk - ee_block);
2950
2951	WARN_ON(map->m_lblk < ee_block);
2952	/*
2953	 * It is safe to convert extent to initialized via explicit
2954	 * zeroout only if extent is fully insde i_size or new_size.
2955	 */
2956	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2957
2958	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2959	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2960	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2961		err = ext4_ext_zeroout(inode, ex);
2962		if (err)
2963			goto out;
2964
2965		err = ext4_ext_get_access(handle, inode, path + depth);
2966		if (err)
2967			goto out;
2968		ext4_ext_mark_initialized(ex);
2969		ext4_ext_try_to_merge(inode, path, ex);
2970		err = ext4_ext_dirty(handle, inode, path + depth);
2971		goto out;
2972	}
2973
2974	/*
2975	 * four cases:
2976	 * 1. split the extent into three extents.
2977	 * 2. split the extent into two extents, zeroout the first half.
2978	 * 3. split the extent into two extents, zeroout the second half.
2979	 * 4. split the extent into two extents with out zeroout.
2980	 */
2981	split_map.m_lblk = map->m_lblk;
2982	split_map.m_len = map->m_len;
2983
2984	if (allocated > map->m_len) {
2985		if (allocated <= EXT4_EXT_ZERO_LEN &&
2986		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2987			/* case 3 */
2988			zero_ex.ee_block =
2989					 cpu_to_le32(map->m_lblk);
2990			zero_ex.ee_len = cpu_to_le16(allocated);
2991			ext4_ext_store_pblock(&zero_ex,
2992				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
2993			err = ext4_ext_zeroout(inode, &zero_ex);
2994			if (err)
2995				goto out;
2996			split_map.m_lblk = map->m_lblk;
2997			split_map.m_len = allocated;
2998		} else if ((map->m_lblk - ee_block + map->m_len <
2999			   EXT4_EXT_ZERO_LEN) &&
3000			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3001			/* case 2 */
3002			if (map->m_lblk != ee_block) {
3003				zero_ex.ee_block = ex->ee_block;
3004				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3005							ee_block);
3006				ext4_ext_store_pblock(&zero_ex,
3007						      ext4_ext_pblock(ex));
3008				err = ext4_ext_zeroout(inode, &zero_ex);
3009				if (err)
3010					goto out;
3011			}
3012
3013			split_map.m_lblk = ee_block;
3014			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3015			allocated = map->m_len;
3016		}
3017	}
3018
3019	allocated = ext4_split_extent(handle, inode, path,
3020				       &split_map, split_flag, 0);
3021	if (allocated < 0)
3022		err = allocated;
3023
3024out:
3025	return err ? err : allocated;
3026}
3027
3028/*
3029 * This function is called by ext4_ext_map_blocks() from
3030 * ext4_get_blocks_dio_write() when DIO to write
3031 * to an uninitialized extent.
3032 *
3033 * Writing to an uninitialized extent may result in splitting the uninitialized
3034 * extent into multiple /initialized uninitialized extents (up to three)
3035 * There are three possibilities:
3036 *   a> There is no split required: Entire extent should be uninitialized
3037 *   b> Splits in two extents: Write is happening at either end of the extent
3038 *   c> Splits in three extents: Somone is writing in middle of the extent
3039 *
3040 * One of more index blocks maybe needed if the extent tree grow after
3041 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3042 * complete, we need to split the uninitialized extent before DIO submit
3043 * the IO. The uninitialized extent called at this time will be split
3044 * into three uninitialized extent(at most). After IO complete, the part
3045 * being filled will be convert to initialized by the end_io callback function
3046 * via ext4_convert_unwritten_extents().
3047 *
3048 * Returns the size of uninitialized extent to be written on success.
3049 */
3050static int ext4_split_unwritten_extents(handle_t *handle,
3051					struct inode *inode,
3052					struct ext4_map_blocks *map,
3053					struct ext4_ext_path *path,
3054					int flags)
3055{
3056	ext4_lblk_t eof_block;
3057	ext4_lblk_t ee_block;
3058	struct ext4_extent *ex;
3059	unsigned int ee_len;
3060	int split_flag = 0, depth;
3061
3062	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3063		"block %llu, max_blocks %u\n", inode->i_ino,
3064		(unsigned long long)map->m_lblk, map->m_len);
3065
3066	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3067		inode->i_sb->s_blocksize_bits;
3068	if (eof_block < map->m_lblk + map->m_len)
3069		eof_block = map->m_lblk + map->m_len;
3070	/*
3071	 * It is safe to convert extent to initialized via explicit
3072	 * zeroout only if extent is fully insde i_size or new_size.
3073	 */
3074	depth = ext_depth(inode);
3075	ex = path[depth].p_ext;
3076	ee_block = le32_to_cpu(ex->ee_block);
3077	ee_len = ext4_ext_get_actual_len(ex);
3078
3079	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3080	split_flag |= EXT4_EXT_MARK_UNINIT2;
3081
3082	flags |= EXT4_GET_BLOCKS_PRE_IO;
3083	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3084}
3085
3086static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3087					      struct inode *inode,
3088					      struct ext4_ext_path *path)
3089{
3090	struct ext4_extent *ex;
3091	int depth;
3092	int err = 0;
3093
3094	depth = ext_depth(inode);
3095	ex = path[depth].p_ext;
3096
3097	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3098		"block %llu, max_blocks %u\n", inode->i_ino,
3099		(unsigned long long)le32_to_cpu(ex->ee_block),
3100		ext4_ext_get_actual_len(ex));
3101
3102	err = ext4_ext_get_access(handle, inode, path + depth);
3103	if (err)
3104		goto out;
3105	/* first mark the extent as initialized */
3106	ext4_ext_mark_initialized(ex);
3107
3108	/* note: ext4_ext_correct_indexes() isn't needed here because
3109	 * borders are not changed
3110	 */
3111	ext4_ext_try_to_merge(inode, path, ex);
3112
3113	/* Mark modified extent as dirty */
3114	err = ext4_ext_dirty(handle, inode, path + depth);
3115out:
3116	ext4_ext_show_leaf(inode, path);
3117	return err;
3118}
3119
3120static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3121			sector_t block, int count)
3122{
3123	int i;
3124	for (i = 0; i < count; i++)
3125                unmap_underlying_metadata(bdev, block + i);
3126}
3127
3128/*
3129 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3130 */
3131static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3132			      ext4_lblk_t lblk,
3133			      struct ext4_ext_path *path,
3134			      unsigned int len)
3135{
3136	int i, depth;
3137	struct ext4_extent_header *eh;
3138	struct ext4_extent *last_ex;
3139
3140	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3141		return 0;
3142
3143	depth = ext_depth(inode);
3144	eh = path[depth].p_hdr;
3145
3146	if (unlikely(!eh->eh_entries)) {
3147		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3148				 "EOFBLOCKS_FL set");
3149		return -EIO;
3150	}
3151	last_ex = EXT_LAST_EXTENT(eh);
3152	/*
3153	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3154	 * last block in the last extent in the file.  We test this by
3155	 * first checking to see if the caller to
3156	 * ext4_ext_get_blocks() was interested in the last block (or
3157	 * a block beyond the last block) in the current extent.  If
3158	 * this turns out to be false, we can bail out from this
3159	 * function immediately.
3160	 */
3161	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3162	    ext4_ext_get_actual_len(last_ex))
3163		return 0;
3164	/*
3165	 * If the caller does appear to be planning to write at or
3166	 * beyond the end of the current extent, we then test to see
3167	 * if the current extent is the last extent in the file, by
3168	 * checking to make sure it was reached via the rightmost node
3169	 * at each level of the tree.
3170	 */
3171	for (i = depth-1; i >= 0; i--)
3172		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3173			return 0;
3174	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3175	return ext4_mark_inode_dirty(handle, inode);
3176}
3177
3178/**
3179 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3180 *
3181 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3182 * whether there are any buffers marked for delayed allocation. It returns '1'
3183 * on the first delalloc'ed buffer head found. If no buffer head in the given
3184 * range is marked for delalloc, it returns 0.
3185 * lblk_start should always be <= lblk_end.
3186 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3187 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3188 * block sooner). This is useful when blocks are truncated sequentially from
3189 * lblk_start towards lblk_end.
3190 */
3191static int ext4_find_delalloc_range(struct inode *inode,
3192				    ext4_lblk_t lblk_start,
3193				    ext4_lblk_t lblk_end,
3194				    int search_hint_reverse)
3195{
3196	struct address_space *mapping = inode->i_mapping;
3197	struct buffer_head *head, *bh = NULL;
3198	struct page *page;
3199	ext4_lblk_t i, pg_lblk;
3200	pgoff_t index;
3201
3202	/* reverse search wont work if fs block size is less than page size */
3203	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3204		search_hint_reverse = 0;
3205
3206	if (search_hint_reverse)
3207		i = lblk_end;
3208	else
3209		i = lblk_start;
3210
3211	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3212
3213	while ((i >= lblk_start) && (i <= lblk_end)) {
3214		page = find_get_page(mapping, index);
3215		if (!page)
3216			goto nextpage;
3217
3218		if (!page_has_buffers(page))
3219			goto nextpage;
3220
3221		head = page_buffers(page);
3222		if (!head)
3223			goto nextpage;
3224
3225		bh = head;
3226		pg_lblk = index << (PAGE_CACHE_SHIFT -
3227						inode->i_blkbits);
3228		do {
3229			if (unlikely(pg_lblk < lblk_start)) {
3230				/*
3231				 * This is possible when fs block size is less
3232				 * than page size and our cluster starts/ends in
3233				 * middle of the page. So we need to skip the
3234				 * initial few blocks till we reach the 'lblk'
3235				 */
3236				pg_lblk++;
3237				continue;
3238			}
3239
3240			/* Check if the buffer is delayed allocated and that it
3241			 * is not yet mapped. (when da-buffers are mapped during
3242			 * their writeout, their da_mapped bit is set.)
3243			 */
3244			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3245				page_cache_release(page);
3246				trace_ext4_find_delalloc_range(inode,
3247						lblk_start, lblk_end,
3248						search_hint_reverse,
3249						1, i);
3250				return 1;
3251			}
3252			if (search_hint_reverse)
3253				i--;
3254			else
3255				i++;
3256		} while ((i >= lblk_start) && (i <= lblk_end) &&
3257				((bh = bh->b_this_page) != head));
3258nextpage:
3259		if (page)
3260			page_cache_release(page);
3261		/*
3262		 * Move to next page. 'i' will be the first lblk in the next
3263		 * page.
3264		 */
3265		if (search_hint_reverse)
3266			index--;
3267		else
3268			index++;
3269		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3270	}
3271
3272	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3273					search_hint_reverse, 0, 0);
3274	return 0;
3275}
3276
3277int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3278			       int search_hint_reverse)
3279{
3280	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3281	ext4_lblk_t lblk_start, lblk_end;
3282	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3283	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3284
3285	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3286					search_hint_reverse);
3287}
3288
3289/**
3290 * Determines how many complete clusters (out of those specified by the 'map')
3291 * are under delalloc and were reserved quota for.
3292 * This function is called when we are writing out the blocks that were
3293 * originally written with their allocation delayed, but then the space was
3294 * allocated using fallocate() before the delayed allocation could be resolved.
3295 * The cases to look for are:
3296 * ('=' indicated delayed allocated blocks
3297 *  '-' indicates non-delayed allocated blocks)
3298 * (a) partial clusters towards beginning and/or end outside of allocated range
3299 *     are not delalloc'ed.
3300 *	Ex:
3301 *	|----c---=|====c====|====c====|===-c----|
3302 *	         |++++++ allocated ++++++|
3303 *	==> 4 complete clusters in above example
3304 *
3305 * (b) partial cluster (outside of allocated range) towards either end is
3306 *     marked for delayed allocation. In this case, we will exclude that
3307 *     cluster.
3308 *	Ex:
3309 *	|----====c========|========c========|
3310 *	     |++++++ allocated ++++++|
3311 *	==> 1 complete clusters in above example
3312 *
3313 *	Ex:
3314 *	|================c================|
3315 *            |++++++ allocated ++++++|
3316 *	==> 0 complete clusters in above example
3317 *
3318 * The ext4_da_update_reserve_space will be called only if we
3319 * determine here that there were some "entire" clusters that span
3320 * this 'allocated' range.
3321 * In the non-bigalloc case, this function will just end up returning num_blks
3322 * without ever calling ext4_find_delalloc_range.
3323 */
3324static unsigned int
3325get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3326			   unsigned int num_blks)
3327{
3328	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3329	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3330	ext4_lblk_t lblk_from, lblk_to, c_offset;
3331	unsigned int allocated_clusters = 0;
3332
3333	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3334	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3335
3336	/* max possible clusters for this allocation */
3337	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3338
3339	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3340
3341	/* Check towards left side */
3342	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3343	if (c_offset) {
3344		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3345		lblk_to = lblk_from + c_offset - 1;
3346
3347		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3348			allocated_clusters--;
3349	}
3350
3351	/* Now check towards right. */
3352	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3353	if (allocated_clusters && c_offset) {
3354		lblk_from = lblk_start + num_blks;
3355		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3356
3357		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3358			allocated_clusters--;
3359	}
3360
3361	return allocated_clusters;
3362}
3363
3364static int
3365ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3366			struct ext4_map_blocks *map,
3367			struct ext4_ext_path *path, int flags,
3368			unsigned int allocated, ext4_fsblk_t newblock)
3369{
3370	int ret = 0;
3371	int err = 0;
3372	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3373
3374	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3375		  "block %llu, max_blocks %u, flags %d, allocated %u",
3376		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3377		  flags, allocated);
3378	ext4_ext_show_leaf(inode, path);
3379
3380	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3381						    newblock);
3382
3383	/* get_block() before submit the IO, split the extent */
3384	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3385		ret = ext4_split_unwritten_extents(handle, inode, map,
3386						   path, flags);
3387		/*
3388		 * Flag the inode(non aio case) or end_io struct (aio case)
3389		 * that this IO needs to conversion to written when IO is
3390		 * completed
3391		 */
3392		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3393			io->flag = EXT4_IO_END_UNWRITTEN;
3394			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3395		} else
3396			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3397		if (ext4_should_dioread_nolock(inode))
3398			map->m_flags |= EXT4_MAP_UNINIT;
3399		goto out;
3400	}
3401	/* IO end_io complete, convert the filled extent to written */
3402	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3403		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3404							path);
3405		if (ret >= 0) {
3406			ext4_update_inode_fsync_trans(handle, inode, 1);
3407			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3408						 path, map->m_len);
3409		} else
3410			err = ret;
3411		goto out2;
3412	}
3413	/* buffered IO case */
3414	/*
3415	 * repeat fallocate creation request
3416	 * we already have an unwritten extent
3417	 */
3418	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3419		goto map_out;
3420
3421	/* buffered READ or buffered write_begin() lookup */
3422	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3423		/*
3424		 * We have blocks reserved already.  We
3425		 * return allocated blocks so that delalloc
3426		 * won't do block reservation for us.  But
3427		 * the buffer head will be unmapped so that
3428		 * a read from the block returns 0s.
3429		 */
3430		map->m_flags |= EXT4_MAP_UNWRITTEN;
3431		goto out1;
3432	}
3433
3434	/* buffered write, writepage time, convert*/
3435	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3436	if (ret >= 0)
3437		ext4_update_inode_fsync_trans(handle, inode, 1);
3438out:
3439	if (ret <= 0) {
3440		err = ret;
3441		goto out2;
3442	} else
3443		allocated = ret;
3444	map->m_flags |= EXT4_MAP_NEW;
3445	/*
3446	 * if we allocated more blocks than requested
3447	 * we need to make sure we unmap the extra block
3448	 * allocated. The actual needed block will get
3449	 * unmapped later when we find the buffer_head marked
3450	 * new.
3451	 */
3452	if (allocated > map->m_len) {
3453		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3454					newblock + map->m_len,
3455					allocated - map->m_len);
3456		allocated = map->m_len;
3457	}
3458
3459	/*
3460	 * If we have done fallocate with the offset that is already
3461	 * delayed allocated, we would have block reservation
3462	 * and quota reservation done in the delayed write path.
3463	 * But fallocate would have already updated quota and block
3464	 * count for this offset. So cancel these reservation
3465	 */
3466	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3467		unsigned int reserved_clusters;
3468		reserved_clusters = get_reserved_cluster_alloc(inode,
3469				map->m_lblk, map->m_len);
3470		if (reserved_clusters)
3471			ext4_da_update_reserve_space(inode,
3472						     reserved_clusters,
3473						     0);
3474	}
3475
3476map_out:
3477	map->m_flags |= EXT4_MAP_MAPPED;
3478	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3479		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3480					 map->m_len);
3481		if (err < 0)
3482			goto out2;
3483	}
3484out1:
3485	if (allocated > map->m_len)
3486		allocated = map->m_len;
3487	ext4_ext_show_leaf(inode, path);
3488	map->m_pblk = newblock;
3489	map->m_len = allocated;
3490out2:
3491	if (path) {
3492		ext4_ext_drop_refs(path);
3493		kfree(path);
3494	}
3495	return err ? err : allocated;
3496}
3497
3498/*
3499 * get_implied_cluster_alloc - check to see if the requested
3500 * allocation (in the map structure) overlaps with a cluster already
3501 * allocated in an extent.
3502 *	@sb	The filesystem superblock structure
3503 *	@map	The requested lblk->pblk mapping
3504 *	@ex	The extent structure which might contain an implied
3505 *			cluster allocation
3506 *
3507 * This function is called by ext4_ext_map_blocks() after we failed to
3508 * find blocks that were already in the inode's extent tree.  Hence,
3509 * we know that the beginning of the requested region cannot overlap
3510 * the extent from the inode's extent tree.  There are three cases we
3511 * want to catch.  The first is this case:
3512 *
3513 *		 |--- cluster # N--|
3514 *    |--- extent ---|	|---- requested region ---|
3515 *			|==========|
3516 *
3517 * The second case that we need to test for is this one:
3518 *
3519 *   |--------- cluster # N ----------------|
3520 *	   |--- requested region --|   |------- extent ----|
3521 *	   |=======================|
3522 *
3523 * The third case is when the requested region lies between two extents
3524 * within the same cluster:
3525 *          |------------- cluster # N-------------|
3526 * |----- ex -----|                  |---- ex_right ----|
3527 *                  |------ requested region ------|
3528 *                  |================|
3529 *
3530 * In each of the above cases, we need to set the map->m_pblk and
3531 * map->m_len so it corresponds to the return the extent labelled as
3532 * "|====|" from cluster #N, since it is already in use for data in
3533 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3534 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3535 * as a new "allocated" block region.  Otherwise, we will return 0 and
3536 * ext4_ext_map_blocks() will then allocate one or more new clusters
3537 * by calling ext4_mb_new_blocks().
3538 */
3539static int get_implied_cluster_alloc(struct super_block *sb,
3540				     struct ext4_map_blocks *map,
3541				     struct ext4_extent *ex,
3542				     struct ext4_ext_path *path)
3543{
3544	struct ext4_sb_info *sbi = EXT4_SB(sb);
3545	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3546	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3547	ext4_lblk_t rr_cluster_start, rr_cluster_end;
3548	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3549	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3550	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3551
3552	/* The extent passed in that we are trying to match */
3553	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3554	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3555
3556	/* The requested region passed into ext4_map_blocks() */
3557	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3558	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3559
3560	if ((rr_cluster_start == ex_cluster_end) ||
3561	    (rr_cluster_start == ex_cluster_start)) {
3562		if (rr_cluster_start == ex_cluster_end)
3563			ee_start += ee_len - 1;
3564		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3565			c_offset;
3566		map->m_len = min(map->m_len,
3567				 (unsigned) sbi->s_cluster_ratio - c_offset);
3568		/*
3569		 * Check for and handle this case:
3570		 *
3571		 *   |--------- cluster # N-------------|
3572		 *		       |------- extent ----|
3573		 *	   |--- requested region ---|
3574		 *	   |===========|
3575		 */
3576
3577		if (map->m_lblk < ee_block)
3578			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3579
3580		/*
3581		 * Check for the case where there is already another allocated
3582		 * block to the right of 'ex' but before the end of the cluster.
3583		 *
3584		 *          |------------- cluster # N-------------|
3585		 * |----- ex -----|                  |---- ex_right ----|
3586		 *                  |------ requested region ------|
3587		 *                  |================|
3588		 */
3589		if (map->m_lblk > ee_block) {
3590			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3591			map->m_len = min(map->m_len, next - map->m_lblk);
3592		}
3593
3594		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3595		return 1;
3596	}
3597
3598	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3599	return 0;
3600}
3601
3602
3603/*
3604 * Block allocation/map/preallocation routine for extents based files
3605 *
3606 *
3607 * Need to be called with
3608 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3609 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3610 *
3611 * return > 0, number of of blocks already mapped/allocated
3612 *          if create == 0 and these are pre-allocated blocks
3613 *          	buffer head is unmapped
3614 *          otherwise blocks are mapped
3615 *
3616 * return = 0, if plain look up failed (blocks have not been allocated)
3617 *          buffer head is unmapped
3618 *
3619 * return < 0, error case.
3620 */
3621int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3622			struct ext4_map_blocks *map, int flags)
3623{
3624	struct ext4_ext_path *path = NULL;
3625	struct ext4_extent newex, *ex, *ex2;
3626	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3627	ext4_fsblk_t newblock = 0;
3628	int free_on_err = 0, err = 0, depth, ret;
3629	unsigned int allocated = 0, offset = 0;
3630	unsigned int allocated_clusters = 0, reserved_clusters = 0;
3631	unsigned int punched_out = 0;
3632	unsigned int result = 0;
3633	struct ext4_allocation_request ar;
3634	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3635	ext4_lblk_t cluster_offset;
3636	struct ext4_map_blocks punch_map;
3637
3638	ext_debug("blocks %u/%u requested for inode %lu\n",
3639		  map->m_lblk, map->m_len, inode->i_ino);
3640	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3641
3642	/* check in cache */
3643	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3644		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3645		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3646			if ((sbi->s_cluster_ratio > 1) &&
3647			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3648				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3649
3650			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3651				/*
3652				 * block isn't allocated yet and
3653				 * user doesn't want to allocate it
3654				 */
3655				goto out2;
3656			}
3657			/* we should allocate requested block */
3658		} else {
3659			/* block is already allocated */
3660			if (sbi->s_cluster_ratio > 1)
3661				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3662			newblock = map->m_lblk
3663				   - le32_to_cpu(newex.ee_block)
3664				   + ext4_ext_pblock(&newex);
3665			/* number of remaining blocks in the extent */
3666			allocated = ext4_ext_get_actual_len(&newex) -
3667				(map->m_lblk - le32_to_cpu(newex.ee_block));
3668			goto out;
3669		}
3670	}
3671
3672	/* find extent for this block */
3673	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3674	if (IS_ERR(path)) {
3675		err = PTR_ERR(path);
3676		path = NULL;
3677		goto out2;
3678	}
3679
3680	depth = ext_depth(inode);
3681
3682	/*
3683	 * consistent leaf must not be empty;
3684	 * this situation is possible, though, _during_ tree modification;
3685	 * this is why assert can't be put in ext4_ext_find_extent()
3686	 */
3687	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3688		EXT4_ERROR_INODE(inode, "bad extent address "
3689				 "lblock: %lu, depth: %d pblock %lld",
3690				 (unsigned long) map->m_lblk, depth,
3691				 path[depth].p_block);
3692		err = -EIO;
3693		goto out2;
3694	}
3695
3696	ex = path[depth].p_ext;
3697	if (ex) {
3698		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3699		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3700		unsigned short ee_len;
3701
3702		/*
3703		 * Uninitialized extents are treated as holes, except that
3704		 * we split out initialized portions during a write.
3705		 */
3706		ee_len = ext4_ext_get_actual_len(ex);
3707
3708		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3709
3710		/* if found extent covers block, simply return it */
3711		if (in_range(map->m_lblk, ee_block, ee_len)) {
3712			ext4_fsblk_t partial_cluster = 0;
3713
3714			newblock = map->m_lblk - ee_block + ee_start;
3715			/* number of remaining blocks in the extent */
3716			allocated = ee_len - (map->m_lblk - ee_block);
3717			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3718				  ee_block, ee_len, newblock);
3719
3720			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3721				/*
3722				 * Do not put uninitialized extent
3723				 * in the cache
3724				 */
3725				if (!ext4_ext_is_uninitialized(ex)) {
3726					ext4_ext_put_in_cache(inode, ee_block,
3727						ee_len, ee_start);
3728					goto out;
3729				}
3730				ret = ext4_ext_handle_uninitialized_extents(
3731					handle, inode, map, path, flags,
3732					allocated, newblock);
3733				return ret;
3734			}
3735
3736			/*
3737			 * Punch out the map length, but only to the
3738			 * end of the extent
3739			 */
3740			punched_out = allocated < map->m_len ?
3741				allocated : map->m_len;
3742
3743			/*
3744			 * Sense extents need to be converted to
3745			 * uninitialized, they must fit in an
3746			 * uninitialized extent
3747			 */
3748			if (punched_out > EXT_UNINIT_MAX_LEN)
3749				punched_out = EXT_UNINIT_MAX_LEN;
3750
3751			punch_map.m_lblk = map->m_lblk;
3752			punch_map.m_pblk = newblock;
3753			punch_map.m_len = punched_out;
3754			punch_map.m_flags = 0;
3755
3756			/* Check to see if the extent needs to be split */
3757			if (punch_map.m_len != ee_len ||
3758				punch_map.m_lblk != ee_block) {
3759
3760				ret = ext4_split_extent(handle, inode,
3761				path, &punch_map, 0,
3762				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3763				EXT4_GET_BLOCKS_PRE_IO);
3764
3765				if (ret < 0) {
3766					err = ret;
3767					goto out2;
3768				}
3769				/*
3770				 * find extent for the block at
3771				 * the start of the hole
3772				 */
3773				ext4_ext_drop_refs(path);
3774				kfree(path);
3775
3776				path = ext4_ext_find_extent(inode,
3777				map->m_lblk, NULL);
3778				if (IS_ERR(path)) {
3779					err = PTR_ERR(path);
3780					path = NULL;
3781					goto out2;
3782				}
3783
3784				depth = ext_depth(inode);
3785				ex = path[depth].p_ext;
3786				ee_len = ext4_ext_get_actual_len(ex);
3787				ee_block = le32_to_cpu(ex->ee_block);
3788				ee_start = ext4_ext_pblock(ex);
3789
3790			}
3791
3792			ext4_ext_mark_uninitialized(ex);
3793
3794			ext4_ext_invalidate_cache(inode);
3795
3796			err = ext4_ext_rm_leaf(handle, inode, path,
3797					       &partial_cluster, map->m_lblk,
3798					       map->m_lblk + punched_out);
3799
3800			if (!err && path->p_hdr->eh_entries == 0) {
3801				/*
3802				 * Punch hole freed all of this sub tree,
3803				 * so we need to correct eh_depth
3804				 */
3805				err = ext4_ext_get_access(handle, inode, path);
3806				if (err == 0) {
3807					ext_inode_hdr(inode)->eh_depth = 0;
3808					ext_inode_hdr(inode)->eh_max =
3809					cpu_to_le16(ext4_ext_space_root(
3810						inode, 0));
3811
3812					err = ext4_ext_dirty(
3813						handle, inode, path);
3814				}
3815			}
3816
3817			goto out2;
3818		}
3819	}
3820
3821	if ((sbi->s_cluster_ratio > 1) &&
3822	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3823		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3824
3825	/*
3826	 * requested block isn't allocated yet;
3827	 * we couldn't try to create block if create flag is zero
3828	 */
3829	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3830		/*
3831		 * put just found gap into cache to speed up
3832		 * subsequent requests
3833		 */
3834		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3835		goto out2;
3836	}
3837
3838	/*
3839	 * Okay, we need to do block allocation.
3840	 */
3841	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3842	newex.ee_block = cpu_to_le32(map->m_lblk);
3843	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3844
3845	/*
3846	 * If we are doing bigalloc, check to see if the extent returned
3847	 * by ext4_ext_find_extent() implies a cluster we can use.
3848	 */
3849	if (cluster_offset && ex &&
3850	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3851		ar.len = allocated = map->m_len;
3852		newblock = map->m_pblk;
3853		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3854		goto got_allocated_blocks;
3855	}
3856
3857	/* find neighbour allocated blocks */
3858	ar.lleft = map->m_lblk;
3859	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3860	if (err)
3861		goto out2;
3862	ar.lright = map->m_lblk;
3863	ex2 = NULL;
3864	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3865	if (err)
3866		goto out2;
3867
3868	/* Check if the extent after searching to the right implies a
3869	 * cluster we can use. */
3870	if ((sbi->s_cluster_ratio > 1) && ex2 &&
3871	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3872		ar.len = allocated = map->m_len;
3873		newblock = map->m_pblk;
3874		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3875		goto got_allocated_blocks;
3876	}
3877
3878	/*
3879	 * See if request is beyond maximum number of blocks we can have in
3880	 * a single extent. For an initialized extent this limit is
3881	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3882	 * EXT_UNINIT_MAX_LEN.
3883	 */
3884	if (map->m_len > EXT_INIT_MAX_LEN &&
3885	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3886		map->m_len = EXT_INIT_MAX_LEN;
3887	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3888		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3889		map->m_len = EXT_UNINIT_MAX_LEN;
3890
3891	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3892	newex.ee_len = cpu_to_le16(map->m_len);
3893	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3894	if (err)
3895		allocated = ext4_ext_get_actual_len(&newex);
3896	else
3897		allocated = map->m_len;
3898
3899	/* allocate new block */
3900	ar.inode = inode;
3901	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3902	ar.logical = map->m_lblk;
3903	/*
3904	 * We calculate the offset from the beginning of the cluster
3905	 * for the logical block number, since when we allocate a
3906	 * physical cluster, the physical block should start at the
3907	 * same offset from the beginning of the cluster.  This is
3908	 * needed so that future calls to get_implied_cluster_alloc()
3909	 * work correctly.
3910	 */
3911	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
3912	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
3913	ar.goal -= offset;
3914	ar.logical -= offset;
3915	if (S_ISREG(inode->i_mode))
3916		ar.flags = EXT4_MB_HINT_DATA;
3917	else
3918		/* disable in-core preallocation for non-regular files */
3919		ar.flags = 0;
3920	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3921		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3922	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3923	if (!newblock)
3924		goto out2;
3925	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3926		  ar.goal, newblock, allocated);
3927	free_on_err = 1;
3928	allocated_clusters = ar.len;
3929	ar.len = EXT4_C2B(sbi, ar.len) - offset;
3930	if (ar.len > allocated)
3931		ar.len = allocated;
3932
3933got_allocated_blocks:
3934	/* try to insert new extent into found leaf and return */
3935	ext4_ext_store_pblock(&newex, newblock + offset);
3936	newex.ee_len = cpu_to_le16(ar.len);
3937	/* Mark uninitialized */
3938	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3939		ext4_ext_mark_uninitialized(&newex);
3940		/*
3941		 * io_end structure was created for every IO write to an
3942		 * uninitialized extent. To avoid unnecessary conversion,
3943		 * here we flag the IO that really needs the conversion.
3944		 * For non asycn direct IO case, flag the inode state
3945		 * that we need to perform conversion when IO is done.
3946		 */
3947		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3948			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3949				io->flag = EXT4_IO_END_UNWRITTEN;
3950				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3951			} else
3952				ext4_set_inode_state(inode,
3953						     EXT4_STATE_DIO_UNWRITTEN);
3954		}
3955		if (ext4_should_dioread_nolock(inode))
3956			map->m_flags |= EXT4_MAP_UNINIT;
3957	}
3958
3959	err = 0;
3960	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
3961		err = check_eofblocks_fl(handle, inode, map->m_lblk,
3962					 path, ar.len);
3963	if (!err)
3964		err = ext4_ext_insert_extent(handle, inode, path,
3965					     &newex, flags);
3966	if (err && free_on_err) {
3967		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3968			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3969		/* free data blocks we just allocated */
3970		/* not a good idea to call discard here directly,
3971		 * but otherwise we'd need to call it every free() */
3972		ext4_discard_preallocations(inode);
3973		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3974				 ext4_ext_get_actual_len(&newex), fb_flags);
3975		goto out2;
3976	}
3977
3978	/* previous routine could use block we allocated */
3979	newblock = ext4_ext_pblock(&newex);
3980	allocated = ext4_ext_get_actual_len(&newex);
3981	if (allocated > map->m_len)
3982		allocated = map->m_len;
3983	map->m_flags |= EXT4_MAP_NEW;
3984
3985	/*
3986	 * Update reserved blocks/metadata blocks after successful
3987	 * block allocation which had been deferred till now.
3988	 */
3989	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3990		/*
3991		 * Check how many clusters we had reserved this allocted range.
3992		 */
3993		reserved_clusters = get_reserved_cluster_alloc(inode,
3994						map->m_lblk, allocated);
3995		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
3996			if (reserved_clusters) {
3997				/*
3998				 * We have clusters reserved for this range.
3999				 * But since we are not doing actual allocation
4000				 * and are simply using blocks from previously
4001				 * allocated cluster, we should release the
4002				 * reservation and not claim quota.
4003				 */
4004				ext4_da_update_reserve_space(inode,
4005						reserved_clusters, 0);
4006			}
4007		} else {
4008			BUG_ON(allocated_clusters < reserved_clusters);
4009			/* We will claim quota for all newly allocated blocks.*/
4010			ext4_da_update_reserve_space(inode, allocated_clusters,
4011							1);
4012			if (reserved_clusters < allocated_clusters) {
4013				struct ext4_inode_info *ei = EXT4_I(inode);
4014				int reservation = allocated_clusters -
4015						  reserved_clusters;
4016				/*
4017				 * It seems we claimed few clusters outside of
4018				 * the range of this allocation. We should give
4019				 * it back to the reservation pool. This can
4020				 * happen in the following case:
4021				 *
4022				 * * Suppose s_cluster_ratio is 4 (i.e., each
4023				 *   cluster has 4 blocks. Thus, the clusters
4024				 *   are [0-3],[4-7],[8-11]...
4025				 * * First comes delayed allocation write for
4026				 *   logical blocks 10 & 11. Since there were no
4027				 *   previous delayed allocated blocks in the
4028				 *   range [8-11], we would reserve 1 cluster
4029				 *   for this write.
4030				 * * Next comes write for logical blocks 3 to 8.
4031				 *   In this case, we will reserve 2 clusters
4032				 *   (for [0-3] and [4-7]; and not for [8-11] as
4033				 *   that range has a delayed allocated blocks.
4034				 *   Thus total reserved clusters now becomes 3.
4035				 * * Now, during the delayed allocation writeout
4036				 *   time, we will first write blocks [3-8] and
4037				 *   allocate 3 clusters for writing these
4038				 *   blocks. Also, we would claim all these
4039				 *   three clusters above.
4040				 * * Now when we come here to writeout the
4041				 *   blocks [10-11], we would expect to claim
4042				 *   the reservation of 1 cluster we had made
4043				 *   (and we would claim it since there are no
4044				 *   more delayed allocated blocks in the range
4045				 *   [8-11]. But our reserved cluster count had
4046				 *   already gone to 0.
4047				 *
4048				 *   Thus, at the step 4 above when we determine
4049				 *   that there are still some unwritten delayed
4050				 *   allocated blocks outside of our current
4051				 *   block range, we should increment the
4052				 *   reserved clusters count so that when the
4053				 *   remaining blocks finally gets written, we
4054				 *   could claim them.
4055				 */
4056				dquot_reserve_block(inode,
4057						EXT4_C2B(sbi, reservation));
4058				spin_lock(&ei->i_block_reservation_lock);
4059				ei->i_reserved_data_blocks += reservation;
4060				spin_unlock(&ei->i_block_reservation_lock);
4061			}
4062		}
4063	}
4064
4065	/*
4066	 * Cache the extent and update transaction to commit on fdatasync only
4067	 * when it is _not_ an uninitialized extent.
4068	 */
4069	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4070		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4071		ext4_update_inode_fsync_trans(handle, inode, 1);
4072	} else
4073		ext4_update_inode_fsync_trans(handle, inode, 0);
4074out:
4075	if (allocated > map->m_len)
4076		allocated = map->m_len;
4077	ext4_ext_show_leaf(inode, path);
4078	map->m_flags |= EXT4_MAP_MAPPED;
4079	map->m_pblk = newblock;
4080	map->m_len = allocated;
4081out2:
4082	if (path) {
4083		ext4_ext_drop_refs(path);
4084		kfree(path);
4085	}
4086	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4087		newblock, map->m_len, err ? err : allocated);
4088
4089	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4090			punched_out : allocated;
4091
4092	return err ? err : result;
4093}
4094
4095void ext4_ext_truncate(struct inode *inode)
4096{
4097	struct address_space *mapping = inode->i_mapping;
4098	struct super_block *sb = inode->i_sb;
4099	ext4_lblk_t last_block;
4100	handle_t *handle;
4101	loff_t page_len;
4102	int err = 0;
4103
4104	/*
4105	 * finish any pending end_io work so we won't run the risk of
4106	 * converting any truncated blocks to initialized later
4107	 */
4108	ext4_flush_completed_IO(inode);
4109
4110	/*
4111	 * probably first extent we're gonna free will be last in block
4112	 */
4113	err = ext4_writepage_trans_blocks(inode);
4114	handle = ext4_journal_start(inode, err);
4115	if (IS_ERR(handle))
4116		return;
4117
4118	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4119		page_len = PAGE_CACHE_SIZE -
4120			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4121
4122		err = ext4_discard_partial_page_buffers(handle,
4123			mapping, inode->i_size, page_len, 0);
4124
4125		if (err)
4126			goto out_stop;
4127	}
4128
4129	if (ext4_orphan_add(handle, inode))
4130		goto out_stop;
4131
4132	down_write(&EXT4_I(inode)->i_data_sem);
4133	ext4_ext_invalidate_cache(inode);
4134
4135	ext4_discard_preallocations(inode);
4136
4137	/*
4138	 * TODO: optimization is possible here.
4139	 * Probably we need not scan at all,
4140	 * because page truncation is enough.
4141	 */
4142
4143	/* we have to know where to truncate from in crash case */
4144	EXT4_I(inode)->i_disksize = inode->i_size;
4145	ext4_mark_inode_dirty(handle, inode);
4146
4147	last_block = (inode->i_size + sb->s_blocksize - 1)
4148			>> EXT4_BLOCK_SIZE_BITS(sb);
4149	err = ext4_ext_remove_space(inode, last_block);
4150
4151	/* In a multi-transaction truncate, we only make the final
4152	 * transaction synchronous.
4153	 */
4154	if (IS_SYNC(inode))
4155		ext4_handle_sync(handle);
4156
4157	up_write(&EXT4_I(inode)->i_data_sem);
4158
4159out_stop:
4160	/*
4161	 * If this was a simple ftruncate() and the file will remain alive,
4162	 * then we need to clear up the orphan record which we created above.
4163	 * However, if this was a real unlink then we were called by
4164	 * ext4_delete_inode(), and we allow that function to clean up the
4165	 * orphan info for us.
4166	 */
4167	if (inode->i_nlink)
4168		ext4_orphan_del(handle, inode);
4169
4170	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4171	ext4_mark_inode_dirty(handle, inode);
4172	ext4_journal_stop(handle);
4173}
4174
4175static void ext4_falloc_update_inode(struct inode *inode,
4176				int mode, loff_t new_size, int update_ctime)
4177{
4178	struct timespec now;
4179
4180	if (update_ctime) {
4181		now = current_fs_time(inode->i_sb);
4182		if (!timespec_equal(&inode->i_ctime, &now))
4183			inode->i_ctime = now;
4184	}
4185	/*
4186	 * Update only when preallocation was requested beyond
4187	 * the file size.
4188	 */
4189	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4190		if (new_size > i_size_read(inode))
4191			i_size_write(inode, new_size);
4192		if (new_size > EXT4_I(inode)->i_disksize)
4193			ext4_update_i_disksize(inode, new_size);
4194	} else {
4195		/*
4196		 * Mark that we allocate beyond EOF so the subsequent truncate
4197		 * can proceed even if the new size is the same as i_size.
4198		 */
4199		if (new_size > i_size_read(inode))
4200			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4201	}
4202
4203}
4204
4205/*
4206 * preallocate space for a file. This implements ext4's fallocate file
4207 * operation, which gets called from sys_fallocate system call.
4208 * For block-mapped files, posix_fallocate should fall back to the method
4209 * of writing zeroes to the required new blocks (the same behavior which is
4210 * expected for file systems which do not support fallocate() system call).
4211 */
4212long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4213{
4214	struct inode *inode = file->f_path.dentry->d_inode;
4215	handle_t *handle;
4216	loff_t new_size;
4217	unsigned int max_blocks;
4218	int ret = 0;
4219	int ret2 = 0;
4220	int retries = 0;
4221	int flags;
4222	struct ext4_map_blocks map;
4223	unsigned int credits, blkbits = inode->i_blkbits;
4224
4225	/*
4226	 * currently supporting (pre)allocate mode for extent-based
4227	 * files _only_
4228	 */
4229	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4230		return -EOPNOTSUPP;
4231
4232	/* Return error if mode is not supported */
4233	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4234		return -EOPNOTSUPP;
4235
4236	if (mode & FALLOC_FL_PUNCH_HOLE)
4237		return ext4_punch_hole(file, offset, len);
4238
4239	trace_ext4_fallocate_enter(inode, offset, len, mode);
4240	map.m_lblk = offset >> blkbits;
4241	/*
4242	 * We can't just convert len to max_blocks because
4243	 * If blocksize = 4096 offset = 3072 and len = 2048
4244	 */
4245	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4246		- map.m_lblk;
4247	/*
4248	 * credits to insert 1 extent into extent tree
4249	 */
4250	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4251	mutex_lock(&inode->i_mutex);
4252	ret = inode_newsize_ok(inode, (len + offset));
4253	if (ret) {
4254		mutex_unlock(&inode->i_mutex);
4255		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4256		return ret;
4257	}
4258	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4259		EXT4_GET_BLOCKS_NO_NORMALIZE;
4260	if (mode & FALLOC_FL_KEEP_SIZE)
4261		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4262retry:
4263	while (ret >= 0 && ret < max_blocks) {
4264		map.m_lblk = map.m_lblk + ret;
4265		map.m_len = max_blocks = max_blocks - ret;
4266		handle = ext4_journal_start(inode, credits);
4267		if (IS_ERR(handle)) {
4268			ret = PTR_ERR(handle);
4269			break;
4270		}
4271		ret = ext4_map_blocks(handle, inode, &map, flags);
4272		if (ret <= 0) {
4273#ifdef EXT4FS_DEBUG
4274			WARN_ON(ret <= 0);
4275			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4276				    "returned error inode#%lu, block=%u, "
4277				    "max_blocks=%u", __func__,
4278				    inode->i_ino, map.m_lblk, max_blocks);
4279#endif
4280			ext4_mark_inode_dirty(handle, inode);
4281			ret2 = ext4_journal_stop(handle);
4282			break;
4283		}
4284		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4285						blkbits) >> blkbits))
4286			new_size = offset + len;
4287		else
4288			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4289
4290		ext4_falloc_update_inode(inode, mode, new_size,
4291					 (map.m_flags & EXT4_MAP_NEW));
4292		ext4_mark_inode_dirty(handle, inode);
4293		ret2 = ext4_journal_stop(handle);
4294		if (ret2)
4295			break;
4296	}
4297	if (ret == -ENOSPC &&
4298			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4299		ret = 0;
4300		goto retry;
4301	}
4302	mutex_unlock(&inode->i_mutex);
4303	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4304				ret > 0 ? ret2 : ret);
4305	return ret > 0 ? ret2 : ret;
4306}
4307
4308/*
4309 * This function convert a range of blocks to written extents
4310 * The caller of this function will pass the start offset and the size.
4311 * all unwritten extents within this range will be converted to
4312 * written extents.
4313 *
4314 * This function is called from the direct IO end io call back
4315 * function, to convert the fallocated extents after IO is completed.
4316 * Returns 0 on success.
4317 */
4318int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4319				    ssize_t len)
4320{
4321	handle_t *handle;
4322	unsigned int max_blocks;
4323	int ret = 0;
4324	int ret2 = 0;
4325	struct ext4_map_blocks map;
4326	unsigned int credits, blkbits = inode->i_blkbits;
4327
4328	map.m_lblk = offset >> blkbits;
4329	/*
4330	 * We can't just convert len to max_blocks because
4331	 * If blocksize = 4096 offset = 3072 and len = 2048
4332	 */
4333	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4334		      map.m_lblk);
4335	/*
4336	 * credits to insert 1 extent into extent tree
4337	 */
4338	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4339	while (ret >= 0 && ret < max_blocks) {
4340		map.m_lblk += ret;
4341		map.m_len = (max_blocks -= ret);
4342		handle = ext4_journal_start(inode, credits);
4343		if (IS_ERR(handle)) {
4344			ret = PTR_ERR(handle);
4345			break;
4346		}
4347		ret = ext4_map_blocks(handle, inode, &map,
4348				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4349		if (ret <= 0) {
4350			WARN_ON(ret <= 0);
4351			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4352				    "returned error inode#%lu, block=%u, "
4353				    "max_blocks=%u", __func__,
4354				    inode->i_ino, map.m_lblk, map.m_len);
4355		}
4356		ext4_mark_inode_dirty(handle, inode);
4357		ret2 = ext4_journal_stop(handle);
4358		if (ret <= 0 || ret2 )
4359			break;
4360	}
4361	return ret > 0 ? ret2 : ret;
4362}
4363
4364/*
4365 * Callback function called for each extent to gather FIEMAP information.
4366 */
4367static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4368		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4369		       void *data)
4370{
4371	__u64	logical;
4372	__u64	physical;
4373	__u64	length;
4374	__u32	flags = 0;
4375	int		ret = 0;
4376	struct fiemap_extent_info *fieinfo = data;
4377	unsigned char blksize_bits;
4378
4379	blksize_bits = inode->i_sb->s_blocksize_bits;
4380	logical = (__u64)newex->ec_block << blksize_bits;
4381
4382	if (newex->ec_start == 0) {
4383		/*
4384		 * No extent in extent-tree contains block @newex->ec_start,
4385		 * then the block may stay in 1)a hole or 2)delayed-extent.
4386		 *
4387		 * Holes or delayed-extents are processed as follows.
4388		 * 1. lookup dirty pages with specified range in pagecache.
4389		 *    If no page is got, then there is no delayed-extent and
4390		 *    return with EXT_CONTINUE.
4391		 * 2. find the 1st mapped buffer,
4392		 * 3. check if the mapped buffer is both in the request range
4393		 *    and a delayed buffer. If not, there is no delayed-extent,
4394		 *    then return.
4395		 * 4. a delayed-extent is found, the extent will be collected.
4396		 */
4397		ext4_lblk_t	end = 0;
4398		pgoff_t		last_offset;
4399		pgoff_t		offset;
4400		pgoff_t		index;
4401		pgoff_t		start_index = 0;
4402		struct page	**pages = NULL;
4403		struct buffer_head *bh = NULL;
4404		struct buffer_head *head = NULL;
4405		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4406
4407		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4408		if (pages == NULL)
4409			return -ENOMEM;
4410
4411		offset = logical >> PAGE_SHIFT;
4412repeat:
4413		last_offset = offset;
4414		head = NULL;
4415		ret = find_get_pages_tag(inode->i_mapping, &offset,
4416					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4417
4418		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4419			/* First time, try to find a mapped buffer. */
4420			if (ret == 0) {
4421out:
4422				for (index = 0; index < ret; index++)
4423					page_cache_release(pages[index]);
4424				/* just a hole. */
4425				kfree(pages);
4426				return EXT_CONTINUE;
4427			}
4428			index = 0;
4429
4430next_page:
4431			/* Try to find the 1st mapped buffer. */
4432			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4433				  blksize_bits;
4434			if (!page_has_buffers(pages[index]))
4435				goto out;
4436			head = page_buffers(pages[index]);
4437			if (!head)
4438				goto out;
4439
4440			index++;
4441			bh = head;
4442			do {
4443				if (end >= newex->ec_block +
4444					newex->ec_len)
4445					/* The buffer is out of
4446					 * the request range.
4447					 */
4448					goto out;
4449
4450				if (buffer_mapped(bh) &&
4451				    end >= newex->ec_block) {
4452					start_index = index - 1;
4453					/* get the 1st mapped buffer. */
4454					goto found_mapped_buffer;
4455				}
4456
4457				bh = bh->b_this_page;
4458				end++;
4459			} while (bh != head);
4460
4461			/* No mapped buffer in the range found in this page,
4462			 * We need to look up next page.
4463			 */
4464			if (index >= ret) {
4465				/* There is no page left, but we need to limit
4466				 * newex->ec_len.
4467				 */
4468				newex->ec_len = end - newex->ec_block;
4469				goto out;
4470			}
4471			goto next_page;
4472		} else {
4473			/*Find contiguous delayed buffers. */
4474			if (ret > 0 && pages[0]->index == last_offset)
4475				head = page_buffers(pages[0]);
4476			bh = head;
4477			index = 1;
4478			start_index = 0;
4479		}
4480
4481found_mapped_buffer:
4482		if (bh != NULL && buffer_delay(bh)) {
4483			/* 1st or contiguous delayed buffer found. */
4484			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4485				/*
4486				 * 1st delayed buffer found, record
4487				 * the start of extent.
4488				 */
4489				flags |= FIEMAP_EXTENT_DELALLOC;
4490				newex->ec_block = end;
4491				logical = (__u64)end << blksize_bits;
4492			}
4493			/* Find contiguous delayed buffers. */
4494			do {
4495				if (!buffer_delay(bh))
4496					goto found_delayed_extent;
4497				bh = bh->b_this_page;
4498				end++;
4499			} while (bh != head);
4500
4501			for (; index < ret; index++) {
4502				if (!page_has_buffers(pages[index])) {
4503					bh = NULL;
4504					break;
4505				}
4506				head = page_buffers(pages[index]);
4507				if (!head) {
4508					bh = NULL;
4509					break;
4510				}
4511
4512				if (pages[index]->index !=
4513				    pages[start_index]->index + index
4514				    - start_index) {
4515					/* Blocks are not contiguous. */
4516					bh = NULL;
4517					break;
4518				}
4519				bh = head;
4520				do {
4521					if (!buffer_delay(bh))
4522						/* Delayed-extent ends. */
4523						goto found_delayed_extent;
4524					bh = bh->b_this_page;
4525					end++;
4526				} while (bh != head);
4527			}
4528		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4529			/* a hole found. */
4530			goto out;
4531
4532found_delayed_extent:
4533		newex->ec_len = min(end - newex->ec_block,
4534						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4535		if (ret == nr_pages && bh != NULL &&
4536			newex->ec_len < EXT_INIT_MAX_LEN &&
4537			buffer_delay(bh)) {
4538			/* Have not collected an extent and continue. */
4539			for (index = 0; index < ret; index++)
4540				page_cache_release(pages[index]);
4541			goto repeat;
4542		}
4543
4544		for (index = 0; index < ret; index++)
4545			page_cache_release(pages[index]);
4546		kfree(pages);
4547	}
4548
4549	physical = (__u64)newex->ec_start << blksize_bits;
4550	length =   (__u64)newex->ec_len << blksize_bits;
4551
4552	if (ex && ext4_ext_is_uninitialized(ex))
4553		flags |= FIEMAP_EXTENT_UNWRITTEN;
4554
4555	if (next == EXT_MAX_BLOCKS)
4556		flags |= FIEMAP_EXTENT_LAST;
4557
4558	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4559					length, flags);
4560	if (ret < 0)
4561		return ret;
4562	if (ret == 1)
4563		return EXT_BREAK;
4564	return EXT_CONTINUE;
4565}
4566/* fiemap flags we can handle specified here */
4567#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4568
4569static int ext4_xattr_fiemap(struct inode *inode,
4570				struct fiemap_extent_info *fieinfo)
4571{
4572	__u64 physical = 0;
4573	__u64 length;
4574	__u32 flags = FIEMAP_EXTENT_LAST;
4575	int blockbits = inode->i_sb->s_blocksize_bits;
4576	int error = 0;
4577
4578	/* in-inode? */
4579	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4580		struct ext4_iloc iloc;
4581		int offset;	/* offset of xattr in inode */
4582
4583		error = ext4_get_inode_loc(inode, &iloc);
4584		if (error)
4585			return error;
4586		physical = iloc.bh->b_blocknr << blockbits;
4587		offset = EXT4_GOOD_OLD_INODE_SIZE +
4588				EXT4_I(inode)->i_extra_isize;
4589		physical += offset;
4590		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4591		flags |= FIEMAP_EXTENT_DATA_INLINE;
4592		brelse(iloc.bh);
4593	} else { /* external block */
4594		physical = EXT4_I(inode)->i_file_acl << blockbits;
4595		length = inode->i_sb->s_blocksize;
4596	}
4597
4598	if (physical)
4599		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4600						length, flags);
4601	return (error < 0 ? error : 0);
4602}
4603
4604/*
4605 * ext4_ext_punch_hole
4606 *
4607 * Punches a hole of "length" bytes in a file starting
4608 * at byte "offset"
4609 *
4610 * @inode:  The inode of the file to punch a hole in
4611 * @offset: The starting byte offset of the hole
4612 * @length: The length of the hole
4613 *
4614 * Returns the number of blocks removed or negative on err
4615 */
4616int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4617{
4618	struct inode *inode = file->f_path.dentry->d_inode;
4619	struct super_block *sb = inode->i_sb;
4620	struct ext4_ext_cache cache_ex;
4621	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4622	struct address_space *mapping = inode->i_mapping;
4623	struct ext4_map_blocks map;
4624	handle_t *handle;
4625	loff_t first_page, last_page, page_len;
4626	loff_t first_page_offset, last_page_offset;
4627	int ret, credits, blocks_released, err = 0;
4628
4629	/* No need to punch hole beyond i_size */
4630	if (offset >= inode->i_size)
4631		return 0;
4632
4633	/*
4634	 * If the hole extends beyond i_size, set the hole
4635	 * to end after the page that contains i_size
4636	 */
4637	if (offset + length > inode->i_size) {
4638		length = inode->i_size +
4639		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4640		   offset;
4641	}
4642
4643	first_block = (offset + sb->s_blocksize - 1) >>
4644		EXT4_BLOCK_SIZE_BITS(sb);
4645	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4646
4647	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4648	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4649
4650	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4651	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4652
4653	/*
4654	 * Write out all dirty pages to avoid race conditions
4655	 * Then release them.
4656	 */
4657	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4658		err = filemap_write_and_wait_range(mapping,
4659			offset, offset + length - 1);
4660
4661		if (err)
4662			return err;
4663	}
4664
4665	/* Now release the pages */
4666	if (last_page_offset > first_page_offset) {
4667		truncate_inode_pages_range(mapping, first_page_offset,
4668					   last_page_offset-1);
4669	}
4670
4671	/* finish any pending end_io work */
4672	ext4_flush_completed_IO(inode);
4673
4674	credits = ext4_writepage_trans_blocks(inode);
4675	handle = ext4_journal_start(inode, credits);
4676	if (IS_ERR(handle))
4677		return PTR_ERR(handle);
4678
4679	err = ext4_orphan_add(handle, inode);
4680	if (err)
4681		goto out;
4682
4683	/*
4684	 * Now we need to zero out the non-page-aligned data in the
4685	 * pages at the start and tail of the hole, and unmap the buffer
4686	 * heads for the block aligned regions of the page that were
4687	 * completely zeroed.
4688	 */
4689	if (first_page > last_page) {
4690		/*
4691		 * If the file space being truncated is contained within a page
4692		 * just zero out and unmap the middle of that page
4693		 */
4694		err = ext4_discard_partial_page_buffers(handle,
4695			mapping, offset, length, 0);
4696
4697		if (err)
4698			goto out;
4699	} else {
4700		/*
4701		 * zero out and unmap the partial page that contains
4702		 * the start of the hole
4703		 */
4704		page_len  = first_page_offset - offset;
4705		if (page_len > 0) {
4706			err = ext4_discard_partial_page_buffers(handle, mapping,
4707						   offset, page_len, 0);
4708			if (err)
4709				goto out;
4710		}
4711
4712		/*
4713		 * zero out and unmap the partial page that contains
4714		 * the end of the hole
4715		 */
4716		page_len = offset + length - last_page_offset;
4717		if (page_len > 0) {
4718			err = ext4_discard_partial_page_buffers(handle, mapping,
4719					last_page_offset, page_len, 0);
4720			if (err)
4721				goto out;
4722		}
4723	}
4724
4725
4726	/*
4727	 * If i_size is contained in the last page, we need to
4728	 * unmap and zero the partial page after i_size
4729	 */
4730	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4731	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4732
4733		page_len = PAGE_CACHE_SIZE -
4734			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4735
4736		if (page_len > 0) {
4737			err = ext4_discard_partial_page_buffers(handle,
4738			  mapping, inode->i_size, page_len, 0);
4739
4740			if (err)
4741				goto out;
4742		}
4743	}
4744
4745	/* If there are no blocks to remove, return now */
4746	if (first_block >= last_block)
4747		goto out;
4748
4749	down_write(&EXT4_I(inode)->i_data_sem);
4750	ext4_ext_invalidate_cache(inode);
4751	ext4_discard_preallocations(inode);
4752
4753	/*
4754	 * Loop over all the blocks and identify blocks
4755	 * that need to be punched out
4756	 */
4757	iblock = first_block;
4758	blocks_released = 0;
4759	while (iblock < last_block) {
4760		max_blocks = last_block - iblock;
4761		num_blocks = 1;
4762		memset(&map, 0, sizeof(map));
4763		map.m_lblk = iblock;
4764		map.m_len = max_blocks;
4765		ret = ext4_ext_map_blocks(handle, inode, &map,
4766			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4767
4768		if (ret > 0) {
4769			blocks_released += ret;
4770			num_blocks = ret;
4771		} else if (ret == 0) {
4772			/*
4773			 * If map blocks could not find the block,
4774			 * then it is in a hole.  If the hole was
4775			 * not already cached, then map blocks should
4776			 * put it in the cache.  So we can get the hole
4777			 * out of the cache
4778			 */
4779			memset(&cache_ex, 0, sizeof(cache_ex));
4780			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4781				!cache_ex.ec_start) {
4782
4783				/* The hole is cached */
4784				num_blocks = cache_ex.ec_block +
4785				cache_ex.ec_len - iblock;
4786
4787			} else {
4788				/* The block could not be identified */
4789				err = -EIO;
4790				break;
4791			}
4792		} else {
4793			/* Map blocks error */
4794			err = ret;
4795			break;
4796		}
4797
4798		if (num_blocks == 0) {
4799			/* This condition should never happen */
4800			ext_debug("Block lookup failed");
4801			err = -EIO;
4802			break;
4803		}
4804
4805		iblock += num_blocks;
4806	}
4807
4808	if (blocks_released > 0) {
4809		ext4_ext_invalidate_cache(inode);
4810		ext4_discard_preallocations(inode);
4811	}
4812
4813	if (IS_SYNC(inode))
4814		ext4_handle_sync(handle);
4815
4816	up_write(&EXT4_I(inode)->i_data_sem);
4817
4818out:
4819	ext4_orphan_del(handle, inode);
4820	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4821	ext4_mark_inode_dirty(handle, inode);
4822	ext4_journal_stop(handle);
4823	return err;
4824}
4825int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4826		__u64 start, __u64 len)
4827{
4828	ext4_lblk_t start_blk;
4829	int error = 0;
4830
4831	/* fallback to generic here if not in extents fmt */
4832	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4833		return generic_block_fiemap(inode, fieinfo, start, len,
4834			ext4_get_block);
4835
4836	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4837		return -EBADR;
4838
4839	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4840		error = ext4_xattr_fiemap(inode, fieinfo);
4841	} else {
4842		ext4_lblk_t len_blks;
4843		__u64 last_blk;
4844
4845		start_blk = start >> inode->i_sb->s_blocksize_bits;
4846		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4847		if (last_blk >= EXT_MAX_BLOCKS)
4848			last_blk = EXT_MAX_BLOCKS-1;
4849		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4850
4851		/*
4852		 * Walk the extent tree gathering extent information.
4853		 * ext4_ext_fiemap_cb will push extents back to user.
4854		 */
4855		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4856					  ext4_ext_fiemap_cb, fieinfo);
4857	}
4858
4859	return error;
4860}
4861