extents.c revision e7b319e39776bd0e9c0c7855b023dafed2c93d27
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45
46#include <trace/events/ext4.h>
47
48static int ext4_split_extent(handle_t *handle,
49				struct inode *inode,
50				struct ext4_ext_path *path,
51				struct ext4_map_blocks *map,
52				int split_flag,
53				int flags);
54
55static int ext4_ext_truncate_extend_restart(handle_t *handle,
56					    struct inode *inode,
57					    int needed)
58{
59	int err;
60
61	if (!ext4_handle_valid(handle))
62		return 0;
63	if (handle->h_buffer_credits > needed)
64		return 0;
65	err = ext4_journal_extend(handle, needed);
66	if (err <= 0)
67		return err;
68	err = ext4_truncate_restart_trans(handle, inode, needed);
69	if (err == 0)
70		err = -EAGAIN;
71
72	return err;
73}
74
75/*
76 * could return:
77 *  - EROFS
78 *  - ENOMEM
79 */
80static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
81				struct ext4_ext_path *path)
82{
83	if (path->p_bh) {
84		/* path points to block */
85		return ext4_journal_get_write_access(handle, path->p_bh);
86	}
87	/* path points to leaf/index in inode body */
88	/* we use in-core data, no need to protect them */
89	return 0;
90}
91
92/*
93 * could return:
94 *  - EROFS
95 *  - ENOMEM
96 *  - EIO
97 */
98#define ext4_ext_dirty(handle, inode, path) \
99		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101			    handle_t *handle, struct inode *inode,
102			    struct ext4_ext_path *path)
103{
104	int err;
105	if (path->p_bh) {
106		/* path points to block */
107		err = __ext4_handle_dirty_metadata(where, line, handle,
108						   inode, path->p_bh);
109	} else {
110		/* path points to leaf/index in inode body */
111		err = ext4_mark_inode_dirty(handle, inode);
112	}
113	return err;
114}
115
116static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117			      struct ext4_ext_path *path,
118			      ext4_lblk_t block)
119{
120	if (path) {
121		int depth = path->p_depth;
122		struct ext4_extent *ex;
123
124		/*
125		 * Try to predict block placement assuming that we are
126		 * filling in a file which will eventually be
127		 * non-sparse --- i.e., in the case of libbfd writing
128		 * an ELF object sections out-of-order but in a way
129		 * the eventually results in a contiguous object or
130		 * executable file, or some database extending a table
131		 * space file.  However, this is actually somewhat
132		 * non-ideal if we are writing a sparse file such as
133		 * qemu or KVM writing a raw image file that is going
134		 * to stay fairly sparse, since it will end up
135		 * fragmenting the file system's free space.  Maybe we
136		 * should have some hueristics or some way to allow
137		 * userspace to pass a hint to file system,
138		 * especially if the latter case turns out to be
139		 * common.
140		 */
141		ex = path[depth].p_ext;
142		if (ex) {
143			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
144			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
145
146			if (block > ext_block)
147				return ext_pblk + (block - ext_block);
148			else
149				return ext_pblk - (ext_block - block);
150		}
151
152		/* it looks like index is empty;
153		 * try to find starting block from index itself */
154		if (path[depth].p_bh)
155			return path[depth].p_bh->b_blocknr;
156	}
157
158	/* OK. use inode's group */
159	return ext4_inode_to_goal_block(inode);
160}
161
162/*
163 * Allocation for a meta data block
164 */
165static ext4_fsblk_t
166ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
167			struct ext4_ext_path *path,
168			struct ext4_extent *ex, int *err, unsigned int flags)
169{
170	ext4_fsblk_t goal, newblock;
171
172	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
173	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
174					NULL, err);
175	return newblock;
176}
177
178static inline int ext4_ext_space_block(struct inode *inode, int check)
179{
180	int size;
181
182	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
183			/ sizeof(struct ext4_extent);
184#ifdef AGGRESSIVE_TEST
185	if (!check && size > 6)
186		size = 6;
187#endif
188	return size;
189}
190
191static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
192{
193	int size;
194
195	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
196			/ sizeof(struct ext4_extent_idx);
197#ifdef AGGRESSIVE_TEST
198	if (!check && size > 5)
199		size = 5;
200#endif
201	return size;
202}
203
204static inline int ext4_ext_space_root(struct inode *inode, int check)
205{
206	int size;
207
208	size = sizeof(EXT4_I(inode)->i_data);
209	size -= sizeof(struct ext4_extent_header);
210	size /= sizeof(struct ext4_extent);
211#ifdef AGGRESSIVE_TEST
212	if (!check && size > 3)
213		size = 3;
214#endif
215	return size;
216}
217
218static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
219{
220	int size;
221
222	size = sizeof(EXT4_I(inode)->i_data);
223	size -= sizeof(struct ext4_extent_header);
224	size /= sizeof(struct ext4_extent_idx);
225#ifdef AGGRESSIVE_TEST
226	if (!check && size > 4)
227		size = 4;
228#endif
229	return size;
230}
231
232/*
233 * Calculate the number of metadata blocks needed
234 * to allocate @blocks
235 * Worse case is one block per extent
236 */
237int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
238{
239	struct ext4_inode_info *ei = EXT4_I(inode);
240	int idxs;
241
242	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
243		/ sizeof(struct ext4_extent_idx));
244
245	/*
246	 * If the new delayed allocation block is contiguous with the
247	 * previous da block, it can share index blocks with the
248	 * previous block, so we only need to allocate a new index
249	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
250	 * an additional index block, and at ldxs**3 blocks, yet
251	 * another index blocks.
252	 */
253	if (ei->i_da_metadata_calc_len &&
254	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
255		int num = 0;
256
257		if ((ei->i_da_metadata_calc_len % idxs) == 0)
258			num++;
259		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
260			num++;
261		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
262			num++;
263			ei->i_da_metadata_calc_len = 0;
264		} else
265			ei->i_da_metadata_calc_len++;
266		ei->i_da_metadata_calc_last_lblock++;
267		return num;
268	}
269
270	/*
271	 * In the worst case we need a new set of index blocks at
272	 * every level of the inode's extent tree.
273	 */
274	ei->i_da_metadata_calc_len = 1;
275	ei->i_da_metadata_calc_last_lblock = lblock;
276	return ext_depth(inode) + 1;
277}
278
279static int
280ext4_ext_max_entries(struct inode *inode, int depth)
281{
282	int max;
283
284	if (depth == ext_depth(inode)) {
285		if (depth == 0)
286			max = ext4_ext_space_root(inode, 1);
287		else
288			max = ext4_ext_space_root_idx(inode, 1);
289	} else {
290		if (depth == 0)
291			max = ext4_ext_space_block(inode, 1);
292		else
293			max = ext4_ext_space_block_idx(inode, 1);
294	}
295
296	return max;
297}
298
299static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
300{
301	ext4_fsblk_t block = ext4_ext_pblock(ext);
302	int len = ext4_ext_get_actual_len(ext);
303
304	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
305}
306
307static int ext4_valid_extent_idx(struct inode *inode,
308				struct ext4_extent_idx *ext_idx)
309{
310	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
311
312	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
313}
314
315static int ext4_valid_extent_entries(struct inode *inode,
316				struct ext4_extent_header *eh,
317				int depth)
318{
319	unsigned short entries;
320	if (eh->eh_entries == 0)
321		return 1;
322
323	entries = le16_to_cpu(eh->eh_entries);
324
325	if (depth == 0) {
326		/* leaf entries */
327		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
328		while (entries) {
329			if (!ext4_valid_extent(inode, ext))
330				return 0;
331			ext++;
332			entries--;
333		}
334	} else {
335		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
336		while (entries) {
337			if (!ext4_valid_extent_idx(inode, ext_idx))
338				return 0;
339			ext_idx++;
340			entries--;
341		}
342	}
343	return 1;
344}
345
346static int __ext4_ext_check(const char *function, unsigned int line,
347			    struct inode *inode, struct ext4_extent_header *eh,
348			    int depth)
349{
350	const char *error_msg;
351	int max = 0;
352
353	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
354		error_msg = "invalid magic";
355		goto corrupted;
356	}
357	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
358		error_msg = "unexpected eh_depth";
359		goto corrupted;
360	}
361	if (unlikely(eh->eh_max == 0)) {
362		error_msg = "invalid eh_max";
363		goto corrupted;
364	}
365	max = ext4_ext_max_entries(inode, depth);
366	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
367		error_msg = "too large eh_max";
368		goto corrupted;
369	}
370	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
371		error_msg = "invalid eh_entries";
372		goto corrupted;
373	}
374	if (!ext4_valid_extent_entries(inode, eh, depth)) {
375		error_msg = "invalid extent entries";
376		goto corrupted;
377	}
378	return 0;
379
380corrupted:
381	ext4_error_inode(inode, function, line, 0,
382			"bad header/extent: %s - magic %x, "
383			"entries %u, max %u(%u), depth %u(%u)",
384			error_msg, le16_to_cpu(eh->eh_magic),
385			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
386			max, le16_to_cpu(eh->eh_depth), depth);
387
388	return -EIO;
389}
390
391#define ext4_ext_check(inode, eh, depth)	\
392	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
393
394int ext4_ext_check_inode(struct inode *inode)
395{
396	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
397}
398
399#ifdef EXT_DEBUG
400static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
401{
402	int k, l = path->p_depth;
403
404	ext_debug("path:");
405	for (k = 0; k <= l; k++, path++) {
406		if (path->p_idx) {
407		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
408			    ext4_idx_pblock(path->p_idx));
409		} else if (path->p_ext) {
410			ext_debug("  %d:[%d]%d:%llu ",
411				  le32_to_cpu(path->p_ext->ee_block),
412				  ext4_ext_is_uninitialized(path->p_ext),
413				  ext4_ext_get_actual_len(path->p_ext),
414				  ext4_ext_pblock(path->p_ext));
415		} else
416			ext_debug("  []");
417	}
418	ext_debug("\n");
419}
420
421static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
422{
423	int depth = ext_depth(inode);
424	struct ext4_extent_header *eh;
425	struct ext4_extent *ex;
426	int i;
427
428	if (!path)
429		return;
430
431	eh = path[depth].p_hdr;
432	ex = EXT_FIRST_EXTENT(eh);
433
434	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
435
436	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
437		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
438			  ext4_ext_is_uninitialized(ex),
439			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
440	}
441	ext_debug("\n");
442}
443
444static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
445			ext4_fsblk_t newblock, int level)
446{
447	int depth = ext_depth(inode);
448	struct ext4_extent *ex;
449
450	if (depth != level) {
451		struct ext4_extent_idx *idx;
452		idx = path[level].p_idx;
453		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
454			ext_debug("%d: move %d:%llu in new index %llu\n", level,
455					le32_to_cpu(idx->ei_block),
456					ext4_idx_pblock(idx),
457					newblock);
458			idx++;
459		}
460
461		return;
462	}
463
464	ex = path[depth].p_ext;
465	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
466		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
467				le32_to_cpu(ex->ee_block),
468				ext4_ext_pblock(ex),
469				ext4_ext_is_uninitialized(ex),
470				ext4_ext_get_actual_len(ex),
471				newblock);
472		ex++;
473	}
474}
475
476#else
477#define ext4_ext_show_path(inode, path)
478#define ext4_ext_show_leaf(inode, path)
479#define ext4_ext_show_move(inode, path, newblock, level)
480#endif
481
482void ext4_ext_drop_refs(struct ext4_ext_path *path)
483{
484	int depth = path->p_depth;
485	int i;
486
487	for (i = 0; i <= depth; i++, path++)
488		if (path->p_bh) {
489			brelse(path->p_bh);
490			path->p_bh = NULL;
491		}
492}
493
494/*
495 * ext4_ext_binsearch_idx:
496 * binary search for the closest index of the given block
497 * the header must be checked before calling this
498 */
499static void
500ext4_ext_binsearch_idx(struct inode *inode,
501			struct ext4_ext_path *path, ext4_lblk_t block)
502{
503	struct ext4_extent_header *eh = path->p_hdr;
504	struct ext4_extent_idx *r, *l, *m;
505
506
507	ext_debug("binsearch for %u(idx):  ", block);
508
509	l = EXT_FIRST_INDEX(eh) + 1;
510	r = EXT_LAST_INDEX(eh);
511	while (l <= r) {
512		m = l + (r - l) / 2;
513		if (block < le32_to_cpu(m->ei_block))
514			r = m - 1;
515		else
516			l = m + 1;
517		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
518				m, le32_to_cpu(m->ei_block),
519				r, le32_to_cpu(r->ei_block));
520	}
521
522	path->p_idx = l - 1;
523	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
524		  ext4_idx_pblock(path->p_idx));
525
526#ifdef CHECK_BINSEARCH
527	{
528		struct ext4_extent_idx *chix, *ix;
529		int k;
530
531		chix = ix = EXT_FIRST_INDEX(eh);
532		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
533		  if (k != 0 &&
534		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
535				printk(KERN_DEBUG "k=%d, ix=0x%p, "
536				       "first=0x%p\n", k,
537				       ix, EXT_FIRST_INDEX(eh));
538				printk(KERN_DEBUG "%u <= %u\n",
539				       le32_to_cpu(ix->ei_block),
540				       le32_to_cpu(ix[-1].ei_block));
541			}
542			BUG_ON(k && le32_to_cpu(ix->ei_block)
543					   <= le32_to_cpu(ix[-1].ei_block));
544			if (block < le32_to_cpu(ix->ei_block))
545				break;
546			chix = ix;
547		}
548		BUG_ON(chix != path->p_idx);
549	}
550#endif
551
552}
553
554/*
555 * ext4_ext_binsearch:
556 * binary search for closest extent of the given block
557 * the header must be checked before calling this
558 */
559static void
560ext4_ext_binsearch(struct inode *inode,
561		struct ext4_ext_path *path, ext4_lblk_t block)
562{
563	struct ext4_extent_header *eh = path->p_hdr;
564	struct ext4_extent *r, *l, *m;
565
566	if (eh->eh_entries == 0) {
567		/*
568		 * this leaf is empty:
569		 * we get such a leaf in split/add case
570		 */
571		return;
572	}
573
574	ext_debug("binsearch for %u:  ", block);
575
576	l = EXT_FIRST_EXTENT(eh) + 1;
577	r = EXT_LAST_EXTENT(eh);
578
579	while (l <= r) {
580		m = l + (r - l) / 2;
581		if (block < le32_to_cpu(m->ee_block))
582			r = m - 1;
583		else
584			l = m + 1;
585		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
586				m, le32_to_cpu(m->ee_block),
587				r, le32_to_cpu(r->ee_block));
588	}
589
590	path->p_ext = l - 1;
591	ext_debug("  -> %d:%llu:[%d]%d ",
592			le32_to_cpu(path->p_ext->ee_block),
593			ext4_ext_pblock(path->p_ext),
594			ext4_ext_is_uninitialized(path->p_ext),
595			ext4_ext_get_actual_len(path->p_ext));
596
597#ifdef CHECK_BINSEARCH
598	{
599		struct ext4_extent *chex, *ex;
600		int k;
601
602		chex = ex = EXT_FIRST_EXTENT(eh);
603		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
604			BUG_ON(k && le32_to_cpu(ex->ee_block)
605					  <= le32_to_cpu(ex[-1].ee_block));
606			if (block < le32_to_cpu(ex->ee_block))
607				break;
608			chex = ex;
609		}
610		BUG_ON(chex != path->p_ext);
611	}
612#endif
613
614}
615
616int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
617{
618	struct ext4_extent_header *eh;
619
620	eh = ext_inode_hdr(inode);
621	eh->eh_depth = 0;
622	eh->eh_entries = 0;
623	eh->eh_magic = EXT4_EXT_MAGIC;
624	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
625	ext4_mark_inode_dirty(handle, inode);
626	ext4_ext_invalidate_cache(inode);
627	return 0;
628}
629
630struct ext4_ext_path *
631ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
632					struct ext4_ext_path *path)
633{
634	struct ext4_extent_header *eh;
635	struct buffer_head *bh;
636	short int depth, i, ppos = 0, alloc = 0;
637
638	eh = ext_inode_hdr(inode);
639	depth = ext_depth(inode);
640
641	/* account possible depth increase */
642	if (!path) {
643		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
644				GFP_NOFS);
645		if (!path)
646			return ERR_PTR(-ENOMEM);
647		alloc = 1;
648	}
649	path[0].p_hdr = eh;
650	path[0].p_bh = NULL;
651
652	i = depth;
653	/* walk through the tree */
654	while (i) {
655		int need_to_validate = 0;
656
657		ext_debug("depth %d: num %d, max %d\n",
658			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
659
660		ext4_ext_binsearch_idx(inode, path + ppos, block);
661		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
662		path[ppos].p_depth = i;
663		path[ppos].p_ext = NULL;
664
665		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
666		if (unlikely(!bh))
667			goto err;
668		if (!bh_uptodate_or_lock(bh)) {
669			trace_ext4_ext_load_extent(inode, block,
670						path[ppos].p_block);
671			if (bh_submit_read(bh) < 0) {
672				put_bh(bh);
673				goto err;
674			}
675			/* validate the extent entries */
676			need_to_validate = 1;
677		}
678		eh = ext_block_hdr(bh);
679		ppos++;
680		if (unlikely(ppos > depth)) {
681			put_bh(bh);
682			EXT4_ERROR_INODE(inode,
683					 "ppos %d > depth %d", ppos, depth);
684			goto err;
685		}
686		path[ppos].p_bh = bh;
687		path[ppos].p_hdr = eh;
688		i--;
689
690		if (need_to_validate && ext4_ext_check(inode, eh, i))
691			goto err;
692	}
693
694	path[ppos].p_depth = i;
695	path[ppos].p_ext = NULL;
696	path[ppos].p_idx = NULL;
697
698	/* find extent */
699	ext4_ext_binsearch(inode, path + ppos, block);
700	/* if not an empty leaf */
701	if (path[ppos].p_ext)
702		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
703
704	ext4_ext_show_path(inode, path);
705
706	return path;
707
708err:
709	ext4_ext_drop_refs(path);
710	if (alloc)
711		kfree(path);
712	return ERR_PTR(-EIO);
713}
714
715/*
716 * ext4_ext_insert_index:
717 * insert new index [@logical;@ptr] into the block at @curp;
718 * check where to insert: before @curp or after @curp
719 */
720static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
721				 struct ext4_ext_path *curp,
722				 int logical, ext4_fsblk_t ptr)
723{
724	struct ext4_extent_idx *ix;
725	int len, err;
726
727	err = ext4_ext_get_access(handle, inode, curp);
728	if (err)
729		return err;
730
731	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
732		EXT4_ERROR_INODE(inode,
733				 "logical %d == ei_block %d!",
734				 logical, le32_to_cpu(curp->p_idx->ei_block));
735		return -EIO;
736	}
737
738	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
739			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
740		EXT4_ERROR_INODE(inode,
741				 "eh_entries %d >= eh_max %d!",
742				 le16_to_cpu(curp->p_hdr->eh_entries),
743				 le16_to_cpu(curp->p_hdr->eh_max));
744		return -EIO;
745	}
746
747	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
748		/* insert after */
749		ext_debug("insert new index %d after: %llu\n", logical, ptr);
750		ix = curp->p_idx + 1;
751	} else {
752		/* insert before */
753		ext_debug("insert new index %d before: %llu\n", logical, ptr);
754		ix = curp->p_idx;
755	}
756
757	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
758	BUG_ON(len < 0);
759	if (len > 0) {
760		ext_debug("insert new index %d: "
761				"move %d indices from 0x%p to 0x%p\n",
762				logical, len, ix, ix + 1);
763		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
764	}
765
766	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
767		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
768		return -EIO;
769	}
770
771	ix->ei_block = cpu_to_le32(logical);
772	ext4_idx_store_pblock(ix, ptr);
773	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
774
775	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
776		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
777		return -EIO;
778	}
779
780	err = ext4_ext_dirty(handle, inode, curp);
781	ext4_std_error(inode->i_sb, err);
782
783	return err;
784}
785
786/*
787 * ext4_ext_split:
788 * inserts new subtree into the path, using free index entry
789 * at depth @at:
790 * - allocates all needed blocks (new leaf and all intermediate index blocks)
791 * - makes decision where to split
792 * - moves remaining extents and index entries (right to the split point)
793 *   into the newly allocated blocks
794 * - initializes subtree
795 */
796static int ext4_ext_split(handle_t *handle, struct inode *inode,
797			  unsigned int flags,
798			  struct ext4_ext_path *path,
799			  struct ext4_extent *newext, int at)
800{
801	struct buffer_head *bh = NULL;
802	int depth = ext_depth(inode);
803	struct ext4_extent_header *neh;
804	struct ext4_extent_idx *fidx;
805	int i = at, k, m, a;
806	ext4_fsblk_t newblock, oldblock;
807	__le32 border;
808	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
809	int err = 0;
810
811	/* make decision: where to split? */
812	/* FIXME: now decision is simplest: at current extent */
813
814	/* if current leaf will be split, then we should use
815	 * border from split point */
816	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
817		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
818		return -EIO;
819	}
820	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
821		border = path[depth].p_ext[1].ee_block;
822		ext_debug("leaf will be split."
823				" next leaf starts at %d\n",
824				  le32_to_cpu(border));
825	} else {
826		border = newext->ee_block;
827		ext_debug("leaf will be added."
828				" next leaf starts at %d\n",
829				le32_to_cpu(border));
830	}
831
832	/*
833	 * If error occurs, then we break processing
834	 * and mark filesystem read-only. index won't
835	 * be inserted and tree will be in consistent
836	 * state. Next mount will repair buffers too.
837	 */
838
839	/*
840	 * Get array to track all allocated blocks.
841	 * We need this to handle errors and free blocks
842	 * upon them.
843	 */
844	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
845	if (!ablocks)
846		return -ENOMEM;
847
848	/* allocate all needed blocks */
849	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
850	for (a = 0; a < depth - at; a++) {
851		newblock = ext4_ext_new_meta_block(handle, inode, path,
852						   newext, &err, flags);
853		if (newblock == 0)
854			goto cleanup;
855		ablocks[a] = newblock;
856	}
857
858	/* initialize new leaf */
859	newblock = ablocks[--a];
860	if (unlikely(newblock == 0)) {
861		EXT4_ERROR_INODE(inode, "newblock == 0!");
862		err = -EIO;
863		goto cleanup;
864	}
865	bh = sb_getblk(inode->i_sb, newblock);
866	if (!bh) {
867		err = -EIO;
868		goto cleanup;
869	}
870	lock_buffer(bh);
871
872	err = ext4_journal_get_create_access(handle, bh);
873	if (err)
874		goto cleanup;
875
876	neh = ext_block_hdr(bh);
877	neh->eh_entries = 0;
878	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
879	neh->eh_magic = EXT4_EXT_MAGIC;
880	neh->eh_depth = 0;
881
882	/* move remainder of path[depth] to the new leaf */
883	if (unlikely(path[depth].p_hdr->eh_entries !=
884		     path[depth].p_hdr->eh_max)) {
885		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
886				 path[depth].p_hdr->eh_entries,
887				 path[depth].p_hdr->eh_max);
888		err = -EIO;
889		goto cleanup;
890	}
891	/* start copy from next extent */
892	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
893	ext4_ext_show_move(inode, path, newblock, depth);
894	if (m) {
895		struct ext4_extent *ex;
896		ex = EXT_FIRST_EXTENT(neh);
897		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
898		le16_add_cpu(&neh->eh_entries, m);
899	}
900
901	set_buffer_uptodate(bh);
902	unlock_buffer(bh);
903
904	err = ext4_handle_dirty_metadata(handle, inode, bh);
905	if (err)
906		goto cleanup;
907	brelse(bh);
908	bh = NULL;
909
910	/* correct old leaf */
911	if (m) {
912		err = ext4_ext_get_access(handle, inode, path + depth);
913		if (err)
914			goto cleanup;
915		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
916		err = ext4_ext_dirty(handle, inode, path + depth);
917		if (err)
918			goto cleanup;
919
920	}
921
922	/* create intermediate indexes */
923	k = depth - at - 1;
924	if (unlikely(k < 0)) {
925		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
926		err = -EIO;
927		goto cleanup;
928	}
929	if (k)
930		ext_debug("create %d intermediate indices\n", k);
931	/* insert new index into current index block */
932	/* current depth stored in i var */
933	i = depth - 1;
934	while (k--) {
935		oldblock = newblock;
936		newblock = ablocks[--a];
937		bh = sb_getblk(inode->i_sb, newblock);
938		if (!bh) {
939			err = -EIO;
940			goto cleanup;
941		}
942		lock_buffer(bh);
943
944		err = ext4_journal_get_create_access(handle, bh);
945		if (err)
946			goto cleanup;
947
948		neh = ext_block_hdr(bh);
949		neh->eh_entries = cpu_to_le16(1);
950		neh->eh_magic = EXT4_EXT_MAGIC;
951		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
952		neh->eh_depth = cpu_to_le16(depth - i);
953		fidx = EXT_FIRST_INDEX(neh);
954		fidx->ei_block = border;
955		ext4_idx_store_pblock(fidx, oldblock);
956
957		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
958				i, newblock, le32_to_cpu(border), oldblock);
959
960		/* move remainder of path[i] to the new index block */
961		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
962					EXT_LAST_INDEX(path[i].p_hdr))) {
963			EXT4_ERROR_INODE(inode,
964					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
965					 le32_to_cpu(path[i].p_ext->ee_block));
966			err = -EIO;
967			goto cleanup;
968		}
969		/* start copy indexes */
970		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
971		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
972				EXT_MAX_INDEX(path[i].p_hdr));
973		ext4_ext_show_move(inode, path, newblock, i);
974		if (m) {
975			memmove(++fidx, path[i].p_idx,
976				sizeof(struct ext4_extent_idx) * m);
977			le16_add_cpu(&neh->eh_entries, m);
978		}
979		set_buffer_uptodate(bh);
980		unlock_buffer(bh);
981
982		err = ext4_handle_dirty_metadata(handle, inode, bh);
983		if (err)
984			goto cleanup;
985		brelse(bh);
986		bh = NULL;
987
988		/* correct old index */
989		if (m) {
990			err = ext4_ext_get_access(handle, inode, path + i);
991			if (err)
992				goto cleanup;
993			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
994			err = ext4_ext_dirty(handle, inode, path + i);
995			if (err)
996				goto cleanup;
997		}
998
999		i--;
1000	}
1001
1002	/* insert new index */
1003	err = ext4_ext_insert_index(handle, inode, path + at,
1004				    le32_to_cpu(border), newblock);
1005
1006cleanup:
1007	if (bh) {
1008		if (buffer_locked(bh))
1009			unlock_buffer(bh);
1010		brelse(bh);
1011	}
1012
1013	if (err) {
1014		/* free all allocated blocks in error case */
1015		for (i = 0; i < depth; i++) {
1016			if (!ablocks[i])
1017				continue;
1018			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1019					 EXT4_FREE_BLOCKS_METADATA);
1020		}
1021	}
1022	kfree(ablocks);
1023
1024	return err;
1025}
1026
1027/*
1028 * ext4_ext_grow_indepth:
1029 * implements tree growing procedure:
1030 * - allocates new block
1031 * - moves top-level data (index block or leaf) into the new block
1032 * - initializes new top-level, creating index that points to the
1033 *   just created block
1034 */
1035static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1036				 unsigned int flags,
1037				 struct ext4_extent *newext)
1038{
1039	struct ext4_extent_header *neh;
1040	struct buffer_head *bh;
1041	ext4_fsblk_t newblock;
1042	int err = 0;
1043
1044	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1045		newext, &err, flags);
1046	if (newblock == 0)
1047		return err;
1048
1049	bh = sb_getblk(inode->i_sb, newblock);
1050	if (!bh) {
1051		err = -EIO;
1052		ext4_std_error(inode->i_sb, err);
1053		return err;
1054	}
1055	lock_buffer(bh);
1056
1057	err = ext4_journal_get_create_access(handle, bh);
1058	if (err) {
1059		unlock_buffer(bh);
1060		goto out;
1061	}
1062
1063	/* move top-level index/leaf into new block */
1064	memmove(bh->b_data, EXT4_I(inode)->i_data,
1065		sizeof(EXT4_I(inode)->i_data));
1066
1067	/* set size of new block */
1068	neh = ext_block_hdr(bh);
1069	/* old root could have indexes or leaves
1070	 * so calculate e_max right way */
1071	if (ext_depth(inode))
1072		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1073	else
1074		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1075	neh->eh_magic = EXT4_EXT_MAGIC;
1076	set_buffer_uptodate(bh);
1077	unlock_buffer(bh);
1078
1079	err = ext4_handle_dirty_metadata(handle, inode, bh);
1080	if (err)
1081		goto out;
1082
1083	/* Update top-level index: num,max,pointer */
1084	neh = ext_inode_hdr(inode);
1085	neh->eh_entries = cpu_to_le16(1);
1086	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1087	if (neh->eh_depth == 0) {
1088		/* Root extent block becomes index block */
1089		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1090		EXT_FIRST_INDEX(neh)->ei_block =
1091			EXT_FIRST_EXTENT(neh)->ee_block;
1092	}
1093	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1094		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1095		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1096		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1097
1098	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1099	ext4_mark_inode_dirty(handle, inode);
1100out:
1101	brelse(bh);
1102
1103	return err;
1104}
1105
1106/*
1107 * ext4_ext_create_new_leaf:
1108 * finds empty index and adds new leaf.
1109 * if no free index is found, then it requests in-depth growing.
1110 */
1111static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1112				    unsigned int flags,
1113				    struct ext4_ext_path *path,
1114				    struct ext4_extent *newext)
1115{
1116	struct ext4_ext_path *curp;
1117	int depth, i, err = 0;
1118
1119repeat:
1120	i = depth = ext_depth(inode);
1121
1122	/* walk up to the tree and look for free index entry */
1123	curp = path + depth;
1124	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1125		i--;
1126		curp--;
1127	}
1128
1129	/* we use already allocated block for index block,
1130	 * so subsequent data blocks should be contiguous */
1131	if (EXT_HAS_FREE_INDEX(curp)) {
1132		/* if we found index with free entry, then use that
1133		 * entry: create all needed subtree and add new leaf */
1134		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1135		if (err)
1136			goto out;
1137
1138		/* refill path */
1139		ext4_ext_drop_refs(path);
1140		path = ext4_ext_find_extent(inode,
1141				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1142				    path);
1143		if (IS_ERR(path))
1144			err = PTR_ERR(path);
1145	} else {
1146		/* tree is full, time to grow in depth */
1147		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1148		if (err)
1149			goto out;
1150
1151		/* refill path */
1152		ext4_ext_drop_refs(path);
1153		path = ext4_ext_find_extent(inode,
1154				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1155				    path);
1156		if (IS_ERR(path)) {
1157			err = PTR_ERR(path);
1158			goto out;
1159		}
1160
1161		/*
1162		 * only first (depth 0 -> 1) produces free space;
1163		 * in all other cases we have to split the grown tree
1164		 */
1165		depth = ext_depth(inode);
1166		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1167			/* now we need to split */
1168			goto repeat;
1169		}
1170	}
1171
1172out:
1173	return err;
1174}
1175
1176/*
1177 * search the closest allocated block to the left for *logical
1178 * and returns it at @logical + it's physical address at @phys
1179 * if *logical is the smallest allocated block, the function
1180 * returns 0 at @phys
1181 * return value contains 0 (success) or error code
1182 */
1183static int ext4_ext_search_left(struct inode *inode,
1184				struct ext4_ext_path *path,
1185				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1186{
1187	struct ext4_extent_idx *ix;
1188	struct ext4_extent *ex;
1189	int depth, ee_len;
1190
1191	if (unlikely(path == NULL)) {
1192		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1193		return -EIO;
1194	}
1195	depth = path->p_depth;
1196	*phys = 0;
1197
1198	if (depth == 0 && path->p_ext == NULL)
1199		return 0;
1200
1201	/* usually extent in the path covers blocks smaller
1202	 * then *logical, but it can be that extent is the
1203	 * first one in the file */
1204
1205	ex = path[depth].p_ext;
1206	ee_len = ext4_ext_get_actual_len(ex);
1207	if (*logical < le32_to_cpu(ex->ee_block)) {
1208		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1209			EXT4_ERROR_INODE(inode,
1210					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1211					 *logical, le32_to_cpu(ex->ee_block));
1212			return -EIO;
1213		}
1214		while (--depth >= 0) {
1215			ix = path[depth].p_idx;
1216			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1217				EXT4_ERROR_INODE(inode,
1218				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1219				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1220				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1221		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1222				  depth);
1223				return -EIO;
1224			}
1225		}
1226		return 0;
1227	}
1228
1229	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1230		EXT4_ERROR_INODE(inode,
1231				 "logical %d < ee_block %d + ee_len %d!",
1232				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1233		return -EIO;
1234	}
1235
1236	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1237	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1238	return 0;
1239}
1240
1241/*
1242 * search the closest allocated block to the right for *logical
1243 * and returns it at @logical + it's physical address at @phys
1244 * if *logical is the largest allocated block, the function
1245 * returns 0 at @phys
1246 * return value contains 0 (success) or error code
1247 */
1248static int ext4_ext_search_right(struct inode *inode,
1249				 struct ext4_ext_path *path,
1250				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1251				 struct ext4_extent **ret_ex)
1252{
1253	struct buffer_head *bh = NULL;
1254	struct ext4_extent_header *eh;
1255	struct ext4_extent_idx *ix;
1256	struct ext4_extent *ex;
1257	ext4_fsblk_t block;
1258	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1259	int ee_len;
1260
1261	if (unlikely(path == NULL)) {
1262		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1263		return -EIO;
1264	}
1265	depth = path->p_depth;
1266	*phys = 0;
1267
1268	if (depth == 0 && path->p_ext == NULL)
1269		return 0;
1270
1271	/* usually extent in the path covers blocks smaller
1272	 * then *logical, but it can be that extent is the
1273	 * first one in the file */
1274
1275	ex = path[depth].p_ext;
1276	ee_len = ext4_ext_get_actual_len(ex);
1277	if (*logical < le32_to_cpu(ex->ee_block)) {
1278		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1279			EXT4_ERROR_INODE(inode,
1280					 "first_extent(path[%d].p_hdr) != ex",
1281					 depth);
1282			return -EIO;
1283		}
1284		while (--depth >= 0) {
1285			ix = path[depth].p_idx;
1286			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1287				EXT4_ERROR_INODE(inode,
1288						 "ix != EXT_FIRST_INDEX *logical %d!",
1289						 *logical);
1290				return -EIO;
1291			}
1292		}
1293		goto found_extent;
1294	}
1295
1296	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1297		EXT4_ERROR_INODE(inode,
1298				 "logical %d < ee_block %d + ee_len %d!",
1299				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1300		return -EIO;
1301	}
1302
1303	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1304		/* next allocated block in this leaf */
1305		ex++;
1306		goto found_extent;
1307	}
1308
1309	/* go up and search for index to the right */
1310	while (--depth >= 0) {
1311		ix = path[depth].p_idx;
1312		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1313			goto got_index;
1314	}
1315
1316	/* we've gone up to the root and found no index to the right */
1317	return 0;
1318
1319got_index:
1320	/* we've found index to the right, let's
1321	 * follow it and find the closest allocated
1322	 * block to the right */
1323	ix++;
1324	block = ext4_idx_pblock(ix);
1325	while (++depth < path->p_depth) {
1326		bh = sb_bread(inode->i_sb, block);
1327		if (bh == NULL)
1328			return -EIO;
1329		eh = ext_block_hdr(bh);
1330		/* subtract from p_depth to get proper eh_depth */
1331		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1332			put_bh(bh);
1333			return -EIO;
1334		}
1335		ix = EXT_FIRST_INDEX(eh);
1336		block = ext4_idx_pblock(ix);
1337		put_bh(bh);
1338	}
1339
1340	bh = sb_bread(inode->i_sb, block);
1341	if (bh == NULL)
1342		return -EIO;
1343	eh = ext_block_hdr(bh);
1344	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1345		put_bh(bh);
1346		return -EIO;
1347	}
1348	ex = EXT_FIRST_EXTENT(eh);
1349found_extent:
1350	*logical = le32_to_cpu(ex->ee_block);
1351	*phys = ext4_ext_pblock(ex);
1352	*ret_ex = ex;
1353	if (bh)
1354		put_bh(bh);
1355	return 0;
1356}
1357
1358/*
1359 * ext4_ext_next_allocated_block:
1360 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1361 * NOTE: it considers block number from index entry as
1362 * allocated block. Thus, index entries have to be consistent
1363 * with leaves.
1364 */
1365static ext4_lblk_t
1366ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1367{
1368	int depth;
1369
1370	BUG_ON(path == NULL);
1371	depth = path->p_depth;
1372
1373	if (depth == 0 && path->p_ext == NULL)
1374		return EXT_MAX_BLOCKS;
1375
1376	while (depth >= 0) {
1377		if (depth == path->p_depth) {
1378			/* leaf */
1379			if (path[depth].p_ext &&
1380				path[depth].p_ext !=
1381					EXT_LAST_EXTENT(path[depth].p_hdr))
1382			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1383		} else {
1384			/* index */
1385			if (path[depth].p_idx !=
1386					EXT_LAST_INDEX(path[depth].p_hdr))
1387			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1388		}
1389		depth--;
1390	}
1391
1392	return EXT_MAX_BLOCKS;
1393}
1394
1395/*
1396 * ext4_ext_next_leaf_block:
1397 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1398 */
1399static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1400{
1401	int depth;
1402
1403	BUG_ON(path == NULL);
1404	depth = path->p_depth;
1405
1406	/* zero-tree has no leaf blocks at all */
1407	if (depth == 0)
1408		return EXT_MAX_BLOCKS;
1409
1410	/* go to index block */
1411	depth--;
1412
1413	while (depth >= 0) {
1414		if (path[depth].p_idx !=
1415				EXT_LAST_INDEX(path[depth].p_hdr))
1416			return (ext4_lblk_t)
1417				le32_to_cpu(path[depth].p_idx[1].ei_block);
1418		depth--;
1419	}
1420
1421	return EXT_MAX_BLOCKS;
1422}
1423
1424/*
1425 * ext4_ext_correct_indexes:
1426 * if leaf gets modified and modified extent is first in the leaf,
1427 * then we have to correct all indexes above.
1428 * TODO: do we need to correct tree in all cases?
1429 */
1430static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1431				struct ext4_ext_path *path)
1432{
1433	struct ext4_extent_header *eh;
1434	int depth = ext_depth(inode);
1435	struct ext4_extent *ex;
1436	__le32 border;
1437	int k, err = 0;
1438
1439	eh = path[depth].p_hdr;
1440	ex = path[depth].p_ext;
1441
1442	if (unlikely(ex == NULL || eh == NULL)) {
1443		EXT4_ERROR_INODE(inode,
1444				 "ex %p == NULL or eh %p == NULL", ex, eh);
1445		return -EIO;
1446	}
1447
1448	if (depth == 0) {
1449		/* there is no tree at all */
1450		return 0;
1451	}
1452
1453	if (ex != EXT_FIRST_EXTENT(eh)) {
1454		/* we correct tree if first leaf got modified only */
1455		return 0;
1456	}
1457
1458	/*
1459	 * TODO: we need correction if border is smaller than current one
1460	 */
1461	k = depth - 1;
1462	border = path[depth].p_ext->ee_block;
1463	err = ext4_ext_get_access(handle, inode, path + k);
1464	if (err)
1465		return err;
1466	path[k].p_idx->ei_block = border;
1467	err = ext4_ext_dirty(handle, inode, path + k);
1468	if (err)
1469		return err;
1470
1471	while (k--) {
1472		/* change all left-side indexes */
1473		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1474			break;
1475		err = ext4_ext_get_access(handle, inode, path + k);
1476		if (err)
1477			break;
1478		path[k].p_idx->ei_block = border;
1479		err = ext4_ext_dirty(handle, inode, path + k);
1480		if (err)
1481			break;
1482	}
1483
1484	return err;
1485}
1486
1487int
1488ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1489				struct ext4_extent *ex2)
1490{
1491	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1492
1493	/*
1494	 * Make sure that either both extents are uninitialized, or
1495	 * both are _not_.
1496	 */
1497	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1498		return 0;
1499
1500	if (ext4_ext_is_uninitialized(ex1))
1501		max_len = EXT_UNINIT_MAX_LEN;
1502	else
1503		max_len = EXT_INIT_MAX_LEN;
1504
1505	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1506	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1507
1508	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1509			le32_to_cpu(ex2->ee_block))
1510		return 0;
1511
1512	/*
1513	 * To allow future support for preallocated extents to be added
1514	 * as an RO_COMPAT feature, refuse to merge to extents if
1515	 * this can result in the top bit of ee_len being set.
1516	 */
1517	if (ext1_ee_len + ext2_ee_len > max_len)
1518		return 0;
1519#ifdef AGGRESSIVE_TEST
1520	if (ext1_ee_len >= 4)
1521		return 0;
1522#endif
1523
1524	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1525		return 1;
1526	return 0;
1527}
1528
1529/*
1530 * This function tries to merge the "ex" extent to the next extent in the tree.
1531 * It always tries to merge towards right. If you want to merge towards
1532 * left, pass "ex - 1" as argument instead of "ex".
1533 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1534 * 1 if they got merged.
1535 */
1536static int ext4_ext_try_to_merge_right(struct inode *inode,
1537				 struct ext4_ext_path *path,
1538				 struct ext4_extent *ex)
1539{
1540	struct ext4_extent_header *eh;
1541	unsigned int depth, len;
1542	int merge_done = 0;
1543	int uninitialized = 0;
1544
1545	depth = ext_depth(inode);
1546	BUG_ON(path[depth].p_hdr == NULL);
1547	eh = path[depth].p_hdr;
1548
1549	while (ex < EXT_LAST_EXTENT(eh)) {
1550		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1551			break;
1552		/* merge with next extent! */
1553		if (ext4_ext_is_uninitialized(ex))
1554			uninitialized = 1;
1555		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1556				+ ext4_ext_get_actual_len(ex + 1));
1557		if (uninitialized)
1558			ext4_ext_mark_uninitialized(ex);
1559
1560		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1561			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1562				* sizeof(struct ext4_extent);
1563			memmove(ex + 1, ex + 2, len);
1564		}
1565		le16_add_cpu(&eh->eh_entries, -1);
1566		merge_done = 1;
1567		WARN_ON(eh->eh_entries == 0);
1568		if (!eh->eh_entries)
1569			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1570	}
1571
1572	return merge_done;
1573}
1574
1575/*
1576 * This function tries to merge the @ex extent to neighbours in the tree.
1577 * return 1 if merge left else 0.
1578 */
1579static int ext4_ext_try_to_merge(struct inode *inode,
1580				  struct ext4_ext_path *path,
1581				  struct ext4_extent *ex) {
1582	struct ext4_extent_header *eh;
1583	unsigned int depth;
1584	int merge_done = 0;
1585	int ret = 0;
1586
1587	depth = ext_depth(inode);
1588	BUG_ON(path[depth].p_hdr == NULL);
1589	eh = path[depth].p_hdr;
1590
1591	if (ex > EXT_FIRST_EXTENT(eh))
1592		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1593
1594	if (!merge_done)
1595		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1596
1597	return ret;
1598}
1599
1600/*
1601 * check if a portion of the "newext" extent overlaps with an
1602 * existing extent.
1603 *
1604 * If there is an overlap discovered, it updates the length of the newext
1605 * such that there will be no overlap, and then returns 1.
1606 * If there is no overlap found, it returns 0.
1607 */
1608static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1609					   struct inode *inode,
1610					   struct ext4_extent *newext,
1611					   struct ext4_ext_path *path)
1612{
1613	ext4_lblk_t b1, b2;
1614	unsigned int depth, len1;
1615	unsigned int ret = 0;
1616
1617	b1 = le32_to_cpu(newext->ee_block);
1618	len1 = ext4_ext_get_actual_len(newext);
1619	depth = ext_depth(inode);
1620	if (!path[depth].p_ext)
1621		goto out;
1622	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1623	b2 &= ~(sbi->s_cluster_ratio - 1);
1624
1625	/*
1626	 * get the next allocated block if the extent in the path
1627	 * is before the requested block(s)
1628	 */
1629	if (b2 < b1) {
1630		b2 = ext4_ext_next_allocated_block(path);
1631		if (b2 == EXT_MAX_BLOCKS)
1632			goto out;
1633		b2 &= ~(sbi->s_cluster_ratio - 1);
1634	}
1635
1636	/* check for wrap through zero on extent logical start block*/
1637	if (b1 + len1 < b1) {
1638		len1 = EXT_MAX_BLOCKS - b1;
1639		newext->ee_len = cpu_to_le16(len1);
1640		ret = 1;
1641	}
1642
1643	/* check for overlap */
1644	if (b1 + len1 > b2) {
1645		newext->ee_len = cpu_to_le16(b2 - b1);
1646		ret = 1;
1647	}
1648out:
1649	return ret;
1650}
1651
1652/*
1653 * ext4_ext_insert_extent:
1654 * tries to merge requsted extent into the existing extent or
1655 * inserts requested extent as new one into the tree,
1656 * creating new leaf in the no-space case.
1657 */
1658int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1659				struct ext4_ext_path *path,
1660				struct ext4_extent *newext, int flag)
1661{
1662	struct ext4_extent_header *eh;
1663	struct ext4_extent *ex, *fex;
1664	struct ext4_extent *nearex; /* nearest extent */
1665	struct ext4_ext_path *npath = NULL;
1666	int depth, len, err;
1667	ext4_lblk_t next;
1668	unsigned uninitialized = 0;
1669	int flags = 0;
1670
1671	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1672		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1673		return -EIO;
1674	}
1675	depth = ext_depth(inode);
1676	ex = path[depth].p_ext;
1677	if (unlikely(path[depth].p_hdr == NULL)) {
1678		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1679		return -EIO;
1680	}
1681
1682	/* try to insert block into found extent and return */
1683	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1684		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1685		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1686			  ext4_ext_is_uninitialized(newext),
1687			  ext4_ext_get_actual_len(newext),
1688			  le32_to_cpu(ex->ee_block),
1689			  ext4_ext_is_uninitialized(ex),
1690			  ext4_ext_get_actual_len(ex),
1691			  ext4_ext_pblock(ex));
1692		err = ext4_ext_get_access(handle, inode, path + depth);
1693		if (err)
1694			return err;
1695
1696		/*
1697		 * ext4_can_extents_be_merged should have checked that either
1698		 * both extents are uninitialized, or both aren't. Thus we
1699		 * need to check only one of them here.
1700		 */
1701		if (ext4_ext_is_uninitialized(ex))
1702			uninitialized = 1;
1703		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1704					+ ext4_ext_get_actual_len(newext));
1705		if (uninitialized)
1706			ext4_ext_mark_uninitialized(ex);
1707		eh = path[depth].p_hdr;
1708		nearex = ex;
1709		goto merge;
1710	}
1711
1712	depth = ext_depth(inode);
1713	eh = path[depth].p_hdr;
1714	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1715		goto has_space;
1716
1717	/* probably next leaf has space for us? */
1718	fex = EXT_LAST_EXTENT(eh);
1719	next = EXT_MAX_BLOCKS;
1720	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1721		next = ext4_ext_next_leaf_block(path);
1722	if (next != EXT_MAX_BLOCKS) {
1723		ext_debug("next leaf block - %d\n", next);
1724		BUG_ON(npath != NULL);
1725		npath = ext4_ext_find_extent(inode, next, NULL);
1726		if (IS_ERR(npath))
1727			return PTR_ERR(npath);
1728		BUG_ON(npath->p_depth != path->p_depth);
1729		eh = npath[depth].p_hdr;
1730		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1731			ext_debug("next leaf isn't full(%d)\n",
1732				  le16_to_cpu(eh->eh_entries));
1733			path = npath;
1734			goto has_space;
1735		}
1736		ext_debug("next leaf has no free space(%d,%d)\n",
1737			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1738	}
1739
1740	/*
1741	 * There is no free space in the found leaf.
1742	 * We're gonna add a new leaf in the tree.
1743	 */
1744	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1745		flags = EXT4_MB_USE_ROOT_BLOCKS;
1746	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1747	if (err)
1748		goto cleanup;
1749	depth = ext_depth(inode);
1750	eh = path[depth].p_hdr;
1751
1752has_space:
1753	nearex = path[depth].p_ext;
1754
1755	err = ext4_ext_get_access(handle, inode, path + depth);
1756	if (err)
1757		goto cleanup;
1758
1759	if (!nearex) {
1760		/* there is no extent in this leaf, create first one */
1761		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1762				le32_to_cpu(newext->ee_block),
1763				ext4_ext_pblock(newext),
1764				ext4_ext_is_uninitialized(newext),
1765				ext4_ext_get_actual_len(newext));
1766		nearex = EXT_FIRST_EXTENT(eh);
1767	} else {
1768		if (le32_to_cpu(newext->ee_block)
1769			   > le32_to_cpu(nearex->ee_block)) {
1770			/* Insert after */
1771			ext_debug("insert %d:%llu:[%d]%d %s before: "
1772					"nearest 0x%p\n"
1773					le32_to_cpu(newext->ee_block),
1774					ext4_ext_pblock(newext),
1775					ext4_ext_is_uninitialized(newext),
1776					ext4_ext_get_actual_len(newext),
1777					nearex);
1778			nearex++;
1779		} else {
1780			/* Insert before */
1781			BUG_ON(newext->ee_block == nearex->ee_block);
1782			ext_debug("insert %d:%llu:[%d]%d %s after: "
1783					"nearest 0x%p\n"
1784					le32_to_cpu(newext->ee_block),
1785					ext4_ext_pblock(newext),
1786					ext4_ext_is_uninitialized(newext),
1787					ext4_ext_get_actual_len(newext),
1788					nearex);
1789		}
1790		len = EXT_LAST_EXTENT(eh) - nearex + 1;
1791		if (len > 0) {
1792			ext_debug("insert %d:%llu:[%d]%d: "
1793					"move %d extents from 0x%p to 0x%p\n",
1794					le32_to_cpu(newext->ee_block),
1795					ext4_ext_pblock(newext),
1796					ext4_ext_is_uninitialized(newext),
1797					ext4_ext_get_actual_len(newext),
1798					len, nearex, nearex + 1);
1799			memmove(nearex + 1, nearex,
1800				len * sizeof(struct ext4_extent));
1801		}
1802	}
1803
1804	le16_add_cpu(&eh->eh_entries, 1);
1805	path[depth].p_ext = nearex;
1806	nearex->ee_block = newext->ee_block;
1807	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1808	nearex->ee_len = newext->ee_len;
1809
1810merge:
1811	/* try to merge extents to the right */
1812	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1813		ext4_ext_try_to_merge(inode, path, nearex);
1814
1815	/* try to merge extents to the left */
1816
1817	/* time to correct all indexes above */
1818	err = ext4_ext_correct_indexes(handle, inode, path);
1819	if (err)
1820		goto cleanup;
1821
1822	err = ext4_ext_dirty(handle, inode, path + depth);
1823
1824cleanup:
1825	if (npath) {
1826		ext4_ext_drop_refs(npath);
1827		kfree(npath);
1828	}
1829	ext4_ext_invalidate_cache(inode);
1830	return err;
1831}
1832
1833static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1834			       ext4_lblk_t num, ext_prepare_callback func,
1835			       void *cbdata)
1836{
1837	struct ext4_ext_path *path = NULL;
1838	struct ext4_ext_cache cbex;
1839	struct ext4_extent *ex;
1840	ext4_lblk_t next, start = 0, end = 0;
1841	ext4_lblk_t last = block + num;
1842	int depth, exists, err = 0;
1843
1844	BUG_ON(func == NULL);
1845	BUG_ON(inode == NULL);
1846
1847	while (block < last && block != EXT_MAX_BLOCKS) {
1848		num = last - block;
1849		/* find extent for this block */
1850		down_read(&EXT4_I(inode)->i_data_sem);
1851		path = ext4_ext_find_extent(inode, block, path);
1852		up_read(&EXT4_I(inode)->i_data_sem);
1853		if (IS_ERR(path)) {
1854			err = PTR_ERR(path);
1855			path = NULL;
1856			break;
1857		}
1858
1859		depth = ext_depth(inode);
1860		if (unlikely(path[depth].p_hdr == NULL)) {
1861			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1862			err = -EIO;
1863			break;
1864		}
1865		ex = path[depth].p_ext;
1866		next = ext4_ext_next_allocated_block(path);
1867
1868		exists = 0;
1869		if (!ex) {
1870			/* there is no extent yet, so try to allocate
1871			 * all requested space */
1872			start = block;
1873			end = block + num;
1874		} else if (le32_to_cpu(ex->ee_block) > block) {
1875			/* need to allocate space before found extent */
1876			start = block;
1877			end = le32_to_cpu(ex->ee_block);
1878			if (block + num < end)
1879				end = block + num;
1880		} else if (block >= le32_to_cpu(ex->ee_block)
1881					+ ext4_ext_get_actual_len(ex)) {
1882			/* need to allocate space after found extent */
1883			start = block;
1884			end = block + num;
1885			if (end >= next)
1886				end = next;
1887		} else if (block >= le32_to_cpu(ex->ee_block)) {
1888			/*
1889			 * some part of requested space is covered
1890			 * by found extent
1891			 */
1892			start = block;
1893			end = le32_to_cpu(ex->ee_block)
1894				+ ext4_ext_get_actual_len(ex);
1895			if (block + num < end)
1896				end = block + num;
1897			exists = 1;
1898		} else {
1899			BUG();
1900		}
1901		BUG_ON(end <= start);
1902
1903		if (!exists) {
1904			cbex.ec_block = start;
1905			cbex.ec_len = end - start;
1906			cbex.ec_start = 0;
1907		} else {
1908			cbex.ec_block = le32_to_cpu(ex->ee_block);
1909			cbex.ec_len = ext4_ext_get_actual_len(ex);
1910			cbex.ec_start = ext4_ext_pblock(ex);
1911		}
1912
1913		if (unlikely(cbex.ec_len == 0)) {
1914			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1915			err = -EIO;
1916			break;
1917		}
1918		err = func(inode, next, &cbex, ex, cbdata);
1919		ext4_ext_drop_refs(path);
1920
1921		if (err < 0)
1922			break;
1923
1924		if (err == EXT_REPEAT)
1925			continue;
1926		else if (err == EXT_BREAK) {
1927			err = 0;
1928			break;
1929		}
1930
1931		if (ext_depth(inode) != depth) {
1932			/* depth was changed. we have to realloc path */
1933			kfree(path);
1934			path = NULL;
1935		}
1936
1937		block = cbex.ec_block + cbex.ec_len;
1938	}
1939
1940	if (path) {
1941		ext4_ext_drop_refs(path);
1942		kfree(path);
1943	}
1944
1945	return err;
1946}
1947
1948static void
1949ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1950			__u32 len, ext4_fsblk_t start)
1951{
1952	struct ext4_ext_cache *cex;
1953	BUG_ON(len == 0);
1954	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1955	trace_ext4_ext_put_in_cache(inode, block, len, start);
1956	cex = &EXT4_I(inode)->i_cached_extent;
1957	cex->ec_block = block;
1958	cex->ec_len = len;
1959	cex->ec_start = start;
1960	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1961}
1962
1963/*
1964 * ext4_ext_put_gap_in_cache:
1965 * calculate boundaries of the gap that the requested block fits into
1966 * and cache this gap
1967 */
1968static void
1969ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1970				ext4_lblk_t block)
1971{
1972	int depth = ext_depth(inode);
1973	unsigned long len;
1974	ext4_lblk_t lblock;
1975	struct ext4_extent *ex;
1976
1977	ex = path[depth].p_ext;
1978	if (ex == NULL) {
1979		/* there is no extent yet, so gap is [0;-] */
1980		lblock = 0;
1981		len = EXT_MAX_BLOCKS;
1982		ext_debug("cache gap(whole file):");
1983	} else if (block < le32_to_cpu(ex->ee_block)) {
1984		lblock = block;
1985		len = le32_to_cpu(ex->ee_block) - block;
1986		ext_debug("cache gap(before): %u [%u:%u]",
1987				block,
1988				le32_to_cpu(ex->ee_block),
1989				 ext4_ext_get_actual_len(ex));
1990	} else if (block >= le32_to_cpu(ex->ee_block)
1991			+ ext4_ext_get_actual_len(ex)) {
1992		ext4_lblk_t next;
1993		lblock = le32_to_cpu(ex->ee_block)
1994			+ ext4_ext_get_actual_len(ex);
1995
1996		next = ext4_ext_next_allocated_block(path);
1997		ext_debug("cache gap(after): [%u:%u] %u",
1998				le32_to_cpu(ex->ee_block),
1999				ext4_ext_get_actual_len(ex),
2000				block);
2001		BUG_ON(next == lblock);
2002		len = next - lblock;
2003	} else {
2004		lblock = len = 0;
2005		BUG();
2006	}
2007
2008	ext_debug(" -> %u:%lu\n", lblock, len);
2009	ext4_ext_put_in_cache(inode, lblock, len, 0);
2010}
2011
2012/*
2013 * ext4_ext_check_cache()
2014 * Checks to see if the given block is in the cache.
2015 * If it is, the cached extent is stored in the given
2016 * cache extent pointer.  If the cached extent is a hole,
2017 * this routine should be used instead of
2018 * ext4_ext_in_cache if the calling function needs to
2019 * know the size of the hole.
2020 *
2021 * @inode: The files inode
2022 * @block: The block to look for in the cache
2023 * @ex:    Pointer where the cached extent will be stored
2024 *         if it contains block
2025 *
2026 * Return 0 if cache is invalid; 1 if the cache is valid
2027 */
2028static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2029	struct ext4_ext_cache *ex){
2030	struct ext4_ext_cache *cex;
2031	struct ext4_sb_info *sbi;
2032	int ret = 0;
2033
2034	/*
2035	 * We borrow i_block_reservation_lock to protect i_cached_extent
2036	 */
2037	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2038	cex = &EXT4_I(inode)->i_cached_extent;
2039	sbi = EXT4_SB(inode->i_sb);
2040
2041	/* has cache valid data? */
2042	if (cex->ec_len == 0)
2043		goto errout;
2044
2045	if (in_range(block, cex->ec_block, cex->ec_len)) {
2046		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2047		ext_debug("%u cached by %u:%u:%llu\n",
2048				block,
2049				cex->ec_block, cex->ec_len, cex->ec_start);
2050		ret = 1;
2051	}
2052errout:
2053	if (!ret)
2054		sbi->extent_cache_misses++;
2055	else
2056		sbi->extent_cache_hits++;
2057	trace_ext4_ext_in_cache(inode, block, ret);
2058	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2059	return ret;
2060}
2061
2062/*
2063 * ext4_ext_in_cache()
2064 * Checks to see if the given block is in the cache.
2065 * If it is, the cached extent is stored in the given
2066 * extent pointer.
2067 *
2068 * @inode: The files inode
2069 * @block: The block to look for in the cache
2070 * @ex:    Pointer where the cached extent will be stored
2071 *         if it contains block
2072 *
2073 * Return 0 if cache is invalid; 1 if the cache is valid
2074 */
2075static int
2076ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2077			struct ext4_extent *ex)
2078{
2079	struct ext4_ext_cache cex;
2080	int ret = 0;
2081
2082	if (ext4_ext_check_cache(inode, block, &cex)) {
2083		ex->ee_block = cpu_to_le32(cex.ec_block);
2084		ext4_ext_store_pblock(ex, cex.ec_start);
2085		ex->ee_len = cpu_to_le16(cex.ec_len);
2086		ret = 1;
2087	}
2088
2089	return ret;
2090}
2091
2092
2093/*
2094 * ext4_ext_rm_idx:
2095 * removes index from the index block.
2096 */
2097static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2098			struct ext4_ext_path *path)
2099{
2100	int err;
2101	ext4_fsblk_t leaf;
2102
2103	/* free index block */
2104	path--;
2105	leaf = ext4_idx_pblock(path->p_idx);
2106	if (unlikely(path->p_hdr->eh_entries == 0)) {
2107		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2108		return -EIO;
2109	}
2110	err = ext4_ext_get_access(handle, inode, path);
2111	if (err)
2112		return err;
2113
2114	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2115		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2116		len *= sizeof(struct ext4_extent_idx);
2117		memmove(path->p_idx, path->p_idx + 1, len);
2118	}
2119
2120	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2121	err = ext4_ext_dirty(handle, inode, path);
2122	if (err)
2123		return err;
2124	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2125	trace_ext4_ext_rm_idx(inode, leaf);
2126
2127	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2128			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2129	return err;
2130}
2131
2132/*
2133 * ext4_ext_calc_credits_for_single_extent:
2134 * This routine returns max. credits that needed to insert an extent
2135 * to the extent tree.
2136 * When pass the actual path, the caller should calculate credits
2137 * under i_data_sem.
2138 */
2139int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2140						struct ext4_ext_path *path)
2141{
2142	if (path) {
2143		int depth = ext_depth(inode);
2144		int ret = 0;
2145
2146		/* probably there is space in leaf? */
2147		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2148				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2149
2150			/*
2151			 *  There are some space in the leaf tree, no
2152			 *  need to account for leaf block credit
2153			 *
2154			 *  bitmaps and block group descriptor blocks
2155			 *  and other metadata blocks still need to be
2156			 *  accounted.
2157			 */
2158			/* 1 bitmap, 1 block group descriptor */
2159			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2160			return ret;
2161		}
2162	}
2163
2164	return ext4_chunk_trans_blocks(inode, nrblocks);
2165}
2166
2167/*
2168 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2169 *
2170 * if nrblocks are fit in a single extent (chunk flag is 1), then
2171 * in the worse case, each tree level index/leaf need to be changed
2172 * if the tree split due to insert a new extent, then the old tree
2173 * index/leaf need to be updated too
2174 *
2175 * If the nrblocks are discontiguous, they could cause
2176 * the whole tree split more than once, but this is really rare.
2177 */
2178int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2179{
2180	int index;
2181	int depth = ext_depth(inode);
2182
2183	if (chunk)
2184		index = depth * 2;
2185	else
2186		index = depth * 3;
2187
2188	return index;
2189}
2190
2191static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2192			      struct ext4_extent *ex,
2193			      ext4_fsblk_t *partial_cluster,
2194			      ext4_lblk_t from, ext4_lblk_t to)
2195{
2196	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2197	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2198	ext4_fsblk_t pblk;
2199	int flags = EXT4_FREE_BLOCKS_FORGET;
2200
2201	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2202		flags |= EXT4_FREE_BLOCKS_METADATA;
2203	/*
2204	 * For bigalloc file systems, we never free a partial cluster
2205	 * at the beginning of the extent.  Instead, we make a note
2206	 * that we tried freeing the cluster, and check to see if we
2207	 * need to free it on a subsequent call to ext4_remove_blocks,
2208	 * or at the end of the ext4_truncate() operation.
2209	 */
2210	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2211
2212	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2213	/*
2214	 * If we have a partial cluster, and it's different from the
2215	 * cluster of the last block, we need to explicitly free the
2216	 * partial cluster here.
2217	 */
2218	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2219	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2220		ext4_free_blocks(handle, inode, NULL,
2221				 EXT4_C2B(sbi, *partial_cluster),
2222				 sbi->s_cluster_ratio, flags);
2223		*partial_cluster = 0;
2224	}
2225
2226#ifdef EXTENTS_STATS
2227	{
2228		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2229		spin_lock(&sbi->s_ext_stats_lock);
2230		sbi->s_ext_blocks += ee_len;
2231		sbi->s_ext_extents++;
2232		if (ee_len < sbi->s_ext_min)
2233			sbi->s_ext_min = ee_len;
2234		if (ee_len > sbi->s_ext_max)
2235			sbi->s_ext_max = ee_len;
2236		if (ext_depth(inode) > sbi->s_depth_max)
2237			sbi->s_depth_max = ext_depth(inode);
2238		spin_unlock(&sbi->s_ext_stats_lock);
2239	}
2240#endif
2241	if (from >= le32_to_cpu(ex->ee_block)
2242	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2243		/* tail removal */
2244		ext4_lblk_t num;
2245
2246		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2247		pblk = ext4_ext_pblock(ex) + ee_len - num;
2248		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2249		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2250		/*
2251		 * If the block range to be freed didn't start at the
2252		 * beginning of a cluster, and we removed the entire
2253		 * extent, save the partial cluster here, since we
2254		 * might need to delete if we determine that the
2255		 * truncate operation has removed all of the blocks in
2256		 * the cluster.
2257		 */
2258		if (pblk & (sbi->s_cluster_ratio - 1) &&
2259		    (ee_len == num))
2260			*partial_cluster = EXT4_B2C(sbi, pblk);
2261		else
2262			*partial_cluster = 0;
2263	} else if (from == le32_to_cpu(ex->ee_block)
2264		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2265		/* head removal */
2266		ext4_lblk_t num;
2267		ext4_fsblk_t start;
2268
2269		num = to - from;
2270		start = ext4_ext_pblock(ex);
2271
2272		ext_debug("free first %u blocks starting %llu\n", num, start);
2273		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2274
2275	} else {
2276		printk(KERN_INFO "strange request: removal(2) "
2277				"%u-%u from %u:%u\n",
2278				from, to, le32_to_cpu(ex->ee_block), ee_len);
2279	}
2280	return 0;
2281}
2282
2283
2284/*
2285 * ext4_ext_rm_leaf() Removes the extents associated with the
2286 * blocks appearing between "start" and "end", and splits the extents
2287 * if "start" and "end" appear in the same extent
2288 *
2289 * @handle: The journal handle
2290 * @inode:  The files inode
2291 * @path:   The path to the leaf
2292 * @start:  The first block to remove
2293 * @end:   The last block to remove
2294 */
2295static int
2296ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2297		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2298		 ext4_lblk_t start, ext4_lblk_t end)
2299{
2300	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2301	int err = 0, correct_index = 0;
2302	int depth = ext_depth(inode), credits;
2303	struct ext4_extent_header *eh;
2304	ext4_lblk_t a, b;
2305	unsigned num;
2306	ext4_lblk_t ex_ee_block;
2307	unsigned short ex_ee_len;
2308	unsigned uninitialized = 0;
2309	struct ext4_extent *ex;
2310
2311	/* the header must be checked already in ext4_ext_remove_space() */
2312	ext_debug("truncate since %u in leaf\n", start);
2313	if (!path[depth].p_hdr)
2314		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2315	eh = path[depth].p_hdr;
2316	if (unlikely(path[depth].p_hdr == NULL)) {
2317		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2318		return -EIO;
2319	}
2320	/* find where to start removing */
2321	ex = EXT_LAST_EXTENT(eh);
2322
2323	ex_ee_block = le32_to_cpu(ex->ee_block);
2324	ex_ee_len = ext4_ext_get_actual_len(ex);
2325
2326	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2327
2328	while (ex >= EXT_FIRST_EXTENT(eh) &&
2329			ex_ee_block + ex_ee_len > start) {
2330
2331		if (ext4_ext_is_uninitialized(ex))
2332			uninitialized = 1;
2333		else
2334			uninitialized = 0;
2335
2336		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2337			 uninitialized, ex_ee_len);
2338		path[depth].p_ext = ex;
2339
2340		a = ex_ee_block > start ? ex_ee_block : start;
2341		b = ex_ee_block+ex_ee_len - 1 < end ?
2342			ex_ee_block+ex_ee_len - 1 : end;
2343
2344		ext_debug("  border %u:%u\n", a, b);
2345
2346		/* If this extent is beyond the end of the hole, skip it */
2347		if (end <= ex_ee_block) {
2348			ex--;
2349			ex_ee_block = le32_to_cpu(ex->ee_block);
2350			ex_ee_len = ext4_ext_get_actual_len(ex);
2351			continue;
2352		} else if (b != ex_ee_block + ex_ee_len - 1) {
2353			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
2354					 start, end);
2355			err = -EIO;
2356			goto out;
2357		} else if (a != ex_ee_block) {
2358			/* remove tail of the extent */
2359			num = a - ex_ee_block;
2360		} else {
2361			/* remove whole extent: excellent! */
2362			num = 0;
2363		}
2364		/*
2365		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2366		 * descriptor) for each block group; assume two block
2367		 * groups plus ex_ee_len/blocks_per_block_group for
2368		 * the worst case
2369		 */
2370		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2371		if (ex == EXT_FIRST_EXTENT(eh)) {
2372			correct_index = 1;
2373			credits += (ext_depth(inode)) + 1;
2374		}
2375		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2376
2377		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2378		if (err)
2379			goto out;
2380
2381		err = ext4_ext_get_access(handle, inode, path + depth);
2382		if (err)
2383			goto out;
2384
2385		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2386					 a, b);
2387		if (err)
2388			goto out;
2389
2390		if (num == 0)
2391			/* this extent is removed; mark slot entirely unused */
2392			ext4_ext_store_pblock(ex, 0);
2393
2394		ex->ee_len = cpu_to_le16(num);
2395		/*
2396		 * Do not mark uninitialized if all the blocks in the
2397		 * extent have been removed.
2398		 */
2399		if (uninitialized && num)
2400			ext4_ext_mark_uninitialized(ex);
2401		/*
2402		 * If the extent was completely released,
2403		 * we need to remove it from the leaf
2404		 */
2405		if (num == 0) {
2406			if (end != EXT_MAX_BLOCKS - 1) {
2407				/*
2408				 * For hole punching, we need to scoot all the
2409				 * extents up when an extent is removed so that
2410				 * we dont have blank extents in the middle
2411				 */
2412				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2413					sizeof(struct ext4_extent));
2414
2415				/* Now get rid of the one at the end */
2416				memset(EXT_LAST_EXTENT(eh), 0,
2417					sizeof(struct ext4_extent));
2418			}
2419			le16_add_cpu(&eh->eh_entries, -1);
2420		} else
2421			*partial_cluster = 0;
2422
2423		err = ext4_ext_dirty(handle, inode, path + depth);
2424		if (err)
2425			goto out;
2426
2427		ext_debug("new extent: %u:%u:%llu\n", block, num,
2428				ext4_ext_pblock(ex));
2429		ex--;
2430		ex_ee_block = le32_to_cpu(ex->ee_block);
2431		ex_ee_len = ext4_ext_get_actual_len(ex);
2432	}
2433
2434	if (correct_index && eh->eh_entries)
2435		err = ext4_ext_correct_indexes(handle, inode, path);
2436
2437	/*
2438	 * If there is still a entry in the leaf node, check to see if
2439	 * it references the partial cluster.  This is the only place
2440	 * where it could; if it doesn't, we can free the cluster.
2441	 */
2442	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2443	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2444	     *partial_cluster)) {
2445		int flags = EXT4_FREE_BLOCKS_FORGET;
2446
2447		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2448			flags |= EXT4_FREE_BLOCKS_METADATA;
2449
2450		ext4_free_blocks(handle, inode, NULL,
2451				 EXT4_C2B(sbi, *partial_cluster),
2452				 sbi->s_cluster_ratio, flags);
2453		*partial_cluster = 0;
2454	}
2455
2456	/* if this leaf is free, then we should
2457	 * remove it from index block above */
2458	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2459		err = ext4_ext_rm_idx(handle, inode, path + depth);
2460
2461out:
2462	return err;
2463}
2464
2465/*
2466 * ext4_ext_more_to_rm:
2467 * returns 1 if current index has to be freed (even partial)
2468 */
2469static int
2470ext4_ext_more_to_rm(struct ext4_ext_path *path)
2471{
2472	BUG_ON(path->p_idx == NULL);
2473
2474	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2475		return 0;
2476
2477	/*
2478	 * if truncate on deeper level happened, it wasn't partial,
2479	 * so we have to consider current index for truncation
2480	 */
2481	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2482		return 0;
2483	return 1;
2484}
2485
2486static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2487{
2488	struct super_block *sb = inode->i_sb;
2489	int depth = ext_depth(inode);
2490	struct ext4_ext_path *path;
2491	ext4_fsblk_t partial_cluster = 0;
2492	handle_t *handle;
2493	int i, err;
2494
2495	ext_debug("truncate since %u\n", start);
2496
2497	/* probably first extent we're gonna free will be last in block */
2498	handle = ext4_journal_start(inode, depth + 1);
2499	if (IS_ERR(handle))
2500		return PTR_ERR(handle);
2501
2502again:
2503	ext4_ext_invalidate_cache(inode);
2504
2505	trace_ext4_ext_remove_space(inode, start, depth);
2506
2507	/*
2508	 * We start scanning from right side, freeing all the blocks
2509	 * after i_size and walking into the tree depth-wise.
2510	 */
2511	depth = ext_depth(inode);
2512	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2513	if (path == NULL) {
2514		ext4_journal_stop(handle);
2515		return -ENOMEM;
2516	}
2517	path[0].p_depth = depth;
2518	path[0].p_hdr = ext_inode_hdr(inode);
2519	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2520		err = -EIO;
2521		goto out;
2522	}
2523	i = err = 0;
2524
2525	while (i >= 0 && err == 0) {
2526		if (i == depth) {
2527			/* this is leaf block */
2528			err = ext4_ext_rm_leaf(handle, inode, path,
2529					       &partial_cluster, start,
2530					       EXT_MAX_BLOCKS - 1);
2531			/* root level has p_bh == NULL, brelse() eats this */
2532			brelse(path[i].p_bh);
2533			path[i].p_bh = NULL;
2534			i--;
2535			continue;
2536		}
2537
2538		/* this is index block */
2539		if (!path[i].p_hdr) {
2540			ext_debug("initialize header\n");
2541			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2542		}
2543
2544		if (!path[i].p_idx) {
2545			/* this level hasn't been touched yet */
2546			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2547			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2548			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2549				  path[i].p_hdr,
2550				  le16_to_cpu(path[i].p_hdr->eh_entries));
2551		} else {
2552			/* we were already here, see at next index */
2553			path[i].p_idx--;
2554		}
2555
2556		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2557				i, EXT_FIRST_INDEX(path[i].p_hdr),
2558				path[i].p_idx);
2559		if (ext4_ext_more_to_rm(path + i)) {
2560			struct buffer_head *bh;
2561			/* go to the next level */
2562			ext_debug("move to level %d (block %llu)\n",
2563				  i + 1, ext4_idx_pblock(path[i].p_idx));
2564			memset(path + i + 1, 0, sizeof(*path));
2565			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2566			if (!bh) {
2567				/* should we reset i_size? */
2568				err = -EIO;
2569				break;
2570			}
2571			if (WARN_ON(i + 1 > depth)) {
2572				err = -EIO;
2573				break;
2574			}
2575			if (ext4_ext_check(inode, ext_block_hdr(bh),
2576							depth - i - 1)) {
2577				err = -EIO;
2578				break;
2579			}
2580			path[i + 1].p_bh = bh;
2581
2582			/* save actual number of indexes since this
2583			 * number is changed at the next iteration */
2584			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2585			i++;
2586		} else {
2587			/* we finished processing this index, go up */
2588			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2589				/* index is empty, remove it;
2590				 * handle must be already prepared by the
2591				 * truncatei_leaf() */
2592				err = ext4_ext_rm_idx(handle, inode, path + i);
2593			}
2594			/* root level has p_bh == NULL, brelse() eats this */
2595			brelse(path[i].p_bh);
2596			path[i].p_bh = NULL;
2597			i--;
2598			ext_debug("return to level %d\n", i);
2599		}
2600	}
2601
2602	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2603			path->p_hdr->eh_entries);
2604
2605	/* If we still have something in the partial cluster and we have removed
2606	 * even the first extent, then we should free the blocks in the partial
2607	 * cluster as well. */
2608	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2609		int flags = EXT4_FREE_BLOCKS_FORGET;
2610
2611		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2612			flags |= EXT4_FREE_BLOCKS_METADATA;
2613
2614		ext4_free_blocks(handle, inode, NULL,
2615				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2616				 EXT4_SB(sb)->s_cluster_ratio, flags);
2617		partial_cluster = 0;
2618	}
2619
2620	/* TODO: flexible tree reduction should be here */
2621	if (path->p_hdr->eh_entries == 0) {
2622		/*
2623		 * truncate to zero freed all the tree,
2624		 * so we need to correct eh_depth
2625		 */
2626		err = ext4_ext_get_access(handle, inode, path);
2627		if (err == 0) {
2628			ext_inode_hdr(inode)->eh_depth = 0;
2629			ext_inode_hdr(inode)->eh_max =
2630				cpu_to_le16(ext4_ext_space_root(inode, 0));
2631			err = ext4_ext_dirty(handle, inode, path);
2632		}
2633	}
2634out:
2635	ext4_ext_drop_refs(path);
2636	kfree(path);
2637	if (err == -EAGAIN)
2638		goto again;
2639	ext4_journal_stop(handle);
2640
2641	return err;
2642}
2643
2644/*
2645 * called at mount time
2646 */
2647void ext4_ext_init(struct super_block *sb)
2648{
2649	/*
2650	 * possible initialization would be here
2651	 */
2652
2653	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2654#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2655		printk(KERN_INFO "EXT4-fs: file extents enabled");
2656#ifdef AGGRESSIVE_TEST
2657		printk(", aggressive tests");
2658#endif
2659#ifdef CHECK_BINSEARCH
2660		printk(", check binsearch");
2661#endif
2662#ifdef EXTENTS_STATS
2663		printk(", stats");
2664#endif
2665		printk("\n");
2666#endif
2667#ifdef EXTENTS_STATS
2668		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2669		EXT4_SB(sb)->s_ext_min = 1 << 30;
2670		EXT4_SB(sb)->s_ext_max = 0;
2671#endif
2672	}
2673}
2674
2675/*
2676 * called at umount time
2677 */
2678void ext4_ext_release(struct super_block *sb)
2679{
2680	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2681		return;
2682
2683#ifdef EXTENTS_STATS
2684	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2685		struct ext4_sb_info *sbi = EXT4_SB(sb);
2686		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2687			sbi->s_ext_blocks, sbi->s_ext_extents,
2688			sbi->s_ext_blocks / sbi->s_ext_extents);
2689		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2690			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2691	}
2692#endif
2693}
2694
2695/* FIXME!! we need to try to merge to left or right after zero-out  */
2696static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2697{
2698	ext4_fsblk_t ee_pblock;
2699	unsigned int ee_len;
2700	int ret;
2701
2702	ee_len    = ext4_ext_get_actual_len(ex);
2703	ee_pblock = ext4_ext_pblock(ex);
2704
2705	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2706	if (ret > 0)
2707		ret = 0;
2708
2709	return ret;
2710}
2711
2712/*
2713 * used by extent splitting.
2714 */
2715#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2716					due to ENOSPC */
2717#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2718#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2719
2720/*
2721 * ext4_split_extent_at() splits an extent at given block.
2722 *
2723 * @handle: the journal handle
2724 * @inode: the file inode
2725 * @path: the path to the extent
2726 * @split: the logical block where the extent is splitted.
2727 * @split_flags: indicates if the extent could be zeroout if split fails, and
2728 *		 the states(init or uninit) of new extents.
2729 * @flags: flags used to insert new extent to extent tree.
2730 *
2731 *
2732 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2733 * of which are deterimined by split_flag.
2734 *
2735 * There are two cases:
2736 *  a> the extent are splitted into two extent.
2737 *  b> split is not needed, and just mark the extent.
2738 *
2739 * return 0 on success.
2740 */
2741static int ext4_split_extent_at(handle_t *handle,
2742			     struct inode *inode,
2743			     struct ext4_ext_path *path,
2744			     ext4_lblk_t split,
2745			     int split_flag,
2746			     int flags)
2747{
2748	ext4_fsblk_t newblock;
2749	ext4_lblk_t ee_block;
2750	struct ext4_extent *ex, newex, orig_ex;
2751	struct ext4_extent *ex2 = NULL;
2752	unsigned int ee_len, depth;
2753	int err = 0;
2754
2755	ext_debug("ext4_split_extents_at: inode %lu, logical"
2756		"block %llu\n", inode->i_ino, (unsigned long long)split);
2757
2758	ext4_ext_show_leaf(inode, path);
2759
2760	depth = ext_depth(inode);
2761	ex = path[depth].p_ext;
2762	ee_block = le32_to_cpu(ex->ee_block);
2763	ee_len = ext4_ext_get_actual_len(ex);
2764	newblock = split - ee_block + ext4_ext_pblock(ex);
2765
2766	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2767
2768	err = ext4_ext_get_access(handle, inode, path + depth);
2769	if (err)
2770		goto out;
2771
2772	if (split == ee_block) {
2773		/*
2774		 * case b: block @split is the block that the extent begins with
2775		 * then we just change the state of the extent, and splitting
2776		 * is not needed.
2777		 */
2778		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2779			ext4_ext_mark_uninitialized(ex);
2780		else
2781			ext4_ext_mark_initialized(ex);
2782
2783		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2784			ext4_ext_try_to_merge(inode, path, ex);
2785
2786		err = ext4_ext_dirty(handle, inode, path + depth);
2787		goto out;
2788	}
2789
2790	/* case a */
2791	memcpy(&orig_ex, ex, sizeof(orig_ex));
2792	ex->ee_len = cpu_to_le16(split - ee_block);
2793	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2794		ext4_ext_mark_uninitialized(ex);
2795
2796	/*
2797	 * path may lead to new leaf, not to original leaf any more
2798	 * after ext4_ext_insert_extent() returns,
2799	 */
2800	err = ext4_ext_dirty(handle, inode, path + depth);
2801	if (err)
2802		goto fix_extent_len;
2803
2804	ex2 = &newex;
2805	ex2->ee_block = cpu_to_le32(split);
2806	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2807	ext4_ext_store_pblock(ex2, newblock);
2808	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809		ext4_ext_mark_uninitialized(ex2);
2810
2811	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2812	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2813		err = ext4_ext_zeroout(inode, &orig_ex);
2814		if (err)
2815			goto fix_extent_len;
2816		/* update the extent length and mark as initialized */
2817		ex->ee_len = cpu_to_le32(ee_len);
2818		ext4_ext_try_to_merge(inode, path, ex);
2819		err = ext4_ext_dirty(handle, inode, path + depth);
2820		goto out;
2821	} else if (err)
2822		goto fix_extent_len;
2823
2824out:
2825	ext4_ext_show_leaf(inode, path);
2826	return err;
2827
2828fix_extent_len:
2829	ex->ee_len = orig_ex.ee_len;
2830	ext4_ext_dirty(handle, inode, path + depth);
2831	return err;
2832}
2833
2834/*
2835 * ext4_split_extents() splits an extent and mark extent which is covered
2836 * by @map as split_flags indicates
2837 *
2838 * It may result in splitting the extent into multiple extents (upto three)
2839 * There are three possibilities:
2840 *   a> There is no split required
2841 *   b> Splits in two extents: Split is happening at either end of the extent
2842 *   c> Splits in three extents: Somone is splitting in middle of the extent
2843 *
2844 */
2845static int ext4_split_extent(handle_t *handle,
2846			      struct inode *inode,
2847			      struct ext4_ext_path *path,
2848			      struct ext4_map_blocks *map,
2849			      int split_flag,
2850			      int flags)
2851{
2852	ext4_lblk_t ee_block;
2853	struct ext4_extent *ex;
2854	unsigned int ee_len, depth;
2855	int err = 0;
2856	int uninitialized;
2857	int split_flag1, flags1;
2858
2859	depth = ext_depth(inode);
2860	ex = path[depth].p_ext;
2861	ee_block = le32_to_cpu(ex->ee_block);
2862	ee_len = ext4_ext_get_actual_len(ex);
2863	uninitialized = ext4_ext_is_uninitialized(ex);
2864
2865	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2866		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2867			      EXT4_EXT_MAY_ZEROOUT : 0;
2868		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2869		if (uninitialized)
2870			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2871				       EXT4_EXT_MARK_UNINIT2;
2872		err = ext4_split_extent_at(handle, inode, path,
2873				map->m_lblk + map->m_len, split_flag1, flags1);
2874		if (err)
2875			goto out;
2876	}
2877
2878	ext4_ext_drop_refs(path);
2879	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2880	if (IS_ERR(path))
2881		return PTR_ERR(path);
2882
2883	if (map->m_lblk >= ee_block) {
2884		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2885			      EXT4_EXT_MAY_ZEROOUT : 0;
2886		if (uninitialized)
2887			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2888		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2889			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2890		err = ext4_split_extent_at(handle, inode, path,
2891				map->m_lblk, split_flag1, flags);
2892		if (err)
2893			goto out;
2894	}
2895
2896	ext4_ext_show_leaf(inode, path);
2897out:
2898	return err ? err : map->m_len;
2899}
2900
2901#define EXT4_EXT_ZERO_LEN 7
2902/*
2903 * This function is called by ext4_ext_map_blocks() if someone tries to write
2904 * to an uninitialized extent. It may result in splitting the uninitialized
2905 * extent into multiple extents (up to three - one initialized and two
2906 * uninitialized).
2907 * There are three possibilities:
2908 *   a> There is no split required: Entire extent should be initialized
2909 *   b> Splits in two extents: Write is happening at either end of the extent
2910 *   c> Splits in three extents: Somone is writing in middle of the extent
2911 *
2912 * Pre-conditions:
2913 *  - The extent pointed to by 'path' is uninitialized.
2914 *  - The extent pointed to by 'path' contains a superset
2915 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
2916 *
2917 * Post-conditions on success:
2918 *  - the returned value is the number of blocks beyond map->l_lblk
2919 *    that are allocated and initialized.
2920 *    It is guaranteed to be >= map->m_len.
2921 */
2922static int ext4_ext_convert_to_initialized(handle_t *handle,
2923					   struct inode *inode,
2924					   struct ext4_map_blocks *map,
2925					   struct ext4_ext_path *path)
2926{
2927	struct ext4_extent_header *eh;
2928	struct ext4_map_blocks split_map;
2929	struct ext4_extent zero_ex;
2930	struct ext4_extent *ex;
2931	ext4_lblk_t ee_block, eof_block;
2932	unsigned int ee_len, depth;
2933	int allocated;
2934	int err = 0;
2935	int split_flag = 0;
2936
2937	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2938		"block %llu, max_blocks %u\n", inode->i_ino,
2939		(unsigned long long)map->m_lblk, map->m_len);
2940
2941	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2942		inode->i_sb->s_blocksize_bits;
2943	if (eof_block < map->m_lblk + map->m_len)
2944		eof_block = map->m_lblk + map->m_len;
2945
2946	depth = ext_depth(inode);
2947	eh = path[depth].p_hdr;
2948	ex = path[depth].p_ext;
2949	ee_block = le32_to_cpu(ex->ee_block);
2950	ee_len = ext4_ext_get_actual_len(ex);
2951	allocated = ee_len - (map->m_lblk - ee_block);
2952
2953	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
2954
2955	/* Pre-conditions */
2956	BUG_ON(!ext4_ext_is_uninitialized(ex));
2957	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2958	BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2959
2960	/*
2961	 * Attempt to transfer newly initialized blocks from the currently
2962	 * uninitialized extent to its left neighbor. This is much cheaper
2963	 * than an insertion followed by a merge as those involve costly
2964	 * memmove() calls. This is the common case in steady state for
2965	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
2966	 * writes.
2967	 *
2968	 * Limitations of the current logic:
2969	 *  - L1: we only deal with writes at the start of the extent.
2970	 *    The approach could be extended to writes at the end
2971	 *    of the extent but this scenario was deemed less common.
2972	 *  - L2: we do not deal with writes covering the whole extent.
2973	 *    This would require removing the extent if the transfer
2974	 *    is possible.
2975	 *  - L3: we only attempt to merge with an extent stored in the
2976	 *    same extent tree node.
2977	 */
2978	if ((map->m_lblk == ee_block) &&	/*L1*/
2979		(map->m_len < ee_len) &&	/*L2*/
2980		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
2981		struct ext4_extent *prev_ex;
2982		ext4_lblk_t prev_lblk;
2983		ext4_fsblk_t prev_pblk, ee_pblk;
2984		unsigned int prev_len, write_len;
2985
2986		prev_ex = ex - 1;
2987		prev_lblk = le32_to_cpu(prev_ex->ee_block);
2988		prev_len = ext4_ext_get_actual_len(prev_ex);
2989		prev_pblk = ext4_ext_pblock(prev_ex);
2990		ee_pblk = ext4_ext_pblock(ex);
2991		write_len = map->m_len;
2992
2993		/*
2994		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
2995		 * upon those conditions:
2996		 * - C1: prev_ex is initialized,
2997		 * - C2: prev_ex is logically abutting ex,
2998		 * - C3: prev_ex is physically abutting ex,
2999		 * - C4: prev_ex can receive the additional blocks without
3000		 *   overflowing the (initialized) length limit.
3001		 */
3002		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
3003			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3004			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3005			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
3006			err = ext4_ext_get_access(handle, inode, path + depth);
3007			if (err)
3008				goto out;
3009
3010			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3011				map, ex, prev_ex);
3012
3013			/* Shift the start of ex by 'write_len' blocks */
3014			ex->ee_block = cpu_to_le32(ee_block + write_len);
3015			ext4_ext_store_pblock(ex, ee_pblk + write_len);
3016			ex->ee_len = cpu_to_le16(ee_len - write_len);
3017			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3018
3019			/* Extend prev_ex by 'write_len' blocks */
3020			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3021
3022			/* Mark the block containing both extents as dirty */
3023			ext4_ext_dirty(handle, inode, path + depth);
3024
3025			/* Update path to point to the right extent */
3026			path[depth].p_ext = prev_ex;
3027
3028			/* Result: number of initialized blocks past m_lblk */
3029			allocated = write_len;
3030			goto out;
3031		}
3032	}
3033
3034	WARN_ON(map->m_lblk < ee_block);
3035	/*
3036	 * It is safe to convert extent to initialized via explicit
3037	 * zeroout only if extent is fully insde i_size or new_size.
3038	 */
3039	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3040
3041	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
3042	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
3043	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3044		err = ext4_ext_zeroout(inode, ex);
3045		if (err)
3046			goto out;
3047
3048		err = ext4_ext_get_access(handle, inode, path + depth);
3049		if (err)
3050			goto out;
3051		ext4_ext_mark_initialized(ex);
3052		ext4_ext_try_to_merge(inode, path, ex);
3053		err = ext4_ext_dirty(handle, inode, path + depth);
3054		goto out;
3055	}
3056
3057	/*
3058	 * four cases:
3059	 * 1. split the extent into three extents.
3060	 * 2. split the extent into two extents, zeroout the first half.
3061	 * 3. split the extent into two extents, zeroout the second half.
3062	 * 4. split the extent into two extents with out zeroout.
3063	 */
3064	split_map.m_lblk = map->m_lblk;
3065	split_map.m_len = map->m_len;
3066
3067	if (allocated > map->m_len) {
3068		if (allocated <= EXT4_EXT_ZERO_LEN &&
3069		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3070			/* case 3 */
3071			zero_ex.ee_block =
3072					 cpu_to_le32(map->m_lblk);
3073			zero_ex.ee_len = cpu_to_le16(allocated);
3074			ext4_ext_store_pblock(&zero_ex,
3075				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3076			err = ext4_ext_zeroout(inode, &zero_ex);
3077			if (err)
3078				goto out;
3079			split_map.m_lblk = map->m_lblk;
3080			split_map.m_len = allocated;
3081		} else if ((map->m_lblk - ee_block + map->m_len <
3082			   EXT4_EXT_ZERO_LEN) &&
3083			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3084			/* case 2 */
3085			if (map->m_lblk != ee_block) {
3086				zero_ex.ee_block = ex->ee_block;
3087				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3088							ee_block);
3089				ext4_ext_store_pblock(&zero_ex,
3090						      ext4_ext_pblock(ex));
3091				err = ext4_ext_zeroout(inode, &zero_ex);
3092				if (err)
3093					goto out;
3094			}
3095
3096			split_map.m_lblk = ee_block;
3097			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3098			allocated = map->m_len;
3099		}
3100	}
3101
3102	allocated = ext4_split_extent(handle, inode, path,
3103				       &split_map, split_flag, 0);
3104	if (allocated < 0)
3105		err = allocated;
3106
3107out:
3108	return err ? err : allocated;
3109}
3110
3111/*
3112 * This function is called by ext4_ext_map_blocks() from
3113 * ext4_get_blocks_dio_write() when DIO to write
3114 * to an uninitialized extent.
3115 *
3116 * Writing to an uninitialized extent may result in splitting the uninitialized
3117 * extent into multiple /initialized uninitialized extents (up to three)
3118 * There are three possibilities:
3119 *   a> There is no split required: Entire extent should be uninitialized
3120 *   b> Splits in two extents: Write is happening at either end of the extent
3121 *   c> Splits in three extents: Somone is writing in middle of the extent
3122 *
3123 * One of more index blocks maybe needed if the extent tree grow after
3124 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3125 * complete, we need to split the uninitialized extent before DIO submit
3126 * the IO. The uninitialized extent called at this time will be split
3127 * into three uninitialized extent(at most). After IO complete, the part
3128 * being filled will be convert to initialized by the end_io callback function
3129 * via ext4_convert_unwritten_extents().
3130 *
3131 * Returns the size of uninitialized extent to be written on success.
3132 */
3133static int ext4_split_unwritten_extents(handle_t *handle,
3134					struct inode *inode,
3135					struct ext4_map_blocks *map,
3136					struct ext4_ext_path *path,
3137					int flags)
3138{
3139	ext4_lblk_t eof_block;
3140	ext4_lblk_t ee_block;
3141	struct ext4_extent *ex;
3142	unsigned int ee_len;
3143	int split_flag = 0, depth;
3144
3145	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3146		"block %llu, max_blocks %u\n", inode->i_ino,
3147		(unsigned long long)map->m_lblk, map->m_len);
3148
3149	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3150		inode->i_sb->s_blocksize_bits;
3151	if (eof_block < map->m_lblk + map->m_len)
3152		eof_block = map->m_lblk + map->m_len;
3153	/*
3154	 * It is safe to convert extent to initialized via explicit
3155	 * zeroout only if extent is fully insde i_size or new_size.
3156	 */
3157	depth = ext_depth(inode);
3158	ex = path[depth].p_ext;
3159	ee_block = le32_to_cpu(ex->ee_block);
3160	ee_len = ext4_ext_get_actual_len(ex);
3161
3162	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3163	split_flag |= EXT4_EXT_MARK_UNINIT2;
3164
3165	flags |= EXT4_GET_BLOCKS_PRE_IO;
3166	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3167}
3168
3169static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3170					      struct inode *inode,
3171					      struct ext4_ext_path *path)
3172{
3173	struct ext4_extent *ex;
3174	int depth;
3175	int err = 0;
3176
3177	depth = ext_depth(inode);
3178	ex = path[depth].p_ext;
3179
3180	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3181		"block %llu, max_blocks %u\n", inode->i_ino,
3182		(unsigned long long)le32_to_cpu(ex->ee_block),
3183		ext4_ext_get_actual_len(ex));
3184
3185	err = ext4_ext_get_access(handle, inode, path + depth);
3186	if (err)
3187		goto out;
3188	/* first mark the extent as initialized */
3189	ext4_ext_mark_initialized(ex);
3190
3191	/* note: ext4_ext_correct_indexes() isn't needed here because
3192	 * borders are not changed
3193	 */
3194	ext4_ext_try_to_merge(inode, path, ex);
3195
3196	/* Mark modified extent as dirty */
3197	err = ext4_ext_dirty(handle, inode, path + depth);
3198out:
3199	ext4_ext_show_leaf(inode, path);
3200	return err;
3201}
3202
3203static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3204			sector_t block, int count)
3205{
3206	int i;
3207	for (i = 0; i < count; i++)
3208                unmap_underlying_metadata(bdev, block + i);
3209}
3210
3211/*
3212 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3213 */
3214static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3215			      ext4_lblk_t lblk,
3216			      struct ext4_ext_path *path,
3217			      unsigned int len)
3218{
3219	int i, depth;
3220	struct ext4_extent_header *eh;
3221	struct ext4_extent *last_ex;
3222
3223	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3224		return 0;
3225
3226	depth = ext_depth(inode);
3227	eh = path[depth].p_hdr;
3228
3229	if (unlikely(!eh->eh_entries)) {
3230		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3231				 "EOFBLOCKS_FL set");
3232		return -EIO;
3233	}
3234	last_ex = EXT_LAST_EXTENT(eh);
3235	/*
3236	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3237	 * last block in the last extent in the file.  We test this by
3238	 * first checking to see if the caller to
3239	 * ext4_ext_get_blocks() was interested in the last block (or
3240	 * a block beyond the last block) in the current extent.  If
3241	 * this turns out to be false, we can bail out from this
3242	 * function immediately.
3243	 */
3244	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3245	    ext4_ext_get_actual_len(last_ex))
3246		return 0;
3247	/*
3248	 * If the caller does appear to be planning to write at or
3249	 * beyond the end of the current extent, we then test to see
3250	 * if the current extent is the last extent in the file, by
3251	 * checking to make sure it was reached via the rightmost node
3252	 * at each level of the tree.
3253	 */
3254	for (i = depth-1; i >= 0; i--)
3255		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3256			return 0;
3257	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3258	return ext4_mark_inode_dirty(handle, inode);
3259}
3260
3261/**
3262 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3263 *
3264 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3265 * whether there are any buffers marked for delayed allocation. It returns '1'
3266 * on the first delalloc'ed buffer head found. If no buffer head in the given
3267 * range is marked for delalloc, it returns 0.
3268 * lblk_start should always be <= lblk_end.
3269 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3270 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3271 * block sooner). This is useful when blocks are truncated sequentially from
3272 * lblk_start towards lblk_end.
3273 */
3274static int ext4_find_delalloc_range(struct inode *inode,
3275				    ext4_lblk_t lblk_start,
3276				    ext4_lblk_t lblk_end,
3277				    int search_hint_reverse)
3278{
3279	struct address_space *mapping = inode->i_mapping;
3280	struct buffer_head *head, *bh = NULL;
3281	struct page *page;
3282	ext4_lblk_t i, pg_lblk;
3283	pgoff_t index;
3284
3285	/* reverse search wont work if fs block size is less than page size */
3286	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3287		search_hint_reverse = 0;
3288
3289	if (search_hint_reverse)
3290		i = lblk_end;
3291	else
3292		i = lblk_start;
3293
3294	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3295
3296	while ((i >= lblk_start) && (i <= lblk_end)) {
3297		page = find_get_page(mapping, index);
3298		if (!page)
3299			goto nextpage;
3300
3301		if (!page_has_buffers(page))
3302			goto nextpage;
3303
3304		head = page_buffers(page);
3305		if (!head)
3306			goto nextpage;
3307
3308		bh = head;
3309		pg_lblk = index << (PAGE_CACHE_SHIFT -
3310						inode->i_blkbits);
3311		do {
3312			if (unlikely(pg_lblk < lblk_start)) {
3313				/*
3314				 * This is possible when fs block size is less
3315				 * than page size and our cluster starts/ends in
3316				 * middle of the page. So we need to skip the
3317				 * initial few blocks till we reach the 'lblk'
3318				 */
3319				pg_lblk++;
3320				continue;
3321			}
3322
3323			/* Check if the buffer is delayed allocated and that it
3324			 * is not yet mapped. (when da-buffers are mapped during
3325			 * their writeout, their da_mapped bit is set.)
3326			 */
3327			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3328				page_cache_release(page);
3329				trace_ext4_find_delalloc_range(inode,
3330						lblk_start, lblk_end,
3331						search_hint_reverse,
3332						1, i);
3333				return 1;
3334			}
3335			if (search_hint_reverse)
3336				i--;
3337			else
3338				i++;
3339		} while ((i >= lblk_start) && (i <= lblk_end) &&
3340				((bh = bh->b_this_page) != head));
3341nextpage:
3342		if (page)
3343			page_cache_release(page);
3344		/*
3345		 * Move to next page. 'i' will be the first lblk in the next
3346		 * page.
3347		 */
3348		if (search_hint_reverse)
3349			index--;
3350		else
3351			index++;
3352		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3353	}
3354
3355	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3356					search_hint_reverse, 0, 0);
3357	return 0;
3358}
3359
3360int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3361			       int search_hint_reverse)
3362{
3363	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3364	ext4_lblk_t lblk_start, lblk_end;
3365	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3366	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3367
3368	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3369					search_hint_reverse);
3370}
3371
3372/**
3373 * Determines how many complete clusters (out of those specified by the 'map')
3374 * are under delalloc and were reserved quota for.
3375 * This function is called when we are writing out the blocks that were
3376 * originally written with their allocation delayed, but then the space was
3377 * allocated using fallocate() before the delayed allocation could be resolved.
3378 * The cases to look for are:
3379 * ('=' indicated delayed allocated blocks
3380 *  '-' indicates non-delayed allocated blocks)
3381 * (a) partial clusters towards beginning and/or end outside of allocated range
3382 *     are not delalloc'ed.
3383 *	Ex:
3384 *	|----c---=|====c====|====c====|===-c----|
3385 *	         |++++++ allocated ++++++|
3386 *	==> 4 complete clusters in above example
3387 *
3388 * (b) partial cluster (outside of allocated range) towards either end is
3389 *     marked for delayed allocation. In this case, we will exclude that
3390 *     cluster.
3391 *	Ex:
3392 *	|----====c========|========c========|
3393 *	     |++++++ allocated ++++++|
3394 *	==> 1 complete clusters in above example
3395 *
3396 *	Ex:
3397 *	|================c================|
3398 *            |++++++ allocated ++++++|
3399 *	==> 0 complete clusters in above example
3400 *
3401 * The ext4_da_update_reserve_space will be called only if we
3402 * determine here that there were some "entire" clusters that span
3403 * this 'allocated' range.
3404 * In the non-bigalloc case, this function will just end up returning num_blks
3405 * without ever calling ext4_find_delalloc_range.
3406 */
3407static unsigned int
3408get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3409			   unsigned int num_blks)
3410{
3411	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3412	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3413	ext4_lblk_t lblk_from, lblk_to, c_offset;
3414	unsigned int allocated_clusters = 0;
3415
3416	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3417	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3418
3419	/* max possible clusters for this allocation */
3420	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3421
3422	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3423
3424	/* Check towards left side */
3425	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3426	if (c_offset) {
3427		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3428		lblk_to = lblk_from + c_offset - 1;
3429
3430		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3431			allocated_clusters--;
3432	}
3433
3434	/* Now check towards right. */
3435	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3436	if (allocated_clusters && c_offset) {
3437		lblk_from = lblk_start + num_blks;
3438		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3439
3440		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3441			allocated_clusters--;
3442	}
3443
3444	return allocated_clusters;
3445}
3446
3447static int
3448ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3449			struct ext4_map_blocks *map,
3450			struct ext4_ext_path *path, int flags,
3451			unsigned int allocated, ext4_fsblk_t newblock)
3452{
3453	int ret = 0;
3454	int err = 0;
3455	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3456
3457	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3458		  "block %llu, max_blocks %u, flags %d, allocated %u",
3459		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3460		  flags, allocated);
3461	ext4_ext_show_leaf(inode, path);
3462
3463	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3464						    newblock);
3465
3466	/* get_block() before submit the IO, split the extent */
3467	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3468		ret = ext4_split_unwritten_extents(handle, inode, map,
3469						   path, flags);
3470		/*
3471		 * Flag the inode(non aio case) or end_io struct (aio case)
3472		 * that this IO needs to conversion to written when IO is
3473		 * completed
3474		 */
3475		if (io) {
3476			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
3477				io->flag = EXT4_IO_END_UNWRITTEN;
3478				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3479			}
3480		} else
3481			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3482		if (ext4_should_dioread_nolock(inode))
3483			map->m_flags |= EXT4_MAP_UNINIT;
3484		goto out;
3485	}
3486	/* IO end_io complete, convert the filled extent to written */
3487	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3488		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3489							path);
3490		if (ret >= 0) {
3491			ext4_update_inode_fsync_trans(handle, inode, 1);
3492			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3493						 path, map->m_len);
3494		} else
3495			err = ret;
3496		goto out2;
3497	}
3498	/* buffered IO case */
3499	/*
3500	 * repeat fallocate creation request
3501	 * we already have an unwritten extent
3502	 */
3503	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3504		goto map_out;
3505
3506	/* buffered READ or buffered write_begin() lookup */
3507	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3508		/*
3509		 * We have blocks reserved already.  We
3510		 * return allocated blocks so that delalloc
3511		 * won't do block reservation for us.  But
3512		 * the buffer head will be unmapped so that
3513		 * a read from the block returns 0s.
3514		 */
3515		map->m_flags |= EXT4_MAP_UNWRITTEN;
3516		goto out1;
3517	}
3518
3519	/* buffered write, writepage time, convert*/
3520	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3521	if (ret >= 0)
3522		ext4_update_inode_fsync_trans(handle, inode, 1);
3523out:
3524	if (ret <= 0) {
3525		err = ret;
3526		goto out2;
3527	} else
3528		allocated = ret;
3529	map->m_flags |= EXT4_MAP_NEW;
3530	/*
3531	 * if we allocated more blocks than requested
3532	 * we need to make sure we unmap the extra block
3533	 * allocated. The actual needed block will get
3534	 * unmapped later when we find the buffer_head marked
3535	 * new.
3536	 */
3537	if (allocated > map->m_len) {
3538		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3539					newblock + map->m_len,
3540					allocated - map->m_len);
3541		allocated = map->m_len;
3542	}
3543
3544	/*
3545	 * If we have done fallocate with the offset that is already
3546	 * delayed allocated, we would have block reservation
3547	 * and quota reservation done in the delayed write path.
3548	 * But fallocate would have already updated quota and block
3549	 * count for this offset. So cancel these reservation
3550	 */
3551	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3552		unsigned int reserved_clusters;
3553		reserved_clusters = get_reserved_cluster_alloc(inode,
3554				map->m_lblk, map->m_len);
3555		if (reserved_clusters)
3556			ext4_da_update_reserve_space(inode,
3557						     reserved_clusters,
3558						     0);
3559	}
3560
3561map_out:
3562	map->m_flags |= EXT4_MAP_MAPPED;
3563	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3564		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3565					 map->m_len);
3566		if (err < 0)
3567			goto out2;
3568	}
3569out1:
3570	if (allocated > map->m_len)
3571		allocated = map->m_len;
3572	ext4_ext_show_leaf(inode, path);
3573	map->m_pblk = newblock;
3574	map->m_len = allocated;
3575out2:
3576	if (path) {
3577		ext4_ext_drop_refs(path);
3578		kfree(path);
3579	}
3580	return err ? err : allocated;
3581}
3582
3583/*
3584 * get_implied_cluster_alloc - check to see if the requested
3585 * allocation (in the map structure) overlaps with a cluster already
3586 * allocated in an extent.
3587 *	@sb	The filesystem superblock structure
3588 *	@map	The requested lblk->pblk mapping
3589 *	@ex	The extent structure which might contain an implied
3590 *			cluster allocation
3591 *
3592 * This function is called by ext4_ext_map_blocks() after we failed to
3593 * find blocks that were already in the inode's extent tree.  Hence,
3594 * we know that the beginning of the requested region cannot overlap
3595 * the extent from the inode's extent tree.  There are three cases we
3596 * want to catch.  The first is this case:
3597 *
3598 *		 |--- cluster # N--|
3599 *    |--- extent ---|	|---- requested region ---|
3600 *			|==========|
3601 *
3602 * The second case that we need to test for is this one:
3603 *
3604 *   |--------- cluster # N ----------------|
3605 *	   |--- requested region --|   |------- extent ----|
3606 *	   |=======================|
3607 *
3608 * The third case is when the requested region lies between two extents
3609 * within the same cluster:
3610 *          |------------- cluster # N-------------|
3611 * |----- ex -----|                  |---- ex_right ----|
3612 *                  |------ requested region ------|
3613 *                  |================|
3614 *
3615 * In each of the above cases, we need to set the map->m_pblk and
3616 * map->m_len so it corresponds to the return the extent labelled as
3617 * "|====|" from cluster #N, since it is already in use for data in
3618 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3619 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3620 * as a new "allocated" block region.  Otherwise, we will return 0 and
3621 * ext4_ext_map_blocks() will then allocate one or more new clusters
3622 * by calling ext4_mb_new_blocks().
3623 */
3624static int get_implied_cluster_alloc(struct super_block *sb,
3625				     struct ext4_map_blocks *map,
3626				     struct ext4_extent *ex,
3627				     struct ext4_ext_path *path)
3628{
3629	struct ext4_sb_info *sbi = EXT4_SB(sb);
3630	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3631	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3632	ext4_lblk_t rr_cluster_start, rr_cluster_end;
3633	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3634	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3635	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3636
3637	/* The extent passed in that we are trying to match */
3638	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3639	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3640
3641	/* The requested region passed into ext4_map_blocks() */
3642	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3643	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3644
3645	if ((rr_cluster_start == ex_cluster_end) ||
3646	    (rr_cluster_start == ex_cluster_start)) {
3647		if (rr_cluster_start == ex_cluster_end)
3648			ee_start += ee_len - 1;
3649		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3650			c_offset;
3651		map->m_len = min(map->m_len,
3652				 (unsigned) sbi->s_cluster_ratio - c_offset);
3653		/*
3654		 * Check for and handle this case:
3655		 *
3656		 *   |--------- cluster # N-------------|
3657		 *		       |------- extent ----|
3658		 *	   |--- requested region ---|
3659		 *	   |===========|
3660		 */
3661
3662		if (map->m_lblk < ee_block)
3663			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3664
3665		/*
3666		 * Check for the case where there is already another allocated
3667		 * block to the right of 'ex' but before the end of the cluster.
3668		 *
3669		 *          |------------- cluster # N-------------|
3670		 * |----- ex -----|                  |---- ex_right ----|
3671		 *                  |------ requested region ------|
3672		 *                  |================|
3673		 */
3674		if (map->m_lblk > ee_block) {
3675			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3676			map->m_len = min(map->m_len, next - map->m_lblk);
3677		}
3678
3679		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3680		return 1;
3681	}
3682
3683	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3684	return 0;
3685}
3686
3687
3688/*
3689 * Block allocation/map/preallocation routine for extents based files
3690 *
3691 *
3692 * Need to be called with
3693 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3694 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3695 *
3696 * return > 0, number of of blocks already mapped/allocated
3697 *          if create == 0 and these are pre-allocated blocks
3698 *          	buffer head is unmapped
3699 *          otherwise blocks are mapped
3700 *
3701 * return = 0, if plain look up failed (blocks have not been allocated)
3702 *          buffer head is unmapped
3703 *
3704 * return < 0, error case.
3705 */
3706int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3707			struct ext4_map_blocks *map, int flags)
3708{
3709	struct ext4_ext_path *path = NULL;
3710	struct ext4_extent newex, *ex, *ex2;
3711	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3712	ext4_fsblk_t newblock = 0;
3713	int free_on_err = 0, err = 0, depth, ret;
3714	unsigned int allocated = 0, offset = 0;
3715	unsigned int allocated_clusters = 0;
3716	unsigned int punched_out = 0;
3717	unsigned int result = 0;
3718	struct ext4_allocation_request ar;
3719	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3720	ext4_lblk_t cluster_offset;
3721
3722	ext_debug("blocks %u/%u requested for inode %lu\n",
3723		  map->m_lblk, map->m_len, inode->i_ino);
3724	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3725
3726	/* check in cache */
3727	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3728		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3729		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3730			if ((sbi->s_cluster_ratio > 1) &&
3731			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3732				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3733
3734			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3735				/*
3736				 * block isn't allocated yet and
3737				 * user doesn't want to allocate it
3738				 */
3739				goto out2;
3740			}
3741			/* we should allocate requested block */
3742		} else {
3743			/* block is already allocated */
3744			if (sbi->s_cluster_ratio > 1)
3745				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3746			newblock = map->m_lblk
3747				   - le32_to_cpu(newex.ee_block)
3748				   + ext4_ext_pblock(&newex);
3749			/* number of remaining blocks in the extent */
3750			allocated = ext4_ext_get_actual_len(&newex) -
3751				(map->m_lblk - le32_to_cpu(newex.ee_block));
3752			goto out;
3753		}
3754	}
3755
3756	/* find extent for this block */
3757	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3758	if (IS_ERR(path)) {
3759		err = PTR_ERR(path);
3760		path = NULL;
3761		goto out2;
3762	}
3763
3764	depth = ext_depth(inode);
3765
3766	/*
3767	 * consistent leaf must not be empty;
3768	 * this situation is possible, though, _during_ tree modification;
3769	 * this is why assert can't be put in ext4_ext_find_extent()
3770	 */
3771	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3772		EXT4_ERROR_INODE(inode, "bad extent address "
3773				 "lblock: %lu, depth: %d pblock %lld",
3774				 (unsigned long) map->m_lblk, depth,
3775				 path[depth].p_block);
3776		err = -EIO;
3777		goto out2;
3778	}
3779
3780	ex = path[depth].p_ext;
3781	if (ex) {
3782		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3783		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3784		unsigned short ee_len;
3785
3786		/*
3787		 * Uninitialized extents are treated as holes, except that
3788		 * we split out initialized portions during a write.
3789		 */
3790		ee_len = ext4_ext_get_actual_len(ex);
3791
3792		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3793
3794		/* if found extent covers block, simply return it */
3795		if (in_range(map->m_lblk, ee_block, ee_len)) {
3796			struct ext4_map_blocks punch_map;
3797			ext4_fsblk_t partial_cluster = 0;
3798
3799			newblock = map->m_lblk - ee_block + ee_start;
3800			/* number of remaining blocks in the extent */
3801			allocated = ee_len - (map->m_lblk - ee_block);
3802			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3803				  ee_block, ee_len, newblock);
3804
3805			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3806				/*
3807				 * Do not put uninitialized extent
3808				 * in the cache
3809				 */
3810				if (!ext4_ext_is_uninitialized(ex)) {
3811					ext4_ext_put_in_cache(inode, ee_block,
3812						ee_len, ee_start);
3813					goto out;
3814				}
3815				ret = ext4_ext_handle_uninitialized_extents(
3816					handle, inode, map, path, flags,
3817					allocated, newblock);
3818				return ret;
3819			}
3820
3821			/*
3822			 * Punch out the map length, but only to the
3823			 * end of the extent
3824			 */
3825			punched_out = allocated < map->m_len ?
3826				allocated : map->m_len;
3827
3828			/*
3829			 * Sense extents need to be converted to
3830			 * uninitialized, they must fit in an
3831			 * uninitialized extent
3832			 */
3833			if (punched_out > EXT_UNINIT_MAX_LEN)
3834				punched_out = EXT_UNINIT_MAX_LEN;
3835
3836			punch_map.m_lblk = map->m_lblk;
3837			punch_map.m_pblk = newblock;
3838			punch_map.m_len = punched_out;
3839			punch_map.m_flags = 0;
3840
3841			/* Check to see if the extent needs to be split */
3842			if (punch_map.m_len != ee_len ||
3843				punch_map.m_lblk != ee_block) {
3844
3845				ret = ext4_split_extent(handle, inode,
3846				path, &punch_map, 0,
3847				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3848				EXT4_GET_BLOCKS_PRE_IO);
3849
3850				if (ret < 0) {
3851					err = ret;
3852					goto out2;
3853				}
3854				/*
3855				 * find extent for the block at
3856				 * the start of the hole
3857				 */
3858				ext4_ext_drop_refs(path);
3859				kfree(path);
3860
3861				path = ext4_ext_find_extent(inode,
3862				map->m_lblk, NULL);
3863				if (IS_ERR(path)) {
3864					err = PTR_ERR(path);
3865					path = NULL;
3866					goto out2;
3867				}
3868
3869				depth = ext_depth(inode);
3870				ex = path[depth].p_ext;
3871				ee_len = ext4_ext_get_actual_len(ex);
3872				ee_block = le32_to_cpu(ex->ee_block);
3873				ee_start = ext4_ext_pblock(ex);
3874
3875			}
3876
3877			ext4_ext_mark_uninitialized(ex);
3878
3879			ext4_ext_invalidate_cache(inode);
3880
3881			err = ext4_ext_rm_leaf(handle, inode, path,
3882					       &partial_cluster, map->m_lblk,
3883					       map->m_lblk + punched_out);
3884
3885			if (!err && path->p_hdr->eh_entries == 0) {
3886				/*
3887				 * Punch hole freed all of this sub tree,
3888				 * so we need to correct eh_depth
3889				 */
3890				err = ext4_ext_get_access(handle, inode, path);
3891				if (err == 0) {
3892					ext_inode_hdr(inode)->eh_depth = 0;
3893					ext_inode_hdr(inode)->eh_max =
3894					cpu_to_le16(ext4_ext_space_root(
3895						inode, 0));
3896
3897					err = ext4_ext_dirty(
3898						handle, inode, path);
3899				}
3900			}
3901
3902			goto out2;
3903		}
3904	}
3905
3906	if ((sbi->s_cluster_ratio > 1) &&
3907	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3908		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3909
3910	/*
3911	 * requested block isn't allocated yet;
3912	 * we couldn't try to create block if create flag is zero
3913	 */
3914	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3915		/*
3916		 * put just found gap into cache to speed up
3917		 * subsequent requests
3918		 */
3919		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3920		goto out2;
3921	}
3922
3923	/*
3924	 * Okay, we need to do block allocation.
3925	 */
3926	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3927	newex.ee_block = cpu_to_le32(map->m_lblk);
3928	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3929
3930	/*
3931	 * If we are doing bigalloc, check to see if the extent returned
3932	 * by ext4_ext_find_extent() implies a cluster we can use.
3933	 */
3934	if (cluster_offset && ex &&
3935	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3936		ar.len = allocated = map->m_len;
3937		newblock = map->m_pblk;
3938		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3939		goto got_allocated_blocks;
3940	}
3941
3942	/* find neighbour allocated blocks */
3943	ar.lleft = map->m_lblk;
3944	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3945	if (err)
3946		goto out2;
3947	ar.lright = map->m_lblk;
3948	ex2 = NULL;
3949	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3950	if (err)
3951		goto out2;
3952
3953	/* Check if the extent after searching to the right implies a
3954	 * cluster we can use. */
3955	if ((sbi->s_cluster_ratio > 1) && ex2 &&
3956	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3957		ar.len = allocated = map->m_len;
3958		newblock = map->m_pblk;
3959		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3960		goto got_allocated_blocks;
3961	}
3962
3963	/*
3964	 * See if request is beyond maximum number of blocks we can have in
3965	 * a single extent. For an initialized extent this limit is
3966	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3967	 * EXT_UNINIT_MAX_LEN.
3968	 */
3969	if (map->m_len > EXT_INIT_MAX_LEN &&
3970	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3971		map->m_len = EXT_INIT_MAX_LEN;
3972	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3973		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3974		map->m_len = EXT_UNINIT_MAX_LEN;
3975
3976	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3977	newex.ee_len = cpu_to_le16(map->m_len);
3978	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3979	if (err)
3980		allocated = ext4_ext_get_actual_len(&newex);
3981	else
3982		allocated = map->m_len;
3983
3984	/* allocate new block */
3985	ar.inode = inode;
3986	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3987	ar.logical = map->m_lblk;
3988	/*
3989	 * We calculate the offset from the beginning of the cluster
3990	 * for the logical block number, since when we allocate a
3991	 * physical cluster, the physical block should start at the
3992	 * same offset from the beginning of the cluster.  This is
3993	 * needed so that future calls to get_implied_cluster_alloc()
3994	 * work correctly.
3995	 */
3996	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
3997	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
3998	ar.goal -= offset;
3999	ar.logical -= offset;
4000	if (S_ISREG(inode->i_mode))
4001		ar.flags = EXT4_MB_HINT_DATA;
4002	else
4003		/* disable in-core preallocation for non-regular files */
4004		ar.flags = 0;
4005	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4006		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4007	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4008	if (!newblock)
4009		goto out2;
4010	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4011		  ar.goal, newblock, allocated);
4012	free_on_err = 1;
4013	allocated_clusters = ar.len;
4014	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4015	if (ar.len > allocated)
4016		ar.len = allocated;
4017
4018got_allocated_blocks:
4019	/* try to insert new extent into found leaf and return */
4020	ext4_ext_store_pblock(&newex, newblock + offset);
4021	newex.ee_len = cpu_to_le16(ar.len);
4022	/* Mark uninitialized */
4023	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4024		ext4_ext_mark_uninitialized(&newex);
4025		/*
4026		 * io_end structure was created for every IO write to an
4027		 * uninitialized extent. To avoid unnecessary conversion,
4028		 * here we flag the IO that really needs the conversion.
4029		 * For non asycn direct IO case, flag the inode state
4030		 * that we need to perform conversion when IO is done.
4031		 */
4032		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
4033			if (io) {
4034				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
4035					io->flag = EXT4_IO_END_UNWRITTEN;
4036					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
4037				}
4038			} else
4039				ext4_set_inode_state(inode,
4040						     EXT4_STATE_DIO_UNWRITTEN);
4041		}
4042		if (ext4_should_dioread_nolock(inode))
4043			map->m_flags |= EXT4_MAP_UNINIT;
4044	}
4045
4046	err = 0;
4047	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4048		err = check_eofblocks_fl(handle, inode, map->m_lblk,
4049					 path, ar.len);
4050	if (!err)
4051		err = ext4_ext_insert_extent(handle, inode, path,
4052					     &newex, flags);
4053	if (err && free_on_err) {
4054		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4055			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4056		/* free data blocks we just allocated */
4057		/* not a good idea to call discard here directly,
4058		 * but otherwise we'd need to call it every free() */
4059		ext4_discard_preallocations(inode);
4060		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
4061				 ext4_ext_get_actual_len(&newex), fb_flags);
4062		goto out2;
4063	}
4064
4065	/* previous routine could use block we allocated */
4066	newblock = ext4_ext_pblock(&newex);
4067	allocated = ext4_ext_get_actual_len(&newex);
4068	if (allocated > map->m_len)
4069		allocated = map->m_len;
4070	map->m_flags |= EXT4_MAP_NEW;
4071
4072	/*
4073	 * Update reserved blocks/metadata blocks after successful
4074	 * block allocation which had been deferred till now.
4075	 */
4076	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4077		unsigned int reserved_clusters;
4078		/*
4079		 * Check how many clusters we had reserved this allocated range
4080		 */
4081		reserved_clusters = get_reserved_cluster_alloc(inode,
4082						map->m_lblk, allocated);
4083		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4084			if (reserved_clusters) {
4085				/*
4086				 * We have clusters reserved for this range.
4087				 * But since we are not doing actual allocation
4088				 * and are simply using blocks from previously
4089				 * allocated cluster, we should release the
4090				 * reservation and not claim quota.
4091				 */
4092				ext4_da_update_reserve_space(inode,
4093						reserved_clusters, 0);
4094			}
4095		} else {
4096			BUG_ON(allocated_clusters < reserved_clusters);
4097			/* We will claim quota for all newly allocated blocks.*/
4098			ext4_da_update_reserve_space(inode, allocated_clusters,
4099							1);
4100			if (reserved_clusters < allocated_clusters) {
4101				struct ext4_inode_info *ei = EXT4_I(inode);
4102				int reservation = allocated_clusters -
4103						  reserved_clusters;
4104				/*
4105				 * It seems we claimed few clusters outside of
4106				 * the range of this allocation. We should give
4107				 * it back to the reservation pool. This can
4108				 * happen in the following case:
4109				 *
4110				 * * Suppose s_cluster_ratio is 4 (i.e., each
4111				 *   cluster has 4 blocks. Thus, the clusters
4112				 *   are [0-3],[4-7],[8-11]...
4113				 * * First comes delayed allocation write for
4114				 *   logical blocks 10 & 11. Since there were no
4115				 *   previous delayed allocated blocks in the
4116				 *   range [8-11], we would reserve 1 cluster
4117				 *   for this write.
4118				 * * Next comes write for logical blocks 3 to 8.
4119				 *   In this case, we will reserve 2 clusters
4120				 *   (for [0-3] and [4-7]; and not for [8-11] as
4121				 *   that range has a delayed allocated blocks.
4122				 *   Thus total reserved clusters now becomes 3.
4123				 * * Now, during the delayed allocation writeout
4124				 *   time, we will first write blocks [3-8] and
4125				 *   allocate 3 clusters for writing these
4126				 *   blocks. Also, we would claim all these
4127				 *   three clusters above.
4128				 * * Now when we come here to writeout the
4129				 *   blocks [10-11], we would expect to claim
4130				 *   the reservation of 1 cluster we had made
4131				 *   (and we would claim it since there are no
4132				 *   more delayed allocated blocks in the range
4133				 *   [8-11]. But our reserved cluster count had
4134				 *   already gone to 0.
4135				 *
4136				 *   Thus, at the step 4 above when we determine
4137				 *   that there are still some unwritten delayed
4138				 *   allocated blocks outside of our current
4139				 *   block range, we should increment the
4140				 *   reserved clusters count so that when the
4141				 *   remaining blocks finally gets written, we
4142				 *   could claim them.
4143				 */
4144				dquot_reserve_block(inode,
4145						EXT4_C2B(sbi, reservation));
4146				spin_lock(&ei->i_block_reservation_lock);
4147				ei->i_reserved_data_blocks += reservation;
4148				spin_unlock(&ei->i_block_reservation_lock);
4149			}
4150		}
4151	}
4152
4153	/*
4154	 * Cache the extent and update transaction to commit on fdatasync only
4155	 * when it is _not_ an uninitialized extent.
4156	 */
4157	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4158		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4159		ext4_update_inode_fsync_trans(handle, inode, 1);
4160	} else
4161		ext4_update_inode_fsync_trans(handle, inode, 0);
4162out:
4163	if (allocated > map->m_len)
4164		allocated = map->m_len;
4165	ext4_ext_show_leaf(inode, path);
4166	map->m_flags |= EXT4_MAP_MAPPED;
4167	map->m_pblk = newblock;
4168	map->m_len = allocated;
4169out2:
4170	if (path) {
4171		ext4_ext_drop_refs(path);
4172		kfree(path);
4173	}
4174	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4175			punched_out : allocated;
4176
4177	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4178		newblock, map->m_len, err ? err : result);
4179
4180	return err ? err : result;
4181}
4182
4183void ext4_ext_truncate(struct inode *inode)
4184{
4185	struct address_space *mapping = inode->i_mapping;
4186	struct super_block *sb = inode->i_sb;
4187	ext4_lblk_t last_block;
4188	handle_t *handle;
4189	loff_t page_len;
4190	int err = 0;
4191
4192	/*
4193	 * finish any pending end_io work so we won't run the risk of
4194	 * converting any truncated blocks to initialized later
4195	 */
4196	ext4_flush_completed_IO(inode);
4197
4198	/*
4199	 * probably first extent we're gonna free will be last in block
4200	 */
4201	err = ext4_writepage_trans_blocks(inode);
4202	handle = ext4_journal_start(inode, err);
4203	if (IS_ERR(handle))
4204		return;
4205
4206	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4207		page_len = PAGE_CACHE_SIZE -
4208			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4209
4210		err = ext4_discard_partial_page_buffers(handle,
4211			mapping, inode->i_size, page_len, 0);
4212
4213		if (err)
4214			goto out_stop;
4215	}
4216
4217	if (ext4_orphan_add(handle, inode))
4218		goto out_stop;
4219
4220	down_write(&EXT4_I(inode)->i_data_sem);
4221	ext4_ext_invalidate_cache(inode);
4222
4223	ext4_discard_preallocations(inode);
4224
4225	/*
4226	 * TODO: optimization is possible here.
4227	 * Probably we need not scan at all,
4228	 * because page truncation is enough.
4229	 */
4230
4231	/* we have to know where to truncate from in crash case */
4232	EXT4_I(inode)->i_disksize = inode->i_size;
4233	ext4_mark_inode_dirty(handle, inode);
4234
4235	last_block = (inode->i_size + sb->s_blocksize - 1)
4236			>> EXT4_BLOCK_SIZE_BITS(sb);
4237	err = ext4_ext_remove_space(inode, last_block);
4238
4239	/* In a multi-transaction truncate, we only make the final
4240	 * transaction synchronous.
4241	 */
4242	if (IS_SYNC(inode))
4243		ext4_handle_sync(handle);
4244
4245	up_write(&EXT4_I(inode)->i_data_sem);
4246
4247out_stop:
4248	/*
4249	 * If this was a simple ftruncate() and the file will remain alive,
4250	 * then we need to clear up the orphan record which we created above.
4251	 * However, if this was a real unlink then we were called by
4252	 * ext4_delete_inode(), and we allow that function to clean up the
4253	 * orphan info for us.
4254	 */
4255	if (inode->i_nlink)
4256		ext4_orphan_del(handle, inode);
4257
4258	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4259	ext4_mark_inode_dirty(handle, inode);
4260	ext4_journal_stop(handle);
4261}
4262
4263static void ext4_falloc_update_inode(struct inode *inode,
4264				int mode, loff_t new_size, int update_ctime)
4265{
4266	struct timespec now;
4267
4268	if (update_ctime) {
4269		now = current_fs_time(inode->i_sb);
4270		if (!timespec_equal(&inode->i_ctime, &now))
4271			inode->i_ctime = now;
4272	}
4273	/*
4274	 * Update only when preallocation was requested beyond
4275	 * the file size.
4276	 */
4277	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4278		if (new_size > i_size_read(inode))
4279			i_size_write(inode, new_size);
4280		if (new_size > EXT4_I(inode)->i_disksize)
4281			ext4_update_i_disksize(inode, new_size);
4282	} else {
4283		/*
4284		 * Mark that we allocate beyond EOF so the subsequent truncate
4285		 * can proceed even if the new size is the same as i_size.
4286		 */
4287		if (new_size > i_size_read(inode))
4288			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4289	}
4290
4291}
4292
4293/*
4294 * preallocate space for a file. This implements ext4's fallocate file
4295 * operation, which gets called from sys_fallocate system call.
4296 * For block-mapped files, posix_fallocate should fall back to the method
4297 * of writing zeroes to the required new blocks (the same behavior which is
4298 * expected for file systems which do not support fallocate() system call).
4299 */
4300long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4301{
4302	struct inode *inode = file->f_path.dentry->d_inode;
4303	handle_t *handle;
4304	loff_t new_size;
4305	unsigned int max_blocks;
4306	int ret = 0;
4307	int ret2 = 0;
4308	int retries = 0;
4309	int flags;
4310	struct ext4_map_blocks map;
4311	unsigned int credits, blkbits = inode->i_blkbits;
4312
4313	/*
4314	 * currently supporting (pre)allocate mode for extent-based
4315	 * files _only_
4316	 */
4317	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4318		return -EOPNOTSUPP;
4319
4320	/* Return error if mode is not supported */
4321	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4322		return -EOPNOTSUPP;
4323
4324	if (mode & FALLOC_FL_PUNCH_HOLE)
4325		return ext4_punch_hole(file, offset, len);
4326
4327	trace_ext4_fallocate_enter(inode, offset, len, mode);
4328	map.m_lblk = offset >> blkbits;
4329	/*
4330	 * We can't just convert len to max_blocks because
4331	 * If blocksize = 4096 offset = 3072 and len = 2048
4332	 */
4333	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4334		- map.m_lblk;
4335	/*
4336	 * credits to insert 1 extent into extent tree
4337	 */
4338	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4339	mutex_lock(&inode->i_mutex);
4340	ret = inode_newsize_ok(inode, (len + offset));
4341	if (ret) {
4342		mutex_unlock(&inode->i_mutex);
4343		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4344		return ret;
4345	}
4346	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4347		EXT4_GET_BLOCKS_NO_NORMALIZE;
4348	if (mode & FALLOC_FL_KEEP_SIZE)
4349		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4350retry:
4351	while (ret >= 0 && ret < max_blocks) {
4352		map.m_lblk = map.m_lblk + ret;
4353		map.m_len = max_blocks = max_blocks - ret;
4354		handle = ext4_journal_start(inode, credits);
4355		if (IS_ERR(handle)) {
4356			ret = PTR_ERR(handle);
4357			break;
4358		}
4359		ret = ext4_map_blocks(handle, inode, &map, flags);
4360		if (ret <= 0) {
4361#ifdef EXT4FS_DEBUG
4362			WARN_ON(ret <= 0);
4363			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4364				    "returned error inode#%lu, block=%u, "
4365				    "max_blocks=%u", __func__,
4366				    inode->i_ino, map.m_lblk, max_blocks);
4367#endif
4368			ext4_mark_inode_dirty(handle, inode);
4369			ret2 = ext4_journal_stop(handle);
4370			break;
4371		}
4372		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4373						blkbits) >> blkbits))
4374			new_size = offset + len;
4375		else
4376			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4377
4378		ext4_falloc_update_inode(inode, mode, new_size,
4379					 (map.m_flags & EXT4_MAP_NEW));
4380		ext4_mark_inode_dirty(handle, inode);
4381		ret2 = ext4_journal_stop(handle);
4382		if (ret2)
4383			break;
4384	}
4385	if (ret == -ENOSPC &&
4386			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4387		ret = 0;
4388		goto retry;
4389	}
4390	mutex_unlock(&inode->i_mutex);
4391	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4392				ret > 0 ? ret2 : ret);
4393	return ret > 0 ? ret2 : ret;
4394}
4395
4396/*
4397 * This function convert a range of blocks to written extents
4398 * The caller of this function will pass the start offset and the size.
4399 * all unwritten extents within this range will be converted to
4400 * written extents.
4401 *
4402 * This function is called from the direct IO end io call back
4403 * function, to convert the fallocated extents after IO is completed.
4404 * Returns 0 on success.
4405 */
4406int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4407				    ssize_t len)
4408{
4409	handle_t *handle;
4410	unsigned int max_blocks;
4411	int ret = 0;
4412	int ret2 = 0;
4413	struct ext4_map_blocks map;
4414	unsigned int credits, blkbits = inode->i_blkbits;
4415
4416	map.m_lblk = offset >> blkbits;
4417	/*
4418	 * We can't just convert len to max_blocks because
4419	 * If blocksize = 4096 offset = 3072 and len = 2048
4420	 */
4421	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4422		      map.m_lblk);
4423	/*
4424	 * credits to insert 1 extent into extent tree
4425	 */
4426	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4427	while (ret >= 0 && ret < max_blocks) {
4428		map.m_lblk += ret;
4429		map.m_len = (max_blocks -= ret);
4430		handle = ext4_journal_start(inode, credits);
4431		if (IS_ERR(handle)) {
4432			ret = PTR_ERR(handle);
4433			break;
4434		}
4435		ret = ext4_map_blocks(handle, inode, &map,
4436				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4437		if (ret <= 0) {
4438			WARN_ON(ret <= 0);
4439			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4440				    "returned error inode#%lu, block=%u, "
4441				    "max_blocks=%u", __func__,
4442				    inode->i_ino, map.m_lblk, map.m_len);
4443		}
4444		ext4_mark_inode_dirty(handle, inode);
4445		ret2 = ext4_journal_stop(handle);
4446		if (ret <= 0 || ret2 )
4447			break;
4448	}
4449	return ret > 0 ? ret2 : ret;
4450}
4451
4452/*
4453 * Callback function called for each extent to gather FIEMAP information.
4454 */
4455static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4456		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4457		       void *data)
4458{
4459	__u64	logical;
4460	__u64	physical;
4461	__u64	length;
4462	__u32	flags = 0;
4463	int		ret = 0;
4464	struct fiemap_extent_info *fieinfo = data;
4465	unsigned char blksize_bits;
4466
4467	blksize_bits = inode->i_sb->s_blocksize_bits;
4468	logical = (__u64)newex->ec_block << blksize_bits;
4469
4470	if (newex->ec_start == 0) {
4471		/*
4472		 * No extent in extent-tree contains block @newex->ec_start,
4473		 * then the block may stay in 1)a hole or 2)delayed-extent.
4474		 *
4475		 * Holes or delayed-extents are processed as follows.
4476		 * 1. lookup dirty pages with specified range in pagecache.
4477		 *    If no page is got, then there is no delayed-extent and
4478		 *    return with EXT_CONTINUE.
4479		 * 2. find the 1st mapped buffer,
4480		 * 3. check if the mapped buffer is both in the request range
4481		 *    and a delayed buffer. If not, there is no delayed-extent,
4482		 *    then return.
4483		 * 4. a delayed-extent is found, the extent will be collected.
4484		 */
4485		ext4_lblk_t	end = 0;
4486		pgoff_t		last_offset;
4487		pgoff_t		offset;
4488		pgoff_t		index;
4489		pgoff_t		start_index = 0;
4490		struct page	**pages = NULL;
4491		struct buffer_head *bh = NULL;
4492		struct buffer_head *head = NULL;
4493		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4494
4495		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4496		if (pages == NULL)
4497			return -ENOMEM;
4498
4499		offset = logical >> PAGE_SHIFT;
4500repeat:
4501		last_offset = offset;
4502		head = NULL;
4503		ret = find_get_pages_tag(inode->i_mapping, &offset,
4504					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4505
4506		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4507			/* First time, try to find a mapped buffer. */
4508			if (ret == 0) {
4509out:
4510				for (index = 0; index < ret; index++)
4511					page_cache_release(pages[index]);
4512				/* just a hole. */
4513				kfree(pages);
4514				return EXT_CONTINUE;
4515			}
4516			index = 0;
4517
4518next_page:
4519			/* Try to find the 1st mapped buffer. */
4520			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4521				  blksize_bits;
4522			if (!page_has_buffers(pages[index]))
4523				goto out;
4524			head = page_buffers(pages[index]);
4525			if (!head)
4526				goto out;
4527
4528			index++;
4529			bh = head;
4530			do {
4531				if (end >= newex->ec_block +
4532					newex->ec_len)
4533					/* The buffer is out of
4534					 * the request range.
4535					 */
4536					goto out;
4537
4538				if (buffer_mapped(bh) &&
4539				    end >= newex->ec_block) {
4540					start_index = index - 1;
4541					/* get the 1st mapped buffer. */
4542					goto found_mapped_buffer;
4543				}
4544
4545				bh = bh->b_this_page;
4546				end++;
4547			} while (bh != head);
4548
4549			/* No mapped buffer in the range found in this page,
4550			 * We need to look up next page.
4551			 */
4552			if (index >= ret) {
4553				/* There is no page left, but we need to limit
4554				 * newex->ec_len.
4555				 */
4556				newex->ec_len = end - newex->ec_block;
4557				goto out;
4558			}
4559			goto next_page;
4560		} else {
4561			/*Find contiguous delayed buffers. */
4562			if (ret > 0 && pages[0]->index == last_offset)
4563				head = page_buffers(pages[0]);
4564			bh = head;
4565			index = 1;
4566			start_index = 0;
4567		}
4568
4569found_mapped_buffer:
4570		if (bh != NULL && buffer_delay(bh)) {
4571			/* 1st or contiguous delayed buffer found. */
4572			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4573				/*
4574				 * 1st delayed buffer found, record
4575				 * the start of extent.
4576				 */
4577				flags |= FIEMAP_EXTENT_DELALLOC;
4578				newex->ec_block = end;
4579				logical = (__u64)end << blksize_bits;
4580			}
4581			/* Find contiguous delayed buffers. */
4582			do {
4583				if (!buffer_delay(bh))
4584					goto found_delayed_extent;
4585				bh = bh->b_this_page;
4586				end++;
4587			} while (bh != head);
4588
4589			for (; index < ret; index++) {
4590				if (!page_has_buffers(pages[index])) {
4591					bh = NULL;
4592					break;
4593				}
4594				head = page_buffers(pages[index]);
4595				if (!head) {
4596					bh = NULL;
4597					break;
4598				}
4599
4600				if (pages[index]->index !=
4601				    pages[start_index]->index + index
4602				    - start_index) {
4603					/* Blocks are not contiguous. */
4604					bh = NULL;
4605					break;
4606				}
4607				bh = head;
4608				do {
4609					if (!buffer_delay(bh))
4610						/* Delayed-extent ends. */
4611						goto found_delayed_extent;
4612					bh = bh->b_this_page;
4613					end++;
4614				} while (bh != head);
4615			}
4616		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4617			/* a hole found. */
4618			goto out;
4619
4620found_delayed_extent:
4621		newex->ec_len = min(end - newex->ec_block,
4622						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4623		if (ret == nr_pages && bh != NULL &&
4624			newex->ec_len < EXT_INIT_MAX_LEN &&
4625			buffer_delay(bh)) {
4626			/* Have not collected an extent and continue. */
4627			for (index = 0; index < ret; index++)
4628				page_cache_release(pages[index]);
4629			goto repeat;
4630		}
4631
4632		for (index = 0; index < ret; index++)
4633			page_cache_release(pages[index]);
4634		kfree(pages);
4635	}
4636
4637	physical = (__u64)newex->ec_start << blksize_bits;
4638	length =   (__u64)newex->ec_len << blksize_bits;
4639
4640	if (ex && ext4_ext_is_uninitialized(ex))
4641		flags |= FIEMAP_EXTENT_UNWRITTEN;
4642
4643	if (next == EXT_MAX_BLOCKS)
4644		flags |= FIEMAP_EXTENT_LAST;
4645
4646	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4647					length, flags);
4648	if (ret < 0)
4649		return ret;
4650	if (ret == 1)
4651		return EXT_BREAK;
4652	return EXT_CONTINUE;
4653}
4654/* fiemap flags we can handle specified here */
4655#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4656
4657static int ext4_xattr_fiemap(struct inode *inode,
4658				struct fiemap_extent_info *fieinfo)
4659{
4660	__u64 physical = 0;
4661	__u64 length;
4662	__u32 flags = FIEMAP_EXTENT_LAST;
4663	int blockbits = inode->i_sb->s_blocksize_bits;
4664	int error = 0;
4665
4666	/* in-inode? */
4667	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4668		struct ext4_iloc iloc;
4669		int offset;	/* offset of xattr in inode */
4670
4671		error = ext4_get_inode_loc(inode, &iloc);
4672		if (error)
4673			return error;
4674		physical = iloc.bh->b_blocknr << blockbits;
4675		offset = EXT4_GOOD_OLD_INODE_SIZE +
4676				EXT4_I(inode)->i_extra_isize;
4677		physical += offset;
4678		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4679		flags |= FIEMAP_EXTENT_DATA_INLINE;
4680		brelse(iloc.bh);
4681	} else { /* external block */
4682		physical = EXT4_I(inode)->i_file_acl << blockbits;
4683		length = inode->i_sb->s_blocksize;
4684	}
4685
4686	if (physical)
4687		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4688						length, flags);
4689	return (error < 0 ? error : 0);
4690}
4691
4692/*
4693 * ext4_ext_punch_hole
4694 *
4695 * Punches a hole of "length" bytes in a file starting
4696 * at byte "offset"
4697 *
4698 * @inode:  The inode of the file to punch a hole in
4699 * @offset: The starting byte offset of the hole
4700 * @length: The length of the hole
4701 *
4702 * Returns the number of blocks removed or negative on err
4703 */
4704int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4705{
4706	struct inode *inode = file->f_path.dentry->d_inode;
4707	struct super_block *sb = inode->i_sb;
4708	struct ext4_ext_cache cache_ex;
4709	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4710	struct address_space *mapping = inode->i_mapping;
4711	struct ext4_map_blocks map;
4712	handle_t *handle;
4713	loff_t first_page, last_page, page_len;
4714	loff_t first_page_offset, last_page_offset;
4715	int ret, credits, blocks_released, err = 0;
4716
4717	/* No need to punch hole beyond i_size */
4718	if (offset >= inode->i_size)
4719		return 0;
4720
4721	/*
4722	 * If the hole extends beyond i_size, set the hole
4723	 * to end after the page that contains i_size
4724	 */
4725	if (offset + length > inode->i_size) {
4726		length = inode->i_size +
4727		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4728		   offset;
4729	}
4730
4731	first_block = (offset + sb->s_blocksize - 1) >>
4732		EXT4_BLOCK_SIZE_BITS(sb);
4733	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4734
4735	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4736	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4737
4738	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4739	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4740
4741	/*
4742	 * Write out all dirty pages to avoid race conditions
4743	 * Then release them.
4744	 */
4745	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4746		err = filemap_write_and_wait_range(mapping,
4747			offset, offset + length - 1);
4748
4749		if (err)
4750			return err;
4751	}
4752
4753	/* Now release the pages */
4754	if (last_page_offset > first_page_offset) {
4755		truncate_inode_pages_range(mapping, first_page_offset,
4756					   last_page_offset-1);
4757	}
4758
4759	/* finish any pending end_io work */
4760	ext4_flush_completed_IO(inode);
4761
4762	credits = ext4_writepage_trans_blocks(inode);
4763	handle = ext4_journal_start(inode, credits);
4764	if (IS_ERR(handle))
4765		return PTR_ERR(handle);
4766
4767	err = ext4_orphan_add(handle, inode);
4768	if (err)
4769		goto out;
4770
4771	/*
4772	 * Now we need to zero out the non-page-aligned data in the
4773	 * pages at the start and tail of the hole, and unmap the buffer
4774	 * heads for the block aligned regions of the page that were
4775	 * completely zeroed.
4776	 */
4777	if (first_page > last_page) {
4778		/*
4779		 * If the file space being truncated is contained within a page
4780		 * just zero out and unmap the middle of that page
4781		 */
4782		err = ext4_discard_partial_page_buffers(handle,
4783			mapping, offset, length, 0);
4784
4785		if (err)
4786			goto out;
4787	} else {
4788		/*
4789		 * zero out and unmap the partial page that contains
4790		 * the start of the hole
4791		 */
4792		page_len  = first_page_offset - offset;
4793		if (page_len > 0) {
4794			err = ext4_discard_partial_page_buffers(handle, mapping,
4795						   offset, page_len, 0);
4796			if (err)
4797				goto out;
4798		}
4799
4800		/*
4801		 * zero out and unmap the partial page that contains
4802		 * the end of the hole
4803		 */
4804		page_len = offset + length - last_page_offset;
4805		if (page_len > 0) {
4806			err = ext4_discard_partial_page_buffers(handle, mapping,
4807					last_page_offset, page_len, 0);
4808			if (err)
4809				goto out;
4810		}
4811	}
4812
4813
4814	/*
4815	 * If i_size is contained in the last page, we need to
4816	 * unmap and zero the partial page after i_size
4817	 */
4818	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4819	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4820
4821		page_len = PAGE_CACHE_SIZE -
4822			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4823
4824		if (page_len > 0) {
4825			err = ext4_discard_partial_page_buffers(handle,
4826			  mapping, inode->i_size, page_len, 0);
4827
4828			if (err)
4829				goto out;
4830		}
4831	}
4832
4833	/* If there are no blocks to remove, return now */
4834	if (first_block >= last_block)
4835		goto out;
4836
4837	down_write(&EXT4_I(inode)->i_data_sem);
4838	ext4_ext_invalidate_cache(inode);
4839	ext4_discard_preallocations(inode);
4840
4841	/*
4842	 * Loop over all the blocks and identify blocks
4843	 * that need to be punched out
4844	 */
4845	iblock = first_block;
4846	blocks_released = 0;
4847	while (iblock < last_block) {
4848		max_blocks = last_block - iblock;
4849		num_blocks = 1;
4850		memset(&map, 0, sizeof(map));
4851		map.m_lblk = iblock;
4852		map.m_len = max_blocks;
4853		ret = ext4_ext_map_blocks(handle, inode, &map,
4854			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4855
4856		if (ret > 0) {
4857			blocks_released += ret;
4858			num_blocks = ret;
4859		} else if (ret == 0) {
4860			/*
4861			 * If map blocks could not find the block,
4862			 * then it is in a hole.  If the hole was
4863			 * not already cached, then map blocks should
4864			 * put it in the cache.  So we can get the hole
4865			 * out of the cache
4866			 */
4867			memset(&cache_ex, 0, sizeof(cache_ex));
4868			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4869				!cache_ex.ec_start) {
4870
4871				/* The hole is cached */
4872				num_blocks = cache_ex.ec_block +
4873				cache_ex.ec_len - iblock;
4874
4875			} else {
4876				/* The block could not be identified */
4877				err = -EIO;
4878				break;
4879			}
4880		} else {
4881			/* Map blocks error */
4882			err = ret;
4883			break;
4884		}
4885
4886		if (num_blocks == 0) {
4887			/* This condition should never happen */
4888			ext_debug("Block lookup failed");
4889			err = -EIO;
4890			break;
4891		}
4892
4893		iblock += num_blocks;
4894	}
4895
4896	if (blocks_released > 0) {
4897		ext4_ext_invalidate_cache(inode);
4898		ext4_discard_preallocations(inode);
4899	}
4900
4901	if (IS_SYNC(inode))
4902		ext4_handle_sync(handle);
4903
4904	up_write(&EXT4_I(inode)->i_data_sem);
4905
4906out:
4907	ext4_orphan_del(handle, inode);
4908	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4909	ext4_mark_inode_dirty(handle, inode);
4910	ext4_journal_stop(handle);
4911	return err;
4912}
4913int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4914		__u64 start, __u64 len)
4915{
4916	ext4_lblk_t start_blk;
4917	int error = 0;
4918
4919	/* fallback to generic here if not in extents fmt */
4920	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4921		return generic_block_fiemap(inode, fieinfo, start, len,
4922			ext4_get_block);
4923
4924	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4925		return -EBADR;
4926
4927	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4928		error = ext4_xattr_fiemap(inode, fieinfo);
4929	} else {
4930		ext4_lblk_t len_blks;
4931		__u64 last_blk;
4932
4933		start_blk = start >> inode->i_sb->s_blocksize_bits;
4934		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4935		if (last_blk >= EXT_MAX_BLOCKS)
4936			last_blk = EXT_MAX_BLOCKS-1;
4937		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4938
4939		/*
4940		 * Walk the extent tree gathering extent information.
4941		 * ext4_ext_fiemap_cb will push extents back to user.
4942		 */
4943		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4944					  ext4_ext_fiemap_cb, fieinfo);
4945	}
4946
4947	return error;
4948}
4949