extents.c revision 4b1f1660710c2ddcefc08c289e66a32b6bc2a2e3
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/fs.h>
33#include <linux/time.h>
34#include <linux/jbd2.h>
35#include <linux/highuid.h>
36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/string.h>
39#include <linux/slab.h>
40#include <asm/uaccess.h>
41#include <linux/fiemap.h>
42#include "ext4_jbd2.h"
43#include "ext4_extents.h"
44#include "xattr.h"
45
46#include <trace/events/ext4.h>
47
48/*
49 * used by extent splitting.
50 */
51#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
52					due to ENOSPC */
53#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
54#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
55
56#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
57#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
58
59static __le32 ext4_extent_block_csum(struct inode *inode,
60				     struct ext4_extent_header *eh)
61{
62	struct ext4_inode_info *ei = EXT4_I(inode);
63	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
64	__u32 csum;
65
66	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
67			   EXT4_EXTENT_TAIL_OFFSET(eh));
68	return cpu_to_le32(csum);
69}
70
71static int ext4_extent_block_csum_verify(struct inode *inode,
72					 struct ext4_extent_header *eh)
73{
74	struct ext4_extent_tail *et;
75
76	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
77		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
78		return 1;
79
80	et = find_ext4_extent_tail(eh);
81	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
82		return 0;
83	return 1;
84}
85
86static void ext4_extent_block_csum_set(struct inode *inode,
87				       struct ext4_extent_header *eh)
88{
89	struct ext4_extent_tail *et;
90
91	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
92		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
93		return;
94
95	et = find_ext4_extent_tail(eh);
96	et->et_checksum = ext4_extent_block_csum(inode, eh);
97}
98
99static int ext4_split_extent(handle_t *handle,
100				struct inode *inode,
101				struct ext4_ext_path *path,
102				struct ext4_map_blocks *map,
103				int split_flag,
104				int flags);
105
106static int ext4_split_extent_at(handle_t *handle,
107			     struct inode *inode,
108			     struct ext4_ext_path *path,
109			     ext4_lblk_t split,
110			     int split_flag,
111			     int flags);
112
113static int ext4_find_delayed_extent(struct inode *inode,
114				    struct extent_status *newes);
115
116static int ext4_ext_truncate_extend_restart(handle_t *handle,
117					    struct inode *inode,
118					    int needed)
119{
120	int err;
121
122	if (!ext4_handle_valid(handle))
123		return 0;
124	if (handle->h_buffer_credits > needed)
125		return 0;
126	err = ext4_journal_extend(handle, needed);
127	if (err <= 0)
128		return err;
129	err = ext4_truncate_restart_trans(handle, inode, needed);
130	if (err == 0)
131		err = -EAGAIN;
132
133	return err;
134}
135
136/*
137 * could return:
138 *  - EROFS
139 *  - ENOMEM
140 */
141static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
142				struct ext4_ext_path *path)
143{
144	if (path->p_bh) {
145		/* path points to block */
146		BUFFER_TRACE(path->p_bh, "get_write_access");
147		return ext4_journal_get_write_access(handle, path->p_bh);
148	}
149	/* path points to leaf/index in inode body */
150	/* we use in-core data, no need to protect them */
151	return 0;
152}
153
154/*
155 * could return:
156 *  - EROFS
157 *  - ENOMEM
158 *  - EIO
159 */
160int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161		     struct inode *inode, struct ext4_ext_path *path)
162{
163	int err;
164
165	WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
166	if (path->p_bh) {
167		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
168		/* path points to block */
169		err = __ext4_handle_dirty_metadata(where, line, handle,
170						   inode, path->p_bh);
171	} else {
172		/* path points to leaf/index in inode body */
173		err = ext4_mark_inode_dirty(handle, inode);
174	}
175	return err;
176}
177
178static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
179			      struct ext4_ext_path *path,
180			      ext4_lblk_t block)
181{
182	if (path) {
183		int depth = path->p_depth;
184		struct ext4_extent *ex;
185
186		/*
187		 * Try to predict block placement assuming that we are
188		 * filling in a file which will eventually be
189		 * non-sparse --- i.e., in the case of libbfd writing
190		 * an ELF object sections out-of-order but in a way
191		 * the eventually results in a contiguous object or
192		 * executable file, or some database extending a table
193		 * space file.  However, this is actually somewhat
194		 * non-ideal if we are writing a sparse file such as
195		 * qemu or KVM writing a raw image file that is going
196		 * to stay fairly sparse, since it will end up
197		 * fragmenting the file system's free space.  Maybe we
198		 * should have some hueristics or some way to allow
199		 * userspace to pass a hint to file system,
200		 * especially if the latter case turns out to be
201		 * common.
202		 */
203		ex = path[depth].p_ext;
204		if (ex) {
205			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
206			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
207
208			if (block > ext_block)
209				return ext_pblk + (block - ext_block);
210			else
211				return ext_pblk - (ext_block - block);
212		}
213
214		/* it looks like index is empty;
215		 * try to find starting block from index itself */
216		if (path[depth].p_bh)
217			return path[depth].p_bh->b_blocknr;
218	}
219
220	/* OK. use inode's group */
221	return ext4_inode_to_goal_block(inode);
222}
223
224/*
225 * Allocation for a meta data block
226 */
227static ext4_fsblk_t
228ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
229			struct ext4_ext_path *path,
230			struct ext4_extent *ex, int *err, unsigned int flags)
231{
232	ext4_fsblk_t goal, newblock;
233
234	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
235	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
236					NULL, err);
237	return newblock;
238}
239
240static inline int ext4_ext_space_block(struct inode *inode, int check)
241{
242	int size;
243
244	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
245			/ sizeof(struct ext4_extent);
246#ifdef AGGRESSIVE_TEST
247	if (!check && size > 6)
248		size = 6;
249#endif
250	return size;
251}
252
253static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
254{
255	int size;
256
257	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
258			/ sizeof(struct ext4_extent_idx);
259#ifdef AGGRESSIVE_TEST
260	if (!check && size > 5)
261		size = 5;
262#endif
263	return size;
264}
265
266static inline int ext4_ext_space_root(struct inode *inode, int check)
267{
268	int size;
269
270	size = sizeof(EXT4_I(inode)->i_data);
271	size -= sizeof(struct ext4_extent_header);
272	size /= sizeof(struct ext4_extent);
273#ifdef AGGRESSIVE_TEST
274	if (!check && size > 3)
275		size = 3;
276#endif
277	return size;
278}
279
280static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
281{
282	int size;
283
284	size = sizeof(EXT4_I(inode)->i_data);
285	size -= sizeof(struct ext4_extent_header);
286	size /= sizeof(struct ext4_extent_idx);
287#ifdef AGGRESSIVE_TEST
288	if (!check && size > 4)
289		size = 4;
290#endif
291	return size;
292}
293
294/*
295 * Calculate the number of metadata blocks needed
296 * to allocate @blocks
297 * Worse case is one block per extent
298 */
299int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
300{
301	struct ext4_inode_info *ei = EXT4_I(inode);
302	int idxs;
303
304	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
305		/ sizeof(struct ext4_extent_idx));
306
307	/*
308	 * If the new delayed allocation block is contiguous with the
309	 * previous da block, it can share index blocks with the
310	 * previous block, so we only need to allocate a new index
311	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
312	 * an additional index block, and at ldxs**3 blocks, yet
313	 * another index blocks.
314	 */
315	if (ei->i_da_metadata_calc_len &&
316	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
317		int num = 0;
318
319		if ((ei->i_da_metadata_calc_len % idxs) == 0)
320			num++;
321		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
322			num++;
323		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
324			num++;
325			ei->i_da_metadata_calc_len = 0;
326		} else
327			ei->i_da_metadata_calc_len++;
328		ei->i_da_metadata_calc_last_lblock++;
329		return num;
330	}
331
332	/*
333	 * In the worst case we need a new set of index blocks at
334	 * every level of the inode's extent tree.
335	 */
336	ei->i_da_metadata_calc_len = 1;
337	ei->i_da_metadata_calc_last_lblock = lblock;
338	return ext_depth(inode) + 1;
339}
340
341static int
342ext4_ext_max_entries(struct inode *inode, int depth)
343{
344	int max;
345
346	if (depth == ext_depth(inode)) {
347		if (depth == 0)
348			max = ext4_ext_space_root(inode, 1);
349		else
350			max = ext4_ext_space_root_idx(inode, 1);
351	} else {
352		if (depth == 0)
353			max = ext4_ext_space_block(inode, 1);
354		else
355			max = ext4_ext_space_block_idx(inode, 1);
356	}
357
358	return max;
359}
360
361static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
362{
363	ext4_fsblk_t block = ext4_ext_pblock(ext);
364	int len = ext4_ext_get_actual_len(ext);
365	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
366	ext4_lblk_t last = lblock + len - 1;
367
368	if (lblock > last)
369		return 0;
370	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
371}
372
373static int ext4_valid_extent_idx(struct inode *inode,
374				struct ext4_extent_idx *ext_idx)
375{
376	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
377
378	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
379}
380
381static int ext4_valid_extent_entries(struct inode *inode,
382				struct ext4_extent_header *eh,
383				int depth)
384{
385	unsigned short entries;
386	if (eh->eh_entries == 0)
387		return 1;
388
389	entries = le16_to_cpu(eh->eh_entries);
390
391	if (depth == 0) {
392		/* leaf entries */
393		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
394		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
395		ext4_fsblk_t pblock = 0;
396		ext4_lblk_t lblock = 0;
397		ext4_lblk_t prev = 0;
398		int len = 0;
399		while (entries) {
400			if (!ext4_valid_extent(inode, ext))
401				return 0;
402
403			/* Check for overlapping extents */
404			lblock = le32_to_cpu(ext->ee_block);
405			len = ext4_ext_get_actual_len(ext);
406			if ((lblock <= prev) && prev) {
407				pblock = ext4_ext_pblock(ext);
408				es->s_last_error_block = cpu_to_le64(pblock);
409				return 0;
410			}
411			ext++;
412			entries--;
413			prev = lblock + len - 1;
414		}
415	} else {
416		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
417		while (entries) {
418			if (!ext4_valid_extent_idx(inode, ext_idx))
419				return 0;
420			ext_idx++;
421			entries--;
422		}
423	}
424	return 1;
425}
426
427static int __ext4_ext_check(const char *function, unsigned int line,
428			    struct inode *inode, struct ext4_extent_header *eh,
429			    int depth, ext4_fsblk_t pblk)
430{
431	const char *error_msg;
432	int max = 0;
433
434	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
435		error_msg = "invalid magic";
436		goto corrupted;
437	}
438	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
439		error_msg = "unexpected eh_depth";
440		goto corrupted;
441	}
442	if (unlikely(eh->eh_max == 0)) {
443		error_msg = "invalid eh_max";
444		goto corrupted;
445	}
446	max = ext4_ext_max_entries(inode, depth);
447	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
448		error_msg = "too large eh_max";
449		goto corrupted;
450	}
451	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
452		error_msg = "invalid eh_entries";
453		goto corrupted;
454	}
455	if (!ext4_valid_extent_entries(inode, eh, depth)) {
456		error_msg = "invalid extent entries";
457		goto corrupted;
458	}
459	/* Verify checksum on non-root extent tree nodes */
460	if (ext_depth(inode) != depth &&
461	    !ext4_extent_block_csum_verify(inode, eh)) {
462		error_msg = "extent tree corrupted";
463		goto corrupted;
464	}
465	return 0;
466
467corrupted:
468	ext4_error_inode(inode, function, line, 0,
469			 "pblk %llu bad header/extent: %s - magic %x, "
470			 "entries %u, max %u(%u), depth %u(%u)",
471			 (unsigned long long) pblk, error_msg,
472			 le16_to_cpu(eh->eh_magic),
473			 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
474			 max, le16_to_cpu(eh->eh_depth), depth);
475	return -EIO;
476}
477
478#define ext4_ext_check(inode, eh, depth, pblk)			\
479	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
480
481int ext4_ext_check_inode(struct inode *inode)
482{
483	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
484}
485
486static struct buffer_head *
487__read_extent_tree_block(const char *function, unsigned int line,
488			 struct inode *inode, ext4_fsblk_t pblk, int depth,
489			 int flags)
490{
491	struct buffer_head		*bh;
492	int				err;
493
494	bh = sb_getblk(inode->i_sb, pblk);
495	if (unlikely(!bh))
496		return ERR_PTR(-ENOMEM);
497
498	if (!bh_uptodate_or_lock(bh)) {
499		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
500		err = bh_submit_read(bh);
501		if (err < 0)
502			goto errout;
503	}
504	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
505		return bh;
506	err = __ext4_ext_check(function, line, inode,
507			       ext_block_hdr(bh), depth, pblk);
508	if (err)
509		goto errout;
510	set_buffer_verified(bh);
511	/*
512	 * If this is a leaf block, cache all of its entries
513	 */
514	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
515		struct ext4_extent_header *eh = ext_block_hdr(bh);
516		struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
517		ext4_lblk_t prev = 0;
518		int i;
519
520		for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
521			unsigned int status = EXTENT_STATUS_WRITTEN;
522			ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
523			int len = ext4_ext_get_actual_len(ex);
524
525			if (prev && (prev != lblk))
526				ext4_es_cache_extent(inode, prev,
527						     lblk - prev, ~0,
528						     EXTENT_STATUS_HOLE);
529
530			if (ext4_ext_is_unwritten(ex))
531				status = EXTENT_STATUS_UNWRITTEN;
532			ext4_es_cache_extent(inode, lblk, len,
533					     ext4_ext_pblock(ex), status);
534			prev = lblk + len;
535		}
536	}
537	return bh;
538errout:
539	put_bh(bh);
540	return ERR_PTR(err);
541
542}
543
544#define read_extent_tree_block(inode, pblk, depth, flags)		\
545	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
546				 (depth), (flags))
547
548/*
549 * This function is called to cache a file's extent information in the
550 * extent status tree
551 */
552int ext4_ext_precache(struct inode *inode)
553{
554	struct ext4_inode_info *ei = EXT4_I(inode);
555	struct ext4_ext_path *path = NULL;
556	struct buffer_head *bh;
557	int i = 0, depth, ret = 0;
558
559	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
560		return 0;	/* not an extent-mapped inode */
561
562	down_read(&ei->i_data_sem);
563	depth = ext_depth(inode);
564
565	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
566		       GFP_NOFS);
567	if (path == NULL) {
568		up_read(&ei->i_data_sem);
569		return -ENOMEM;
570	}
571
572	/* Don't cache anything if there are no external extent blocks */
573	if (depth == 0)
574		goto out;
575	path[0].p_hdr = ext_inode_hdr(inode);
576	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
577	if (ret)
578		goto out;
579	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
580	while (i >= 0) {
581		/*
582		 * If this is a leaf block or we've reached the end of
583		 * the index block, go up
584		 */
585		if ((i == depth) ||
586		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
587			brelse(path[i].p_bh);
588			path[i].p_bh = NULL;
589			i--;
590			continue;
591		}
592		bh = read_extent_tree_block(inode,
593					    ext4_idx_pblock(path[i].p_idx++),
594					    depth - i - 1,
595					    EXT4_EX_FORCE_CACHE);
596		if (IS_ERR(bh)) {
597			ret = PTR_ERR(bh);
598			break;
599		}
600		i++;
601		path[i].p_bh = bh;
602		path[i].p_hdr = ext_block_hdr(bh);
603		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
604	}
605	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
606out:
607	up_read(&ei->i_data_sem);
608	ext4_ext_drop_refs(path);
609	kfree(path);
610	return ret;
611}
612
613#ifdef EXT_DEBUG
614static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
615{
616	int k, l = path->p_depth;
617
618	ext_debug("path:");
619	for (k = 0; k <= l; k++, path++) {
620		if (path->p_idx) {
621		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
622			    ext4_idx_pblock(path->p_idx));
623		} else if (path->p_ext) {
624			ext_debug("  %d:[%d]%d:%llu ",
625				  le32_to_cpu(path->p_ext->ee_block),
626				  ext4_ext_is_unwritten(path->p_ext),
627				  ext4_ext_get_actual_len(path->p_ext),
628				  ext4_ext_pblock(path->p_ext));
629		} else
630			ext_debug("  []");
631	}
632	ext_debug("\n");
633}
634
635static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
636{
637	int depth = ext_depth(inode);
638	struct ext4_extent_header *eh;
639	struct ext4_extent *ex;
640	int i;
641
642	if (!path)
643		return;
644
645	eh = path[depth].p_hdr;
646	ex = EXT_FIRST_EXTENT(eh);
647
648	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
649
650	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
651		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
652			  ext4_ext_is_unwritten(ex),
653			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
654	}
655	ext_debug("\n");
656}
657
658static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
659			ext4_fsblk_t newblock, int level)
660{
661	int depth = ext_depth(inode);
662	struct ext4_extent *ex;
663
664	if (depth != level) {
665		struct ext4_extent_idx *idx;
666		idx = path[level].p_idx;
667		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
668			ext_debug("%d: move %d:%llu in new index %llu\n", level,
669					le32_to_cpu(idx->ei_block),
670					ext4_idx_pblock(idx),
671					newblock);
672			idx++;
673		}
674
675		return;
676	}
677
678	ex = path[depth].p_ext;
679	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
680		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
681				le32_to_cpu(ex->ee_block),
682				ext4_ext_pblock(ex),
683				ext4_ext_is_unwritten(ex),
684				ext4_ext_get_actual_len(ex),
685				newblock);
686		ex++;
687	}
688}
689
690#else
691#define ext4_ext_show_path(inode, path)
692#define ext4_ext_show_leaf(inode, path)
693#define ext4_ext_show_move(inode, path, newblock, level)
694#endif
695
696void ext4_ext_drop_refs(struct ext4_ext_path *path)
697{
698	int depth = path->p_depth;
699	int i;
700
701	for (i = 0; i <= depth; i++, path++)
702		if (path->p_bh) {
703			brelse(path->p_bh);
704			path->p_bh = NULL;
705		}
706}
707
708/*
709 * ext4_ext_binsearch_idx:
710 * binary search for the closest index of the given block
711 * the header must be checked before calling this
712 */
713static void
714ext4_ext_binsearch_idx(struct inode *inode,
715			struct ext4_ext_path *path, ext4_lblk_t block)
716{
717	struct ext4_extent_header *eh = path->p_hdr;
718	struct ext4_extent_idx *r, *l, *m;
719
720
721	ext_debug("binsearch for %u(idx):  ", block);
722
723	l = EXT_FIRST_INDEX(eh) + 1;
724	r = EXT_LAST_INDEX(eh);
725	while (l <= r) {
726		m = l + (r - l) / 2;
727		if (block < le32_to_cpu(m->ei_block))
728			r = m - 1;
729		else
730			l = m + 1;
731		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
732				m, le32_to_cpu(m->ei_block),
733				r, le32_to_cpu(r->ei_block));
734	}
735
736	path->p_idx = l - 1;
737	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
738		  ext4_idx_pblock(path->p_idx));
739
740#ifdef CHECK_BINSEARCH
741	{
742		struct ext4_extent_idx *chix, *ix;
743		int k;
744
745		chix = ix = EXT_FIRST_INDEX(eh);
746		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
747		  if (k != 0 &&
748		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
749				printk(KERN_DEBUG "k=%d, ix=0x%p, "
750				       "first=0x%p\n", k,
751				       ix, EXT_FIRST_INDEX(eh));
752				printk(KERN_DEBUG "%u <= %u\n",
753				       le32_to_cpu(ix->ei_block),
754				       le32_to_cpu(ix[-1].ei_block));
755			}
756			BUG_ON(k && le32_to_cpu(ix->ei_block)
757					   <= le32_to_cpu(ix[-1].ei_block));
758			if (block < le32_to_cpu(ix->ei_block))
759				break;
760			chix = ix;
761		}
762		BUG_ON(chix != path->p_idx);
763	}
764#endif
765
766}
767
768/*
769 * ext4_ext_binsearch:
770 * binary search for closest extent of the given block
771 * the header must be checked before calling this
772 */
773static void
774ext4_ext_binsearch(struct inode *inode,
775		struct ext4_ext_path *path, ext4_lblk_t block)
776{
777	struct ext4_extent_header *eh = path->p_hdr;
778	struct ext4_extent *r, *l, *m;
779
780	if (eh->eh_entries == 0) {
781		/*
782		 * this leaf is empty:
783		 * we get such a leaf in split/add case
784		 */
785		return;
786	}
787
788	ext_debug("binsearch for %u:  ", block);
789
790	l = EXT_FIRST_EXTENT(eh) + 1;
791	r = EXT_LAST_EXTENT(eh);
792
793	while (l <= r) {
794		m = l + (r - l) / 2;
795		if (block < le32_to_cpu(m->ee_block))
796			r = m - 1;
797		else
798			l = m + 1;
799		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
800				m, le32_to_cpu(m->ee_block),
801				r, le32_to_cpu(r->ee_block));
802	}
803
804	path->p_ext = l - 1;
805	ext_debug("  -> %d:%llu:[%d]%d ",
806			le32_to_cpu(path->p_ext->ee_block),
807			ext4_ext_pblock(path->p_ext),
808			ext4_ext_is_unwritten(path->p_ext),
809			ext4_ext_get_actual_len(path->p_ext));
810
811#ifdef CHECK_BINSEARCH
812	{
813		struct ext4_extent *chex, *ex;
814		int k;
815
816		chex = ex = EXT_FIRST_EXTENT(eh);
817		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
818			BUG_ON(k && le32_to_cpu(ex->ee_block)
819					  <= le32_to_cpu(ex[-1].ee_block));
820			if (block < le32_to_cpu(ex->ee_block))
821				break;
822			chex = ex;
823		}
824		BUG_ON(chex != path->p_ext);
825	}
826#endif
827
828}
829
830int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
831{
832	struct ext4_extent_header *eh;
833
834	eh = ext_inode_hdr(inode);
835	eh->eh_depth = 0;
836	eh->eh_entries = 0;
837	eh->eh_magic = EXT4_EXT_MAGIC;
838	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
839	ext4_mark_inode_dirty(handle, inode);
840	return 0;
841}
842
843struct ext4_ext_path *
844ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
845		     struct ext4_ext_path *path, int flags)
846{
847	struct ext4_extent_header *eh;
848	struct buffer_head *bh;
849	short int depth, i, ppos = 0, alloc = 0;
850	int ret;
851
852	eh = ext_inode_hdr(inode);
853	depth = ext_depth(inode);
854
855	/* account possible depth increase */
856	if (!path) {
857		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
858				GFP_NOFS);
859		if (!path)
860			return ERR_PTR(-ENOMEM);
861		alloc = 1;
862	}
863	path[0].p_hdr = eh;
864	path[0].p_bh = NULL;
865
866	i = depth;
867	/* walk through the tree */
868	while (i) {
869		ext_debug("depth %d: num %d, max %d\n",
870			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
871
872		ext4_ext_binsearch_idx(inode, path + ppos, block);
873		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
874		path[ppos].p_depth = i;
875		path[ppos].p_ext = NULL;
876
877		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
878					    flags);
879		if (IS_ERR(bh)) {
880			ret = PTR_ERR(bh);
881			goto err;
882		}
883
884		eh = ext_block_hdr(bh);
885		ppos++;
886		if (unlikely(ppos > depth)) {
887			put_bh(bh);
888			EXT4_ERROR_INODE(inode,
889					 "ppos %d > depth %d", ppos, depth);
890			ret = -EIO;
891			goto err;
892		}
893		path[ppos].p_bh = bh;
894		path[ppos].p_hdr = eh;
895	}
896
897	path[ppos].p_depth = i;
898	path[ppos].p_ext = NULL;
899	path[ppos].p_idx = NULL;
900
901	/* find extent */
902	ext4_ext_binsearch(inode, path + ppos, block);
903	/* if not an empty leaf */
904	if (path[ppos].p_ext)
905		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
906
907	ext4_ext_show_path(inode, path);
908
909	return path;
910
911err:
912	ext4_ext_drop_refs(path);
913	if (alloc)
914		kfree(path);
915	return ERR_PTR(ret);
916}
917
918/*
919 * ext4_ext_insert_index:
920 * insert new index [@logical;@ptr] into the block at @curp;
921 * check where to insert: before @curp or after @curp
922 */
923static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
924				 struct ext4_ext_path *curp,
925				 int logical, ext4_fsblk_t ptr)
926{
927	struct ext4_extent_idx *ix;
928	int len, err;
929
930	err = ext4_ext_get_access(handle, inode, curp);
931	if (err)
932		return err;
933
934	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
935		EXT4_ERROR_INODE(inode,
936				 "logical %d == ei_block %d!",
937				 logical, le32_to_cpu(curp->p_idx->ei_block));
938		return -EIO;
939	}
940
941	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
942			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
943		EXT4_ERROR_INODE(inode,
944				 "eh_entries %d >= eh_max %d!",
945				 le16_to_cpu(curp->p_hdr->eh_entries),
946				 le16_to_cpu(curp->p_hdr->eh_max));
947		return -EIO;
948	}
949
950	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
951		/* insert after */
952		ext_debug("insert new index %d after: %llu\n", logical, ptr);
953		ix = curp->p_idx + 1;
954	} else {
955		/* insert before */
956		ext_debug("insert new index %d before: %llu\n", logical, ptr);
957		ix = curp->p_idx;
958	}
959
960	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
961	BUG_ON(len < 0);
962	if (len > 0) {
963		ext_debug("insert new index %d: "
964				"move %d indices from 0x%p to 0x%p\n",
965				logical, len, ix, ix + 1);
966		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
967	}
968
969	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
970		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
971		return -EIO;
972	}
973
974	ix->ei_block = cpu_to_le32(logical);
975	ext4_idx_store_pblock(ix, ptr);
976	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
977
978	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
979		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
980		return -EIO;
981	}
982
983	err = ext4_ext_dirty(handle, inode, curp);
984	ext4_std_error(inode->i_sb, err);
985
986	return err;
987}
988
989/*
990 * ext4_ext_split:
991 * inserts new subtree into the path, using free index entry
992 * at depth @at:
993 * - allocates all needed blocks (new leaf and all intermediate index blocks)
994 * - makes decision where to split
995 * - moves remaining extents and index entries (right to the split point)
996 *   into the newly allocated blocks
997 * - initializes subtree
998 */
999static int ext4_ext_split(handle_t *handle, struct inode *inode,
1000			  unsigned int flags,
1001			  struct ext4_ext_path *path,
1002			  struct ext4_extent *newext, int at)
1003{
1004	struct buffer_head *bh = NULL;
1005	int depth = ext_depth(inode);
1006	struct ext4_extent_header *neh;
1007	struct ext4_extent_idx *fidx;
1008	int i = at, k, m, a;
1009	ext4_fsblk_t newblock, oldblock;
1010	__le32 border;
1011	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1012	int err = 0;
1013
1014	/* make decision: where to split? */
1015	/* FIXME: now decision is simplest: at current extent */
1016
1017	/* if current leaf will be split, then we should use
1018	 * border from split point */
1019	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1020		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1021		return -EIO;
1022	}
1023	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1024		border = path[depth].p_ext[1].ee_block;
1025		ext_debug("leaf will be split."
1026				" next leaf starts at %d\n",
1027				  le32_to_cpu(border));
1028	} else {
1029		border = newext->ee_block;
1030		ext_debug("leaf will be added."
1031				" next leaf starts at %d\n",
1032				le32_to_cpu(border));
1033	}
1034
1035	/*
1036	 * If error occurs, then we break processing
1037	 * and mark filesystem read-only. index won't
1038	 * be inserted and tree will be in consistent
1039	 * state. Next mount will repair buffers too.
1040	 */
1041
1042	/*
1043	 * Get array to track all allocated blocks.
1044	 * We need this to handle errors and free blocks
1045	 * upon them.
1046	 */
1047	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
1048	if (!ablocks)
1049		return -ENOMEM;
1050
1051	/* allocate all needed blocks */
1052	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1053	for (a = 0; a < depth - at; a++) {
1054		newblock = ext4_ext_new_meta_block(handle, inode, path,
1055						   newext, &err, flags);
1056		if (newblock == 0)
1057			goto cleanup;
1058		ablocks[a] = newblock;
1059	}
1060
1061	/* initialize new leaf */
1062	newblock = ablocks[--a];
1063	if (unlikely(newblock == 0)) {
1064		EXT4_ERROR_INODE(inode, "newblock == 0!");
1065		err = -EIO;
1066		goto cleanup;
1067	}
1068	bh = sb_getblk(inode->i_sb, newblock);
1069	if (unlikely(!bh)) {
1070		err = -ENOMEM;
1071		goto cleanup;
1072	}
1073	lock_buffer(bh);
1074
1075	err = ext4_journal_get_create_access(handle, bh);
1076	if (err)
1077		goto cleanup;
1078
1079	neh = ext_block_hdr(bh);
1080	neh->eh_entries = 0;
1081	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1082	neh->eh_magic = EXT4_EXT_MAGIC;
1083	neh->eh_depth = 0;
1084
1085	/* move remainder of path[depth] to the new leaf */
1086	if (unlikely(path[depth].p_hdr->eh_entries !=
1087		     path[depth].p_hdr->eh_max)) {
1088		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1089				 path[depth].p_hdr->eh_entries,
1090				 path[depth].p_hdr->eh_max);
1091		err = -EIO;
1092		goto cleanup;
1093	}
1094	/* start copy from next extent */
1095	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1096	ext4_ext_show_move(inode, path, newblock, depth);
1097	if (m) {
1098		struct ext4_extent *ex;
1099		ex = EXT_FIRST_EXTENT(neh);
1100		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1101		le16_add_cpu(&neh->eh_entries, m);
1102	}
1103
1104	ext4_extent_block_csum_set(inode, neh);
1105	set_buffer_uptodate(bh);
1106	unlock_buffer(bh);
1107
1108	err = ext4_handle_dirty_metadata(handle, inode, bh);
1109	if (err)
1110		goto cleanup;
1111	brelse(bh);
1112	bh = NULL;
1113
1114	/* correct old leaf */
1115	if (m) {
1116		err = ext4_ext_get_access(handle, inode, path + depth);
1117		if (err)
1118			goto cleanup;
1119		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1120		err = ext4_ext_dirty(handle, inode, path + depth);
1121		if (err)
1122			goto cleanup;
1123
1124	}
1125
1126	/* create intermediate indexes */
1127	k = depth - at - 1;
1128	if (unlikely(k < 0)) {
1129		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1130		err = -EIO;
1131		goto cleanup;
1132	}
1133	if (k)
1134		ext_debug("create %d intermediate indices\n", k);
1135	/* insert new index into current index block */
1136	/* current depth stored in i var */
1137	i = depth - 1;
1138	while (k--) {
1139		oldblock = newblock;
1140		newblock = ablocks[--a];
1141		bh = sb_getblk(inode->i_sb, newblock);
1142		if (unlikely(!bh)) {
1143			err = -ENOMEM;
1144			goto cleanup;
1145		}
1146		lock_buffer(bh);
1147
1148		err = ext4_journal_get_create_access(handle, bh);
1149		if (err)
1150			goto cleanup;
1151
1152		neh = ext_block_hdr(bh);
1153		neh->eh_entries = cpu_to_le16(1);
1154		neh->eh_magic = EXT4_EXT_MAGIC;
1155		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1156		neh->eh_depth = cpu_to_le16(depth - i);
1157		fidx = EXT_FIRST_INDEX(neh);
1158		fidx->ei_block = border;
1159		ext4_idx_store_pblock(fidx, oldblock);
1160
1161		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1162				i, newblock, le32_to_cpu(border), oldblock);
1163
1164		/* move remainder of path[i] to the new index block */
1165		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1166					EXT_LAST_INDEX(path[i].p_hdr))) {
1167			EXT4_ERROR_INODE(inode,
1168					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1169					 le32_to_cpu(path[i].p_ext->ee_block));
1170			err = -EIO;
1171			goto cleanup;
1172		}
1173		/* start copy indexes */
1174		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1175		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1176				EXT_MAX_INDEX(path[i].p_hdr));
1177		ext4_ext_show_move(inode, path, newblock, i);
1178		if (m) {
1179			memmove(++fidx, path[i].p_idx,
1180				sizeof(struct ext4_extent_idx) * m);
1181			le16_add_cpu(&neh->eh_entries, m);
1182		}
1183		ext4_extent_block_csum_set(inode, neh);
1184		set_buffer_uptodate(bh);
1185		unlock_buffer(bh);
1186
1187		err = ext4_handle_dirty_metadata(handle, inode, bh);
1188		if (err)
1189			goto cleanup;
1190		brelse(bh);
1191		bh = NULL;
1192
1193		/* correct old index */
1194		if (m) {
1195			err = ext4_ext_get_access(handle, inode, path + i);
1196			if (err)
1197				goto cleanup;
1198			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1199			err = ext4_ext_dirty(handle, inode, path + i);
1200			if (err)
1201				goto cleanup;
1202		}
1203
1204		i--;
1205	}
1206
1207	/* insert new index */
1208	err = ext4_ext_insert_index(handle, inode, path + at,
1209				    le32_to_cpu(border), newblock);
1210
1211cleanup:
1212	if (bh) {
1213		if (buffer_locked(bh))
1214			unlock_buffer(bh);
1215		brelse(bh);
1216	}
1217
1218	if (err) {
1219		/* free all allocated blocks in error case */
1220		for (i = 0; i < depth; i++) {
1221			if (!ablocks[i])
1222				continue;
1223			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1224					 EXT4_FREE_BLOCKS_METADATA);
1225		}
1226	}
1227	kfree(ablocks);
1228
1229	return err;
1230}
1231
1232/*
1233 * ext4_ext_grow_indepth:
1234 * implements tree growing procedure:
1235 * - allocates new block
1236 * - moves top-level data (index block or leaf) into the new block
1237 * - initializes new top-level, creating index that points to the
1238 *   just created block
1239 */
1240static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1241				 unsigned int flags,
1242				 struct ext4_extent *newext)
1243{
1244	struct ext4_extent_header *neh;
1245	struct buffer_head *bh;
1246	ext4_fsblk_t newblock;
1247	int err = 0;
1248
1249	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1250		newext, &err, flags);
1251	if (newblock == 0)
1252		return err;
1253
1254	bh = sb_getblk(inode->i_sb, newblock);
1255	if (unlikely(!bh))
1256		return -ENOMEM;
1257	lock_buffer(bh);
1258
1259	err = ext4_journal_get_create_access(handle, bh);
1260	if (err) {
1261		unlock_buffer(bh);
1262		goto out;
1263	}
1264
1265	/* move top-level index/leaf into new block */
1266	memmove(bh->b_data, EXT4_I(inode)->i_data,
1267		sizeof(EXT4_I(inode)->i_data));
1268
1269	/* set size of new block */
1270	neh = ext_block_hdr(bh);
1271	/* old root could have indexes or leaves
1272	 * so calculate e_max right way */
1273	if (ext_depth(inode))
1274		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1275	else
1276		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1277	neh->eh_magic = EXT4_EXT_MAGIC;
1278	ext4_extent_block_csum_set(inode, neh);
1279	set_buffer_uptodate(bh);
1280	unlock_buffer(bh);
1281
1282	err = ext4_handle_dirty_metadata(handle, inode, bh);
1283	if (err)
1284		goto out;
1285
1286	/* Update top-level index: num,max,pointer */
1287	neh = ext_inode_hdr(inode);
1288	neh->eh_entries = cpu_to_le16(1);
1289	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1290	if (neh->eh_depth == 0) {
1291		/* Root extent block becomes index block */
1292		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1293		EXT_FIRST_INDEX(neh)->ei_block =
1294			EXT_FIRST_EXTENT(neh)->ee_block;
1295	}
1296	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1297		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1298		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1299		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1300
1301	le16_add_cpu(&neh->eh_depth, 1);
1302	ext4_mark_inode_dirty(handle, inode);
1303out:
1304	brelse(bh);
1305
1306	return err;
1307}
1308
1309/*
1310 * ext4_ext_create_new_leaf:
1311 * finds empty index and adds new leaf.
1312 * if no free index is found, then it requests in-depth growing.
1313 */
1314static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1315				    unsigned int mb_flags,
1316				    unsigned int gb_flags,
1317				    struct ext4_ext_path *path,
1318				    struct ext4_extent *newext)
1319{
1320	struct ext4_ext_path *curp;
1321	int depth, i, err = 0;
1322
1323repeat:
1324	i = depth = ext_depth(inode);
1325
1326	/* walk up to the tree and look for free index entry */
1327	curp = path + depth;
1328	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1329		i--;
1330		curp--;
1331	}
1332
1333	/* we use already allocated block for index block,
1334	 * so subsequent data blocks should be contiguous */
1335	if (EXT_HAS_FREE_INDEX(curp)) {
1336		/* if we found index with free entry, then use that
1337		 * entry: create all needed subtree and add new leaf */
1338		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1339		if (err)
1340			goto out;
1341
1342		/* refill path */
1343		ext4_ext_drop_refs(path);
1344		path = ext4_ext_find_extent(inode,
1345				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1346				    path, gb_flags);
1347		if (IS_ERR(path))
1348			err = PTR_ERR(path);
1349	} else {
1350		/* tree is full, time to grow in depth */
1351		err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
1352		if (err)
1353			goto out;
1354
1355		/* refill path */
1356		ext4_ext_drop_refs(path);
1357		path = ext4_ext_find_extent(inode,
1358				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1359				    path, gb_flags);
1360		if (IS_ERR(path)) {
1361			err = PTR_ERR(path);
1362			goto out;
1363		}
1364
1365		/*
1366		 * only first (depth 0 -> 1) produces free space;
1367		 * in all other cases we have to split the grown tree
1368		 */
1369		depth = ext_depth(inode);
1370		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1371			/* now we need to split */
1372			goto repeat;
1373		}
1374	}
1375
1376out:
1377	return err;
1378}
1379
1380/*
1381 * search the closest allocated block to the left for *logical
1382 * and returns it at @logical + it's physical address at @phys
1383 * if *logical is the smallest allocated block, the function
1384 * returns 0 at @phys
1385 * return value contains 0 (success) or error code
1386 */
1387static int ext4_ext_search_left(struct inode *inode,
1388				struct ext4_ext_path *path,
1389				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1390{
1391	struct ext4_extent_idx *ix;
1392	struct ext4_extent *ex;
1393	int depth, ee_len;
1394
1395	if (unlikely(path == NULL)) {
1396		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1397		return -EIO;
1398	}
1399	depth = path->p_depth;
1400	*phys = 0;
1401
1402	if (depth == 0 && path->p_ext == NULL)
1403		return 0;
1404
1405	/* usually extent in the path covers blocks smaller
1406	 * then *logical, but it can be that extent is the
1407	 * first one in the file */
1408
1409	ex = path[depth].p_ext;
1410	ee_len = ext4_ext_get_actual_len(ex);
1411	if (*logical < le32_to_cpu(ex->ee_block)) {
1412		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1413			EXT4_ERROR_INODE(inode,
1414					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1415					 *logical, le32_to_cpu(ex->ee_block));
1416			return -EIO;
1417		}
1418		while (--depth >= 0) {
1419			ix = path[depth].p_idx;
1420			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1421				EXT4_ERROR_INODE(inode,
1422				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1423				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1424				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1425		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1426				  depth);
1427				return -EIO;
1428			}
1429		}
1430		return 0;
1431	}
1432
1433	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1434		EXT4_ERROR_INODE(inode,
1435				 "logical %d < ee_block %d + ee_len %d!",
1436				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1437		return -EIO;
1438	}
1439
1440	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1441	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1442	return 0;
1443}
1444
1445/*
1446 * search the closest allocated block to the right for *logical
1447 * and returns it at @logical + it's physical address at @phys
1448 * if *logical is the largest allocated block, the function
1449 * returns 0 at @phys
1450 * return value contains 0 (success) or error code
1451 */
1452static int ext4_ext_search_right(struct inode *inode,
1453				 struct ext4_ext_path *path,
1454				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1455				 struct ext4_extent **ret_ex)
1456{
1457	struct buffer_head *bh = NULL;
1458	struct ext4_extent_header *eh;
1459	struct ext4_extent_idx *ix;
1460	struct ext4_extent *ex;
1461	ext4_fsblk_t block;
1462	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1463	int ee_len;
1464
1465	if (unlikely(path == NULL)) {
1466		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1467		return -EIO;
1468	}
1469	depth = path->p_depth;
1470	*phys = 0;
1471
1472	if (depth == 0 && path->p_ext == NULL)
1473		return 0;
1474
1475	/* usually extent in the path covers blocks smaller
1476	 * then *logical, but it can be that extent is the
1477	 * first one in the file */
1478
1479	ex = path[depth].p_ext;
1480	ee_len = ext4_ext_get_actual_len(ex);
1481	if (*logical < le32_to_cpu(ex->ee_block)) {
1482		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1483			EXT4_ERROR_INODE(inode,
1484					 "first_extent(path[%d].p_hdr) != ex",
1485					 depth);
1486			return -EIO;
1487		}
1488		while (--depth >= 0) {
1489			ix = path[depth].p_idx;
1490			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1491				EXT4_ERROR_INODE(inode,
1492						 "ix != EXT_FIRST_INDEX *logical %d!",
1493						 *logical);
1494				return -EIO;
1495			}
1496		}
1497		goto found_extent;
1498	}
1499
1500	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1501		EXT4_ERROR_INODE(inode,
1502				 "logical %d < ee_block %d + ee_len %d!",
1503				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1504		return -EIO;
1505	}
1506
1507	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1508		/* next allocated block in this leaf */
1509		ex++;
1510		goto found_extent;
1511	}
1512
1513	/* go up and search for index to the right */
1514	while (--depth >= 0) {
1515		ix = path[depth].p_idx;
1516		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1517			goto got_index;
1518	}
1519
1520	/* we've gone up to the root and found no index to the right */
1521	return 0;
1522
1523got_index:
1524	/* we've found index to the right, let's
1525	 * follow it and find the closest allocated
1526	 * block to the right */
1527	ix++;
1528	block = ext4_idx_pblock(ix);
1529	while (++depth < path->p_depth) {
1530		/* subtract from p_depth to get proper eh_depth */
1531		bh = read_extent_tree_block(inode, block,
1532					    path->p_depth - depth, 0);
1533		if (IS_ERR(bh))
1534			return PTR_ERR(bh);
1535		eh = ext_block_hdr(bh);
1536		ix = EXT_FIRST_INDEX(eh);
1537		block = ext4_idx_pblock(ix);
1538		put_bh(bh);
1539	}
1540
1541	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1542	if (IS_ERR(bh))
1543		return PTR_ERR(bh);
1544	eh = ext_block_hdr(bh);
1545	ex = EXT_FIRST_EXTENT(eh);
1546found_extent:
1547	*logical = le32_to_cpu(ex->ee_block);
1548	*phys = ext4_ext_pblock(ex);
1549	*ret_ex = ex;
1550	if (bh)
1551		put_bh(bh);
1552	return 0;
1553}
1554
1555/*
1556 * ext4_ext_next_allocated_block:
1557 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1558 * NOTE: it considers block number from index entry as
1559 * allocated block. Thus, index entries have to be consistent
1560 * with leaves.
1561 */
1562static ext4_lblk_t
1563ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1564{
1565	int depth;
1566
1567	BUG_ON(path == NULL);
1568	depth = path->p_depth;
1569
1570	if (depth == 0 && path->p_ext == NULL)
1571		return EXT_MAX_BLOCKS;
1572
1573	while (depth >= 0) {
1574		if (depth == path->p_depth) {
1575			/* leaf */
1576			if (path[depth].p_ext &&
1577				path[depth].p_ext !=
1578					EXT_LAST_EXTENT(path[depth].p_hdr))
1579			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1580		} else {
1581			/* index */
1582			if (path[depth].p_idx !=
1583					EXT_LAST_INDEX(path[depth].p_hdr))
1584			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1585		}
1586		depth--;
1587	}
1588
1589	return EXT_MAX_BLOCKS;
1590}
1591
1592/*
1593 * ext4_ext_next_leaf_block:
1594 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1595 */
1596static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1597{
1598	int depth;
1599
1600	BUG_ON(path == NULL);
1601	depth = path->p_depth;
1602
1603	/* zero-tree has no leaf blocks at all */
1604	if (depth == 0)
1605		return EXT_MAX_BLOCKS;
1606
1607	/* go to index block */
1608	depth--;
1609
1610	while (depth >= 0) {
1611		if (path[depth].p_idx !=
1612				EXT_LAST_INDEX(path[depth].p_hdr))
1613			return (ext4_lblk_t)
1614				le32_to_cpu(path[depth].p_idx[1].ei_block);
1615		depth--;
1616	}
1617
1618	return EXT_MAX_BLOCKS;
1619}
1620
1621/*
1622 * ext4_ext_correct_indexes:
1623 * if leaf gets modified and modified extent is first in the leaf,
1624 * then we have to correct all indexes above.
1625 * TODO: do we need to correct tree in all cases?
1626 */
1627static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1628				struct ext4_ext_path *path)
1629{
1630	struct ext4_extent_header *eh;
1631	int depth = ext_depth(inode);
1632	struct ext4_extent *ex;
1633	__le32 border;
1634	int k, err = 0;
1635
1636	eh = path[depth].p_hdr;
1637	ex = path[depth].p_ext;
1638
1639	if (unlikely(ex == NULL || eh == NULL)) {
1640		EXT4_ERROR_INODE(inode,
1641				 "ex %p == NULL or eh %p == NULL", ex, eh);
1642		return -EIO;
1643	}
1644
1645	if (depth == 0) {
1646		/* there is no tree at all */
1647		return 0;
1648	}
1649
1650	if (ex != EXT_FIRST_EXTENT(eh)) {
1651		/* we correct tree if first leaf got modified only */
1652		return 0;
1653	}
1654
1655	/*
1656	 * TODO: we need correction if border is smaller than current one
1657	 */
1658	k = depth - 1;
1659	border = path[depth].p_ext->ee_block;
1660	err = ext4_ext_get_access(handle, inode, path + k);
1661	if (err)
1662		return err;
1663	path[k].p_idx->ei_block = border;
1664	err = ext4_ext_dirty(handle, inode, path + k);
1665	if (err)
1666		return err;
1667
1668	while (k--) {
1669		/* change all left-side indexes */
1670		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1671			break;
1672		err = ext4_ext_get_access(handle, inode, path + k);
1673		if (err)
1674			break;
1675		path[k].p_idx->ei_block = border;
1676		err = ext4_ext_dirty(handle, inode, path + k);
1677		if (err)
1678			break;
1679	}
1680
1681	return err;
1682}
1683
1684int
1685ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1686				struct ext4_extent *ex2)
1687{
1688	unsigned short ext1_ee_len, ext2_ee_len;
1689
1690	/*
1691	 * Make sure that both extents are initialized. We don't merge
1692	 * unwritten extents so that we can be sure that end_io code has
1693	 * the extent that was written properly split out and conversion to
1694	 * initialized is trivial.
1695	 */
1696	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1697		return 0;
1698
1699	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1700	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1701
1702	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1703			le32_to_cpu(ex2->ee_block))
1704		return 0;
1705
1706	/*
1707	 * To allow future support for preallocated extents to be added
1708	 * as an RO_COMPAT feature, refuse to merge to extents if
1709	 * this can result in the top bit of ee_len being set.
1710	 */
1711	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1712		return 0;
1713	if (ext4_ext_is_unwritten(ex1) &&
1714	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1715	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
1716	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1717		return 0;
1718#ifdef AGGRESSIVE_TEST
1719	if (ext1_ee_len >= 4)
1720		return 0;
1721#endif
1722
1723	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1724		return 1;
1725	return 0;
1726}
1727
1728/*
1729 * This function tries to merge the "ex" extent to the next extent in the tree.
1730 * It always tries to merge towards right. If you want to merge towards
1731 * left, pass "ex - 1" as argument instead of "ex".
1732 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1733 * 1 if they got merged.
1734 */
1735static int ext4_ext_try_to_merge_right(struct inode *inode,
1736				 struct ext4_ext_path *path,
1737				 struct ext4_extent *ex)
1738{
1739	struct ext4_extent_header *eh;
1740	unsigned int depth, len;
1741	int merge_done = 0, unwritten;
1742
1743	depth = ext_depth(inode);
1744	BUG_ON(path[depth].p_hdr == NULL);
1745	eh = path[depth].p_hdr;
1746
1747	while (ex < EXT_LAST_EXTENT(eh)) {
1748		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1749			break;
1750		/* merge with next extent! */
1751		unwritten = ext4_ext_is_unwritten(ex);
1752		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1753				+ ext4_ext_get_actual_len(ex + 1));
1754		if (unwritten)
1755			ext4_ext_mark_unwritten(ex);
1756
1757		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1758			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1759				* sizeof(struct ext4_extent);
1760			memmove(ex + 1, ex + 2, len);
1761		}
1762		le16_add_cpu(&eh->eh_entries, -1);
1763		merge_done = 1;
1764		WARN_ON(eh->eh_entries == 0);
1765		if (!eh->eh_entries)
1766			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1767	}
1768
1769	return merge_done;
1770}
1771
1772/*
1773 * This function does a very simple check to see if we can collapse
1774 * an extent tree with a single extent tree leaf block into the inode.
1775 */
1776static void ext4_ext_try_to_merge_up(handle_t *handle,
1777				     struct inode *inode,
1778				     struct ext4_ext_path *path)
1779{
1780	size_t s;
1781	unsigned max_root = ext4_ext_space_root(inode, 0);
1782	ext4_fsblk_t blk;
1783
1784	if ((path[0].p_depth != 1) ||
1785	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1786	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1787		return;
1788
1789	/*
1790	 * We need to modify the block allocation bitmap and the block
1791	 * group descriptor to release the extent tree block.  If we
1792	 * can't get the journal credits, give up.
1793	 */
1794	if (ext4_journal_extend(handle, 2))
1795		return;
1796
1797	/*
1798	 * Copy the extent data up to the inode
1799	 */
1800	blk = ext4_idx_pblock(path[0].p_idx);
1801	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1802		sizeof(struct ext4_extent_idx);
1803	s += sizeof(struct ext4_extent_header);
1804
1805	memcpy(path[0].p_hdr, path[1].p_hdr, s);
1806	path[0].p_depth = 0;
1807	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1808		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1809	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1810
1811	brelse(path[1].p_bh);
1812	ext4_free_blocks(handle, inode, NULL, blk, 1,
1813			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1814}
1815
1816/*
1817 * This function tries to merge the @ex extent to neighbours in the tree.
1818 * return 1 if merge left else 0.
1819 */
1820static void ext4_ext_try_to_merge(handle_t *handle,
1821				  struct inode *inode,
1822				  struct ext4_ext_path *path,
1823				  struct ext4_extent *ex) {
1824	struct ext4_extent_header *eh;
1825	unsigned int depth;
1826	int merge_done = 0;
1827
1828	depth = ext_depth(inode);
1829	BUG_ON(path[depth].p_hdr == NULL);
1830	eh = path[depth].p_hdr;
1831
1832	if (ex > EXT_FIRST_EXTENT(eh))
1833		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1834
1835	if (!merge_done)
1836		(void) ext4_ext_try_to_merge_right(inode, path, ex);
1837
1838	ext4_ext_try_to_merge_up(handle, inode, path);
1839}
1840
1841/*
1842 * check if a portion of the "newext" extent overlaps with an
1843 * existing extent.
1844 *
1845 * If there is an overlap discovered, it updates the length of the newext
1846 * such that there will be no overlap, and then returns 1.
1847 * If there is no overlap found, it returns 0.
1848 */
1849static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1850					   struct inode *inode,
1851					   struct ext4_extent *newext,
1852					   struct ext4_ext_path *path)
1853{
1854	ext4_lblk_t b1, b2;
1855	unsigned int depth, len1;
1856	unsigned int ret = 0;
1857
1858	b1 = le32_to_cpu(newext->ee_block);
1859	len1 = ext4_ext_get_actual_len(newext);
1860	depth = ext_depth(inode);
1861	if (!path[depth].p_ext)
1862		goto out;
1863	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1864
1865	/*
1866	 * get the next allocated block if the extent in the path
1867	 * is before the requested block(s)
1868	 */
1869	if (b2 < b1) {
1870		b2 = ext4_ext_next_allocated_block(path);
1871		if (b2 == EXT_MAX_BLOCKS)
1872			goto out;
1873		b2 = EXT4_LBLK_CMASK(sbi, b2);
1874	}
1875
1876	/* check for wrap through zero on extent logical start block*/
1877	if (b1 + len1 < b1) {
1878		len1 = EXT_MAX_BLOCKS - b1;
1879		newext->ee_len = cpu_to_le16(len1);
1880		ret = 1;
1881	}
1882
1883	/* check for overlap */
1884	if (b1 + len1 > b2) {
1885		newext->ee_len = cpu_to_le16(b2 - b1);
1886		ret = 1;
1887	}
1888out:
1889	return ret;
1890}
1891
1892/*
1893 * ext4_ext_insert_extent:
1894 * tries to merge requsted extent into the existing extent or
1895 * inserts requested extent as new one into the tree,
1896 * creating new leaf in the no-space case.
1897 */
1898int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1899				struct ext4_ext_path *path,
1900				struct ext4_extent *newext, int gb_flags)
1901{
1902	struct ext4_extent_header *eh;
1903	struct ext4_extent *ex, *fex;
1904	struct ext4_extent *nearex; /* nearest extent */
1905	struct ext4_ext_path *npath = NULL;
1906	int depth, len, err;
1907	ext4_lblk_t next;
1908	int mb_flags = 0, unwritten;
1909
1910	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1911		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1912		return -EIO;
1913	}
1914	depth = ext_depth(inode);
1915	ex = path[depth].p_ext;
1916	eh = path[depth].p_hdr;
1917	if (unlikely(path[depth].p_hdr == NULL)) {
1918		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1919		return -EIO;
1920	}
1921
1922	/* try to insert block into found extent and return */
1923	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1924
1925		/*
1926		 * Try to see whether we should rather test the extent on
1927		 * right from ex, or from the left of ex. This is because
1928		 * ext4_ext_find_extent() can return either extent on the
1929		 * left, or on the right from the searched position. This
1930		 * will make merging more effective.
1931		 */
1932		if (ex < EXT_LAST_EXTENT(eh) &&
1933		    (le32_to_cpu(ex->ee_block) +
1934		    ext4_ext_get_actual_len(ex) <
1935		    le32_to_cpu(newext->ee_block))) {
1936			ex += 1;
1937			goto prepend;
1938		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1939			   (le32_to_cpu(newext->ee_block) +
1940			   ext4_ext_get_actual_len(newext) <
1941			   le32_to_cpu(ex->ee_block)))
1942			ex -= 1;
1943
1944		/* Try to append newex to the ex */
1945		if (ext4_can_extents_be_merged(inode, ex, newext)) {
1946			ext_debug("append [%d]%d block to %u:[%d]%d"
1947				  "(from %llu)\n",
1948				  ext4_ext_is_unwritten(newext),
1949				  ext4_ext_get_actual_len(newext),
1950				  le32_to_cpu(ex->ee_block),
1951				  ext4_ext_is_unwritten(ex),
1952				  ext4_ext_get_actual_len(ex),
1953				  ext4_ext_pblock(ex));
1954			err = ext4_ext_get_access(handle, inode,
1955						  path + depth);
1956			if (err)
1957				return err;
1958			unwritten = ext4_ext_is_unwritten(ex);
1959			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1960					+ ext4_ext_get_actual_len(newext));
1961			if (unwritten)
1962				ext4_ext_mark_unwritten(ex);
1963			eh = path[depth].p_hdr;
1964			nearex = ex;
1965			goto merge;
1966		}
1967
1968prepend:
1969		/* Try to prepend newex to the ex */
1970		if (ext4_can_extents_be_merged(inode, newext, ex)) {
1971			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1972				  "(from %llu)\n",
1973				  le32_to_cpu(newext->ee_block),
1974				  ext4_ext_is_unwritten(newext),
1975				  ext4_ext_get_actual_len(newext),
1976				  le32_to_cpu(ex->ee_block),
1977				  ext4_ext_is_unwritten(ex),
1978				  ext4_ext_get_actual_len(ex),
1979				  ext4_ext_pblock(ex));
1980			err = ext4_ext_get_access(handle, inode,
1981						  path + depth);
1982			if (err)
1983				return err;
1984
1985			unwritten = ext4_ext_is_unwritten(ex);
1986			ex->ee_block = newext->ee_block;
1987			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1988			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1989					+ ext4_ext_get_actual_len(newext));
1990			if (unwritten)
1991				ext4_ext_mark_unwritten(ex);
1992			eh = path[depth].p_hdr;
1993			nearex = ex;
1994			goto merge;
1995		}
1996	}
1997
1998	depth = ext_depth(inode);
1999	eh = path[depth].p_hdr;
2000	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2001		goto has_space;
2002
2003	/* probably next leaf has space for us? */
2004	fex = EXT_LAST_EXTENT(eh);
2005	next = EXT_MAX_BLOCKS;
2006	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2007		next = ext4_ext_next_leaf_block(path);
2008	if (next != EXT_MAX_BLOCKS) {
2009		ext_debug("next leaf block - %u\n", next);
2010		BUG_ON(npath != NULL);
2011		npath = ext4_ext_find_extent(inode, next, NULL, 0);
2012		if (IS_ERR(npath))
2013			return PTR_ERR(npath);
2014		BUG_ON(npath->p_depth != path->p_depth);
2015		eh = npath[depth].p_hdr;
2016		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2017			ext_debug("next leaf isn't full(%d)\n",
2018				  le16_to_cpu(eh->eh_entries));
2019			path = npath;
2020			goto has_space;
2021		}
2022		ext_debug("next leaf has no free space(%d,%d)\n",
2023			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2024	}
2025
2026	/*
2027	 * There is no free space in the found leaf.
2028	 * We're gonna add a new leaf in the tree.
2029	 */
2030	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2031		mb_flags = EXT4_MB_USE_RESERVED;
2032	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2033				       path, newext);
2034	if (err)
2035		goto cleanup;
2036	depth = ext_depth(inode);
2037	eh = path[depth].p_hdr;
2038
2039has_space:
2040	nearex = path[depth].p_ext;
2041
2042	err = ext4_ext_get_access(handle, inode, path + depth);
2043	if (err)
2044		goto cleanup;
2045
2046	if (!nearex) {
2047		/* there is no extent in this leaf, create first one */
2048		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2049				le32_to_cpu(newext->ee_block),
2050				ext4_ext_pblock(newext),
2051				ext4_ext_is_unwritten(newext),
2052				ext4_ext_get_actual_len(newext));
2053		nearex = EXT_FIRST_EXTENT(eh);
2054	} else {
2055		if (le32_to_cpu(newext->ee_block)
2056			   > le32_to_cpu(nearex->ee_block)) {
2057			/* Insert after */
2058			ext_debug("insert %u:%llu:[%d]%d before: "
2059					"nearest %p\n",
2060					le32_to_cpu(newext->ee_block),
2061					ext4_ext_pblock(newext),
2062					ext4_ext_is_unwritten(newext),
2063					ext4_ext_get_actual_len(newext),
2064					nearex);
2065			nearex++;
2066		} else {
2067			/* Insert before */
2068			BUG_ON(newext->ee_block == nearex->ee_block);
2069			ext_debug("insert %u:%llu:[%d]%d after: "
2070					"nearest %p\n",
2071					le32_to_cpu(newext->ee_block),
2072					ext4_ext_pblock(newext),
2073					ext4_ext_is_unwritten(newext),
2074					ext4_ext_get_actual_len(newext),
2075					nearex);
2076		}
2077		len = EXT_LAST_EXTENT(eh) - nearex + 1;
2078		if (len > 0) {
2079			ext_debug("insert %u:%llu:[%d]%d: "
2080					"move %d extents from 0x%p to 0x%p\n",
2081					le32_to_cpu(newext->ee_block),
2082					ext4_ext_pblock(newext),
2083					ext4_ext_is_unwritten(newext),
2084					ext4_ext_get_actual_len(newext),
2085					len, nearex, nearex + 1);
2086			memmove(nearex + 1, nearex,
2087				len * sizeof(struct ext4_extent));
2088		}
2089	}
2090
2091	le16_add_cpu(&eh->eh_entries, 1);
2092	path[depth].p_ext = nearex;
2093	nearex->ee_block = newext->ee_block;
2094	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2095	nearex->ee_len = newext->ee_len;
2096
2097merge:
2098	/* try to merge extents */
2099	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2100		ext4_ext_try_to_merge(handle, inode, path, nearex);
2101
2102
2103	/* time to correct all indexes above */
2104	err = ext4_ext_correct_indexes(handle, inode, path);
2105	if (err)
2106		goto cleanup;
2107
2108	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2109
2110cleanup:
2111	if (npath) {
2112		ext4_ext_drop_refs(npath);
2113		kfree(npath);
2114	}
2115	return err;
2116}
2117
2118static int ext4_fill_fiemap_extents(struct inode *inode,
2119				    ext4_lblk_t block, ext4_lblk_t num,
2120				    struct fiemap_extent_info *fieinfo)
2121{
2122	struct ext4_ext_path *path = NULL;
2123	struct ext4_extent *ex;
2124	struct extent_status es;
2125	ext4_lblk_t next, next_del, start = 0, end = 0;
2126	ext4_lblk_t last = block + num;
2127	int exists, depth = 0, err = 0;
2128	unsigned int flags = 0;
2129	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2130
2131	while (block < last && block != EXT_MAX_BLOCKS) {
2132		num = last - block;
2133		/* find extent for this block */
2134		down_read(&EXT4_I(inode)->i_data_sem);
2135
2136		if (path && ext_depth(inode) != depth) {
2137			/* depth was changed. we have to realloc path */
2138			kfree(path);
2139			path = NULL;
2140		}
2141
2142		path = ext4_ext_find_extent(inode, block, path, 0);
2143		if (IS_ERR(path)) {
2144			up_read(&EXT4_I(inode)->i_data_sem);
2145			err = PTR_ERR(path);
2146			path = NULL;
2147			break;
2148		}
2149
2150		depth = ext_depth(inode);
2151		if (unlikely(path[depth].p_hdr == NULL)) {
2152			up_read(&EXT4_I(inode)->i_data_sem);
2153			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2154			err = -EIO;
2155			break;
2156		}
2157		ex = path[depth].p_ext;
2158		next = ext4_ext_next_allocated_block(path);
2159		ext4_ext_drop_refs(path);
2160
2161		flags = 0;
2162		exists = 0;
2163		if (!ex) {
2164			/* there is no extent yet, so try to allocate
2165			 * all requested space */
2166			start = block;
2167			end = block + num;
2168		} else if (le32_to_cpu(ex->ee_block) > block) {
2169			/* need to allocate space before found extent */
2170			start = block;
2171			end = le32_to_cpu(ex->ee_block);
2172			if (block + num < end)
2173				end = block + num;
2174		} else if (block >= le32_to_cpu(ex->ee_block)
2175					+ ext4_ext_get_actual_len(ex)) {
2176			/* need to allocate space after found extent */
2177			start = block;
2178			end = block + num;
2179			if (end >= next)
2180				end = next;
2181		} else if (block >= le32_to_cpu(ex->ee_block)) {
2182			/*
2183			 * some part of requested space is covered
2184			 * by found extent
2185			 */
2186			start = block;
2187			end = le32_to_cpu(ex->ee_block)
2188				+ ext4_ext_get_actual_len(ex);
2189			if (block + num < end)
2190				end = block + num;
2191			exists = 1;
2192		} else {
2193			BUG();
2194		}
2195		BUG_ON(end <= start);
2196
2197		if (!exists) {
2198			es.es_lblk = start;
2199			es.es_len = end - start;
2200			es.es_pblk = 0;
2201		} else {
2202			es.es_lblk = le32_to_cpu(ex->ee_block);
2203			es.es_len = ext4_ext_get_actual_len(ex);
2204			es.es_pblk = ext4_ext_pblock(ex);
2205			if (ext4_ext_is_unwritten(ex))
2206				flags |= FIEMAP_EXTENT_UNWRITTEN;
2207		}
2208
2209		/*
2210		 * Find delayed extent and update es accordingly. We call
2211		 * it even in !exists case to find out whether es is the
2212		 * last existing extent or not.
2213		 */
2214		next_del = ext4_find_delayed_extent(inode, &es);
2215		if (!exists && next_del) {
2216			exists = 1;
2217			flags |= (FIEMAP_EXTENT_DELALLOC |
2218				  FIEMAP_EXTENT_UNKNOWN);
2219		}
2220		up_read(&EXT4_I(inode)->i_data_sem);
2221
2222		if (unlikely(es.es_len == 0)) {
2223			EXT4_ERROR_INODE(inode, "es.es_len == 0");
2224			err = -EIO;
2225			break;
2226		}
2227
2228		/*
2229		 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2230		 * we need to check next == EXT_MAX_BLOCKS because it is
2231		 * possible that an extent is with unwritten and delayed
2232		 * status due to when an extent is delayed allocated and
2233		 * is allocated by fallocate status tree will track both of
2234		 * them in a extent.
2235		 *
2236		 * So we could return a unwritten and delayed extent, and
2237		 * its block is equal to 'next'.
2238		 */
2239		if (next == next_del && next == EXT_MAX_BLOCKS) {
2240			flags |= FIEMAP_EXTENT_LAST;
2241			if (unlikely(next_del != EXT_MAX_BLOCKS ||
2242				     next != EXT_MAX_BLOCKS)) {
2243				EXT4_ERROR_INODE(inode,
2244						 "next extent == %u, next "
2245						 "delalloc extent = %u",
2246						 next, next_del);
2247				err = -EIO;
2248				break;
2249			}
2250		}
2251
2252		if (exists) {
2253			err = fiemap_fill_next_extent(fieinfo,
2254				(__u64)es.es_lblk << blksize_bits,
2255				(__u64)es.es_pblk << blksize_bits,
2256				(__u64)es.es_len << blksize_bits,
2257				flags);
2258			if (err < 0)
2259				break;
2260			if (err == 1) {
2261				err = 0;
2262				break;
2263			}
2264		}
2265
2266		block = es.es_lblk + es.es_len;
2267	}
2268
2269	if (path) {
2270		ext4_ext_drop_refs(path);
2271		kfree(path);
2272	}
2273
2274	return err;
2275}
2276
2277/*
2278 * ext4_ext_put_gap_in_cache:
2279 * calculate boundaries of the gap that the requested block fits into
2280 * and cache this gap
2281 */
2282static void
2283ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2284				ext4_lblk_t block)
2285{
2286	int depth = ext_depth(inode);
2287	unsigned long len = 0;
2288	ext4_lblk_t lblock = 0;
2289	struct ext4_extent *ex;
2290
2291	ex = path[depth].p_ext;
2292	if (ex == NULL) {
2293		/*
2294		 * there is no extent yet, so gap is [0;-] and we
2295		 * don't cache it
2296		 */
2297		ext_debug("cache gap(whole file):");
2298	} else if (block < le32_to_cpu(ex->ee_block)) {
2299		lblock = block;
2300		len = le32_to_cpu(ex->ee_block) - block;
2301		ext_debug("cache gap(before): %u [%u:%u]",
2302				block,
2303				le32_to_cpu(ex->ee_block),
2304				 ext4_ext_get_actual_len(ex));
2305		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2306			ext4_es_insert_extent(inode, lblock, len, ~0,
2307					      EXTENT_STATUS_HOLE);
2308	} else if (block >= le32_to_cpu(ex->ee_block)
2309			+ ext4_ext_get_actual_len(ex)) {
2310		ext4_lblk_t next;
2311		lblock = le32_to_cpu(ex->ee_block)
2312			+ ext4_ext_get_actual_len(ex);
2313
2314		next = ext4_ext_next_allocated_block(path);
2315		ext_debug("cache gap(after): [%u:%u] %u",
2316				le32_to_cpu(ex->ee_block),
2317				ext4_ext_get_actual_len(ex),
2318				block);
2319		BUG_ON(next == lblock);
2320		len = next - lblock;
2321		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2322			ext4_es_insert_extent(inode, lblock, len, ~0,
2323					      EXTENT_STATUS_HOLE);
2324	} else {
2325		BUG();
2326	}
2327
2328	ext_debug(" -> %u:%lu\n", lblock, len);
2329}
2330
2331/*
2332 * ext4_ext_rm_idx:
2333 * removes index from the index block.
2334 */
2335static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2336			struct ext4_ext_path *path, int depth)
2337{
2338	int err;
2339	ext4_fsblk_t leaf;
2340
2341	/* free index block */
2342	depth--;
2343	path = path + depth;
2344	leaf = ext4_idx_pblock(path->p_idx);
2345	if (unlikely(path->p_hdr->eh_entries == 0)) {
2346		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2347		return -EIO;
2348	}
2349	err = ext4_ext_get_access(handle, inode, path);
2350	if (err)
2351		return err;
2352
2353	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2354		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2355		len *= sizeof(struct ext4_extent_idx);
2356		memmove(path->p_idx, path->p_idx + 1, len);
2357	}
2358
2359	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2360	err = ext4_ext_dirty(handle, inode, path);
2361	if (err)
2362		return err;
2363	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2364	trace_ext4_ext_rm_idx(inode, leaf);
2365
2366	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2367			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2368
2369	while (--depth >= 0) {
2370		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2371			break;
2372		path--;
2373		err = ext4_ext_get_access(handle, inode, path);
2374		if (err)
2375			break;
2376		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2377		err = ext4_ext_dirty(handle, inode, path);
2378		if (err)
2379			break;
2380	}
2381	return err;
2382}
2383
2384/*
2385 * ext4_ext_calc_credits_for_single_extent:
2386 * This routine returns max. credits that needed to insert an extent
2387 * to the extent tree.
2388 * When pass the actual path, the caller should calculate credits
2389 * under i_data_sem.
2390 */
2391int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2392						struct ext4_ext_path *path)
2393{
2394	if (path) {
2395		int depth = ext_depth(inode);
2396		int ret = 0;
2397
2398		/* probably there is space in leaf? */
2399		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2400				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2401
2402			/*
2403			 *  There are some space in the leaf tree, no
2404			 *  need to account for leaf block credit
2405			 *
2406			 *  bitmaps and block group descriptor blocks
2407			 *  and other metadata blocks still need to be
2408			 *  accounted.
2409			 */
2410			/* 1 bitmap, 1 block group descriptor */
2411			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2412			return ret;
2413		}
2414	}
2415
2416	return ext4_chunk_trans_blocks(inode, nrblocks);
2417}
2418
2419/*
2420 * How many index/leaf blocks need to change/allocate to add @extents extents?
2421 *
2422 * If we add a single extent, then in the worse case, each tree level
2423 * index/leaf need to be changed in case of the tree split.
2424 *
2425 * If more extents are inserted, they could cause the whole tree split more
2426 * than once, but this is really rare.
2427 */
2428int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2429{
2430	int index;
2431	int depth;
2432
2433	/* If we are converting the inline data, only one is needed here. */
2434	if (ext4_has_inline_data(inode))
2435		return 1;
2436
2437	depth = ext_depth(inode);
2438
2439	if (extents <= 1)
2440		index = depth * 2;
2441	else
2442		index = depth * 3;
2443
2444	return index;
2445}
2446
2447static inline int get_default_free_blocks_flags(struct inode *inode)
2448{
2449	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2450		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2451	else if (ext4_should_journal_data(inode))
2452		return EXT4_FREE_BLOCKS_FORGET;
2453	return 0;
2454}
2455
2456static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2457			      struct ext4_extent *ex,
2458			      long long *partial_cluster,
2459			      ext4_lblk_t from, ext4_lblk_t to)
2460{
2461	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2462	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2463	ext4_fsblk_t pblk;
2464	int flags = get_default_free_blocks_flags(inode);
2465
2466	/*
2467	 * For bigalloc file systems, we never free a partial cluster
2468	 * at the beginning of the extent.  Instead, we make a note
2469	 * that we tried freeing the cluster, and check to see if we
2470	 * need to free it on a subsequent call to ext4_remove_blocks,
2471	 * or at the end of the ext4_truncate() operation.
2472	 */
2473	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2474
2475	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2476	/*
2477	 * If we have a partial cluster, and it's different from the
2478	 * cluster of the last block, we need to explicitly free the
2479	 * partial cluster here.
2480	 */
2481	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2482	if ((*partial_cluster > 0) &&
2483	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2484		ext4_free_blocks(handle, inode, NULL,
2485				 EXT4_C2B(sbi, *partial_cluster),
2486				 sbi->s_cluster_ratio, flags);
2487		*partial_cluster = 0;
2488	}
2489
2490#ifdef EXTENTS_STATS
2491	{
2492		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2493		spin_lock(&sbi->s_ext_stats_lock);
2494		sbi->s_ext_blocks += ee_len;
2495		sbi->s_ext_extents++;
2496		if (ee_len < sbi->s_ext_min)
2497			sbi->s_ext_min = ee_len;
2498		if (ee_len > sbi->s_ext_max)
2499			sbi->s_ext_max = ee_len;
2500		if (ext_depth(inode) > sbi->s_depth_max)
2501			sbi->s_depth_max = ext_depth(inode);
2502		spin_unlock(&sbi->s_ext_stats_lock);
2503	}
2504#endif
2505	if (from >= le32_to_cpu(ex->ee_block)
2506	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2507		/* tail removal */
2508		ext4_lblk_t num;
2509		unsigned int unaligned;
2510
2511		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2512		pblk = ext4_ext_pblock(ex) + ee_len - num;
2513		/*
2514		 * Usually we want to free partial cluster at the end of the
2515		 * extent, except for the situation when the cluster is still
2516		 * used by any other extent (partial_cluster is negative).
2517		 */
2518		if (*partial_cluster < 0 &&
2519		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2520			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2521
2522		ext_debug("free last %u blocks starting %llu partial %lld\n",
2523			  num, pblk, *partial_cluster);
2524		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2525		/*
2526		 * If the block range to be freed didn't start at the
2527		 * beginning of a cluster, and we removed the entire
2528		 * extent and the cluster is not used by any other extent,
2529		 * save the partial cluster here, since we might need to
2530		 * delete if we determine that the truncate operation has
2531		 * removed all of the blocks in the cluster.
2532		 *
2533		 * On the other hand, if we did not manage to free the whole
2534		 * extent, we have to mark the cluster as used (store negative
2535		 * cluster number in partial_cluster).
2536		 */
2537		unaligned = EXT4_PBLK_COFF(sbi, pblk);
2538		if (unaligned && (ee_len == num) &&
2539		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2540			*partial_cluster = EXT4_B2C(sbi, pblk);
2541		else if (unaligned)
2542			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2543		else if (*partial_cluster > 0)
2544			*partial_cluster = 0;
2545	} else
2546		ext4_error(sbi->s_sb, "strange request: removal(2) "
2547			   "%u-%u from %u:%u\n",
2548			   from, to, le32_to_cpu(ex->ee_block), ee_len);
2549	return 0;
2550}
2551
2552
2553/*
2554 * ext4_ext_rm_leaf() Removes the extents associated with the
2555 * blocks appearing between "start" and "end", and splits the extents
2556 * if "start" and "end" appear in the same extent
2557 *
2558 * @handle: The journal handle
2559 * @inode:  The files inode
2560 * @path:   The path to the leaf
2561 * @partial_cluster: The cluster which we'll have to free if all extents
2562 *                   has been released from it. It gets negative in case
2563 *                   that the cluster is still used.
2564 * @start:  The first block to remove
2565 * @end:   The last block to remove
2566 */
2567static int
2568ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2569		 struct ext4_ext_path *path,
2570		 long long *partial_cluster,
2571		 ext4_lblk_t start, ext4_lblk_t end)
2572{
2573	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2574	int err = 0, correct_index = 0;
2575	int depth = ext_depth(inode), credits;
2576	struct ext4_extent_header *eh;
2577	ext4_lblk_t a, b;
2578	unsigned num;
2579	ext4_lblk_t ex_ee_block;
2580	unsigned short ex_ee_len;
2581	unsigned unwritten = 0;
2582	struct ext4_extent *ex;
2583	ext4_fsblk_t pblk;
2584
2585	/* the header must be checked already in ext4_ext_remove_space() */
2586	ext_debug("truncate since %u in leaf to %u\n", start, end);
2587	if (!path[depth].p_hdr)
2588		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2589	eh = path[depth].p_hdr;
2590	if (unlikely(path[depth].p_hdr == NULL)) {
2591		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2592		return -EIO;
2593	}
2594	/* find where to start removing */
2595	ex = path[depth].p_ext;
2596	if (!ex)
2597		ex = EXT_LAST_EXTENT(eh);
2598
2599	ex_ee_block = le32_to_cpu(ex->ee_block);
2600	ex_ee_len = ext4_ext_get_actual_len(ex);
2601
2602	/*
2603	 * If we're starting with an extent other than the last one in the
2604	 * node, we need to see if it shares a cluster with the extent to
2605	 * the right (towards the end of the file). If its leftmost cluster
2606	 * is this extent's rightmost cluster and it is not cluster aligned,
2607	 * we'll mark it as a partial that is not to be deallocated.
2608	 */
2609
2610	if (ex != EXT_LAST_EXTENT(eh)) {
2611		ext4_fsblk_t current_pblk, right_pblk;
2612		long long current_cluster, right_cluster;
2613
2614		current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2615		current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2616		right_pblk = ext4_ext_pblock(ex + 1);
2617		right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2618		if (current_cluster == right_cluster &&
2619			EXT4_PBLK_COFF(sbi, right_pblk))
2620			*partial_cluster = -right_cluster;
2621	}
2622
2623	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2624
2625	while (ex >= EXT_FIRST_EXTENT(eh) &&
2626			ex_ee_block + ex_ee_len > start) {
2627
2628		if (ext4_ext_is_unwritten(ex))
2629			unwritten = 1;
2630		else
2631			unwritten = 0;
2632
2633		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2634			  unwritten, ex_ee_len);
2635		path[depth].p_ext = ex;
2636
2637		a = ex_ee_block > start ? ex_ee_block : start;
2638		b = ex_ee_block+ex_ee_len - 1 < end ?
2639			ex_ee_block+ex_ee_len - 1 : end;
2640
2641		ext_debug("  border %u:%u\n", a, b);
2642
2643		/* If this extent is beyond the end of the hole, skip it */
2644		if (end < ex_ee_block) {
2645			/*
2646			 * We're going to skip this extent and move to another,
2647			 * so if this extent is not cluster aligned we have
2648			 * to mark the current cluster as used to avoid
2649			 * accidentally freeing it later on
2650			 */
2651			pblk = ext4_ext_pblock(ex);
2652			if (EXT4_PBLK_COFF(sbi, pblk))
2653				*partial_cluster =
2654					-((long long)EXT4_B2C(sbi, pblk));
2655			ex--;
2656			ex_ee_block = le32_to_cpu(ex->ee_block);
2657			ex_ee_len = ext4_ext_get_actual_len(ex);
2658			continue;
2659		} else if (b != ex_ee_block + ex_ee_len - 1) {
2660			EXT4_ERROR_INODE(inode,
2661					 "can not handle truncate %u:%u "
2662					 "on extent %u:%u",
2663					 start, end, ex_ee_block,
2664					 ex_ee_block + ex_ee_len - 1);
2665			err = -EIO;
2666			goto out;
2667		} else if (a != ex_ee_block) {
2668			/* remove tail of the extent */
2669			num = a - ex_ee_block;
2670		} else {
2671			/* remove whole extent: excellent! */
2672			num = 0;
2673		}
2674		/*
2675		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2676		 * descriptor) for each block group; assume two block
2677		 * groups plus ex_ee_len/blocks_per_block_group for
2678		 * the worst case
2679		 */
2680		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2681		if (ex == EXT_FIRST_EXTENT(eh)) {
2682			correct_index = 1;
2683			credits += (ext_depth(inode)) + 1;
2684		}
2685		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2686
2687		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2688		if (err)
2689			goto out;
2690
2691		err = ext4_ext_get_access(handle, inode, path + depth);
2692		if (err)
2693			goto out;
2694
2695		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2696					 a, b);
2697		if (err)
2698			goto out;
2699
2700		if (num == 0)
2701			/* this extent is removed; mark slot entirely unused */
2702			ext4_ext_store_pblock(ex, 0);
2703
2704		ex->ee_len = cpu_to_le16(num);
2705		/*
2706		 * Do not mark unwritten if all the blocks in the
2707		 * extent have been removed.
2708		 */
2709		if (unwritten && num)
2710			ext4_ext_mark_unwritten(ex);
2711		/*
2712		 * If the extent was completely released,
2713		 * we need to remove it from the leaf
2714		 */
2715		if (num == 0) {
2716			if (end != EXT_MAX_BLOCKS - 1) {
2717				/*
2718				 * For hole punching, we need to scoot all the
2719				 * extents up when an extent is removed so that
2720				 * we dont have blank extents in the middle
2721				 */
2722				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2723					sizeof(struct ext4_extent));
2724
2725				/* Now get rid of the one at the end */
2726				memset(EXT_LAST_EXTENT(eh), 0,
2727					sizeof(struct ext4_extent));
2728			}
2729			le16_add_cpu(&eh->eh_entries, -1);
2730		} else if (*partial_cluster > 0)
2731			*partial_cluster = 0;
2732
2733		err = ext4_ext_dirty(handle, inode, path + depth);
2734		if (err)
2735			goto out;
2736
2737		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2738				ext4_ext_pblock(ex));
2739		ex--;
2740		ex_ee_block = le32_to_cpu(ex->ee_block);
2741		ex_ee_len = ext4_ext_get_actual_len(ex);
2742	}
2743
2744	if (correct_index && eh->eh_entries)
2745		err = ext4_ext_correct_indexes(handle, inode, path);
2746
2747	/*
2748	 * If there's a partial cluster and at least one extent remains in
2749	 * the leaf, free the partial cluster if it isn't shared with the
2750	 * current extent.  If there's a partial cluster and no extents
2751	 * remain in the leaf, it can't be freed here.  It can only be
2752	 * freed when it's possible to determine if it's not shared with
2753	 * any other extent - when the next leaf is processed or when space
2754	 * removal is complete.
2755	 */
2756	if (*partial_cluster > 0 && eh->eh_entries &&
2757	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2758	     *partial_cluster)) {
2759		int flags = get_default_free_blocks_flags(inode);
2760
2761		ext4_free_blocks(handle, inode, NULL,
2762				 EXT4_C2B(sbi, *partial_cluster),
2763				 sbi->s_cluster_ratio, flags);
2764		*partial_cluster = 0;
2765	}
2766
2767	/* if this leaf is free, then we should
2768	 * remove it from index block above */
2769	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2770		err = ext4_ext_rm_idx(handle, inode, path, depth);
2771
2772out:
2773	return err;
2774}
2775
2776/*
2777 * ext4_ext_more_to_rm:
2778 * returns 1 if current index has to be freed (even partial)
2779 */
2780static int
2781ext4_ext_more_to_rm(struct ext4_ext_path *path)
2782{
2783	BUG_ON(path->p_idx == NULL);
2784
2785	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2786		return 0;
2787
2788	/*
2789	 * if truncate on deeper level happened, it wasn't partial,
2790	 * so we have to consider current index for truncation
2791	 */
2792	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2793		return 0;
2794	return 1;
2795}
2796
2797int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2798			  ext4_lblk_t end)
2799{
2800	struct super_block *sb = inode->i_sb;
2801	int depth = ext_depth(inode);
2802	struct ext4_ext_path *path = NULL;
2803	long long partial_cluster = 0;
2804	handle_t *handle;
2805	int i = 0, err = 0;
2806
2807	ext_debug("truncate since %u to %u\n", start, end);
2808
2809	/* probably first extent we're gonna free will be last in block */
2810	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2811	if (IS_ERR(handle))
2812		return PTR_ERR(handle);
2813
2814again:
2815	trace_ext4_ext_remove_space(inode, start, end, depth);
2816
2817	/*
2818	 * Check if we are removing extents inside the extent tree. If that
2819	 * is the case, we are going to punch a hole inside the extent tree
2820	 * so we have to check whether we need to split the extent covering
2821	 * the last block to remove so we can easily remove the part of it
2822	 * in ext4_ext_rm_leaf().
2823	 */
2824	if (end < EXT_MAX_BLOCKS - 1) {
2825		struct ext4_extent *ex;
2826		ext4_lblk_t ee_block;
2827
2828		/* find extent for this block */
2829		path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2830		if (IS_ERR(path)) {
2831			ext4_journal_stop(handle);
2832			return PTR_ERR(path);
2833		}
2834		depth = ext_depth(inode);
2835		/* Leaf not may not exist only if inode has no blocks at all */
2836		ex = path[depth].p_ext;
2837		if (!ex) {
2838			if (depth) {
2839				EXT4_ERROR_INODE(inode,
2840						 "path[%d].p_hdr == NULL",
2841						 depth);
2842				err = -EIO;
2843			}
2844			goto out;
2845		}
2846
2847		ee_block = le32_to_cpu(ex->ee_block);
2848
2849		/*
2850		 * See if the last block is inside the extent, if so split
2851		 * the extent at 'end' block so we can easily remove the
2852		 * tail of the first part of the split extent in
2853		 * ext4_ext_rm_leaf().
2854		 */
2855		if (end >= ee_block &&
2856		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2857			int split_flag = 0;
2858
2859			if (ext4_ext_is_unwritten(ex))
2860				split_flag = EXT4_EXT_MARK_UNWRIT1 |
2861					     EXT4_EXT_MARK_UNWRIT2;
2862
2863			/*
2864			 * Split the extent in two so that 'end' is the last
2865			 * block in the first new extent. Also we should not
2866			 * fail removing space due to ENOSPC so try to use
2867			 * reserved block if that happens.
2868			 */
2869			err = ext4_split_extent_at(handle, inode, path,
2870					end + 1, split_flag,
2871					EXT4_EX_NOCACHE |
2872					EXT4_GET_BLOCKS_PRE_IO |
2873					EXT4_GET_BLOCKS_METADATA_NOFAIL);
2874
2875			if (err < 0)
2876				goto out;
2877		}
2878	}
2879	/*
2880	 * We start scanning from right side, freeing all the blocks
2881	 * after i_size and walking into the tree depth-wise.
2882	 */
2883	depth = ext_depth(inode);
2884	if (path) {
2885		int k = i = depth;
2886		while (--k > 0)
2887			path[k].p_block =
2888				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2889	} else {
2890		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2891			       GFP_NOFS);
2892		if (path == NULL) {
2893			ext4_journal_stop(handle);
2894			return -ENOMEM;
2895		}
2896		path[0].p_depth = depth;
2897		path[0].p_hdr = ext_inode_hdr(inode);
2898		i = 0;
2899
2900		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2901			err = -EIO;
2902			goto out;
2903		}
2904	}
2905	err = 0;
2906
2907	while (i >= 0 && err == 0) {
2908		if (i == depth) {
2909			/* this is leaf block */
2910			err = ext4_ext_rm_leaf(handle, inode, path,
2911					       &partial_cluster, start,
2912					       end);
2913			/* root level has p_bh == NULL, brelse() eats this */
2914			brelse(path[i].p_bh);
2915			path[i].p_bh = NULL;
2916			i--;
2917			continue;
2918		}
2919
2920		/* this is index block */
2921		if (!path[i].p_hdr) {
2922			ext_debug("initialize header\n");
2923			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2924		}
2925
2926		if (!path[i].p_idx) {
2927			/* this level hasn't been touched yet */
2928			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2929			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2930			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2931				  path[i].p_hdr,
2932				  le16_to_cpu(path[i].p_hdr->eh_entries));
2933		} else {
2934			/* we were already here, see at next index */
2935			path[i].p_idx--;
2936		}
2937
2938		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2939				i, EXT_FIRST_INDEX(path[i].p_hdr),
2940				path[i].p_idx);
2941		if (ext4_ext_more_to_rm(path + i)) {
2942			struct buffer_head *bh;
2943			/* go to the next level */
2944			ext_debug("move to level %d (block %llu)\n",
2945				  i + 1, ext4_idx_pblock(path[i].p_idx));
2946			memset(path + i + 1, 0, sizeof(*path));
2947			bh = read_extent_tree_block(inode,
2948				ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2949				EXT4_EX_NOCACHE);
2950			if (IS_ERR(bh)) {
2951				/* should we reset i_size? */
2952				err = PTR_ERR(bh);
2953				break;
2954			}
2955			/* Yield here to deal with large extent trees.
2956			 * Should be a no-op if we did IO above. */
2957			cond_resched();
2958			if (WARN_ON(i + 1 > depth)) {
2959				err = -EIO;
2960				break;
2961			}
2962			path[i + 1].p_bh = bh;
2963
2964			/* save actual number of indexes since this
2965			 * number is changed at the next iteration */
2966			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2967			i++;
2968		} else {
2969			/* we finished processing this index, go up */
2970			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2971				/* index is empty, remove it;
2972				 * handle must be already prepared by the
2973				 * truncatei_leaf() */
2974				err = ext4_ext_rm_idx(handle, inode, path, i);
2975			}
2976			/* root level has p_bh == NULL, brelse() eats this */
2977			brelse(path[i].p_bh);
2978			path[i].p_bh = NULL;
2979			i--;
2980			ext_debug("return to level %d\n", i);
2981		}
2982	}
2983
2984	trace_ext4_ext_remove_space_done(inode, start, end, depth,
2985			partial_cluster, path->p_hdr->eh_entries);
2986
2987	/* If we still have something in the partial cluster and we have removed
2988	 * even the first extent, then we should free the blocks in the partial
2989	 * cluster as well. */
2990	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2991		int flags = get_default_free_blocks_flags(inode);
2992
2993		ext4_free_blocks(handle, inode, NULL,
2994				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2995				 EXT4_SB(sb)->s_cluster_ratio, flags);
2996		partial_cluster = 0;
2997	}
2998
2999	/* TODO: flexible tree reduction should be here */
3000	if (path->p_hdr->eh_entries == 0) {
3001		/*
3002		 * truncate to zero freed all the tree,
3003		 * so we need to correct eh_depth
3004		 */
3005		err = ext4_ext_get_access(handle, inode, path);
3006		if (err == 0) {
3007			ext_inode_hdr(inode)->eh_depth = 0;
3008			ext_inode_hdr(inode)->eh_max =
3009				cpu_to_le16(ext4_ext_space_root(inode, 0));
3010			err = ext4_ext_dirty(handle, inode, path);
3011		}
3012	}
3013out:
3014	ext4_ext_drop_refs(path);
3015	kfree(path);
3016	if (err == -EAGAIN) {
3017		path = NULL;
3018		goto again;
3019	}
3020	ext4_journal_stop(handle);
3021
3022	return err;
3023}
3024
3025/*
3026 * called at mount time
3027 */
3028void ext4_ext_init(struct super_block *sb)
3029{
3030	/*
3031	 * possible initialization would be here
3032	 */
3033
3034	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
3035#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3036		printk(KERN_INFO "EXT4-fs: file extents enabled"
3037#ifdef AGGRESSIVE_TEST
3038		       ", aggressive tests"
3039#endif
3040#ifdef CHECK_BINSEARCH
3041		       ", check binsearch"
3042#endif
3043#ifdef EXTENTS_STATS
3044		       ", stats"
3045#endif
3046		       "\n");
3047#endif
3048#ifdef EXTENTS_STATS
3049		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3050		EXT4_SB(sb)->s_ext_min = 1 << 30;
3051		EXT4_SB(sb)->s_ext_max = 0;
3052#endif
3053	}
3054}
3055
3056/*
3057 * called at umount time
3058 */
3059void ext4_ext_release(struct super_block *sb)
3060{
3061	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3062		return;
3063
3064#ifdef EXTENTS_STATS
3065	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3066		struct ext4_sb_info *sbi = EXT4_SB(sb);
3067		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3068			sbi->s_ext_blocks, sbi->s_ext_extents,
3069			sbi->s_ext_blocks / sbi->s_ext_extents);
3070		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3071			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3072	}
3073#endif
3074}
3075
3076static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3077{
3078	ext4_lblk_t  ee_block;
3079	ext4_fsblk_t ee_pblock;
3080	unsigned int ee_len;
3081
3082	ee_block  = le32_to_cpu(ex->ee_block);
3083	ee_len    = ext4_ext_get_actual_len(ex);
3084	ee_pblock = ext4_ext_pblock(ex);
3085
3086	if (ee_len == 0)
3087		return 0;
3088
3089	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3090				     EXTENT_STATUS_WRITTEN);
3091}
3092
3093/* FIXME!! we need to try to merge to left or right after zero-out  */
3094static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3095{
3096	ext4_fsblk_t ee_pblock;
3097	unsigned int ee_len;
3098	int ret;
3099
3100	ee_len    = ext4_ext_get_actual_len(ex);
3101	ee_pblock = ext4_ext_pblock(ex);
3102
3103	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
3104	if (ret > 0)
3105		ret = 0;
3106
3107	return ret;
3108}
3109
3110/*
3111 * ext4_split_extent_at() splits an extent at given block.
3112 *
3113 * @handle: the journal handle
3114 * @inode: the file inode
3115 * @path: the path to the extent
3116 * @split: the logical block where the extent is splitted.
3117 * @split_flags: indicates if the extent could be zeroout if split fails, and
3118 *		 the states(init or unwritten) of new extents.
3119 * @flags: flags used to insert new extent to extent tree.
3120 *
3121 *
3122 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3123 * of which are deterimined by split_flag.
3124 *
3125 * There are two cases:
3126 *  a> the extent are splitted into two extent.
3127 *  b> split is not needed, and just mark the extent.
3128 *
3129 * return 0 on success.
3130 */
3131static int ext4_split_extent_at(handle_t *handle,
3132			     struct inode *inode,
3133			     struct ext4_ext_path *path,
3134			     ext4_lblk_t split,
3135			     int split_flag,
3136			     int flags)
3137{
3138	ext4_fsblk_t newblock;
3139	ext4_lblk_t ee_block;
3140	struct ext4_extent *ex, newex, orig_ex, zero_ex;
3141	struct ext4_extent *ex2 = NULL;
3142	unsigned int ee_len, depth;
3143	int err = 0;
3144
3145	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3146	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3147
3148	ext_debug("ext4_split_extents_at: inode %lu, logical"
3149		"block %llu\n", inode->i_ino, (unsigned long long)split);
3150
3151	ext4_ext_show_leaf(inode, path);
3152
3153	depth = ext_depth(inode);
3154	ex = path[depth].p_ext;
3155	ee_block = le32_to_cpu(ex->ee_block);
3156	ee_len = ext4_ext_get_actual_len(ex);
3157	newblock = split - ee_block + ext4_ext_pblock(ex);
3158
3159	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3160	BUG_ON(!ext4_ext_is_unwritten(ex) &&
3161	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
3162			     EXT4_EXT_MARK_UNWRIT1 |
3163			     EXT4_EXT_MARK_UNWRIT2));
3164
3165	err = ext4_ext_get_access(handle, inode, path + depth);
3166	if (err)
3167		goto out;
3168
3169	if (split == ee_block) {
3170		/*
3171		 * case b: block @split is the block that the extent begins with
3172		 * then we just change the state of the extent, and splitting
3173		 * is not needed.
3174		 */
3175		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3176			ext4_ext_mark_unwritten(ex);
3177		else
3178			ext4_ext_mark_initialized(ex);
3179
3180		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3181			ext4_ext_try_to_merge(handle, inode, path, ex);
3182
3183		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3184		goto out;
3185	}
3186
3187	/* case a */
3188	memcpy(&orig_ex, ex, sizeof(orig_ex));
3189	ex->ee_len = cpu_to_le16(split - ee_block);
3190	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3191		ext4_ext_mark_unwritten(ex);
3192
3193	/*
3194	 * path may lead to new leaf, not to original leaf any more
3195	 * after ext4_ext_insert_extent() returns,
3196	 */
3197	err = ext4_ext_dirty(handle, inode, path + depth);
3198	if (err)
3199		goto fix_extent_len;
3200
3201	ex2 = &newex;
3202	ex2->ee_block = cpu_to_le32(split);
3203	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3204	ext4_ext_store_pblock(ex2, newblock);
3205	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3206		ext4_ext_mark_unwritten(ex2);
3207
3208	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3209	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3210		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3211			if (split_flag & EXT4_EXT_DATA_VALID1) {
3212				err = ext4_ext_zeroout(inode, ex2);
3213				zero_ex.ee_block = ex2->ee_block;
3214				zero_ex.ee_len = cpu_to_le16(
3215						ext4_ext_get_actual_len(ex2));
3216				ext4_ext_store_pblock(&zero_ex,
3217						      ext4_ext_pblock(ex2));
3218			} else {
3219				err = ext4_ext_zeroout(inode, ex);
3220				zero_ex.ee_block = ex->ee_block;
3221				zero_ex.ee_len = cpu_to_le16(
3222						ext4_ext_get_actual_len(ex));
3223				ext4_ext_store_pblock(&zero_ex,
3224						      ext4_ext_pblock(ex));
3225			}
3226		} else {
3227			err = ext4_ext_zeroout(inode, &orig_ex);
3228			zero_ex.ee_block = orig_ex.ee_block;
3229			zero_ex.ee_len = cpu_to_le16(
3230						ext4_ext_get_actual_len(&orig_ex));
3231			ext4_ext_store_pblock(&zero_ex,
3232					      ext4_ext_pblock(&orig_ex));
3233		}
3234
3235		if (err)
3236			goto fix_extent_len;
3237		/* update the extent length and mark as initialized */
3238		ex->ee_len = cpu_to_le16(ee_len);
3239		ext4_ext_try_to_merge(handle, inode, path, ex);
3240		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3241		if (err)
3242			goto fix_extent_len;
3243
3244		/* update extent status tree */
3245		err = ext4_zeroout_es(inode, &zero_ex);
3246
3247		goto out;
3248	} else if (err)
3249		goto fix_extent_len;
3250
3251out:
3252	ext4_ext_show_leaf(inode, path);
3253	return err;
3254
3255fix_extent_len:
3256	ex->ee_len = orig_ex.ee_len;
3257	ext4_ext_dirty(handle, inode, path + depth);
3258	return err;
3259}
3260
3261/*
3262 * ext4_split_extents() splits an extent and mark extent which is covered
3263 * by @map as split_flags indicates
3264 *
3265 * It may result in splitting the extent into multiple extents (up to three)
3266 * There are three possibilities:
3267 *   a> There is no split required
3268 *   b> Splits in two extents: Split is happening at either end of the extent
3269 *   c> Splits in three extents: Somone is splitting in middle of the extent
3270 *
3271 */
3272static int ext4_split_extent(handle_t *handle,
3273			      struct inode *inode,
3274			      struct ext4_ext_path *path,
3275			      struct ext4_map_blocks *map,
3276			      int split_flag,
3277			      int flags)
3278{
3279	ext4_lblk_t ee_block;
3280	struct ext4_extent *ex;
3281	unsigned int ee_len, depth;
3282	int err = 0;
3283	int unwritten;
3284	int split_flag1, flags1;
3285	int allocated = map->m_len;
3286
3287	depth = ext_depth(inode);
3288	ex = path[depth].p_ext;
3289	ee_block = le32_to_cpu(ex->ee_block);
3290	ee_len = ext4_ext_get_actual_len(ex);
3291	unwritten = ext4_ext_is_unwritten(ex);
3292
3293	if (map->m_lblk + map->m_len < ee_block + ee_len) {
3294		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3295		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3296		if (unwritten)
3297			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3298				       EXT4_EXT_MARK_UNWRIT2;
3299		if (split_flag & EXT4_EXT_DATA_VALID2)
3300			split_flag1 |= EXT4_EXT_DATA_VALID1;
3301		err = ext4_split_extent_at(handle, inode, path,
3302				map->m_lblk + map->m_len, split_flag1, flags1);
3303		if (err)
3304			goto out;
3305	} else {
3306		allocated = ee_len - (map->m_lblk - ee_block);
3307	}
3308	/*
3309	 * Update path is required because previous ext4_split_extent_at() may
3310	 * result in split of original leaf or extent zeroout.
3311	 */
3312	ext4_ext_drop_refs(path);
3313	path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3314	if (IS_ERR(path))
3315		return PTR_ERR(path);
3316	depth = ext_depth(inode);
3317	ex = path[depth].p_ext;
3318	if (!ex) {
3319		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3320				 (unsigned long) map->m_lblk);
3321		return -EIO;
3322	}
3323	unwritten = ext4_ext_is_unwritten(ex);
3324	split_flag1 = 0;
3325
3326	if (map->m_lblk >= ee_block) {
3327		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3328		if (unwritten) {
3329			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3330			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3331						     EXT4_EXT_MARK_UNWRIT2);
3332		}
3333		err = ext4_split_extent_at(handle, inode, path,
3334				map->m_lblk, split_flag1, flags);
3335		if (err)
3336			goto out;
3337	}
3338
3339	ext4_ext_show_leaf(inode, path);
3340out:
3341	return err ? err : allocated;
3342}
3343
3344/*
3345 * This function is called by ext4_ext_map_blocks() if someone tries to write
3346 * to an unwritten extent. It may result in splitting the unwritten
3347 * extent into multiple extents (up to three - one initialized and two
3348 * unwritten).
3349 * There are three possibilities:
3350 *   a> There is no split required: Entire extent should be initialized
3351 *   b> Splits in two extents: Write is happening at either end of the extent
3352 *   c> Splits in three extents: Somone is writing in middle of the extent
3353 *
3354 * Pre-conditions:
3355 *  - The extent pointed to by 'path' is unwritten.
3356 *  - The extent pointed to by 'path' contains a superset
3357 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3358 *
3359 * Post-conditions on success:
3360 *  - the returned value is the number of blocks beyond map->l_lblk
3361 *    that are allocated and initialized.
3362 *    It is guaranteed to be >= map->m_len.
3363 */
3364static int ext4_ext_convert_to_initialized(handle_t *handle,
3365					   struct inode *inode,
3366					   struct ext4_map_blocks *map,
3367					   struct ext4_ext_path *path,
3368					   int flags)
3369{
3370	struct ext4_sb_info *sbi;
3371	struct ext4_extent_header *eh;
3372	struct ext4_map_blocks split_map;
3373	struct ext4_extent zero_ex;
3374	struct ext4_extent *ex, *abut_ex;
3375	ext4_lblk_t ee_block, eof_block;
3376	unsigned int ee_len, depth, map_len = map->m_len;
3377	int allocated = 0, max_zeroout = 0;
3378	int err = 0;
3379	int split_flag = 0;
3380
3381	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3382		"block %llu, max_blocks %u\n", inode->i_ino,
3383		(unsigned long long)map->m_lblk, map_len);
3384
3385	sbi = EXT4_SB(inode->i_sb);
3386	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3387		inode->i_sb->s_blocksize_bits;
3388	if (eof_block < map->m_lblk + map_len)
3389		eof_block = map->m_lblk + map_len;
3390
3391	depth = ext_depth(inode);
3392	eh = path[depth].p_hdr;
3393	ex = path[depth].p_ext;
3394	ee_block = le32_to_cpu(ex->ee_block);
3395	ee_len = ext4_ext_get_actual_len(ex);
3396	zero_ex.ee_len = 0;
3397
3398	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3399
3400	/* Pre-conditions */
3401	BUG_ON(!ext4_ext_is_unwritten(ex));
3402	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3403
3404	/*
3405	 * Attempt to transfer newly initialized blocks from the currently
3406	 * unwritten extent to its neighbor. This is much cheaper
3407	 * than an insertion followed by a merge as those involve costly
3408	 * memmove() calls. Transferring to the left is the common case in
3409	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3410	 * followed by append writes.
3411	 *
3412	 * Limitations of the current logic:
3413	 *  - L1: we do not deal with writes covering the whole extent.
3414	 *    This would require removing the extent if the transfer
3415	 *    is possible.
3416	 *  - L2: we only attempt to merge with an extent stored in the
3417	 *    same extent tree node.
3418	 */
3419	if ((map->m_lblk == ee_block) &&
3420		/* See if we can merge left */
3421		(map_len < ee_len) &&		/*L1*/
3422		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
3423		ext4_lblk_t prev_lblk;
3424		ext4_fsblk_t prev_pblk, ee_pblk;
3425		unsigned int prev_len;
3426
3427		abut_ex = ex - 1;
3428		prev_lblk = le32_to_cpu(abut_ex->ee_block);
3429		prev_len = ext4_ext_get_actual_len(abut_ex);
3430		prev_pblk = ext4_ext_pblock(abut_ex);
3431		ee_pblk = ext4_ext_pblock(ex);
3432
3433		/*
3434		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3435		 * upon those conditions:
3436		 * - C1: abut_ex is initialized,
3437		 * - C2: abut_ex is logically abutting ex,
3438		 * - C3: abut_ex is physically abutting ex,
3439		 * - C4: abut_ex can receive the additional blocks without
3440		 *   overflowing the (initialized) length limit.
3441		 */
3442		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3443			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3444			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3445			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3446			err = ext4_ext_get_access(handle, inode, path + depth);
3447			if (err)
3448				goto out;
3449
3450			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3451				map, ex, abut_ex);
3452
3453			/* Shift the start of ex by 'map_len' blocks */
3454			ex->ee_block = cpu_to_le32(ee_block + map_len);
3455			ext4_ext_store_pblock(ex, ee_pblk + map_len);
3456			ex->ee_len = cpu_to_le16(ee_len - map_len);
3457			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3458
3459			/* Extend abut_ex by 'map_len' blocks */
3460			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3461
3462			/* Result: number of initialized blocks past m_lblk */
3463			allocated = map_len;
3464		}
3465	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3466		   (map_len < ee_len) &&	/*L1*/
3467		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
3468		/* See if we can merge right */
3469		ext4_lblk_t next_lblk;
3470		ext4_fsblk_t next_pblk, ee_pblk;
3471		unsigned int next_len;
3472
3473		abut_ex = ex + 1;
3474		next_lblk = le32_to_cpu(abut_ex->ee_block);
3475		next_len = ext4_ext_get_actual_len(abut_ex);
3476		next_pblk = ext4_ext_pblock(abut_ex);
3477		ee_pblk = ext4_ext_pblock(ex);
3478
3479		/*
3480		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3481		 * upon those conditions:
3482		 * - C1: abut_ex is initialized,
3483		 * - C2: abut_ex is logically abutting ex,
3484		 * - C3: abut_ex is physically abutting ex,
3485		 * - C4: abut_ex can receive the additional blocks without
3486		 *   overflowing the (initialized) length limit.
3487		 */
3488		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3489		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
3490		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
3491		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3492			err = ext4_ext_get_access(handle, inode, path + depth);
3493			if (err)
3494				goto out;
3495
3496			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3497				map, ex, abut_ex);
3498
3499			/* Shift the start of abut_ex by 'map_len' blocks */
3500			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3501			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3502			ex->ee_len = cpu_to_le16(ee_len - map_len);
3503			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3504
3505			/* Extend abut_ex by 'map_len' blocks */
3506			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3507
3508			/* Result: number of initialized blocks past m_lblk */
3509			allocated = map_len;
3510		}
3511	}
3512	if (allocated) {
3513		/* Mark the block containing both extents as dirty */
3514		ext4_ext_dirty(handle, inode, path + depth);
3515
3516		/* Update path to point to the right extent */
3517		path[depth].p_ext = abut_ex;
3518		goto out;
3519	} else
3520		allocated = ee_len - (map->m_lblk - ee_block);
3521
3522	WARN_ON(map->m_lblk < ee_block);
3523	/*
3524	 * It is safe to convert extent to initialized via explicit
3525	 * zeroout only if extent is fully inside i_size or new_size.
3526	 */
3527	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3528
3529	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3530		max_zeroout = sbi->s_extent_max_zeroout_kb >>
3531			(inode->i_sb->s_blocksize_bits - 10);
3532
3533	/* If extent is less than s_max_zeroout_kb, zeroout directly */
3534	if (max_zeroout && (ee_len <= max_zeroout)) {
3535		err = ext4_ext_zeroout(inode, ex);
3536		if (err)
3537			goto out;
3538		zero_ex.ee_block = ex->ee_block;
3539		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3540		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3541
3542		err = ext4_ext_get_access(handle, inode, path + depth);
3543		if (err)
3544			goto out;
3545		ext4_ext_mark_initialized(ex);
3546		ext4_ext_try_to_merge(handle, inode, path, ex);
3547		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3548		goto out;
3549	}
3550
3551	/*
3552	 * four cases:
3553	 * 1. split the extent into three extents.
3554	 * 2. split the extent into two extents, zeroout the first half.
3555	 * 3. split the extent into two extents, zeroout the second half.
3556	 * 4. split the extent into two extents with out zeroout.
3557	 */
3558	split_map.m_lblk = map->m_lblk;
3559	split_map.m_len = map->m_len;
3560
3561	if (max_zeroout && (allocated > map->m_len)) {
3562		if (allocated <= max_zeroout) {
3563			/* case 3 */
3564			zero_ex.ee_block =
3565					 cpu_to_le32(map->m_lblk);
3566			zero_ex.ee_len = cpu_to_le16(allocated);
3567			ext4_ext_store_pblock(&zero_ex,
3568				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3569			err = ext4_ext_zeroout(inode, &zero_ex);
3570			if (err)
3571				goto out;
3572			split_map.m_lblk = map->m_lblk;
3573			split_map.m_len = allocated;
3574		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3575			/* case 2 */
3576			if (map->m_lblk != ee_block) {
3577				zero_ex.ee_block = ex->ee_block;
3578				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3579							ee_block);
3580				ext4_ext_store_pblock(&zero_ex,
3581						      ext4_ext_pblock(ex));
3582				err = ext4_ext_zeroout(inode, &zero_ex);
3583				if (err)
3584					goto out;
3585			}
3586
3587			split_map.m_lblk = ee_block;
3588			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3589			allocated = map->m_len;
3590		}
3591	}
3592
3593	allocated = ext4_split_extent(handle, inode, path,
3594				      &split_map, split_flag, flags);
3595	if (allocated < 0)
3596		err = allocated;
3597
3598out:
3599	/* If we have gotten a failure, don't zero out status tree */
3600	if (!err)
3601		err = ext4_zeroout_es(inode, &zero_ex);
3602	return err ? err : allocated;
3603}
3604
3605/*
3606 * This function is called by ext4_ext_map_blocks() from
3607 * ext4_get_blocks_dio_write() when DIO to write
3608 * to an unwritten extent.
3609 *
3610 * Writing to an unwritten extent may result in splitting the unwritten
3611 * extent into multiple initialized/unwritten extents (up to three)
3612 * There are three possibilities:
3613 *   a> There is no split required: Entire extent should be unwritten
3614 *   b> Splits in two extents: Write is happening at either end of the extent
3615 *   c> Splits in three extents: Somone is writing in middle of the extent
3616 *
3617 * This works the same way in the case of initialized -> unwritten conversion.
3618 *
3619 * One of more index blocks maybe needed if the extent tree grow after
3620 * the unwritten extent split. To prevent ENOSPC occur at the IO
3621 * complete, we need to split the unwritten extent before DIO submit
3622 * the IO. The unwritten extent called at this time will be split
3623 * into three unwritten extent(at most). After IO complete, the part
3624 * being filled will be convert to initialized by the end_io callback function
3625 * via ext4_convert_unwritten_extents().
3626 *
3627 * Returns the size of unwritten extent to be written on success.
3628 */
3629static int ext4_split_convert_extents(handle_t *handle,
3630					struct inode *inode,
3631					struct ext4_map_blocks *map,
3632					struct ext4_ext_path *path,
3633					int flags)
3634{
3635	ext4_lblk_t eof_block;
3636	ext4_lblk_t ee_block;
3637	struct ext4_extent *ex;
3638	unsigned int ee_len;
3639	int split_flag = 0, depth;
3640
3641	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3642		  __func__, inode->i_ino,
3643		  (unsigned long long)map->m_lblk, map->m_len);
3644
3645	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3646		inode->i_sb->s_blocksize_bits;
3647	if (eof_block < map->m_lblk + map->m_len)
3648		eof_block = map->m_lblk + map->m_len;
3649	/*
3650	 * It is safe to convert extent to initialized via explicit
3651	 * zeroout only if extent is fully insde i_size or new_size.
3652	 */
3653	depth = ext_depth(inode);
3654	ex = path[depth].p_ext;
3655	ee_block = le32_to_cpu(ex->ee_block);
3656	ee_len = ext4_ext_get_actual_len(ex);
3657
3658	/* Convert to unwritten */
3659	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3660		split_flag |= EXT4_EXT_DATA_VALID1;
3661	/* Convert to initialized */
3662	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3663		split_flag |= ee_block + ee_len <= eof_block ?
3664			      EXT4_EXT_MAY_ZEROOUT : 0;
3665		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3666	}
3667	flags |= EXT4_GET_BLOCKS_PRE_IO;
3668	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3669}
3670
3671static int ext4_convert_initialized_extents(handle_t *handle,
3672					    struct inode *inode,
3673					    struct ext4_map_blocks *map,
3674					    struct ext4_ext_path *path)
3675{
3676	struct ext4_extent *ex;
3677	ext4_lblk_t ee_block;
3678	unsigned int ee_len;
3679	int depth;
3680	int err = 0;
3681
3682	depth = ext_depth(inode);
3683	ex = path[depth].p_ext;
3684	ee_block = le32_to_cpu(ex->ee_block);
3685	ee_len = ext4_ext_get_actual_len(ex);
3686
3687	ext_debug("%s: inode %lu, logical"
3688		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
3689		  (unsigned long long)ee_block, ee_len);
3690
3691	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3692		err = ext4_split_convert_extents(handle, inode, map, path,
3693				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3694		if (err < 0)
3695			goto out;
3696		ext4_ext_drop_refs(path);
3697		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3698		if (IS_ERR(path)) {
3699			err = PTR_ERR(path);
3700			goto out;
3701		}
3702		depth = ext_depth(inode);
3703		ex = path[depth].p_ext;
3704		if (!ex) {
3705			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3706					 (unsigned long) map->m_lblk);
3707			err = -EIO;
3708			goto out;
3709		}
3710	}
3711
3712	err = ext4_ext_get_access(handle, inode, path + depth);
3713	if (err)
3714		goto out;
3715	/* first mark the extent as unwritten */
3716	ext4_ext_mark_unwritten(ex);
3717
3718	/* note: ext4_ext_correct_indexes() isn't needed here because
3719	 * borders are not changed
3720	 */
3721	ext4_ext_try_to_merge(handle, inode, path, ex);
3722
3723	/* Mark modified extent as dirty */
3724	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3725out:
3726	ext4_ext_show_leaf(inode, path);
3727	return err;
3728}
3729
3730
3731static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3732						struct inode *inode,
3733						struct ext4_map_blocks *map,
3734						struct ext4_ext_path *path)
3735{
3736	struct ext4_extent *ex;
3737	ext4_lblk_t ee_block;
3738	unsigned int ee_len;
3739	int depth;
3740	int err = 0;
3741
3742	depth = ext_depth(inode);
3743	ex = path[depth].p_ext;
3744	ee_block = le32_to_cpu(ex->ee_block);
3745	ee_len = ext4_ext_get_actual_len(ex);
3746
3747	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3748		"block %llu, max_blocks %u\n", inode->i_ino,
3749		  (unsigned long long)ee_block, ee_len);
3750
3751	/* If extent is larger than requested it is a clear sign that we still
3752	 * have some extent state machine issues left. So extent_split is still
3753	 * required.
3754	 * TODO: Once all related issues will be fixed this situation should be
3755	 * illegal.
3756	 */
3757	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3758#ifdef EXT4_DEBUG
3759		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3760			     " len %u; IO logical block %llu, len %u\n",
3761			     inode->i_ino, (unsigned long long)ee_block, ee_len,
3762			     (unsigned long long)map->m_lblk, map->m_len);
3763#endif
3764		err = ext4_split_convert_extents(handle, inode, map, path,
3765						 EXT4_GET_BLOCKS_CONVERT);
3766		if (err < 0)
3767			goto out;
3768		ext4_ext_drop_refs(path);
3769		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3770		if (IS_ERR(path)) {
3771			err = PTR_ERR(path);
3772			goto out;
3773		}
3774		depth = ext_depth(inode);
3775		ex = path[depth].p_ext;
3776	}
3777
3778	err = ext4_ext_get_access(handle, inode, path + depth);
3779	if (err)
3780		goto out;
3781	/* first mark the extent as initialized */
3782	ext4_ext_mark_initialized(ex);
3783
3784	/* note: ext4_ext_correct_indexes() isn't needed here because
3785	 * borders are not changed
3786	 */
3787	ext4_ext_try_to_merge(handle, inode, path, ex);
3788
3789	/* Mark modified extent as dirty */
3790	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3791out:
3792	ext4_ext_show_leaf(inode, path);
3793	return err;
3794}
3795
3796static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3797			sector_t block, int count)
3798{
3799	int i;
3800	for (i = 0; i < count; i++)
3801                unmap_underlying_metadata(bdev, block + i);
3802}
3803
3804/*
3805 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3806 */
3807static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3808			      ext4_lblk_t lblk,
3809			      struct ext4_ext_path *path,
3810			      unsigned int len)
3811{
3812	int i, depth;
3813	struct ext4_extent_header *eh;
3814	struct ext4_extent *last_ex;
3815
3816	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3817		return 0;
3818
3819	depth = ext_depth(inode);
3820	eh = path[depth].p_hdr;
3821
3822	/*
3823	 * We're going to remove EOFBLOCKS_FL entirely in future so we
3824	 * do not care for this case anymore. Simply remove the flag
3825	 * if there are no extents.
3826	 */
3827	if (unlikely(!eh->eh_entries))
3828		goto out;
3829	last_ex = EXT_LAST_EXTENT(eh);
3830	/*
3831	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3832	 * last block in the last extent in the file.  We test this by
3833	 * first checking to see if the caller to
3834	 * ext4_ext_get_blocks() was interested in the last block (or
3835	 * a block beyond the last block) in the current extent.  If
3836	 * this turns out to be false, we can bail out from this
3837	 * function immediately.
3838	 */
3839	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3840	    ext4_ext_get_actual_len(last_ex))
3841		return 0;
3842	/*
3843	 * If the caller does appear to be planning to write at or
3844	 * beyond the end of the current extent, we then test to see
3845	 * if the current extent is the last extent in the file, by
3846	 * checking to make sure it was reached via the rightmost node
3847	 * at each level of the tree.
3848	 */
3849	for (i = depth-1; i >= 0; i--)
3850		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3851			return 0;
3852out:
3853	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3854	return ext4_mark_inode_dirty(handle, inode);
3855}
3856
3857/**
3858 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3859 *
3860 * Return 1 if there is a delalloc block in the range, otherwise 0.
3861 */
3862int ext4_find_delalloc_range(struct inode *inode,
3863			     ext4_lblk_t lblk_start,
3864			     ext4_lblk_t lblk_end)
3865{
3866	struct extent_status es;
3867
3868	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3869	if (es.es_len == 0)
3870		return 0; /* there is no delay extent in this tree */
3871	else if (es.es_lblk <= lblk_start &&
3872		 lblk_start < es.es_lblk + es.es_len)
3873		return 1;
3874	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3875		return 1;
3876	else
3877		return 0;
3878}
3879
3880int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3881{
3882	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3883	ext4_lblk_t lblk_start, lblk_end;
3884	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3885	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3886
3887	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3888}
3889
3890/**
3891 * Determines how many complete clusters (out of those specified by the 'map')
3892 * are under delalloc and were reserved quota for.
3893 * This function is called when we are writing out the blocks that were
3894 * originally written with their allocation delayed, but then the space was
3895 * allocated using fallocate() before the delayed allocation could be resolved.
3896 * The cases to look for are:
3897 * ('=' indicated delayed allocated blocks
3898 *  '-' indicates non-delayed allocated blocks)
3899 * (a) partial clusters towards beginning and/or end outside of allocated range
3900 *     are not delalloc'ed.
3901 *	Ex:
3902 *	|----c---=|====c====|====c====|===-c----|
3903 *	         |++++++ allocated ++++++|
3904 *	==> 4 complete clusters in above example
3905 *
3906 * (b) partial cluster (outside of allocated range) towards either end is
3907 *     marked for delayed allocation. In this case, we will exclude that
3908 *     cluster.
3909 *	Ex:
3910 *	|----====c========|========c========|
3911 *	     |++++++ allocated ++++++|
3912 *	==> 1 complete clusters in above example
3913 *
3914 *	Ex:
3915 *	|================c================|
3916 *            |++++++ allocated ++++++|
3917 *	==> 0 complete clusters in above example
3918 *
3919 * The ext4_da_update_reserve_space will be called only if we
3920 * determine here that there were some "entire" clusters that span
3921 * this 'allocated' range.
3922 * In the non-bigalloc case, this function will just end up returning num_blks
3923 * without ever calling ext4_find_delalloc_range.
3924 */
3925static unsigned int
3926get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3927			   unsigned int num_blks)
3928{
3929	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3930	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3931	ext4_lblk_t lblk_from, lblk_to, c_offset;
3932	unsigned int allocated_clusters = 0;
3933
3934	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3935	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3936
3937	/* max possible clusters for this allocation */
3938	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3939
3940	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3941
3942	/* Check towards left side */
3943	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3944	if (c_offset) {
3945		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3946		lblk_to = lblk_from + c_offset - 1;
3947
3948		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3949			allocated_clusters--;
3950	}
3951
3952	/* Now check towards right. */
3953	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3954	if (allocated_clusters && c_offset) {
3955		lblk_from = lblk_start + num_blks;
3956		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3957
3958		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3959			allocated_clusters--;
3960	}
3961
3962	return allocated_clusters;
3963}
3964
3965static int
3966ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3967			struct ext4_map_blocks *map,
3968			struct ext4_ext_path *path, int flags,
3969			unsigned int allocated, ext4_fsblk_t newblock)
3970{
3971	int ret = 0;
3972	int err = 0;
3973
3974	/*
3975	 * Make sure that the extent is no bigger than we support with
3976	 * unwritten extent
3977	 */
3978	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3979		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3980
3981	ret = ext4_convert_initialized_extents(handle, inode, map,
3982						path);
3983	if (ret >= 0) {
3984		ext4_update_inode_fsync_trans(handle, inode, 1);
3985		err = check_eofblocks_fl(handle, inode, map->m_lblk,
3986					 path, map->m_len);
3987	} else
3988		err = ret;
3989	map->m_flags |= EXT4_MAP_UNWRITTEN;
3990	if (allocated > map->m_len)
3991		allocated = map->m_len;
3992	map->m_len = allocated;
3993
3994	return err ? err : allocated;
3995}
3996
3997static int
3998ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3999			struct ext4_map_blocks *map,
4000			struct ext4_ext_path *path, int flags,
4001			unsigned int allocated, ext4_fsblk_t newblock)
4002{
4003	int ret = 0;
4004	int err = 0;
4005	ext4_io_end_t *io = ext4_inode_aio(inode);
4006
4007	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4008		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
4009		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4010		  flags, allocated);
4011	ext4_ext_show_leaf(inode, path);
4012
4013	/*
4014	 * When writing into unwritten space, we should not fail to
4015	 * allocate metadata blocks for the new extent block if needed.
4016	 */
4017	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4018
4019	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4020						    allocated, newblock);
4021
4022	/* get_block() before submit the IO, split the extent */
4023	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4024		ret = ext4_split_convert_extents(handle, inode, map,
4025					 path, flags | EXT4_GET_BLOCKS_CONVERT);
4026		if (ret <= 0)
4027			goto out;
4028		/*
4029		 * Flag the inode(non aio case) or end_io struct (aio case)
4030		 * that this IO needs to conversion to written when IO is
4031		 * completed
4032		 */
4033		if (io)
4034			ext4_set_io_unwritten_flag(inode, io);
4035		else
4036			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4037		map->m_flags |= EXT4_MAP_UNWRITTEN;
4038		goto out;
4039	}
4040	/* IO end_io complete, convert the filled extent to written */
4041	if (flags & EXT4_GET_BLOCKS_CONVERT) {
4042		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4043							path);
4044		if (ret >= 0) {
4045			ext4_update_inode_fsync_trans(handle, inode, 1);
4046			err = check_eofblocks_fl(handle, inode, map->m_lblk,
4047						 path, map->m_len);
4048		} else
4049			err = ret;
4050		map->m_flags |= EXT4_MAP_MAPPED;
4051		map->m_pblk = newblock;
4052		if (allocated > map->m_len)
4053			allocated = map->m_len;
4054		map->m_len = allocated;
4055		goto out2;
4056	}
4057	/* buffered IO case */
4058	/*
4059	 * repeat fallocate creation request
4060	 * we already have an unwritten extent
4061	 */
4062	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4063		map->m_flags |= EXT4_MAP_UNWRITTEN;
4064		goto map_out;
4065	}
4066
4067	/* buffered READ or buffered write_begin() lookup */
4068	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4069		/*
4070		 * We have blocks reserved already.  We
4071		 * return allocated blocks so that delalloc
4072		 * won't do block reservation for us.  But
4073		 * the buffer head will be unmapped so that
4074		 * a read from the block returns 0s.
4075		 */
4076		map->m_flags |= EXT4_MAP_UNWRITTEN;
4077		goto out1;
4078	}
4079
4080	/* buffered write, writepage time, convert*/
4081	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
4082	if (ret >= 0)
4083		ext4_update_inode_fsync_trans(handle, inode, 1);
4084out:
4085	if (ret <= 0) {
4086		err = ret;
4087		goto out2;
4088	} else
4089		allocated = ret;
4090	map->m_flags |= EXT4_MAP_NEW;
4091	/*
4092	 * if we allocated more blocks than requested
4093	 * we need to make sure we unmap the extra block
4094	 * allocated. The actual needed block will get
4095	 * unmapped later when we find the buffer_head marked
4096	 * new.
4097	 */
4098	if (allocated > map->m_len) {
4099		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
4100					newblock + map->m_len,
4101					allocated - map->m_len);
4102		allocated = map->m_len;
4103	}
4104	map->m_len = allocated;
4105
4106	/*
4107	 * If we have done fallocate with the offset that is already
4108	 * delayed allocated, we would have block reservation
4109	 * and quota reservation done in the delayed write path.
4110	 * But fallocate would have already updated quota and block
4111	 * count for this offset. So cancel these reservation
4112	 */
4113	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4114		unsigned int reserved_clusters;
4115		reserved_clusters = get_reserved_cluster_alloc(inode,
4116				map->m_lblk, map->m_len);
4117		if (reserved_clusters)
4118			ext4_da_update_reserve_space(inode,
4119						     reserved_clusters,
4120						     0);
4121	}
4122
4123map_out:
4124	map->m_flags |= EXT4_MAP_MAPPED;
4125	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4126		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4127					 map->m_len);
4128		if (err < 0)
4129			goto out2;
4130	}
4131out1:
4132	if (allocated > map->m_len)
4133		allocated = map->m_len;
4134	ext4_ext_show_leaf(inode, path);
4135	map->m_pblk = newblock;
4136	map->m_len = allocated;
4137out2:
4138	return err ? err : allocated;
4139}
4140
4141/*
4142 * get_implied_cluster_alloc - check to see if the requested
4143 * allocation (in the map structure) overlaps with a cluster already
4144 * allocated in an extent.
4145 *	@sb	The filesystem superblock structure
4146 *	@map	The requested lblk->pblk mapping
4147 *	@ex	The extent structure which might contain an implied
4148 *			cluster allocation
4149 *
4150 * This function is called by ext4_ext_map_blocks() after we failed to
4151 * find blocks that were already in the inode's extent tree.  Hence,
4152 * we know that the beginning of the requested region cannot overlap
4153 * the extent from the inode's extent tree.  There are three cases we
4154 * want to catch.  The first is this case:
4155 *
4156 *		 |--- cluster # N--|
4157 *    |--- extent ---|	|---- requested region ---|
4158 *			|==========|
4159 *
4160 * The second case that we need to test for is this one:
4161 *
4162 *   |--------- cluster # N ----------------|
4163 *	   |--- requested region --|   |------- extent ----|
4164 *	   |=======================|
4165 *
4166 * The third case is when the requested region lies between two extents
4167 * within the same cluster:
4168 *          |------------- cluster # N-------------|
4169 * |----- ex -----|                  |---- ex_right ----|
4170 *                  |------ requested region ------|
4171 *                  |================|
4172 *
4173 * In each of the above cases, we need to set the map->m_pblk and
4174 * map->m_len so it corresponds to the return the extent labelled as
4175 * "|====|" from cluster #N, since it is already in use for data in
4176 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
4177 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4178 * as a new "allocated" block region.  Otherwise, we will return 0 and
4179 * ext4_ext_map_blocks() will then allocate one or more new clusters
4180 * by calling ext4_mb_new_blocks().
4181 */
4182static int get_implied_cluster_alloc(struct super_block *sb,
4183				     struct ext4_map_blocks *map,
4184				     struct ext4_extent *ex,
4185				     struct ext4_ext_path *path)
4186{
4187	struct ext4_sb_info *sbi = EXT4_SB(sb);
4188	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4189	ext4_lblk_t ex_cluster_start, ex_cluster_end;
4190	ext4_lblk_t rr_cluster_start;
4191	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4192	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4193	unsigned short ee_len = ext4_ext_get_actual_len(ex);
4194
4195	/* The extent passed in that we are trying to match */
4196	ex_cluster_start = EXT4_B2C(sbi, ee_block);
4197	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4198
4199	/* The requested region passed into ext4_map_blocks() */
4200	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4201
4202	if ((rr_cluster_start == ex_cluster_end) ||
4203	    (rr_cluster_start == ex_cluster_start)) {
4204		if (rr_cluster_start == ex_cluster_end)
4205			ee_start += ee_len - 1;
4206		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4207		map->m_len = min(map->m_len,
4208				 (unsigned) sbi->s_cluster_ratio - c_offset);
4209		/*
4210		 * Check for and handle this case:
4211		 *
4212		 *   |--------- cluster # N-------------|
4213		 *		       |------- extent ----|
4214		 *	   |--- requested region ---|
4215		 *	   |===========|
4216		 */
4217
4218		if (map->m_lblk < ee_block)
4219			map->m_len = min(map->m_len, ee_block - map->m_lblk);
4220
4221		/*
4222		 * Check for the case where there is already another allocated
4223		 * block to the right of 'ex' but before the end of the cluster.
4224		 *
4225		 *          |------------- cluster # N-------------|
4226		 * |----- ex -----|                  |---- ex_right ----|
4227		 *                  |------ requested region ------|
4228		 *                  |================|
4229		 */
4230		if (map->m_lblk > ee_block) {
4231			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4232			map->m_len = min(map->m_len, next - map->m_lblk);
4233		}
4234
4235		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4236		return 1;
4237	}
4238
4239	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4240	return 0;
4241}
4242
4243
4244/*
4245 * Block allocation/map/preallocation routine for extents based files
4246 *
4247 *
4248 * Need to be called with
4249 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4250 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4251 *
4252 * return > 0, number of of blocks already mapped/allocated
4253 *          if create == 0 and these are pre-allocated blocks
4254 *          	buffer head is unmapped
4255 *          otherwise blocks are mapped
4256 *
4257 * return = 0, if plain look up failed (blocks have not been allocated)
4258 *          buffer head is unmapped
4259 *
4260 * return < 0, error case.
4261 */
4262int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4263			struct ext4_map_blocks *map, int flags)
4264{
4265	struct ext4_ext_path *path = NULL;
4266	struct ext4_extent newex, *ex, *ex2;
4267	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4268	ext4_fsblk_t newblock = 0;
4269	int free_on_err = 0, err = 0, depth, ret;
4270	unsigned int allocated = 0, offset = 0;
4271	unsigned int allocated_clusters = 0;
4272	struct ext4_allocation_request ar;
4273	ext4_io_end_t *io = ext4_inode_aio(inode);
4274	ext4_lblk_t cluster_offset;
4275	int set_unwritten = 0;
4276
4277	ext_debug("blocks %u/%u requested for inode %lu\n",
4278		  map->m_lblk, map->m_len, inode->i_ino);
4279	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4280
4281	/* find extent for this block */
4282	path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
4283	if (IS_ERR(path)) {
4284		err = PTR_ERR(path);
4285		path = NULL;
4286		goto out2;
4287	}
4288
4289	depth = ext_depth(inode);
4290
4291	/*
4292	 * consistent leaf must not be empty;
4293	 * this situation is possible, though, _during_ tree modification;
4294	 * this is why assert can't be put in ext4_ext_find_extent()
4295	 */
4296	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4297		EXT4_ERROR_INODE(inode, "bad extent address "
4298				 "lblock: %lu, depth: %d pblock %lld",
4299				 (unsigned long) map->m_lblk, depth,
4300				 path[depth].p_block);
4301		err = -EIO;
4302		goto out2;
4303	}
4304
4305	ex = path[depth].p_ext;
4306	if (ex) {
4307		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4308		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4309		unsigned short ee_len;
4310
4311
4312		/*
4313		 * unwritten extents are treated as holes, except that
4314		 * we split out initialized portions during a write.
4315		 */
4316		ee_len = ext4_ext_get_actual_len(ex);
4317
4318		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4319
4320		/* if found extent covers block, simply return it */
4321		if (in_range(map->m_lblk, ee_block, ee_len)) {
4322			newblock = map->m_lblk - ee_block + ee_start;
4323			/* number of remaining blocks in the extent */
4324			allocated = ee_len - (map->m_lblk - ee_block);
4325			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4326				  ee_block, ee_len, newblock);
4327
4328			/*
4329			 * If the extent is initialized check whether the
4330			 * caller wants to convert it to unwritten.
4331			 */
4332			if ((!ext4_ext_is_unwritten(ex)) &&
4333			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4334				allocated = ext4_ext_convert_initialized_extent(
4335						handle, inode, map, path, flags,
4336						allocated, newblock);
4337				goto out2;
4338			} else if (!ext4_ext_is_unwritten(ex))
4339				goto out;
4340
4341			ret = ext4_ext_handle_unwritten_extents(
4342				handle, inode, map, path, flags,
4343				allocated, newblock);
4344			if (ret < 0)
4345				err = ret;
4346			else
4347				allocated = ret;
4348			goto out2;
4349		}
4350	}
4351
4352	if ((sbi->s_cluster_ratio > 1) &&
4353	    ext4_find_delalloc_cluster(inode, map->m_lblk))
4354		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4355
4356	/*
4357	 * requested block isn't allocated yet;
4358	 * we couldn't try to create block if create flag is zero
4359	 */
4360	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4361		/*
4362		 * put just found gap into cache to speed up
4363		 * subsequent requests
4364		 */
4365		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
4366			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4367		goto out2;
4368	}
4369
4370	/*
4371	 * Okay, we need to do block allocation.
4372	 */
4373	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4374	newex.ee_block = cpu_to_le32(map->m_lblk);
4375	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4376
4377	/*
4378	 * If we are doing bigalloc, check to see if the extent returned
4379	 * by ext4_ext_find_extent() implies a cluster we can use.
4380	 */
4381	if (cluster_offset && ex &&
4382	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4383		ar.len = allocated = map->m_len;
4384		newblock = map->m_pblk;
4385		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4386		goto got_allocated_blocks;
4387	}
4388
4389	/* find neighbour allocated blocks */
4390	ar.lleft = map->m_lblk;
4391	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4392	if (err)
4393		goto out2;
4394	ar.lright = map->m_lblk;
4395	ex2 = NULL;
4396	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4397	if (err)
4398		goto out2;
4399
4400	/* Check if the extent after searching to the right implies a
4401	 * cluster we can use. */
4402	if ((sbi->s_cluster_ratio > 1) && ex2 &&
4403	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4404		ar.len = allocated = map->m_len;
4405		newblock = map->m_pblk;
4406		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4407		goto got_allocated_blocks;
4408	}
4409
4410	/*
4411	 * See if request is beyond maximum number of blocks we can have in
4412	 * a single extent. For an initialized extent this limit is
4413	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4414	 * EXT_UNWRITTEN_MAX_LEN.
4415	 */
4416	if (map->m_len > EXT_INIT_MAX_LEN &&
4417	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4418		map->m_len = EXT_INIT_MAX_LEN;
4419	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4420		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4421		map->m_len = EXT_UNWRITTEN_MAX_LEN;
4422
4423	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4424	newex.ee_len = cpu_to_le16(map->m_len);
4425	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4426	if (err)
4427		allocated = ext4_ext_get_actual_len(&newex);
4428	else
4429		allocated = map->m_len;
4430
4431	/* allocate new block */
4432	ar.inode = inode;
4433	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4434	ar.logical = map->m_lblk;
4435	/*
4436	 * We calculate the offset from the beginning of the cluster
4437	 * for the logical block number, since when we allocate a
4438	 * physical cluster, the physical block should start at the
4439	 * same offset from the beginning of the cluster.  This is
4440	 * needed so that future calls to get_implied_cluster_alloc()
4441	 * work correctly.
4442	 */
4443	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4444	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4445	ar.goal -= offset;
4446	ar.logical -= offset;
4447	if (S_ISREG(inode->i_mode))
4448		ar.flags = EXT4_MB_HINT_DATA;
4449	else
4450		/* disable in-core preallocation for non-regular files */
4451		ar.flags = 0;
4452	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4453		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4454	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4455	if (!newblock)
4456		goto out2;
4457	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4458		  ar.goal, newblock, allocated);
4459	free_on_err = 1;
4460	allocated_clusters = ar.len;
4461	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4462	if (ar.len > allocated)
4463		ar.len = allocated;
4464
4465got_allocated_blocks:
4466	/* try to insert new extent into found leaf and return */
4467	ext4_ext_store_pblock(&newex, newblock + offset);
4468	newex.ee_len = cpu_to_le16(ar.len);
4469	/* Mark unwritten */
4470	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4471		ext4_ext_mark_unwritten(&newex);
4472		map->m_flags |= EXT4_MAP_UNWRITTEN;
4473		/*
4474		 * io_end structure was created for every IO write to an
4475		 * unwritten extent. To avoid unnecessary conversion,
4476		 * here we flag the IO that really needs the conversion.
4477		 * For non asycn direct IO case, flag the inode state
4478		 * that we need to perform conversion when IO is done.
4479		 */
4480		if (flags & EXT4_GET_BLOCKS_PRE_IO)
4481			set_unwritten = 1;
4482	}
4483
4484	err = 0;
4485	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4486		err = check_eofblocks_fl(handle, inode, map->m_lblk,
4487					 path, ar.len);
4488	if (!err)
4489		err = ext4_ext_insert_extent(handle, inode, path,
4490					     &newex, flags);
4491
4492	if (!err && set_unwritten) {
4493		if (io)
4494			ext4_set_io_unwritten_flag(inode, io);
4495		else
4496			ext4_set_inode_state(inode,
4497					     EXT4_STATE_DIO_UNWRITTEN);
4498	}
4499
4500	if (err && free_on_err) {
4501		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4502			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4503		/* free data blocks we just allocated */
4504		/* not a good idea to call discard here directly,
4505		 * but otherwise we'd need to call it every free() */
4506		ext4_discard_preallocations(inode);
4507		ext4_free_blocks(handle, inode, NULL, newblock,
4508				 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4509		goto out2;
4510	}
4511
4512	/* previous routine could use block we allocated */
4513	newblock = ext4_ext_pblock(&newex);
4514	allocated = ext4_ext_get_actual_len(&newex);
4515	if (allocated > map->m_len)
4516		allocated = map->m_len;
4517	map->m_flags |= EXT4_MAP_NEW;
4518
4519	/*
4520	 * Update reserved blocks/metadata blocks after successful
4521	 * block allocation which had been deferred till now.
4522	 */
4523	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4524		unsigned int reserved_clusters;
4525		/*
4526		 * Check how many clusters we had reserved this allocated range
4527		 */
4528		reserved_clusters = get_reserved_cluster_alloc(inode,
4529						map->m_lblk, allocated);
4530		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4531			if (reserved_clusters) {
4532				/*
4533				 * We have clusters reserved for this range.
4534				 * But since we are not doing actual allocation
4535				 * and are simply using blocks from previously
4536				 * allocated cluster, we should release the
4537				 * reservation and not claim quota.
4538				 */
4539				ext4_da_update_reserve_space(inode,
4540						reserved_clusters, 0);
4541			}
4542		} else {
4543			BUG_ON(allocated_clusters < reserved_clusters);
4544			if (reserved_clusters < allocated_clusters) {
4545				struct ext4_inode_info *ei = EXT4_I(inode);
4546				int reservation = allocated_clusters -
4547						  reserved_clusters;
4548				/*
4549				 * It seems we claimed few clusters outside of
4550				 * the range of this allocation. We should give
4551				 * it back to the reservation pool. This can
4552				 * happen in the following case:
4553				 *
4554				 * * Suppose s_cluster_ratio is 4 (i.e., each
4555				 *   cluster has 4 blocks. Thus, the clusters
4556				 *   are [0-3],[4-7],[8-11]...
4557				 * * First comes delayed allocation write for
4558				 *   logical blocks 10 & 11. Since there were no
4559				 *   previous delayed allocated blocks in the
4560				 *   range [8-11], we would reserve 1 cluster
4561				 *   for this write.
4562				 * * Next comes write for logical blocks 3 to 8.
4563				 *   In this case, we will reserve 2 clusters
4564				 *   (for [0-3] and [4-7]; and not for [8-11] as
4565				 *   that range has a delayed allocated blocks.
4566				 *   Thus total reserved clusters now becomes 3.
4567				 * * Now, during the delayed allocation writeout
4568				 *   time, we will first write blocks [3-8] and
4569				 *   allocate 3 clusters for writing these
4570				 *   blocks. Also, we would claim all these
4571				 *   three clusters above.
4572				 * * Now when we come here to writeout the
4573				 *   blocks [10-11], we would expect to claim
4574				 *   the reservation of 1 cluster we had made
4575				 *   (and we would claim it since there are no
4576				 *   more delayed allocated blocks in the range
4577				 *   [8-11]. But our reserved cluster count had
4578				 *   already gone to 0.
4579				 *
4580				 *   Thus, at the step 4 above when we determine
4581				 *   that there are still some unwritten delayed
4582				 *   allocated blocks outside of our current
4583				 *   block range, we should increment the
4584				 *   reserved clusters count so that when the
4585				 *   remaining blocks finally gets written, we
4586				 *   could claim them.
4587				 */
4588				dquot_reserve_block(inode,
4589						EXT4_C2B(sbi, reservation));
4590				spin_lock(&ei->i_block_reservation_lock);
4591				ei->i_reserved_data_blocks += reservation;
4592				spin_unlock(&ei->i_block_reservation_lock);
4593			}
4594			/*
4595			 * We will claim quota for all newly allocated blocks.
4596			 * We're updating the reserved space *after* the
4597			 * correction above so we do not accidentally free
4598			 * all the metadata reservation because we might
4599			 * actually need it later on.
4600			 */
4601			ext4_da_update_reserve_space(inode, allocated_clusters,
4602							1);
4603		}
4604	}
4605
4606	/*
4607	 * Cache the extent and update transaction to commit on fdatasync only
4608	 * when it is _not_ an unwritten extent.
4609	 */
4610	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4611		ext4_update_inode_fsync_trans(handle, inode, 1);
4612	else
4613		ext4_update_inode_fsync_trans(handle, inode, 0);
4614out:
4615	if (allocated > map->m_len)
4616		allocated = map->m_len;
4617	ext4_ext_show_leaf(inode, path);
4618	map->m_flags |= EXT4_MAP_MAPPED;
4619	map->m_pblk = newblock;
4620	map->m_len = allocated;
4621out2:
4622	if (path) {
4623		ext4_ext_drop_refs(path);
4624		kfree(path);
4625	}
4626
4627	trace_ext4_ext_map_blocks_exit(inode, flags, map,
4628				       err ? err : allocated);
4629	ext4_es_lru_add(inode);
4630	return err ? err : allocated;
4631}
4632
4633void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4634{
4635	struct super_block *sb = inode->i_sb;
4636	ext4_lblk_t last_block;
4637	int err = 0;
4638
4639	/*
4640	 * TODO: optimization is possible here.
4641	 * Probably we need not scan at all,
4642	 * because page truncation is enough.
4643	 */
4644
4645	/* we have to know where to truncate from in crash case */
4646	EXT4_I(inode)->i_disksize = inode->i_size;
4647	ext4_mark_inode_dirty(handle, inode);
4648
4649	last_block = (inode->i_size + sb->s_blocksize - 1)
4650			>> EXT4_BLOCK_SIZE_BITS(sb);
4651retry:
4652	err = ext4_es_remove_extent(inode, last_block,
4653				    EXT_MAX_BLOCKS - last_block);
4654	if (err == -ENOMEM) {
4655		cond_resched();
4656		congestion_wait(BLK_RW_ASYNC, HZ/50);
4657		goto retry;
4658	}
4659	if (err) {
4660		ext4_std_error(inode->i_sb, err);
4661		return;
4662	}
4663	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4664	ext4_std_error(inode->i_sb, err);
4665}
4666
4667static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4668				  ext4_lblk_t len, int flags, int mode)
4669{
4670	struct inode *inode = file_inode(file);
4671	handle_t *handle;
4672	int ret = 0;
4673	int ret2 = 0;
4674	int retries = 0;
4675	struct ext4_map_blocks map;
4676	unsigned int credits;
4677
4678	map.m_lblk = offset;
4679	/*
4680	 * Don't normalize the request if it can fit in one extent so
4681	 * that it doesn't get unnecessarily split into multiple
4682	 * extents.
4683	 */
4684	if (len <= EXT_UNWRITTEN_MAX_LEN)
4685		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4686
4687	/*
4688	 * credits to insert 1 extent into extent tree
4689	 */
4690	credits = ext4_chunk_trans_blocks(inode, len);
4691
4692retry:
4693	while (ret >= 0 && ret < len) {
4694		map.m_lblk = map.m_lblk + ret;
4695		map.m_len = len = len - ret;
4696		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4697					    credits);
4698		if (IS_ERR(handle)) {
4699			ret = PTR_ERR(handle);
4700			break;
4701		}
4702		ret = ext4_map_blocks(handle, inode, &map, flags);
4703		if (ret <= 0) {
4704			ext4_debug("inode #%lu: block %u: len %u: "
4705				   "ext4_ext_map_blocks returned %d",
4706				   inode->i_ino, map.m_lblk,
4707				   map.m_len, ret);
4708			ext4_mark_inode_dirty(handle, inode);
4709			ret2 = ext4_journal_stop(handle);
4710			break;
4711		}
4712		ret2 = ext4_journal_stop(handle);
4713		if (ret2)
4714			break;
4715	}
4716	if (ret == -ENOSPC &&
4717			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4718		ret = 0;
4719		goto retry;
4720	}
4721
4722	return ret > 0 ? ret2 : ret;
4723}
4724
4725static long ext4_zero_range(struct file *file, loff_t offset,
4726			    loff_t len, int mode)
4727{
4728	struct inode *inode = file_inode(file);
4729	handle_t *handle = NULL;
4730	unsigned int max_blocks;
4731	loff_t new_size = 0;
4732	int ret = 0;
4733	int flags;
4734	int partial;
4735	loff_t start, end;
4736	ext4_lblk_t lblk;
4737	struct address_space *mapping = inode->i_mapping;
4738	unsigned int blkbits = inode->i_blkbits;
4739
4740	trace_ext4_zero_range(inode, offset, len, mode);
4741
4742	if (!S_ISREG(inode->i_mode))
4743		return -EINVAL;
4744
4745	/* Call ext4_force_commit to flush all data in case of data=journal. */
4746	if (ext4_should_journal_data(inode)) {
4747		ret = ext4_force_commit(inode->i_sb);
4748		if (ret)
4749			return ret;
4750	}
4751
4752	/*
4753	 * Write out all dirty pages to avoid race conditions
4754	 * Then release them.
4755	 */
4756	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4757		ret = filemap_write_and_wait_range(mapping, offset,
4758						   offset + len - 1);
4759		if (ret)
4760			return ret;
4761	}
4762
4763	/*
4764	 * Round up offset. This is not fallocate, we neet to zero out
4765	 * blocks, so convert interior block aligned part of the range to
4766	 * unwritten and possibly manually zero out unaligned parts of the
4767	 * range.
4768	 */
4769	start = round_up(offset, 1 << blkbits);
4770	end = round_down((offset + len), 1 << blkbits);
4771
4772	if (start < offset || end > offset + len)
4773		return -EINVAL;
4774	partial = (offset + len) & ((1 << blkbits) - 1);
4775
4776	lblk = start >> blkbits;
4777	max_blocks = (end >> blkbits);
4778	if (max_blocks < lblk)
4779		max_blocks = 0;
4780	else
4781		max_blocks -= lblk;
4782
4783	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
4784		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
4785	if (mode & FALLOC_FL_KEEP_SIZE)
4786		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4787
4788	mutex_lock(&inode->i_mutex);
4789
4790	/*
4791	 * Indirect files do not support unwritten extnets
4792	 */
4793	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4794		ret = -EOPNOTSUPP;
4795		goto out_mutex;
4796	}
4797
4798	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4799	     offset + len > i_size_read(inode)) {
4800		new_size = offset + len;
4801		ret = inode_newsize_ok(inode, new_size);
4802		if (ret)
4803			goto out_mutex;
4804		/*
4805		 * If we have a partial block after EOF we have to allocate
4806		 * the entire block.
4807		 */
4808		if (partial)
4809			max_blocks += 1;
4810	}
4811
4812	if (max_blocks > 0) {
4813
4814		/* Now release the pages and zero block aligned part of pages*/
4815		truncate_pagecache_range(inode, start, end - 1);
4816
4817		/* Wait all existing dio workers, newcomers will block on i_mutex */
4818		ext4_inode_block_unlocked_dio(inode);
4819		inode_dio_wait(inode);
4820
4821		/*
4822		 * Remove entire range from the extent status tree.
4823		 */
4824		ret = ext4_es_remove_extent(inode, lblk, max_blocks);
4825		if (ret)
4826			goto out_dio;
4827
4828		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
4829					     mode);
4830		if (ret)
4831			goto out_dio;
4832	}
4833
4834	handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
4835	if (IS_ERR(handle)) {
4836		ret = PTR_ERR(handle);
4837		ext4_std_error(inode->i_sb, ret);
4838		goto out_dio;
4839	}
4840
4841	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4842
4843	if (new_size) {
4844		if (new_size > i_size_read(inode))
4845			i_size_write(inode, new_size);
4846		if (new_size > EXT4_I(inode)->i_disksize)
4847			ext4_update_i_disksize(inode, new_size);
4848	} else {
4849		/*
4850		* Mark that we allocate beyond EOF so the subsequent truncate
4851		* can proceed even if the new size is the same as i_size.
4852		*/
4853		if ((offset + len) > i_size_read(inode))
4854			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4855	}
4856
4857	ext4_mark_inode_dirty(handle, inode);
4858
4859	/* Zero out partial block at the edges of the range */
4860	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4861
4862	if (file->f_flags & O_SYNC)
4863		ext4_handle_sync(handle);
4864
4865	ext4_journal_stop(handle);
4866out_dio:
4867	ext4_inode_resume_unlocked_dio(inode);
4868out_mutex:
4869	mutex_unlock(&inode->i_mutex);
4870	return ret;
4871}
4872
4873/*
4874 * preallocate space for a file. This implements ext4's fallocate file
4875 * operation, which gets called from sys_fallocate system call.
4876 * For block-mapped files, posix_fallocate should fall back to the method
4877 * of writing zeroes to the required new blocks (the same behavior which is
4878 * expected for file systems which do not support fallocate() system call).
4879 */
4880long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4881{
4882	struct inode *inode = file_inode(file);
4883	handle_t *handle;
4884	loff_t new_size = 0;
4885	unsigned int max_blocks;
4886	int ret = 0;
4887	int flags;
4888	ext4_lblk_t lblk;
4889	struct timespec tv;
4890	unsigned int blkbits = inode->i_blkbits;
4891
4892	/* Return error if mode is not supported */
4893	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4894		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4895		return -EOPNOTSUPP;
4896
4897	if (mode & FALLOC_FL_PUNCH_HOLE)
4898		return ext4_punch_hole(inode, offset, len);
4899
4900	ret = ext4_convert_inline_data(inode);
4901	if (ret)
4902		return ret;
4903
4904	/*
4905	 * currently supporting (pre)allocate mode for extent-based
4906	 * files _only_
4907	 */
4908	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4909		return -EOPNOTSUPP;
4910
4911	if (mode & FALLOC_FL_COLLAPSE_RANGE)
4912		return ext4_collapse_range(inode, offset, len);
4913
4914	if (mode & FALLOC_FL_ZERO_RANGE)
4915		return ext4_zero_range(file, offset, len, mode);
4916
4917	trace_ext4_fallocate_enter(inode, offset, len, mode);
4918	lblk = offset >> blkbits;
4919	/*
4920	 * We can't just convert len to max_blocks because
4921	 * If blocksize = 4096 offset = 3072 and len = 2048
4922	 */
4923	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4924		- lblk;
4925
4926	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4927	if (mode & FALLOC_FL_KEEP_SIZE)
4928		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4929
4930	mutex_lock(&inode->i_mutex);
4931
4932	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4933	     offset + len > i_size_read(inode)) {
4934		new_size = offset + len;
4935		ret = inode_newsize_ok(inode, new_size);
4936		if (ret)
4937			goto out;
4938	}
4939
4940	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
4941	if (ret)
4942		goto out;
4943
4944	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4945	if (IS_ERR(handle))
4946		goto out;
4947
4948	tv = inode->i_ctime = ext4_current_time(inode);
4949
4950	if (new_size) {
4951		if (new_size > i_size_read(inode)) {
4952			i_size_write(inode, new_size);
4953			inode->i_mtime = tv;
4954		}
4955		if (new_size > EXT4_I(inode)->i_disksize)
4956			ext4_update_i_disksize(inode, new_size);
4957	} else {
4958		/*
4959		* Mark that we allocate beyond EOF so the subsequent truncate
4960		* can proceed even if the new size is the same as i_size.
4961		*/
4962		if ((offset + len) > i_size_read(inode))
4963			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4964	}
4965	ext4_mark_inode_dirty(handle, inode);
4966	if (file->f_flags & O_SYNC)
4967		ext4_handle_sync(handle);
4968
4969	ext4_journal_stop(handle);
4970out:
4971	mutex_unlock(&inode->i_mutex);
4972	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4973	return ret;
4974}
4975
4976/*
4977 * This function convert a range of blocks to written extents
4978 * The caller of this function will pass the start offset and the size.
4979 * all unwritten extents within this range will be converted to
4980 * written extents.
4981 *
4982 * This function is called from the direct IO end io call back
4983 * function, to convert the fallocated extents after IO is completed.
4984 * Returns 0 on success.
4985 */
4986int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4987				   loff_t offset, ssize_t len)
4988{
4989	unsigned int max_blocks;
4990	int ret = 0;
4991	int ret2 = 0;
4992	struct ext4_map_blocks map;
4993	unsigned int credits, blkbits = inode->i_blkbits;
4994
4995	map.m_lblk = offset >> blkbits;
4996	/*
4997	 * We can't just convert len to max_blocks because
4998	 * If blocksize = 4096 offset = 3072 and len = 2048
4999	 */
5000	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
5001		      map.m_lblk);
5002	/*
5003	 * This is somewhat ugly but the idea is clear: When transaction is
5004	 * reserved, everything goes into it. Otherwise we rather start several
5005	 * smaller transactions for conversion of each extent separately.
5006	 */
5007	if (handle) {
5008		handle = ext4_journal_start_reserved(handle,
5009						     EXT4_HT_EXT_CONVERT);
5010		if (IS_ERR(handle))
5011			return PTR_ERR(handle);
5012		credits = 0;
5013	} else {
5014		/*
5015		 * credits to insert 1 extent into extent tree
5016		 */
5017		credits = ext4_chunk_trans_blocks(inode, max_blocks);
5018	}
5019	while (ret >= 0 && ret < max_blocks) {
5020		map.m_lblk += ret;
5021		map.m_len = (max_blocks -= ret);
5022		if (credits) {
5023			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5024						    credits);
5025			if (IS_ERR(handle)) {
5026				ret = PTR_ERR(handle);
5027				break;
5028			}
5029		}
5030		ret = ext4_map_blocks(handle, inode, &map,
5031				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5032		if (ret <= 0)
5033			ext4_warning(inode->i_sb,
5034				     "inode #%lu: block %u: len %u: "
5035				     "ext4_ext_map_blocks returned %d",
5036				     inode->i_ino, map.m_lblk,
5037				     map.m_len, ret);
5038		ext4_mark_inode_dirty(handle, inode);
5039		if (credits)
5040			ret2 = ext4_journal_stop(handle);
5041		if (ret <= 0 || ret2)
5042			break;
5043	}
5044	if (!credits)
5045		ret2 = ext4_journal_stop(handle);
5046	return ret > 0 ? ret2 : ret;
5047}
5048
5049/*
5050 * If newes is not existing extent (newes->ec_pblk equals zero) find
5051 * delayed extent at start of newes and update newes accordingly and
5052 * return start of the next delayed extent.
5053 *
5054 * If newes is existing extent (newes->ec_pblk is not equal zero)
5055 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5056 * extent found. Leave newes unmodified.
5057 */
5058static int ext4_find_delayed_extent(struct inode *inode,
5059				    struct extent_status *newes)
5060{
5061	struct extent_status es;
5062	ext4_lblk_t block, next_del;
5063
5064	if (newes->es_pblk == 0) {
5065		ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
5066				newes->es_lblk + newes->es_len - 1, &es);
5067
5068		/*
5069		 * No extent in extent-tree contains block @newes->es_pblk,
5070		 * then the block may stay in 1)a hole or 2)delayed-extent.
5071		 */
5072		if (es.es_len == 0)
5073			/* A hole found. */
5074			return 0;
5075
5076		if (es.es_lblk > newes->es_lblk) {
5077			/* A hole found. */
5078			newes->es_len = min(es.es_lblk - newes->es_lblk,
5079					    newes->es_len);
5080			return 0;
5081		}
5082
5083		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5084	}
5085
5086	block = newes->es_lblk + newes->es_len;
5087	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
5088	if (es.es_len == 0)
5089		next_del = EXT_MAX_BLOCKS;
5090	else
5091		next_del = es.es_lblk;
5092
5093	return next_del;
5094}
5095/* fiemap flags we can handle specified here */
5096#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5097
5098static int ext4_xattr_fiemap(struct inode *inode,
5099				struct fiemap_extent_info *fieinfo)
5100{
5101	__u64 physical = 0;
5102	__u64 length;
5103	__u32 flags = FIEMAP_EXTENT_LAST;
5104	int blockbits = inode->i_sb->s_blocksize_bits;
5105	int error = 0;
5106
5107	/* in-inode? */
5108	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5109		struct ext4_iloc iloc;
5110		int offset;	/* offset of xattr in inode */
5111
5112		error = ext4_get_inode_loc(inode, &iloc);
5113		if (error)
5114			return error;
5115		physical = (__u64)iloc.bh->b_blocknr << blockbits;
5116		offset = EXT4_GOOD_OLD_INODE_SIZE +
5117				EXT4_I(inode)->i_extra_isize;
5118		physical += offset;
5119		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5120		flags |= FIEMAP_EXTENT_DATA_INLINE;
5121		brelse(iloc.bh);
5122	} else { /* external block */
5123		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5124		length = inode->i_sb->s_blocksize;
5125	}
5126
5127	if (physical)
5128		error = fiemap_fill_next_extent(fieinfo, 0, physical,
5129						length, flags);
5130	return (error < 0 ? error : 0);
5131}
5132
5133int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5134		__u64 start, __u64 len)
5135{
5136	ext4_lblk_t start_blk;
5137	int error = 0;
5138
5139	if (ext4_has_inline_data(inode)) {
5140		int has_inline = 1;
5141
5142		error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
5143
5144		if (has_inline)
5145			return error;
5146	}
5147
5148	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5149		error = ext4_ext_precache(inode);
5150		if (error)
5151			return error;
5152	}
5153
5154	/* fallback to generic here if not in extents fmt */
5155	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5156		return generic_block_fiemap(inode, fieinfo, start, len,
5157			ext4_get_block);
5158
5159	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5160		return -EBADR;
5161
5162	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5163		error = ext4_xattr_fiemap(inode, fieinfo);
5164	} else {
5165		ext4_lblk_t len_blks;
5166		__u64 last_blk;
5167
5168		start_blk = start >> inode->i_sb->s_blocksize_bits;
5169		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5170		if (last_blk >= EXT_MAX_BLOCKS)
5171			last_blk = EXT_MAX_BLOCKS-1;
5172		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5173
5174		/*
5175		 * Walk the extent tree gathering extent information
5176		 * and pushing extents back to the user.
5177		 */
5178		error = ext4_fill_fiemap_extents(inode, start_blk,
5179						 len_blks, fieinfo);
5180	}
5181	ext4_es_lru_add(inode);
5182	return error;
5183}
5184
5185/*
5186 * ext4_access_path:
5187 * Function to access the path buffer for marking it dirty.
5188 * It also checks if there are sufficient credits left in the journal handle
5189 * to update path.
5190 */
5191static int
5192ext4_access_path(handle_t *handle, struct inode *inode,
5193		struct ext4_ext_path *path)
5194{
5195	int credits, err;
5196
5197	if (!ext4_handle_valid(handle))
5198		return 0;
5199
5200	/*
5201	 * Check if need to extend journal credits
5202	 * 3 for leaf, sb, and inode plus 2 (bmap and group
5203	 * descriptor) for each block group; assume two block
5204	 * groups
5205	 */
5206	if (handle->h_buffer_credits < 7) {
5207		credits = ext4_writepage_trans_blocks(inode);
5208		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5209		/* EAGAIN is success */
5210		if (err && err != -EAGAIN)
5211			return err;
5212	}
5213
5214	err = ext4_ext_get_access(handle, inode, path);
5215	return err;
5216}
5217
5218/*
5219 * ext4_ext_shift_path_extents:
5220 * Shift the extents of a path structure lying between path[depth].p_ext
5221 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5222 * from starting block for each extent.
5223 */
5224static int
5225ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5226			    struct inode *inode, handle_t *handle,
5227			    ext4_lblk_t *start)
5228{
5229	int depth, err = 0;
5230	struct ext4_extent *ex_start, *ex_last;
5231	bool update = 0;
5232	depth = path->p_depth;
5233
5234	while (depth >= 0) {
5235		if (depth == path->p_depth) {
5236			ex_start = path[depth].p_ext;
5237			if (!ex_start)
5238				return -EIO;
5239
5240			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5241			if (!ex_last)
5242				return -EIO;
5243
5244			err = ext4_access_path(handle, inode, path + depth);
5245			if (err)
5246				goto out;
5247
5248			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5249				update = 1;
5250
5251			*start = le32_to_cpu(ex_last->ee_block) +
5252				ext4_ext_get_actual_len(ex_last);
5253
5254			while (ex_start <= ex_last) {
5255				le32_add_cpu(&ex_start->ee_block, -shift);
5256				/* Try to merge to the left. */
5257				if ((ex_start >
5258				     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
5259				    ext4_ext_try_to_merge_right(inode,
5260							path, ex_start - 1))
5261					ex_last--;
5262				else
5263					ex_start++;
5264			}
5265			err = ext4_ext_dirty(handle, inode, path + depth);
5266			if (err)
5267				goto out;
5268
5269			if (--depth < 0 || !update)
5270				break;
5271		}
5272
5273		/* Update index too */
5274		err = ext4_access_path(handle, inode, path + depth);
5275		if (err)
5276			goto out;
5277
5278		le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5279		err = ext4_ext_dirty(handle, inode, path + depth);
5280		if (err)
5281			goto out;
5282
5283		/* we are done if current index is not a starting index */
5284		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5285			break;
5286
5287		depth--;
5288	}
5289
5290out:
5291	return err;
5292}
5293
5294/*
5295 * ext4_ext_shift_extents:
5296 * All the extents which lies in the range from start to the last allocated
5297 * block for the file are shifted downwards by shift blocks.
5298 * On success, 0 is returned, error otherwise.
5299 */
5300static int
5301ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5302		       ext4_lblk_t start, ext4_lblk_t shift)
5303{
5304	struct ext4_ext_path *path;
5305	int ret = 0, depth;
5306	struct ext4_extent *extent;
5307	ext4_lblk_t stop_block, current_block;
5308	ext4_lblk_t ex_start, ex_end;
5309
5310	/* Let path point to the last extent */
5311	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5312	if (IS_ERR(path))
5313		return PTR_ERR(path);
5314
5315	depth = path->p_depth;
5316	extent = path[depth].p_ext;
5317	if (!extent) {
5318		ext4_ext_drop_refs(path);
5319		kfree(path);
5320		return ret;
5321	}
5322
5323	stop_block = le32_to_cpu(extent->ee_block) +
5324			ext4_ext_get_actual_len(extent);
5325	ext4_ext_drop_refs(path);
5326	kfree(path);
5327
5328	/* Nothing to shift, if hole is at the end of file */
5329	if (start >= stop_block)
5330		return ret;
5331
5332	/*
5333	 * Don't start shifting extents until we make sure the hole is big
5334	 * enough to accomodate the shift.
5335	 */
5336	path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5337	if (IS_ERR(path))
5338		return PTR_ERR(path);
5339	depth = path->p_depth;
5340	extent =  path[depth].p_ext;
5341	if (extent) {
5342		ex_start = le32_to_cpu(extent->ee_block);
5343		ex_end = le32_to_cpu(extent->ee_block) +
5344			ext4_ext_get_actual_len(extent);
5345	} else {
5346		ex_start = 0;
5347		ex_end = 0;
5348	}
5349	ext4_ext_drop_refs(path);
5350	kfree(path);
5351
5352	if ((start == ex_start && shift > ex_start) ||
5353	    (shift > start - ex_end))
5354		return -EINVAL;
5355
5356	/* Its safe to start updating extents */
5357	while (start < stop_block) {
5358		path = ext4_ext_find_extent(inode, start, NULL, 0);
5359		if (IS_ERR(path))
5360			return PTR_ERR(path);
5361		depth = path->p_depth;
5362		extent = path[depth].p_ext;
5363		if (!extent) {
5364			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5365					 (unsigned long) start);
5366			return -EIO;
5367		}
5368
5369		current_block = le32_to_cpu(extent->ee_block);
5370		if (start > current_block) {
5371			/* Hole, move to the next extent */
5372			ret = mext_next_extent(inode, path, &extent);
5373			if (ret != 0) {
5374				ext4_ext_drop_refs(path);
5375				kfree(path);
5376				if (ret == 1)
5377					ret = 0;
5378				break;
5379			}
5380		}
5381		ret = ext4_ext_shift_path_extents(path, shift, inode,
5382				handle, &start);
5383		ext4_ext_drop_refs(path);
5384		kfree(path);
5385		if (ret)
5386			break;
5387	}
5388
5389	return ret;
5390}
5391
5392/*
5393 * ext4_collapse_range:
5394 * This implements the fallocate's collapse range functionality for ext4
5395 * Returns: 0 and non-zero on error.
5396 */
5397int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5398{
5399	struct super_block *sb = inode->i_sb;
5400	ext4_lblk_t punch_start, punch_stop;
5401	handle_t *handle;
5402	unsigned int credits;
5403	loff_t new_size, ioffset;
5404	int ret;
5405
5406	/* Collapse range works only on fs block size aligned offsets. */
5407	if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
5408	    len & (EXT4_BLOCK_SIZE(sb) - 1))
5409		return -EINVAL;
5410
5411	if (!S_ISREG(inode->i_mode))
5412		return -EINVAL;
5413
5414	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
5415		return -EOPNOTSUPP;
5416
5417	trace_ext4_collapse_range(inode, offset, len);
5418
5419	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5420	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5421
5422	/* Call ext4_force_commit to flush all data in case of data=journal. */
5423	if (ext4_should_journal_data(inode)) {
5424		ret = ext4_force_commit(inode->i_sb);
5425		if (ret)
5426			return ret;
5427	}
5428
5429	/*
5430	 * Need to round down offset to be aligned with page size boundary
5431	 * for page size > block size.
5432	 */
5433	ioffset = round_down(offset, PAGE_SIZE);
5434
5435	/* Write out all dirty pages */
5436	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5437					   LLONG_MAX);
5438	if (ret)
5439		return ret;
5440
5441	/* Take mutex lock */
5442	mutex_lock(&inode->i_mutex);
5443
5444	/*
5445	 * There is no need to overlap collapse range with EOF, in which case
5446	 * it is effectively a truncate operation
5447	 */
5448	if (offset + len >= i_size_read(inode)) {
5449		ret = -EINVAL;
5450		goto out_mutex;
5451	}
5452
5453	/* Currently just for extent based files */
5454	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5455		ret = -EOPNOTSUPP;
5456		goto out_mutex;
5457	}
5458
5459	truncate_pagecache(inode, ioffset);
5460
5461	/* Wait for existing dio to complete */
5462	ext4_inode_block_unlocked_dio(inode);
5463	inode_dio_wait(inode);
5464
5465	credits = ext4_writepage_trans_blocks(inode);
5466	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5467	if (IS_ERR(handle)) {
5468		ret = PTR_ERR(handle);
5469		goto out_dio;
5470	}
5471
5472	down_write(&EXT4_I(inode)->i_data_sem);
5473	ext4_discard_preallocations(inode);
5474
5475	ret = ext4_es_remove_extent(inode, punch_start,
5476				    EXT_MAX_BLOCKS - punch_start);
5477	if (ret) {
5478		up_write(&EXT4_I(inode)->i_data_sem);
5479		goto out_stop;
5480	}
5481
5482	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5483	if (ret) {
5484		up_write(&EXT4_I(inode)->i_data_sem);
5485		goto out_stop;
5486	}
5487	ext4_discard_preallocations(inode);
5488
5489	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5490				     punch_stop - punch_start);
5491	if (ret) {
5492		up_write(&EXT4_I(inode)->i_data_sem);
5493		goto out_stop;
5494	}
5495
5496	new_size = i_size_read(inode) - len;
5497	i_size_write(inode, new_size);
5498	EXT4_I(inode)->i_disksize = new_size;
5499
5500	up_write(&EXT4_I(inode)->i_data_sem);
5501	if (IS_SYNC(inode))
5502		ext4_handle_sync(handle);
5503	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5504	ext4_mark_inode_dirty(handle, inode);
5505
5506out_stop:
5507	ext4_journal_stop(handle);
5508out_dio:
5509	ext4_inode_resume_unlocked_dio(inode);
5510out_mutex:
5511	mutex_unlock(&inode->i_mutex);
5512	return ret;
5513}
5514