extents.c revision 81fdbb4a8d34242f0ed048395c4ddc910f1dffbe
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45
46#include <trace/events/ext4.h>
47
48static int ext4_split_extent(handle_t *handle,
49				struct inode *inode,
50				struct ext4_ext_path *path,
51				struct ext4_map_blocks *map,
52				int split_flag,
53				int flags);
54
55static int ext4_ext_truncate_extend_restart(handle_t *handle,
56					    struct inode *inode,
57					    int needed)
58{
59	int err;
60
61	if (!ext4_handle_valid(handle))
62		return 0;
63	if (handle->h_buffer_credits > needed)
64		return 0;
65	err = ext4_journal_extend(handle, needed);
66	if (err <= 0)
67		return err;
68	err = ext4_truncate_restart_trans(handle, inode, needed);
69	if (err == 0)
70		err = -EAGAIN;
71
72	return err;
73}
74
75/*
76 * could return:
77 *  - EROFS
78 *  - ENOMEM
79 */
80static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
81				struct ext4_ext_path *path)
82{
83	if (path->p_bh) {
84		/* path points to block */
85		return ext4_journal_get_write_access(handle, path->p_bh);
86	}
87	/* path points to leaf/index in inode body */
88	/* we use in-core data, no need to protect them */
89	return 0;
90}
91
92/*
93 * could return:
94 *  - EROFS
95 *  - ENOMEM
96 *  - EIO
97 */
98#define ext4_ext_dirty(handle, inode, path) \
99		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101			    handle_t *handle, struct inode *inode,
102			    struct ext4_ext_path *path)
103{
104	int err;
105	if (path->p_bh) {
106		/* path points to block */
107		err = __ext4_handle_dirty_metadata(where, line, handle,
108						   inode, path->p_bh);
109	} else {
110		/* path points to leaf/index in inode body */
111		err = ext4_mark_inode_dirty(handle, inode);
112	}
113	return err;
114}
115
116static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117			      struct ext4_ext_path *path,
118			      ext4_lblk_t block)
119{
120	if (path) {
121		int depth = path->p_depth;
122		struct ext4_extent *ex;
123
124		/*
125		 * Try to predict block placement assuming that we are
126		 * filling in a file which will eventually be
127		 * non-sparse --- i.e., in the case of libbfd writing
128		 * an ELF object sections out-of-order but in a way
129		 * the eventually results in a contiguous object or
130		 * executable file, or some database extending a table
131		 * space file.  However, this is actually somewhat
132		 * non-ideal if we are writing a sparse file such as
133		 * qemu or KVM writing a raw image file that is going
134		 * to stay fairly sparse, since it will end up
135		 * fragmenting the file system's free space.  Maybe we
136		 * should have some hueristics or some way to allow
137		 * userspace to pass a hint to file system,
138		 * especially if the latter case turns out to be
139		 * common.
140		 */
141		ex = path[depth].p_ext;
142		if (ex) {
143			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
144			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
145
146			if (block > ext_block)
147				return ext_pblk + (block - ext_block);
148			else
149				return ext_pblk - (ext_block - block);
150		}
151
152		/* it looks like index is empty;
153		 * try to find starting block from index itself */
154		if (path[depth].p_bh)
155			return path[depth].p_bh->b_blocknr;
156	}
157
158	/* OK. use inode's group */
159	return ext4_inode_to_goal_block(inode);
160}
161
162/*
163 * Allocation for a meta data block
164 */
165static ext4_fsblk_t
166ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
167			struct ext4_ext_path *path,
168			struct ext4_extent *ex, int *err, unsigned int flags)
169{
170	ext4_fsblk_t goal, newblock;
171
172	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
173	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
174					NULL, err);
175	return newblock;
176}
177
178static inline int ext4_ext_space_block(struct inode *inode, int check)
179{
180	int size;
181
182	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
183			/ sizeof(struct ext4_extent);
184	if (!check) {
185#ifdef AGGRESSIVE_TEST
186		if (size > 6)
187			size = 6;
188#endif
189	}
190	return size;
191}
192
193static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
194{
195	int size;
196
197	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
198			/ sizeof(struct ext4_extent_idx);
199	if (!check) {
200#ifdef AGGRESSIVE_TEST
201		if (size > 5)
202			size = 5;
203#endif
204	}
205	return size;
206}
207
208static inline int ext4_ext_space_root(struct inode *inode, int check)
209{
210	int size;
211
212	size = sizeof(EXT4_I(inode)->i_data);
213	size -= sizeof(struct ext4_extent_header);
214	size /= sizeof(struct ext4_extent);
215	if (!check) {
216#ifdef AGGRESSIVE_TEST
217		if (size > 3)
218			size = 3;
219#endif
220	}
221	return size;
222}
223
224static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
225{
226	int size;
227
228	size = sizeof(EXT4_I(inode)->i_data);
229	size -= sizeof(struct ext4_extent_header);
230	size /= sizeof(struct ext4_extent_idx);
231	if (!check) {
232#ifdef AGGRESSIVE_TEST
233		if (size > 4)
234			size = 4;
235#endif
236	}
237	return size;
238}
239
240/*
241 * Calculate the number of metadata blocks needed
242 * to allocate @blocks
243 * Worse case is one block per extent
244 */
245int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
246{
247	struct ext4_inode_info *ei = EXT4_I(inode);
248	int idxs;
249
250	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
251		/ sizeof(struct ext4_extent_idx));
252
253	/*
254	 * If the new delayed allocation block is contiguous with the
255	 * previous da block, it can share index blocks with the
256	 * previous block, so we only need to allocate a new index
257	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
258	 * an additional index block, and at ldxs**3 blocks, yet
259	 * another index blocks.
260	 */
261	if (ei->i_da_metadata_calc_len &&
262	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
263		int num = 0;
264
265		if ((ei->i_da_metadata_calc_len % idxs) == 0)
266			num++;
267		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
268			num++;
269		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
270			num++;
271			ei->i_da_metadata_calc_len = 0;
272		} else
273			ei->i_da_metadata_calc_len++;
274		ei->i_da_metadata_calc_last_lblock++;
275		return num;
276	}
277
278	/*
279	 * In the worst case we need a new set of index blocks at
280	 * every level of the inode's extent tree.
281	 */
282	ei->i_da_metadata_calc_len = 1;
283	ei->i_da_metadata_calc_last_lblock = lblock;
284	return ext_depth(inode) + 1;
285}
286
287static int
288ext4_ext_max_entries(struct inode *inode, int depth)
289{
290	int max;
291
292	if (depth == ext_depth(inode)) {
293		if (depth == 0)
294			max = ext4_ext_space_root(inode, 1);
295		else
296			max = ext4_ext_space_root_idx(inode, 1);
297	} else {
298		if (depth == 0)
299			max = ext4_ext_space_block(inode, 1);
300		else
301			max = ext4_ext_space_block_idx(inode, 1);
302	}
303
304	return max;
305}
306
307static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
308{
309	ext4_fsblk_t block = ext4_ext_pblock(ext);
310	int len = ext4_ext_get_actual_len(ext);
311
312	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
313}
314
315static int ext4_valid_extent_idx(struct inode *inode,
316				struct ext4_extent_idx *ext_idx)
317{
318	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
319
320	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
321}
322
323static int ext4_valid_extent_entries(struct inode *inode,
324				struct ext4_extent_header *eh,
325				int depth)
326{
327	unsigned short entries;
328	if (eh->eh_entries == 0)
329		return 1;
330
331	entries = le16_to_cpu(eh->eh_entries);
332
333	if (depth == 0) {
334		/* leaf entries */
335		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
336		while (entries) {
337			if (!ext4_valid_extent(inode, ext))
338				return 0;
339			ext++;
340			entries--;
341		}
342	} else {
343		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
344		while (entries) {
345			if (!ext4_valid_extent_idx(inode, ext_idx))
346				return 0;
347			ext_idx++;
348			entries--;
349		}
350	}
351	return 1;
352}
353
354static int __ext4_ext_check(const char *function, unsigned int line,
355			    struct inode *inode, struct ext4_extent_header *eh,
356			    int depth)
357{
358	const char *error_msg;
359	int max = 0;
360
361	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
362		error_msg = "invalid magic";
363		goto corrupted;
364	}
365	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
366		error_msg = "unexpected eh_depth";
367		goto corrupted;
368	}
369	if (unlikely(eh->eh_max == 0)) {
370		error_msg = "invalid eh_max";
371		goto corrupted;
372	}
373	max = ext4_ext_max_entries(inode, depth);
374	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
375		error_msg = "too large eh_max";
376		goto corrupted;
377	}
378	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
379		error_msg = "invalid eh_entries";
380		goto corrupted;
381	}
382	if (!ext4_valid_extent_entries(inode, eh, depth)) {
383		error_msg = "invalid extent entries";
384		goto corrupted;
385	}
386	return 0;
387
388corrupted:
389	ext4_error_inode(inode, function, line, 0,
390			"bad header/extent: %s - magic %x, "
391			"entries %u, max %u(%u), depth %u(%u)",
392			error_msg, le16_to_cpu(eh->eh_magic),
393			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
394			max, le16_to_cpu(eh->eh_depth), depth);
395
396	return -EIO;
397}
398
399#define ext4_ext_check(inode, eh, depth)	\
400	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
401
402int ext4_ext_check_inode(struct inode *inode)
403{
404	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
405}
406
407#ifdef EXT_DEBUG
408static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
409{
410	int k, l = path->p_depth;
411
412	ext_debug("path:");
413	for (k = 0; k <= l; k++, path++) {
414		if (path->p_idx) {
415		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
416			    ext4_idx_pblock(path->p_idx));
417		} else if (path->p_ext) {
418			ext_debug("  %d:[%d]%d:%llu ",
419				  le32_to_cpu(path->p_ext->ee_block),
420				  ext4_ext_is_uninitialized(path->p_ext),
421				  ext4_ext_get_actual_len(path->p_ext),
422				  ext4_ext_pblock(path->p_ext));
423		} else
424			ext_debug("  []");
425	}
426	ext_debug("\n");
427}
428
429static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
430{
431	int depth = ext_depth(inode);
432	struct ext4_extent_header *eh;
433	struct ext4_extent *ex;
434	int i;
435
436	if (!path)
437		return;
438
439	eh = path[depth].p_hdr;
440	ex = EXT_FIRST_EXTENT(eh);
441
442	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
443
444	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
445		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
446			  ext4_ext_is_uninitialized(ex),
447			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
448	}
449	ext_debug("\n");
450}
451
452static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
453			ext4_fsblk_t newblock, int level)
454{
455	int depth = ext_depth(inode);
456	struct ext4_extent *ex;
457
458	if (depth != level) {
459		struct ext4_extent_idx *idx;
460		idx = path[level].p_idx;
461		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
462			ext_debug("%d: move %d:%llu in new index %llu\n", level,
463					le32_to_cpu(idx->ei_block),
464					ext4_idx_pblock(idx),
465					newblock);
466			idx++;
467		}
468
469		return;
470	}
471
472	ex = path[depth].p_ext;
473	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
474		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
475				le32_to_cpu(ex->ee_block),
476				ext4_ext_pblock(ex),
477				ext4_ext_is_uninitialized(ex),
478				ext4_ext_get_actual_len(ex),
479				newblock);
480		ex++;
481	}
482}
483
484#else
485#define ext4_ext_show_path(inode, path)
486#define ext4_ext_show_leaf(inode, path)
487#define ext4_ext_show_move(inode, path, newblock, level)
488#endif
489
490void ext4_ext_drop_refs(struct ext4_ext_path *path)
491{
492	int depth = path->p_depth;
493	int i;
494
495	for (i = 0; i <= depth; i++, path++)
496		if (path->p_bh) {
497			brelse(path->p_bh);
498			path->p_bh = NULL;
499		}
500}
501
502/*
503 * ext4_ext_binsearch_idx:
504 * binary search for the closest index of the given block
505 * the header must be checked before calling this
506 */
507static void
508ext4_ext_binsearch_idx(struct inode *inode,
509			struct ext4_ext_path *path, ext4_lblk_t block)
510{
511	struct ext4_extent_header *eh = path->p_hdr;
512	struct ext4_extent_idx *r, *l, *m;
513
514
515	ext_debug("binsearch for %u(idx):  ", block);
516
517	l = EXT_FIRST_INDEX(eh) + 1;
518	r = EXT_LAST_INDEX(eh);
519	while (l <= r) {
520		m = l + (r - l) / 2;
521		if (block < le32_to_cpu(m->ei_block))
522			r = m - 1;
523		else
524			l = m + 1;
525		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
526				m, le32_to_cpu(m->ei_block),
527				r, le32_to_cpu(r->ei_block));
528	}
529
530	path->p_idx = l - 1;
531	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
532		  ext4_idx_pblock(path->p_idx));
533
534#ifdef CHECK_BINSEARCH
535	{
536		struct ext4_extent_idx *chix, *ix;
537		int k;
538
539		chix = ix = EXT_FIRST_INDEX(eh);
540		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
541		  if (k != 0 &&
542		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
543				printk(KERN_DEBUG "k=%d, ix=0x%p, "
544				       "first=0x%p\n", k,
545				       ix, EXT_FIRST_INDEX(eh));
546				printk(KERN_DEBUG "%u <= %u\n",
547				       le32_to_cpu(ix->ei_block),
548				       le32_to_cpu(ix[-1].ei_block));
549			}
550			BUG_ON(k && le32_to_cpu(ix->ei_block)
551					   <= le32_to_cpu(ix[-1].ei_block));
552			if (block < le32_to_cpu(ix->ei_block))
553				break;
554			chix = ix;
555		}
556		BUG_ON(chix != path->p_idx);
557	}
558#endif
559
560}
561
562/*
563 * ext4_ext_binsearch:
564 * binary search for closest extent of the given block
565 * the header must be checked before calling this
566 */
567static void
568ext4_ext_binsearch(struct inode *inode,
569		struct ext4_ext_path *path, ext4_lblk_t block)
570{
571	struct ext4_extent_header *eh = path->p_hdr;
572	struct ext4_extent *r, *l, *m;
573
574	if (eh->eh_entries == 0) {
575		/*
576		 * this leaf is empty:
577		 * we get such a leaf in split/add case
578		 */
579		return;
580	}
581
582	ext_debug("binsearch for %u:  ", block);
583
584	l = EXT_FIRST_EXTENT(eh) + 1;
585	r = EXT_LAST_EXTENT(eh);
586
587	while (l <= r) {
588		m = l + (r - l) / 2;
589		if (block < le32_to_cpu(m->ee_block))
590			r = m - 1;
591		else
592			l = m + 1;
593		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
594				m, le32_to_cpu(m->ee_block),
595				r, le32_to_cpu(r->ee_block));
596	}
597
598	path->p_ext = l - 1;
599	ext_debug("  -> %d:%llu:[%d]%d ",
600			le32_to_cpu(path->p_ext->ee_block),
601			ext4_ext_pblock(path->p_ext),
602			ext4_ext_is_uninitialized(path->p_ext),
603			ext4_ext_get_actual_len(path->p_ext));
604
605#ifdef CHECK_BINSEARCH
606	{
607		struct ext4_extent *chex, *ex;
608		int k;
609
610		chex = ex = EXT_FIRST_EXTENT(eh);
611		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
612			BUG_ON(k && le32_to_cpu(ex->ee_block)
613					  <= le32_to_cpu(ex[-1].ee_block));
614			if (block < le32_to_cpu(ex->ee_block))
615				break;
616			chex = ex;
617		}
618		BUG_ON(chex != path->p_ext);
619	}
620#endif
621
622}
623
624int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
625{
626	struct ext4_extent_header *eh;
627
628	eh = ext_inode_hdr(inode);
629	eh->eh_depth = 0;
630	eh->eh_entries = 0;
631	eh->eh_magic = EXT4_EXT_MAGIC;
632	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
633	ext4_mark_inode_dirty(handle, inode);
634	ext4_ext_invalidate_cache(inode);
635	return 0;
636}
637
638struct ext4_ext_path *
639ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
640					struct ext4_ext_path *path)
641{
642	struct ext4_extent_header *eh;
643	struct buffer_head *bh;
644	short int depth, i, ppos = 0, alloc = 0;
645
646	eh = ext_inode_hdr(inode);
647	depth = ext_depth(inode);
648
649	/* account possible depth increase */
650	if (!path) {
651		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
652				GFP_NOFS);
653		if (!path)
654			return ERR_PTR(-ENOMEM);
655		alloc = 1;
656	}
657	path[0].p_hdr = eh;
658	path[0].p_bh = NULL;
659
660	i = depth;
661	/* walk through the tree */
662	while (i) {
663		int need_to_validate = 0;
664
665		ext_debug("depth %d: num %d, max %d\n",
666			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
667
668		ext4_ext_binsearch_idx(inode, path + ppos, block);
669		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
670		path[ppos].p_depth = i;
671		path[ppos].p_ext = NULL;
672
673		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
674		if (unlikely(!bh))
675			goto err;
676		if (!bh_uptodate_or_lock(bh)) {
677			trace_ext4_ext_load_extent(inode, block,
678						path[ppos].p_block);
679			if (bh_submit_read(bh) < 0) {
680				put_bh(bh);
681				goto err;
682			}
683			/* validate the extent entries */
684			need_to_validate = 1;
685		}
686		eh = ext_block_hdr(bh);
687		ppos++;
688		if (unlikely(ppos > depth)) {
689			put_bh(bh);
690			EXT4_ERROR_INODE(inode,
691					 "ppos %d > depth %d", ppos, depth);
692			goto err;
693		}
694		path[ppos].p_bh = bh;
695		path[ppos].p_hdr = eh;
696		i--;
697
698		if (need_to_validate && ext4_ext_check(inode, eh, i))
699			goto err;
700	}
701
702	path[ppos].p_depth = i;
703	path[ppos].p_ext = NULL;
704	path[ppos].p_idx = NULL;
705
706	/* find extent */
707	ext4_ext_binsearch(inode, path + ppos, block);
708	/* if not an empty leaf */
709	if (path[ppos].p_ext)
710		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
711
712	ext4_ext_show_path(inode, path);
713
714	return path;
715
716err:
717	ext4_ext_drop_refs(path);
718	if (alloc)
719		kfree(path);
720	return ERR_PTR(-EIO);
721}
722
723/*
724 * ext4_ext_insert_index:
725 * insert new index [@logical;@ptr] into the block at @curp;
726 * check where to insert: before @curp or after @curp
727 */
728static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
729				 struct ext4_ext_path *curp,
730				 int logical, ext4_fsblk_t ptr)
731{
732	struct ext4_extent_idx *ix;
733	int len, err;
734
735	err = ext4_ext_get_access(handle, inode, curp);
736	if (err)
737		return err;
738
739	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
740		EXT4_ERROR_INODE(inode,
741				 "logical %d == ei_block %d!",
742				 logical, le32_to_cpu(curp->p_idx->ei_block));
743		return -EIO;
744	}
745
746	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
747			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
748		EXT4_ERROR_INODE(inode,
749				 "eh_entries %d >= eh_max %d!",
750				 le16_to_cpu(curp->p_hdr->eh_entries),
751				 le16_to_cpu(curp->p_hdr->eh_max));
752		return -EIO;
753	}
754
755	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
756		/* insert after */
757		ext_debug("insert new index %d after: %llu\n", logical, ptr);
758		ix = curp->p_idx + 1;
759	} else {
760		/* insert before */
761		ext_debug("insert new index %d before: %llu\n", logical, ptr);
762		ix = curp->p_idx;
763	}
764
765	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
766	BUG_ON(len < 0);
767	if (len > 0) {
768		ext_debug("insert new index %d: "
769				"move %d indices from 0x%p to 0x%p\n",
770				logical, len, ix, ix + 1);
771		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
772	}
773
774	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
775		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
776		return -EIO;
777	}
778
779	ix->ei_block = cpu_to_le32(logical);
780	ext4_idx_store_pblock(ix, ptr);
781	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
782
783	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
784		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
785		return -EIO;
786	}
787
788	err = ext4_ext_dirty(handle, inode, curp);
789	ext4_std_error(inode->i_sb, err);
790
791	return err;
792}
793
794/*
795 * ext4_ext_split:
796 * inserts new subtree into the path, using free index entry
797 * at depth @at:
798 * - allocates all needed blocks (new leaf and all intermediate index blocks)
799 * - makes decision where to split
800 * - moves remaining extents and index entries (right to the split point)
801 *   into the newly allocated blocks
802 * - initializes subtree
803 */
804static int ext4_ext_split(handle_t *handle, struct inode *inode,
805			  unsigned int flags,
806			  struct ext4_ext_path *path,
807			  struct ext4_extent *newext, int at)
808{
809	struct buffer_head *bh = NULL;
810	int depth = ext_depth(inode);
811	struct ext4_extent_header *neh;
812	struct ext4_extent_idx *fidx;
813	int i = at, k, m, a;
814	ext4_fsblk_t newblock, oldblock;
815	__le32 border;
816	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
817	int err = 0;
818
819	/* make decision: where to split? */
820	/* FIXME: now decision is simplest: at current extent */
821
822	/* if current leaf will be split, then we should use
823	 * border from split point */
824	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
825		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
826		return -EIO;
827	}
828	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
829		border = path[depth].p_ext[1].ee_block;
830		ext_debug("leaf will be split."
831				" next leaf starts at %d\n",
832				  le32_to_cpu(border));
833	} else {
834		border = newext->ee_block;
835		ext_debug("leaf will be added."
836				" next leaf starts at %d\n",
837				le32_to_cpu(border));
838	}
839
840	/*
841	 * If error occurs, then we break processing
842	 * and mark filesystem read-only. index won't
843	 * be inserted and tree will be in consistent
844	 * state. Next mount will repair buffers too.
845	 */
846
847	/*
848	 * Get array to track all allocated blocks.
849	 * We need this to handle errors and free blocks
850	 * upon them.
851	 */
852	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
853	if (!ablocks)
854		return -ENOMEM;
855
856	/* allocate all needed blocks */
857	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
858	for (a = 0; a < depth - at; a++) {
859		newblock = ext4_ext_new_meta_block(handle, inode, path,
860						   newext, &err, flags);
861		if (newblock == 0)
862			goto cleanup;
863		ablocks[a] = newblock;
864	}
865
866	/* initialize new leaf */
867	newblock = ablocks[--a];
868	if (unlikely(newblock == 0)) {
869		EXT4_ERROR_INODE(inode, "newblock == 0!");
870		err = -EIO;
871		goto cleanup;
872	}
873	bh = sb_getblk(inode->i_sb, newblock);
874	if (!bh) {
875		err = -EIO;
876		goto cleanup;
877	}
878	lock_buffer(bh);
879
880	err = ext4_journal_get_create_access(handle, bh);
881	if (err)
882		goto cleanup;
883
884	neh = ext_block_hdr(bh);
885	neh->eh_entries = 0;
886	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
887	neh->eh_magic = EXT4_EXT_MAGIC;
888	neh->eh_depth = 0;
889
890	/* move remainder of path[depth] to the new leaf */
891	if (unlikely(path[depth].p_hdr->eh_entries !=
892		     path[depth].p_hdr->eh_max)) {
893		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
894				 path[depth].p_hdr->eh_entries,
895				 path[depth].p_hdr->eh_max);
896		err = -EIO;
897		goto cleanup;
898	}
899	/* start copy from next extent */
900	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
901	ext4_ext_show_move(inode, path, newblock, depth);
902	if (m) {
903		struct ext4_extent *ex;
904		ex = EXT_FIRST_EXTENT(neh);
905		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
906		le16_add_cpu(&neh->eh_entries, m);
907	}
908
909	set_buffer_uptodate(bh);
910	unlock_buffer(bh);
911
912	err = ext4_handle_dirty_metadata(handle, inode, bh);
913	if (err)
914		goto cleanup;
915	brelse(bh);
916	bh = NULL;
917
918	/* correct old leaf */
919	if (m) {
920		err = ext4_ext_get_access(handle, inode, path + depth);
921		if (err)
922			goto cleanup;
923		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
924		err = ext4_ext_dirty(handle, inode, path + depth);
925		if (err)
926			goto cleanup;
927
928	}
929
930	/* create intermediate indexes */
931	k = depth - at - 1;
932	if (unlikely(k < 0)) {
933		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
934		err = -EIO;
935		goto cleanup;
936	}
937	if (k)
938		ext_debug("create %d intermediate indices\n", k);
939	/* insert new index into current index block */
940	/* current depth stored in i var */
941	i = depth - 1;
942	while (k--) {
943		oldblock = newblock;
944		newblock = ablocks[--a];
945		bh = sb_getblk(inode->i_sb, newblock);
946		if (!bh) {
947			err = -EIO;
948			goto cleanup;
949		}
950		lock_buffer(bh);
951
952		err = ext4_journal_get_create_access(handle, bh);
953		if (err)
954			goto cleanup;
955
956		neh = ext_block_hdr(bh);
957		neh->eh_entries = cpu_to_le16(1);
958		neh->eh_magic = EXT4_EXT_MAGIC;
959		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
960		neh->eh_depth = cpu_to_le16(depth - i);
961		fidx = EXT_FIRST_INDEX(neh);
962		fidx->ei_block = border;
963		ext4_idx_store_pblock(fidx, oldblock);
964
965		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
966				i, newblock, le32_to_cpu(border), oldblock);
967
968		/* move remainder of path[i] to the new index block */
969		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
970					EXT_LAST_INDEX(path[i].p_hdr))) {
971			EXT4_ERROR_INODE(inode,
972					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
973					 le32_to_cpu(path[i].p_ext->ee_block));
974			err = -EIO;
975			goto cleanup;
976		}
977		/* start copy indexes */
978		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
979		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
980				EXT_MAX_INDEX(path[i].p_hdr));
981		ext4_ext_show_move(inode, path, newblock, i);
982		if (m) {
983			memmove(++fidx, path[i].p_idx,
984				sizeof(struct ext4_extent_idx) * m);
985			le16_add_cpu(&neh->eh_entries, m);
986		}
987		set_buffer_uptodate(bh);
988		unlock_buffer(bh);
989
990		err = ext4_handle_dirty_metadata(handle, inode, bh);
991		if (err)
992			goto cleanup;
993		brelse(bh);
994		bh = NULL;
995
996		/* correct old index */
997		if (m) {
998			err = ext4_ext_get_access(handle, inode, path + i);
999			if (err)
1000				goto cleanup;
1001			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1002			err = ext4_ext_dirty(handle, inode, path + i);
1003			if (err)
1004				goto cleanup;
1005		}
1006
1007		i--;
1008	}
1009
1010	/* insert new index */
1011	err = ext4_ext_insert_index(handle, inode, path + at,
1012				    le32_to_cpu(border), newblock);
1013
1014cleanup:
1015	if (bh) {
1016		if (buffer_locked(bh))
1017			unlock_buffer(bh);
1018		brelse(bh);
1019	}
1020
1021	if (err) {
1022		/* free all allocated blocks in error case */
1023		for (i = 0; i < depth; i++) {
1024			if (!ablocks[i])
1025				continue;
1026			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1027					 EXT4_FREE_BLOCKS_METADATA);
1028		}
1029	}
1030	kfree(ablocks);
1031
1032	return err;
1033}
1034
1035/*
1036 * ext4_ext_grow_indepth:
1037 * implements tree growing procedure:
1038 * - allocates new block
1039 * - moves top-level data (index block or leaf) into the new block
1040 * - initializes new top-level, creating index that points to the
1041 *   just created block
1042 */
1043static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1044				 unsigned int flags,
1045				 struct ext4_extent *newext)
1046{
1047	struct ext4_extent_header *neh;
1048	struct buffer_head *bh;
1049	ext4_fsblk_t newblock;
1050	int err = 0;
1051
1052	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1053		newext, &err, flags);
1054	if (newblock == 0)
1055		return err;
1056
1057	bh = sb_getblk(inode->i_sb, newblock);
1058	if (!bh) {
1059		err = -EIO;
1060		ext4_std_error(inode->i_sb, err);
1061		return err;
1062	}
1063	lock_buffer(bh);
1064
1065	err = ext4_journal_get_create_access(handle, bh);
1066	if (err) {
1067		unlock_buffer(bh);
1068		goto out;
1069	}
1070
1071	/* move top-level index/leaf into new block */
1072	memmove(bh->b_data, EXT4_I(inode)->i_data,
1073		sizeof(EXT4_I(inode)->i_data));
1074
1075	/* set size of new block */
1076	neh = ext_block_hdr(bh);
1077	/* old root could have indexes or leaves
1078	 * so calculate e_max right way */
1079	if (ext_depth(inode))
1080		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1081	else
1082		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1083	neh->eh_magic = EXT4_EXT_MAGIC;
1084	set_buffer_uptodate(bh);
1085	unlock_buffer(bh);
1086
1087	err = ext4_handle_dirty_metadata(handle, inode, bh);
1088	if (err)
1089		goto out;
1090
1091	/* Update top-level index: num,max,pointer */
1092	neh = ext_inode_hdr(inode);
1093	neh->eh_entries = cpu_to_le16(1);
1094	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1095	if (neh->eh_depth == 0) {
1096		/* Root extent block becomes index block */
1097		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1098		EXT_FIRST_INDEX(neh)->ei_block =
1099			EXT_FIRST_EXTENT(neh)->ee_block;
1100	}
1101	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1102		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1103		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1104		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1105
1106	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1107	ext4_mark_inode_dirty(handle, inode);
1108out:
1109	brelse(bh);
1110
1111	return err;
1112}
1113
1114/*
1115 * ext4_ext_create_new_leaf:
1116 * finds empty index and adds new leaf.
1117 * if no free index is found, then it requests in-depth growing.
1118 */
1119static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1120				    unsigned int flags,
1121				    struct ext4_ext_path *path,
1122				    struct ext4_extent *newext)
1123{
1124	struct ext4_ext_path *curp;
1125	int depth, i, err = 0;
1126
1127repeat:
1128	i = depth = ext_depth(inode);
1129
1130	/* walk up to the tree and look for free index entry */
1131	curp = path + depth;
1132	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1133		i--;
1134		curp--;
1135	}
1136
1137	/* we use already allocated block for index block,
1138	 * so subsequent data blocks should be contiguous */
1139	if (EXT_HAS_FREE_INDEX(curp)) {
1140		/* if we found index with free entry, then use that
1141		 * entry: create all needed subtree and add new leaf */
1142		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1143		if (err)
1144			goto out;
1145
1146		/* refill path */
1147		ext4_ext_drop_refs(path);
1148		path = ext4_ext_find_extent(inode,
1149				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1150				    path);
1151		if (IS_ERR(path))
1152			err = PTR_ERR(path);
1153	} else {
1154		/* tree is full, time to grow in depth */
1155		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1156		if (err)
1157			goto out;
1158
1159		/* refill path */
1160		ext4_ext_drop_refs(path);
1161		path = ext4_ext_find_extent(inode,
1162				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1163				    path);
1164		if (IS_ERR(path)) {
1165			err = PTR_ERR(path);
1166			goto out;
1167		}
1168
1169		/*
1170		 * only first (depth 0 -> 1) produces free space;
1171		 * in all other cases we have to split the grown tree
1172		 */
1173		depth = ext_depth(inode);
1174		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1175			/* now we need to split */
1176			goto repeat;
1177		}
1178	}
1179
1180out:
1181	return err;
1182}
1183
1184/*
1185 * search the closest allocated block to the left for *logical
1186 * and returns it at @logical + it's physical address at @phys
1187 * if *logical is the smallest allocated block, the function
1188 * returns 0 at @phys
1189 * return value contains 0 (success) or error code
1190 */
1191static int ext4_ext_search_left(struct inode *inode,
1192				struct ext4_ext_path *path,
1193				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1194{
1195	struct ext4_extent_idx *ix;
1196	struct ext4_extent *ex;
1197	int depth, ee_len;
1198
1199	if (unlikely(path == NULL)) {
1200		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1201		return -EIO;
1202	}
1203	depth = path->p_depth;
1204	*phys = 0;
1205
1206	if (depth == 0 && path->p_ext == NULL)
1207		return 0;
1208
1209	/* usually extent in the path covers blocks smaller
1210	 * then *logical, but it can be that extent is the
1211	 * first one in the file */
1212
1213	ex = path[depth].p_ext;
1214	ee_len = ext4_ext_get_actual_len(ex);
1215	if (*logical < le32_to_cpu(ex->ee_block)) {
1216		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1217			EXT4_ERROR_INODE(inode,
1218					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1219					 *logical, le32_to_cpu(ex->ee_block));
1220			return -EIO;
1221		}
1222		while (--depth >= 0) {
1223			ix = path[depth].p_idx;
1224			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1225				EXT4_ERROR_INODE(inode,
1226				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1227				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1228				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1229		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1230				  depth);
1231				return -EIO;
1232			}
1233		}
1234		return 0;
1235	}
1236
1237	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1238		EXT4_ERROR_INODE(inode,
1239				 "logical %d < ee_block %d + ee_len %d!",
1240				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1241		return -EIO;
1242	}
1243
1244	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1245	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1246	return 0;
1247}
1248
1249/*
1250 * search the closest allocated block to the right for *logical
1251 * and returns it at @logical + it's physical address at @phys
1252 * if *logical is the largest allocated block, the function
1253 * returns 0 at @phys
1254 * return value contains 0 (success) or error code
1255 */
1256static int ext4_ext_search_right(struct inode *inode,
1257				 struct ext4_ext_path *path,
1258				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1259				 struct ext4_extent **ret_ex)
1260{
1261	struct buffer_head *bh = NULL;
1262	struct ext4_extent_header *eh;
1263	struct ext4_extent_idx *ix;
1264	struct ext4_extent *ex;
1265	ext4_fsblk_t block;
1266	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1267	int ee_len;
1268
1269	if (unlikely(path == NULL)) {
1270		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1271		return -EIO;
1272	}
1273	depth = path->p_depth;
1274	*phys = 0;
1275
1276	if (depth == 0 && path->p_ext == NULL)
1277		return 0;
1278
1279	/* usually extent in the path covers blocks smaller
1280	 * then *logical, but it can be that extent is the
1281	 * first one in the file */
1282
1283	ex = path[depth].p_ext;
1284	ee_len = ext4_ext_get_actual_len(ex);
1285	if (*logical < le32_to_cpu(ex->ee_block)) {
1286		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1287			EXT4_ERROR_INODE(inode,
1288					 "first_extent(path[%d].p_hdr) != ex",
1289					 depth);
1290			return -EIO;
1291		}
1292		while (--depth >= 0) {
1293			ix = path[depth].p_idx;
1294			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1295				EXT4_ERROR_INODE(inode,
1296						 "ix != EXT_FIRST_INDEX *logical %d!",
1297						 *logical);
1298				return -EIO;
1299			}
1300		}
1301		goto found_extent;
1302	}
1303
1304	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1305		EXT4_ERROR_INODE(inode,
1306				 "logical %d < ee_block %d + ee_len %d!",
1307				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1308		return -EIO;
1309	}
1310
1311	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1312		/* next allocated block in this leaf */
1313		ex++;
1314		goto found_extent;
1315	}
1316
1317	/* go up and search for index to the right */
1318	while (--depth >= 0) {
1319		ix = path[depth].p_idx;
1320		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1321			goto got_index;
1322	}
1323
1324	/* we've gone up to the root and found no index to the right */
1325	return 0;
1326
1327got_index:
1328	/* we've found index to the right, let's
1329	 * follow it and find the closest allocated
1330	 * block to the right */
1331	ix++;
1332	block = ext4_idx_pblock(ix);
1333	while (++depth < path->p_depth) {
1334		bh = sb_bread(inode->i_sb, block);
1335		if (bh == NULL)
1336			return -EIO;
1337		eh = ext_block_hdr(bh);
1338		/* subtract from p_depth to get proper eh_depth */
1339		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1340			put_bh(bh);
1341			return -EIO;
1342		}
1343		ix = EXT_FIRST_INDEX(eh);
1344		block = ext4_idx_pblock(ix);
1345		put_bh(bh);
1346	}
1347
1348	bh = sb_bread(inode->i_sb, block);
1349	if (bh == NULL)
1350		return -EIO;
1351	eh = ext_block_hdr(bh);
1352	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1353		put_bh(bh);
1354		return -EIO;
1355	}
1356	ex = EXT_FIRST_EXTENT(eh);
1357found_extent:
1358	*logical = le32_to_cpu(ex->ee_block);
1359	*phys = ext4_ext_pblock(ex);
1360	*ret_ex = ex;
1361	if (bh)
1362		put_bh(bh);
1363	return 0;
1364}
1365
1366/*
1367 * ext4_ext_next_allocated_block:
1368 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1369 * NOTE: it considers block number from index entry as
1370 * allocated block. Thus, index entries have to be consistent
1371 * with leaves.
1372 */
1373static ext4_lblk_t
1374ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1375{
1376	int depth;
1377
1378	BUG_ON(path == NULL);
1379	depth = path->p_depth;
1380
1381	if (depth == 0 && path->p_ext == NULL)
1382		return EXT_MAX_BLOCKS;
1383
1384	while (depth >= 0) {
1385		if (depth == path->p_depth) {
1386			/* leaf */
1387			if (path[depth].p_ext &&
1388				path[depth].p_ext !=
1389					EXT_LAST_EXTENT(path[depth].p_hdr))
1390			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1391		} else {
1392			/* index */
1393			if (path[depth].p_idx !=
1394					EXT_LAST_INDEX(path[depth].p_hdr))
1395			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1396		}
1397		depth--;
1398	}
1399
1400	return EXT_MAX_BLOCKS;
1401}
1402
1403/*
1404 * ext4_ext_next_leaf_block:
1405 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1406 */
1407static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1408{
1409	int depth;
1410
1411	BUG_ON(path == NULL);
1412	depth = path->p_depth;
1413
1414	/* zero-tree has no leaf blocks at all */
1415	if (depth == 0)
1416		return EXT_MAX_BLOCKS;
1417
1418	/* go to index block */
1419	depth--;
1420
1421	while (depth >= 0) {
1422		if (path[depth].p_idx !=
1423				EXT_LAST_INDEX(path[depth].p_hdr))
1424			return (ext4_lblk_t)
1425				le32_to_cpu(path[depth].p_idx[1].ei_block);
1426		depth--;
1427	}
1428
1429	return EXT_MAX_BLOCKS;
1430}
1431
1432/*
1433 * ext4_ext_correct_indexes:
1434 * if leaf gets modified and modified extent is first in the leaf,
1435 * then we have to correct all indexes above.
1436 * TODO: do we need to correct tree in all cases?
1437 */
1438static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1439				struct ext4_ext_path *path)
1440{
1441	struct ext4_extent_header *eh;
1442	int depth = ext_depth(inode);
1443	struct ext4_extent *ex;
1444	__le32 border;
1445	int k, err = 0;
1446
1447	eh = path[depth].p_hdr;
1448	ex = path[depth].p_ext;
1449
1450	if (unlikely(ex == NULL || eh == NULL)) {
1451		EXT4_ERROR_INODE(inode,
1452				 "ex %p == NULL or eh %p == NULL", ex, eh);
1453		return -EIO;
1454	}
1455
1456	if (depth == 0) {
1457		/* there is no tree at all */
1458		return 0;
1459	}
1460
1461	if (ex != EXT_FIRST_EXTENT(eh)) {
1462		/* we correct tree if first leaf got modified only */
1463		return 0;
1464	}
1465
1466	/*
1467	 * TODO: we need correction if border is smaller than current one
1468	 */
1469	k = depth - 1;
1470	border = path[depth].p_ext->ee_block;
1471	err = ext4_ext_get_access(handle, inode, path + k);
1472	if (err)
1473		return err;
1474	path[k].p_idx->ei_block = border;
1475	err = ext4_ext_dirty(handle, inode, path + k);
1476	if (err)
1477		return err;
1478
1479	while (k--) {
1480		/* change all left-side indexes */
1481		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1482			break;
1483		err = ext4_ext_get_access(handle, inode, path + k);
1484		if (err)
1485			break;
1486		path[k].p_idx->ei_block = border;
1487		err = ext4_ext_dirty(handle, inode, path + k);
1488		if (err)
1489			break;
1490	}
1491
1492	return err;
1493}
1494
1495int
1496ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1497				struct ext4_extent *ex2)
1498{
1499	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1500
1501	/*
1502	 * Make sure that either both extents are uninitialized, or
1503	 * both are _not_.
1504	 */
1505	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1506		return 0;
1507
1508	if (ext4_ext_is_uninitialized(ex1))
1509		max_len = EXT_UNINIT_MAX_LEN;
1510	else
1511		max_len = EXT_INIT_MAX_LEN;
1512
1513	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1514	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1515
1516	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1517			le32_to_cpu(ex2->ee_block))
1518		return 0;
1519
1520	/*
1521	 * To allow future support for preallocated extents to be added
1522	 * as an RO_COMPAT feature, refuse to merge to extents if
1523	 * this can result in the top bit of ee_len being set.
1524	 */
1525	if (ext1_ee_len + ext2_ee_len > max_len)
1526		return 0;
1527#ifdef AGGRESSIVE_TEST
1528	if (ext1_ee_len >= 4)
1529		return 0;
1530#endif
1531
1532	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1533		return 1;
1534	return 0;
1535}
1536
1537/*
1538 * This function tries to merge the "ex" extent to the next extent in the tree.
1539 * It always tries to merge towards right. If you want to merge towards
1540 * left, pass "ex - 1" as argument instead of "ex".
1541 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1542 * 1 if they got merged.
1543 */
1544static int ext4_ext_try_to_merge_right(struct inode *inode,
1545				 struct ext4_ext_path *path,
1546				 struct ext4_extent *ex)
1547{
1548	struct ext4_extent_header *eh;
1549	unsigned int depth, len;
1550	int merge_done = 0;
1551	int uninitialized = 0;
1552
1553	depth = ext_depth(inode);
1554	BUG_ON(path[depth].p_hdr == NULL);
1555	eh = path[depth].p_hdr;
1556
1557	while (ex < EXT_LAST_EXTENT(eh)) {
1558		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1559			break;
1560		/* merge with next extent! */
1561		if (ext4_ext_is_uninitialized(ex))
1562			uninitialized = 1;
1563		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1564				+ ext4_ext_get_actual_len(ex + 1));
1565		if (uninitialized)
1566			ext4_ext_mark_uninitialized(ex);
1567
1568		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1569			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1570				* sizeof(struct ext4_extent);
1571			memmove(ex + 1, ex + 2, len);
1572		}
1573		le16_add_cpu(&eh->eh_entries, -1);
1574		merge_done = 1;
1575		WARN_ON(eh->eh_entries == 0);
1576		if (!eh->eh_entries)
1577			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1578	}
1579
1580	return merge_done;
1581}
1582
1583/*
1584 * This function tries to merge the @ex extent to neighbours in the tree.
1585 * return 1 if merge left else 0.
1586 */
1587static int ext4_ext_try_to_merge(struct inode *inode,
1588				  struct ext4_ext_path *path,
1589				  struct ext4_extent *ex) {
1590	struct ext4_extent_header *eh;
1591	unsigned int depth;
1592	int merge_done = 0;
1593	int ret = 0;
1594
1595	depth = ext_depth(inode);
1596	BUG_ON(path[depth].p_hdr == NULL);
1597	eh = path[depth].p_hdr;
1598
1599	if (ex > EXT_FIRST_EXTENT(eh))
1600		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1601
1602	if (!merge_done)
1603		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1604
1605	return ret;
1606}
1607
1608/*
1609 * check if a portion of the "newext" extent overlaps with an
1610 * existing extent.
1611 *
1612 * If there is an overlap discovered, it updates the length of the newext
1613 * such that there will be no overlap, and then returns 1.
1614 * If there is no overlap found, it returns 0.
1615 */
1616static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1617					   struct inode *inode,
1618					   struct ext4_extent *newext,
1619					   struct ext4_ext_path *path)
1620{
1621	ext4_lblk_t b1, b2;
1622	unsigned int depth, len1;
1623	unsigned int ret = 0;
1624
1625	b1 = le32_to_cpu(newext->ee_block);
1626	len1 = ext4_ext_get_actual_len(newext);
1627	depth = ext_depth(inode);
1628	if (!path[depth].p_ext)
1629		goto out;
1630	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1631	b2 &= ~(sbi->s_cluster_ratio - 1);
1632
1633	/*
1634	 * get the next allocated block if the extent in the path
1635	 * is before the requested block(s)
1636	 */
1637	if (b2 < b1) {
1638		b2 = ext4_ext_next_allocated_block(path);
1639		if (b2 == EXT_MAX_BLOCKS)
1640			goto out;
1641		b2 &= ~(sbi->s_cluster_ratio - 1);
1642	}
1643
1644	/* check for wrap through zero on extent logical start block*/
1645	if (b1 + len1 < b1) {
1646		len1 = EXT_MAX_BLOCKS - b1;
1647		newext->ee_len = cpu_to_le16(len1);
1648		ret = 1;
1649	}
1650
1651	/* check for overlap */
1652	if (b1 + len1 > b2) {
1653		newext->ee_len = cpu_to_le16(b2 - b1);
1654		ret = 1;
1655	}
1656out:
1657	return ret;
1658}
1659
1660/*
1661 * ext4_ext_insert_extent:
1662 * tries to merge requsted extent into the existing extent or
1663 * inserts requested extent as new one into the tree,
1664 * creating new leaf in the no-space case.
1665 */
1666int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1667				struct ext4_ext_path *path,
1668				struct ext4_extent *newext, int flag)
1669{
1670	struct ext4_extent_header *eh;
1671	struct ext4_extent *ex, *fex;
1672	struct ext4_extent *nearex; /* nearest extent */
1673	struct ext4_ext_path *npath = NULL;
1674	int depth, len, err;
1675	ext4_lblk_t next;
1676	unsigned uninitialized = 0;
1677	int flags = 0;
1678
1679	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1680		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1681		return -EIO;
1682	}
1683	depth = ext_depth(inode);
1684	ex = path[depth].p_ext;
1685	if (unlikely(path[depth].p_hdr == NULL)) {
1686		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1687		return -EIO;
1688	}
1689
1690	/* try to insert block into found extent and return */
1691	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1692		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1693		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1694			  ext4_ext_is_uninitialized(newext),
1695			  ext4_ext_get_actual_len(newext),
1696			  le32_to_cpu(ex->ee_block),
1697			  ext4_ext_is_uninitialized(ex),
1698			  ext4_ext_get_actual_len(ex),
1699			  ext4_ext_pblock(ex));
1700		err = ext4_ext_get_access(handle, inode, path + depth);
1701		if (err)
1702			return err;
1703
1704		/*
1705		 * ext4_can_extents_be_merged should have checked that either
1706		 * both extents are uninitialized, or both aren't. Thus we
1707		 * need to check only one of them here.
1708		 */
1709		if (ext4_ext_is_uninitialized(ex))
1710			uninitialized = 1;
1711		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1712					+ ext4_ext_get_actual_len(newext));
1713		if (uninitialized)
1714			ext4_ext_mark_uninitialized(ex);
1715		eh = path[depth].p_hdr;
1716		nearex = ex;
1717		goto merge;
1718	}
1719
1720	depth = ext_depth(inode);
1721	eh = path[depth].p_hdr;
1722	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1723		goto has_space;
1724
1725	/* probably next leaf has space for us? */
1726	fex = EXT_LAST_EXTENT(eh);
1727	next = EXT_MAX_BLOCKS;
1728	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1729		next = ext4_ext_next_leaf_block(path);
1730	if (next != EXT_MAX_BLOCKS) {
1731		ext_debug("next leaf block - %d\n", next);
1732		BUG_ON(npath != NULL);
1733		npath = ext4_ext_find_extent(inode, next, NULL);
1734		if (IS_ERR(npath))
1735			return PTR_ERR(npath);
1736		BUG_ON(npath->p_depth != path->p_depth);
1737		eh = npath[depth].p_hdr;
1738		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1739			ext_debug("next leaf isn't full(%d)\n",
1740				  le16_to_cpu(eh->eh_entries));
1741			path = npath;
1742			goto has_space;
1743		}
1744		ext_debug("next leaf has no free space(%d,%d)\n",
1745			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1746	}
1747
1748	/*
1749	 * There is no free space in the found leaf.
1750	 * We're gonna add a new leaf in the tree.
1751	 */
1752	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1753		flags = EXT4_MB_USE_ROOT_BLOCKS;
1754	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1755	if (err)
1756		goto cleanup;
1757	depth = ext_depth(inode);
1758	eh = path[depth].p_hdr;
1759
1760has_space:
1761	nearex = path[depth].p_ext;
1762
1763	err = ext4_ext_get_access(handle, inode, path + depth);
1764	if (err)
1765		goto cleanup;
1766
1767	if (!nearex) {
1768		/* there is no extent in this leaf, create first one */
1769		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1770				le32_to_cpu(newext->ee_block),
1771				ext4_ext_pblock(newext),
1772				ext4_ext_is_uninitialized(newext),
1773				ext4_ext_get_actual_len(newext));
1774		nearex = EXT_FIRST_EXTENT(eh);
1775	} else {
1776		if (le32_to_cpu(newext->ee_block)
1777			   > le32_to_cpu(nearex->ee_block)) {
1778			/* Insert after */
1779			ext_debug("insert %d:%llu:[%d]%d %s before: "
1780					"nearest 0x%p\n"
1781					le32_to_cpu(newext->ee_block),
1782					ext4_ext_pblock(newext),
1783					ext4_ext_is_uninitialized(newext),
1784					ext4_ext_get_actual_len(newext),
1785					nearex);
1786			nearex++;
1787		} else {
1788			/* Insert before */
1789			BUG_ON(newext->ee_block == nearex->ee_block);
1790			ext_debug("insert %d:%llu:[%d]%d %s after: "
1791					"nearest 0x%p\n"
1792					le32_to_cpu(newext->ee_block),
1793					ext4_ext_pblock(newext),
1794					ext4_ext_is_uninitialized(newext),
1795					ext4_ext_get_actual_len(newext),
1796					nearex);
1797		}
1798		len = EXT_LAST_EXTENT(eh) - nearex + 1;
1799		if (len > 0) {
1800			ext_debug("insert %d:%llu:[%d]%d: "
1801					"move %d extents from 0x%p to 0x%p\n",
1802					le32_to_cpu(newext->ee_block),
1803					ext4_ext_pblock(newext),
1804					ext4_ext_is_uninitialized(newext),
1805					ext4_ext_get_actual_len(newext),
1806					len, nearex, nearex + 1);
1807			memmove(nearex + 1, nearex,
1808				len * sizeof(struct ext4_extent));
1809		}
1810	}
1811
1812	le16_add_cpu(&eh->eh_entries, 1);
1813	path[depth].p_ext = nearex;
1814	nearex->ee_block = newext->ee_block;
1815	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1816	nearex->ee_len = newext->ee_len;
1817
1818merge:
1819	/* try to merge extents to the right */
1820	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1821		ext4_ext_try_to_merge(inode, path, nearex);
1822
1823	/* try to merge extents to the left */
1824
1825	/* time to correct all indexes above */
1826	err = ext4_ext_correct_indexes(handle, inode, path);
1827	if (err)
1828		goto cleanup;
1829
1830	err = ext4_ext_dirty(handle, inode, path + depth);
1831
1832cleanup:
1833	if (npath) {
1834		ext4_ext_drop_refs(npath);
1835		kfree(npath);
1836	}
1837	ext4_ext_invalidate_cache(inode);
1838	return err;
1839}
1840
1841static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1842			       ext4_lblk_t num, ext_prepare_callback func,
1843			       void *cbdata)
1844{
1845	struct ext4_ext_path *path = NULL;
1846	struct ext4_ext_cache cbex;
1847	struct ext4_extent *ex;
1848	ext4_lblk_t next, start = 0, end = 0;
1849	ext4_lblk_t last = block + num;
1850	int depth, exists, err = 0;
1851
1852	BUG_ON(func == NULL);
1853	BUG_ON(inode == NULL);
1854
1855	while (block < last && block != EXT_MAX_BLOCKS) {
1856		num = last - block;
1857		/* find extent for this block */
1858		down_read(&EXT4_I(inode)->i_data_sem);
1859		path = ext4_ext_find_extent(inode, block, path);
1860		up_read(&EXT4_I(inode)->i_data_sem);
1861		if (IS_ERR(path)) {
1862			err = PTR_ERR(path);
1863			path = NULL;
1864			break;
1865		}
1866
1867		depth = ext_depth(inode);
1868		if (unlikely(path[depth].p_hdr == NULL)) {
1869			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1870			err = -EIO;
1871			break;
1872		}
1873		ex = path[depth].p_ext;
1874		next = ext4_ext_next_allocated_block(path);
1875
1876		exists = 0;
1877		if (!ex) {
1878			/* there is no extent yet, so try to allocate
1879			 * all requested space */
1880			start = block;
1881			end = block + num;
1882		} else if (le32_to_cpu(ex->ee_block) > block) {
1883			/* need to allocate space before found extent */
1884			start = block;
1885			end = le32_to_cpu(ex->ee_block);
1886			if (block + num < end)
1887				end = block + num;
1888		} else if (block >= le32_to_cpu(ex->ee_block)
1889					+ ext4_ext_get_actual_len(ex)) {
1890			/* need to allocate space after found extent */
1891			start = block;
1892			end = block + num;
1893			if (end >= next)
1894				end = next;
1895		} else if (block >= le32_to_cpu(ex->ee_block)) {
1896			/*
1897			 * some part of requested space is covered
1898			 * by found extent
1899			 */
1900			start = block;
1901			end = le32_to_cpu(ex->ee_block)
1902				+ ext4_ext_get_actual_len(ex);
1903			if (block + num < end)
1904				end = block + num;
1905			exists = 1;
1906		} else {
1907			BUG();
1908		}
1909		BUG_ON(end <= start);
1910
1911		if (!exists) {
1912			cbex.ec_block = start;
1913			cbex.ec_len = end - start;
1914			cbex.ec_start = 0;
1915		} else {
1916			cbex.ec_block = le32_to_cpu(ex->ee_block);
1917			cbex.ec_len = ext4_ext_get_actual_len(ex);
1918			cbex.ec_start = ext4_ext_pblock(ex);
1919		}
1920
1921		if (unlikely(cbex.ec_len == 0)) {
1922			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1923			err = -EIO;
1924			break;
1925		}
1926		err = func(inode, next, &cbex, ex, cbdata);
1927		ext4_ext_drop_refs(path);
1928
1929		if (err < 0)
1930			break;
1931
1932		if (err == EXT_REPEAT)
1933			continue;
1934		else if (err == EXT_BREAK) {
1935			err = 0;
1936			break;
1937		}
1938
1939		if (ext_depth(inode) != depth) {
1940			/* depth was changed. we have to realloc path */
1941			kfree(path);
1942			path = NULL;
1943		}
1944
1945		block = cbex.ec_block + cbex.ec_len;
1946	}
1947
1948	if (path) {
1949		ext4_ext_drop_refs(path);
1950		kfree(path);
1951	}
1952
1953	return err;
1954}
1955
1956static void
1957ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1958			__u32 len, ext4_fsblk_t start)
1959{
1960	struct ext4_ext_cache *cex;
1961	BUG_ON(len == 0);
1962	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1963	trace_ext4_ext_put_in_cache(inode, block, len, start);
1964	cex = &EXT4_I(inode)->i_cached_extent;
1965	cex->ec_block = block;
1966	cex->ec_len = len;
1967	cex->ec_start = start;
1968	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1969}
1970
1971/*
1972 * ext4_ext_put_gap_in_cache:
1973 * calculate boundaries of the gap that the requested block fits into
1974 * and cache this gap
1975 */
1976static void
1977ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1978				ext4_lblk_t block)
1979{
1980	int depth = ext_depth(inode);
1981	unsigned long len;
1982	ext4_lblk_t lblock;
1983	struct ext4_extent *ex;
1984
1985	ex = path[depth].p_ext;
1986	if (ex == NULL) {
1987		/* there is no extent yet, so gap is [0;-] */
1988		lblock = 0;
1989		len = EXT_MAX_BLOCKS;
1990		ext_debug("cache gap(whole file):");
1991	} else if (block < le32_to_cpu(ex->ee_block)) {
1992		lblock = block;
1993		len = le32_to_cpu(ex->ee_block) - block;
1994		ext_debug("cache gap(before): %u [%u:%u]",
1995				block,
1996				le32_to_cpu(ex->ee_block),
1997				 ext4_ext_get_actual_len(ex));
1998	} else if (block >= le32_to_cpu(ex->ee_block)
1999			+ ext4_ext_get_actual_len(ex)) {
2000		ext4_lblk_t next;
2001		lblock = le32_to_cpu(ex->ee_block)
2002			+ ext4_ext_get_actual_len(ex);
2003
2004		next = ext4_ext_next_allocated_block(path);
2005		ext_debug("cache gap(after): [%u:%u] %u",
2006				le32_to_cpu(ex->ee_block),
2007				ext4_ext_get_actual_len(ex),
2008				block);
2009		BUG_ON(next == lblock);
2010		len = next - lblock;
2011	} else {
2012		lblock = len = 0;
2013		BUG();
2014	}
2015
2016	ext_debug(" -> %u:%lu\n", lblock, len);
2017	ext4_ext_put_in_cache(inode, lblock, len, 0);
2018}
2019
2020/*
2021 * ext4_ext_check_cache()
2022 * Checks to see if the given block is in the cache.
2023 * If it is, the cached extent is stored in the given
2024 * cache extent pointer.  If the cached extent is a hole,
2025 * this routine should be used instead of
2026 * ext4_ext_in_cache if the calling function needs to
2027 * know the size of the hole.
2028 *
2029 * @inode: The files inode
2030 * @block: The block to look for in the cache
2031 * @ex:    Pointer where the cached extent will be stored
2032 *         if it contains block
2033 *
2034 * Return 0 if cache is invalid; 1 if the cache is valid
2035 */
2036static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2037	struct ext4_ext_cache *ex){
2038	struct ext4_ext_cache *cex;
2039	struct ext4_sb_info *sbi;
2040	int ret = 0;
2041
2042	/*
2043	 * We borrow i_block_reservation_lock to protect i_cached_extent
2044	 */
2045	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2046	cex = &EXT4_I(inode)->i_cached_extent;
2047	sbi = EXT4_SB(inode->i_sb);
2048
2049	/* has cache valid data? */
2050	if (cex->ec_len == 0)
2051		goto errout;
2052
2053	if (in_range(block, cex->ec_block, cex->ec_len)) {
2054		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2055		ext_debug("%u cached by %u:%u:%llu\n",
2056				block,
2057				cex->ec_block, cex->ec_len, cex->ec_start);
2058		ret = 1;
2059	}
2060errout:
2061	if (!ret)
2062		sbi->extent_cache_misses++;
2063	else
2064		sbi->extent_cache_hits++;
2065	trace_ext4_ext_in_cache(inode, block, ret);
2066	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2067	return ret;
2068}
2069
2070/*
2071 * ext4_ext_in_cache()
2072 * Checks to see if the given block is in the cache.
2073 * If it is, the cached extent is stored in the given
2074 * extent pointer.
2075 *
2076 * @inode: The files inode
2077 * @block: The block to look for in the cache
2078 * @ex:    Pointer where the cached extent will be stored
2079 *         if it contains block
2080 *
2081 * Return 0 if cache is invalid; 1 if the cache is valid
2082 */
2083static int
2084ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2085			struct ext4_extent *ex)
2086{
2087	struct ext4_ext_cache cex;
2088	int ret = 0;
2089
2090	if (ext4_ext_check_cache(inode, block, &cex)) {
2091		ex->ee_block = cpu_to_le32(cex.ec_block);
2092		ext4_ext_store_pblock(ex, cex.ec_start);
2093		ex->ee_len = cpu_to_le16(cex.ec_len);
2094		ret = 1;
2095	}
2096
2097	return ret;
2098}
2099
2100
2101/*
2102 * ext4_ext_rm_idx:
2103 * removes index from the index block.
2104 */
2105static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2106			struct ext4_ext_path *path)
2107{
2108	int err;
2109	ext4_fsblk_t leaf;
2110
2111	/* free index block */
2112	path--;
2113	leaf = ext4_idx_pblock(path->p_idx);
2114	if (unlikely(path->p_hdr->eh_entries == 0)) {
2115		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2116		return -EIO;
2117	}
2118	err = ext4_ext_get_access(handle, inode, path);
2119	if (err)
2120		return err;
2121
2122	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2123		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2124		len *= sizeof(struct ext4_extent_idx);
2125		memmove(path->p_idx, path->p_idx + 1, len);
2126	}
2127
2128	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2129	err = ext4_ext_dirty(handle, inode, path);
2130	if (err)
2131		return err;
2132	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2133	trace_ext4_ext_rm_idx(inode, leaf);
2134
2135	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2136			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2137	return err;
2138}
2139
2140/*
2141 * ext4_ext_calc_credits_for_single_extent:
2142 * This routine returns max. credits that needed to insert an extent
2143 * to the extent tree.
2144 * When pass the actual path, the caller should calculate credits
2145 * under i_data_sem.
2146 */
2147int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2148						struct ext4_ext_path *path)
2149{
2150	if (path) {
2151		int depth = ext_depth(inode);
2152		int ret = 0;
2153
2154		/* probably there is space in leaf? */
2155		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2156				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2157
2158			/*
2159			 *  There are some space in the leaf tree, no
2160			 *  need to account for leaf block credit
2161			 *
2162			 *  bitmaps and block group descriptor blocks
2163			 *  and other metadata blocks still need to be
2164			 *  accounted.
2165			 */
2166			/* 1 bitmap, 1 block group descriptor */
2167			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2168			return ret;
2169		}
2170	}
2171
2172	return ext4_chunk_trans_blocks(inode, nrblocks);
2173}
2174
2175/*
2176 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2177 *
2178 * if nrblocks are fit in a single extent (chunk flag is 1), then
2179 * in the worse case, each tree level index/leaf need to be changed
2180 * if the tree split due to insert a new extent, then the old tree
2181 * index/leaf need to be updated too
2182 *
2183 * If the nrblocks are discontiguous, they could cause
2184 * the whole tree split more than once, but this is really rare.
2185 */
2186int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2187{
2188	int index;
2189	int depth = ext_depth(inode);
2190
2191	if (chunk)
2192		index = depth * 2;
2193	else
2194		index = depth * 3;
2195
2196	return index;
2197}
2198
2199static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2200			      struct ext4_extent *ex,
2201			      ext4_fsblk_t *partial_cluster,
2202			      ext4_lblk_t from, ext4_lblk_t to)
2203{
2204	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2205	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2206	ext4_fsblk_t pblk;
2207	int flags = EXT4_FREE_BLOCKS_FORGET;
2208
2209	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2210		flags |= EXT4_FREE_BLOCKS_METADATA;
2211	/*
2212	 * For bigalloc file systems, we never free a partial cluster
2213	 * at the beginning of the extent.  Instead, we make a note
2214	 * that we tried freeing the cluster, and check to see if we
2215	 * need to free it on a subsequent call to ext4_remove_blocks,
2216	 * or at the end of the ext4_truncate() operation.
2217	 */
2218	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2219
2220	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2221	/*
2222	 * If we have a partial cluster, and it's different from the
2223	 * cluster of the last block, we need to explicitly free the
2224	 * partial cluster here.
2225	 */
2226	pblk = ext4_ext_pblock(ex) + ee_len - 1;
2227	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2228		ext4_free_blocks(handle, inode, NULL,
2229				 EXT4_C2B(sbi, *partial_cluster),
2230				 sbi->s_cluster_ratio, flags);
2231		*partial_cluster = 0;
2232	}
2233
2234#ifdef EXTENTS_STATS
2235	{
2236		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2237		spin_lock(&sbi->s_ext_stats_lock);
2238		sbi->s_ext_blocks += ee_len;
2239		sbi->s_ext_extents++;
2240		if (ee_len < sbi->s_ext_min)
2241			sbi->s_ext_min = ee_len;
2242		if (ee_len > sbi->s_ext_max)
2243			sbi->s_ext_max = ee_len;
2244		if (ext_depth(inode) > sbi->s_depth_max)
2245			sbi->s_depth_max = ext_depth(inode);
2246		spin_unlock(&sbi->s_ext_stats_lock);
2247	}
2248#endif
2249	if (from >= le32_to_cpu(ex->ee_block)
2250	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2251		/* tail removal */
2252		ext4_lblk_t num;
2253
2254		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2255		pblk = ext4_ext_pblock(ex) + ee_len - num;
2256		ext_debug("free last %u blocks starting %llu\n", num, pblk);
2257		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2258		/*
2259		 * If the block range to be freed didn't start at the
2260		 * beginning of a cluster, and we removed the entire
2261		 * extent, save the partial cluster here, since we
2262		 * might need to delete if we determine that the
2263		 * truncate operation has removed all of the blocks in
2264		 * the cluster.
2265		 */
2266		if (pblk & (sbi->s_cluster_ratio - 1) &&
2267		    (ee_len == num))
2268			*partial_cluster = EXT4_B2C(sbi, pblk);
2269		else
2270			*partial_cluster = 0;
2271	} else if (from == le32_to_cpu(ex->ee_block)
2272		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2273		/* head removal */
2274		ext4_lblk_t num;
2275		ext4_fsblk_t start;
2276
2277		num = to - from;
2278		start = ext4_ext_pblock(ex);
2279
2280		ext_debug("free first %u blocks starting %llu\n", num, start);
2281		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2282
2283	} else {
2284		printk(KERN_INFO "strange request: removal(2) "
2285				"%u-%u from %u:%u\n",
2286				from, to, le32_to_cpu(ex->ee_block), ee_len);
2287	}
2288	return 0;
2289}
2290
2291
2292/*
2293 * ext4_ext_rm_leaf() Removes the extents associated with the
2294 * blocks appearing between "start" and "end", and splits the extents
2295 * if "start" and "end" appear in the same extent
2296 *
2297 * @handle: The journal handle
2298 * @inode:  The files inode
2299 * @path:   The path to the leaf
2300 * @start:  The first block to remove
2301 * @end:   The last block to remove
2302 */
2303static int
2304ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2305		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2306		 ext4_lblk_t start, ext4_lblk_t end)
2307{
2308	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2309	int err = 0, correct_index = 0;
2310	int depth = ext_depth(inode), credits;
2311	struct ext4_extent_header *eh;
2312	ext4_lblk_t a, b;
2313	unsigned num;
2314	ext4_lblk_t ex_ee_block;
2315	unsigned short ex_ee_len;
2316	unsigned uninitialized = 0;
2317	struct ext4_extent *ex;
2318
2319	/* the header must be checked already in ext4_ext_remove_space() */
2320	ext_debug("truncate since %u in leaf\n", start);
2321	if (!path[depth].p_hdr)
2322		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2323	eh = path[depth].p_hdr;
2324	if (unlikely(path[depth].p_hdr == NULL)) {
2325		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2326		return -EIO;
2327	}
2328	/* find where to start removing */
2329	ex = EXT_LAST_EXTENT(eh);
2330
2331	ex_ee_block = le32_to_cpu(ex->ee_block);
2332	ex_ee_len = ext4_ext_get_actual_len(ex);
2333
2334	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2335
2336	while (ex >= EXT_FIRST_EXTENT(eh) &&
2337			ex_ee_block + ex_ee_len > start) {
2338
2339		if (ext4_ext_is_uninitialized(ex))
2340			uninitialized = 1;
2341		else
2342			uninitialized = 0;
2343
2344		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2345			 uninitialized, ex_ee_len);
2346		path[depth].p_ext = ex;
2347
2348		a = ex_ee_block > start ? ex_ee_block : start;
2349		b = ex_ee_block+ex_ee_len - 1 < end ?
2350			ex_ee_block+ex_ee_len - 1 : end;
2351
2352		ext_debug("  border %u:%u\n", a, b);
2353
2354		/* If this extent is beyond the end of the hole, skip it */
2355		if (end <= ex_ee_block) {
2356			ex--;
2357			ex_ee_block = le32_to_cpu(ex->ee_block);
2358			ex_ee_len = ext4_ext_get_actual_len(ex);
2359			continue;
2360		} else if (b != ex_ee_block + ex_ee_len - 1) {
2361			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
2362					 start, end);
2363			err = -EIO;
2364			goto out;
2365		} else if (a != ex_ee_block) {
2366			/* remove tail of the extent */
2367			num = a - ex_ee_block;
2368		} else {
2369			/* remove whole extent: excellent! */
2370			num = 0;
2371		}
2372		/*
2373		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2374		 * descriptor) for each block group; assume two block
2375		 * groups plus ex_ee_len/blocks_per_block_group for
2376		 * the worst case
2377		 */
2378		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2379		if (ex == EXT_FIRST_EXTENT(eh)) {
2380			correct_index = 1;
2381			credits += (ext_depth(inode)) + 1;
2382		}
2383		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2384
2385		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2386		if (err)
2387			goto out;
2388
2389		err = ext4_ext_get_access(handle, inode, path + depth);
2390		if (err)
2391			goto out;
2392
2393		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2394					 a, b);
2395		if (err)
2396			goto out;
2397
2398		if (num == 0)
2399			/* this extent is removed; mark slot entirely unused */
2400			ext4_ext_store_pblock(ex, 0);
2401
2402		ex->ee_len = cpu_to_le16(num);
2403		/*
2404		 * Do not mark uninitialized if all the blocks in the
2405		 * extent have been removed.
2406		 */
2407		if (uninitialized && num)
2408			ext4_ext_mark_uninitialized(ex);
2409		/*
2410		 * If the extent was completely released,
2411		 * we need to remove it from the leaf
2412		 */
2413		if (num == 0) {
2414			if (end != EXT_MAX_BLOCKS - 1) {
2415				/*
2416				 * For hole punching, we need to scoot all the
2417				 * extents up when an extent is removed so that
2418				 * we dont have blank extents in the middle
2419				 */
2420				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2421					sizeof(struct ext4_extent));
2422
2423				/* Now get rid of the one at the end */
2424				memset(EXT_LAST_EXTENT(eh), 0,
2425					sizeof(struct ext4_extent));
2426			}
2427			le16_add_cpu(&eh->eh_entries, -1);
2428		} else
2429			*partial_cluster = 0;
2430
2431		err = ext4_ext_dirty(handle, inode, path + depth);
2432		if (err)
2433			goto out;
2434
2435		ext_debug("new extent: %u:%u:%llu\n", block, num,
2436				ext4_ext_pblock(ex));
2437		ex--;
2438		ex_ee_block = le32_to_cpu(ex->ee_block);
2439		ex_ee_len = ext4_ext_get_actual_len(ex);
2440	}
2441
2442	if (correct_index && eh->eh_entries)
2443		err = ext4_ext_correct_indexes(handle, inode, path);
2444
2445	/*
2446	 * If there is still a entry in the leaf node, check to see if
2447	 * it references the partial cluster.  This is the only place
2448	 * where it could; if it doesn't, we can free the cluster.
2449	 */
2450	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2451	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2452	     *partial_cluster)) {
2453		int flags = EXT4_FREE_BLOCKS_FORGET;
2454
2455		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2456			flags |= EXT4_FREE_BLOCKS_METADATA;
2457
2458		ext4_free_blocks(handle, inode, NULL,
2459				 EXT4_C2B(sbi, *partial_cluster),
2460				 sbi->s_cluster_ratio, flags);
2461		*partial_cluster = 0;
2462	}
2463
2464	/* if this leaf is free, then we should
2465	 * remove it from index block above */
2466	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2467		err = ext4_ext_rm_idx(handle, inode, path + depth);
2468
2469out:
2470	return err;
2471}
2472
2473/*
2474 * ext4_ext_more_to_rm:
2475 * returns 1 if current index has to be freed (even partial)
2476 */
2477static int
2478ext4_ext_more_to_rm(struct ext4_ext_path *path)
2479{
2480	BUG_ON(path->p_idx == NULL);
2481
2482	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2483		return 0;
2484
2485	/*
2486	 * if truncate on deeper level happened, it wasn't partial,
2487	 * so we have to consider current index for truncation
2488	 */
2489	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2490		return 0;
2491	return 1;
2492}
2493
2494static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2495{
2496	struct super_block *sb = inode->i_sb;
2497	int depth = ext_depth(inode);
2498	struct ext4_ext_path *path;
2499	ext4_fsblk_t partial_cluster = 0;
2500	handle_t *handle;
2501	int i, err;
2502
2503	ext_debug("truncate since %u\n", start);
2504
2505	/* probably first extent we're gonna free will be last in block */
2506	handle = ext4_journal_start(inode, depth + 1);
2507	if (IS_ERR(handle))
2508		return PTR_ERR(handle);
2509
2510again:
2511	ext4_ext_invalidate_cache(inode);
2512
2513	trace_ext4_ext_remove_space(inode, start, depth);
2514
2515	/*
2516	 * We start scanning from right side, freeing all the blocks
2517	 * after i_size and walking into the tree depth-wise.
2518	 */
2519	depth = ext_depth(inode);
2520	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2521	if (path == NULL) {
2522		ext4_journal_stop(handle);
2523		return -ENOMEM;
2524	}
2525	path[0].p_depth = depth;
2526	path[0].p_hdr = ext_inode_hdr(inode);
2527	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2528		err = -EIO;
2529		goto out;
2530	}
2531	i = err = 0;
2532
2533	while (i >= 0 && err == 0) {
2534		if (i == depth) {
2535			/* this is leaf block */
2536			err = ext4_ext_rm_leaf(handle, inode, path,
2537					       &partial_cluster, start,
2538					       EXT_MAX_BLOCKS - 1);
2539			/* root level has p_bh == NULL, brelse() eats this */
2540			brelse(path[i].p_bh);
2541			path[i].p_bh = NULL;
2542			i--;
2543			continue;
2544		}
2545
2546		/* this is index block */
2547		if (!path[i].p_hdr) {
2548			ext_debug("initialize header\n");
2549			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2550		}
2551
2552		if (!path[i].p_idx) {
2553			/* this level hasn't been touched yet */
2554			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2555			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2556			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2557				  path[i].p_hdr,
2558				  le16_to_cpu(path[i].p_hdr->eh_entries));
2559		} else {
2560			/* we were already here, see at next index */
2561			path[i].p_idx--;
2562		}
2563
2564		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2565				i, EXT_FIRST_INDEX(path[i].p_hdr),
2566				path[i].p_idx);
2567		if (ext4_ext_more_to_rm(path + i)) {
2568			struct buffer_head *bh;
2569			/* go to the next level */
2570			ext_debug("move to level %d (block %llu)\n",
2571				  i + 1, ext4_idx_pblock(path[i].p_idx));
2572			memset(path + i + 1, 0, sizeof(*path));
2573			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2574			if (!bh) {
2575				/* should we reset i_size? */
2576				err = -EIO;
2577				break;
2578			}
2579			if (WARN_ON(i + 1 > depth)) {
2580				err = -EIO;
2581				break;
2582			}
2583			if (ext4_ext_check(inode, ext_block_hdr(bh),
2584							depth - i - 1)) {
2585				err = -EIO;
2586				break;
2587			}
2588			path[i + 1].p_bh = bh;
2589
2590			/* save actual number of indexes since this
2591			 * number is changed at the next iteration */
2592			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2593			i++;
2594		} else {
2595			/* we finished processing this index, go up */
2596			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2597				/* index is empty, remove it;
2598				 * handle must be already prepared by the
2599				 * truncatei_leaf() */
2600				err = ext4_ext_rm_idx(handle, inode, path + i);
2601			}
2602			/* root level has p_bh == NULL, brelse() eats this */
2603			brelse(path[i].p_bh);
2604			path[i].p_bh = NULL;
2605			i--;
2606			ext_debug("return to level %d\n", i);
2607		}
2608	}
2609
2610	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2611			path->p_hdr->eh_entries);
2612
2613	/* If we still have something in the partial cluster and we have removed
2614	 * even the first extent, then we should free the blocks in the partial
2615	 * cluster as well. */
2616	if (partial_cluster && path->p_hdr->eh_entries == 0) {
2617		int flags = EXT4_FREE_BLOCKS_FORGET;
2618
2619		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2620			flags |= EXT4_FREE_BLOCKS_METADATA;
2621
2622		ext4_free_blocks(handle, inode, NULL,
2623				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2624				 EXT4_SB(sb)->s_cluster_ratio, flags);
2625		partial_cluster = 0;
2626	}
2627
2628	/* TODO: flexible tree reduction should be here */
2629	if (path->p_hdr->eh_entries == 0) {
2630		/*
2631		 * truncate to zero freed all the tree,
2632		 * so we need to correct eh_depth
2633		 */
2634		err = ext4_ext_get_access(handle, inode, path);
2635		if (err == 0) {
2636			ext_inode_hdr(inode)->eh_depth = 0;
2637			ext_inode_hdr(inode)->eh_max =
2638				cpu_to_le16(ext4_ext_space_root(inode, 0));
2639			err = ext4_ext_dirty(handle, inode, path);
2640		}
2641	}
2642out:
2643	ext4_ext_drop_refs(path);
2644	kfree(path);
2645	if (err == -EAGAIN)
2646		goto again;
2647	ext4_journal_stop(handle);
2648
2649	return err;
2650}
2651
2652/*
2653 * called at mount time
2654 */
2655void ext4_ext_init(struct super_block *sb)
2656{
2657	/*
2658	 * possible initialization would be here
2659	 */
2660
2661	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2662#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2663		printk(KERN_INFO "EXT4-fs: file extents enabled");
2664#ifdef AGGRESSIVE_TEST
2665		printk(", aggressive tests");
2666#endif
2667#ifdef CHECK_BINSEARCH
2668		printk(", check binsearch");
2669#endif
2670#ifdef EXTENTS_STATS
2671		printk(", stats");
2672#endif
2673		printk("\n");
2674#endif
2675#ifdef EXTENTS_STATS
2676		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2677		EXT4_SB(sb)->s_ext_min = 1 << 30;
2678		EXT4_SB(sb)->s_ext_max = 0;
2679#endif
2680	}
2681}
2682
2683/*
2684 * called at umount time
2685 */
2686void ext4_ext_release(struct super_block *sb)
2687{
2688	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2689		return;
2690
2691#ifdef EXTENTS_STATS
2692	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2693		struct ext4_sb_info *sbi = EXT4_SB(sb);
2694		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2695			sbi->s_ext_blocks, sbi->s_ext_extents,
2696			sbi->s_ext_blocks / sbi->s_ext_extents);
2697		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2698			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2699	}
2700#endif
2701}
2702
2703/* FIXME!! we need to try to merge to left or right after zero-out  */
2704static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2705{
2706	ext4_fsblk_t ee_pblock;
2707	unsigned int ee_len;
2708	int ret;
2709
2710	ee_len    = ext4_ext_get_actual_len(ex);
2711	ee_pblock = ext4_ext_pblock(ex);
2712
2713	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2714	if (ret > 0)
2715		ret = 0;
2716
2717	return ret;
2718}
2719
2720/*
2721 * used by extent splitting.
2722 */
2723#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2724					due to ENOSPC */
2725#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2726#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2727
2728/*
2729 * ext4_split_extent_at() splits an extent at given block.
2730 *
2731 * @handle: the journal handle
2732 * @inode: the file inode
2733 * @path: the path to the extent
2734 * @split: the logical block where the extent is splitted.
2735 * @split_flags: indicates if the extent could be zeroout if split fails, and
2736 *		 the states(init or uninit) of new extents.
2737 * @flags: flags used to insert new extent to extent tree.
2738 *
2739 *
2740 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2741 * of which are deterimined by split_flag.
2742 *
2743 * There are two cases:
2744 *  a> the extent are splitted into two extent.
2745 *  b> split is not needed, and just mark the extent.
2746 *
2747 * return 0 on success.
2748 */
2749static int ext4_split_extent_at(handle_t *handle,
2750			     struct inode *inode,
2751			     struct ext4_ext_path *path,
2752			     ext4_lblk_t split,
2753			     int split_flag,
2754			     int flags)
2755{
2756	ext4_fsblk_t newblock;
2757	ext4_lblk_t ee_block;
2758	struct ext4_extent *ex, newex, orig_ex;
2759	struct ext4_extent *ex2 = NULL;
2760	unsigned int ee_len, depth;
2761	int err = 0;
2762
2763	ext_debug("ext4_split_extents_at: inode %lu, logical"
2764		"block %llu\n", inode->i_ino, (unsigned long long)split);
2765
2766	ext4_ext_show_leaf(inode, path);
2767
2768	depth = ext_depth(inode);
2769	ex = path[depth].p_ext;
2770	ee_block = le32_to_cpu(ex->ee_block);
2771	ee_len = ext4_ext_get_actual_len(ex);
2772	newblock = split - ee_block + ext4_ext_pblock(ex);
2773
2774	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2775
2776	err = ext4_ext_get_access(handle, inode, path + depth);
2777	if (err)
2778		goto out;
2779
2780	if (split == ee_block) {
2781		/*
2782		 * case b: block @split is the block that the extent begins with
2783		 * then we just change the state of the extent, and splitting
2784		 * is not needed.
2785		 */
2786		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2787			ext4_ext_mark_uninitialized(ex);
2788		else
2789			ext4_ext_mark_initialized(ex);
2790
2791		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2792			ext4_ext_try_to_merge(inode, path, ex);
2793
2794		err = ext4_ext_dirty(handle, inode, path + depth);
2795		goto out;
2796	}
2797
2798	/* case a */
2799	memcpy(&orig_ex, ex, sizeof(orig_ex));
2800	ex->ee_len = cpu_to_le16(split - ee_block);
2801	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2802		ext4_ext_mark_uninitialized(ex);
2803
2804	/*
2805	 * path may lead to new leaf, not to original leaf any more
2806	 * after ext4_ext_insert_extent() returns,
2807	 */
2808	err = ext4_ext_dirty(handle, inode, path + depth);
2809	if (err)
2810		goto fix_extent_len;
2811
2812	ex2 = &newex;
2813	ex2->ee_block = cpu_to_le32(split);
2814	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2815	ext4_ext_store_pblock(ex2, newblock);
2816	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2817		ext4_ext_mark_uninitialized(ex2);
2818
2819	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2820	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2821		err = ext4_ext_zeroout(inode, &orig_ex);
2822		if (err)
2823			goto fix_extent_len;
2824		/* update the extent length and mark as initialized */
2825		ex->ee_len = cpu_to_le32(ee_len);
2826		ext4_ext_try_to_merge(inode, path, ex);
2827		err = ext4_ext_dirty(handle, inode, path + depth);
2828		goto out;
2829	} else if (err)
2830		goto fix_extent_len;
2831
2832out:
2833	ext4_ext_show_leaf(inode, path);
2834	return err;
2835
2836fix_extent_len:
2837	ex->ee_len = orig_ex.ee_len;
2838	ext4_ext_dirty(handle, inode, path + depth);
2839	return err;
2840}
2841
2842/*
2843 * ext4_split_extents() splits an extent and mark extent which is covered
2844 * by @map as split_flags indicates
2845 *
2846 * It may result in splitting the extent into multiple extents (upto three)
2847 * There are three possibilities:
2848 *   a> There is no split required
2849 *   b> Splits in two extents: Split is happening at either end of the extent
2850 *   c> Splits in three extents: Somone is splitting in middle of the extent
2851 *
2852 */
2853static int ext4_split_extent(handle_t *handle,
2854			      struct inode *inode,
2855			      struct ext4_ext_path *path,
2856			      struct ext4_map_blocks *map,
2857			      int split_flag,
2858			      int flags)
2859{
2860	ext4_lblk_t ee_block;
2861	struct ext4_extent *ex;
2862	unsigned int ee_len, depth;
2863	int err = 0;
2864	int uninitialized;
2865	int split_flag1, flags1;
2866
2867	depth = ext_depth(inode);
2868	ex = path[depth].p_ext;
2869	ee_block = le32_to_cpu(ex->ee_block);
2870	ee_len = ext4_ext_get_actual_len(ex);
2871	uninitialized = ext4_ext_is_uninitialized(ex);
2872
2873	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2874		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2875			      EXT4_EXT_MAY_ZEROOUT : 0;
2876		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2877		if (uninitialized)
2878			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2879				       EXT4_EXT_MARK_UNINIT2;
2880		err = ext4_split_extent_at(handle, inode, path,
2881				map->m_lblk + map->m_len, split_flag1, flags1);
2882		if (err)
2883			goto out;
2884	}
2885
2886	ext4_ext_drop_refs(path);
2887	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2888	if (IS_ERR(path))
2889		return PTR_ERR(path);
2890
2891	if (map->m_lblk >= ee_block) {
2892		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2893			      EXT4_EXT_MAY_ZEROOUT : 0;
2894		if (uninitialized)
2895			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2896		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2897			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2898		err = ext4_split_extent_at(handle, inode, path,
2899				map->m_lblk, split_flag1, flags);
2900		if (err)
2901			goto out;
2902	}
2903
2904	ext4_ext_show_leaf(inode, path);
2905out:
2906	return err ? err : map->m_len;
2907}
2908
2909#define EXT4_EXT_ZERO_LEN 7
2910/*
2911 * This function is called by ext4_ext_map_blocks() if someone tries to write
2912 * to an uninitialized extent. It may result in splitting the uninitialized
2913 * extent into multiple extents (up to three - one initialized and two
2914 * uninitialized).
2915 * There are three possibilities:
2916 *   a> There is no split required: Entire extent should be initialized
2917 *   b> Splits in two extents: Write is happening at either end of the extent
2918 *   c> Splits in three extents: Somone is writing in middle of the extent
2919 *
2920 * Pre-conditions:
2921 *  - The extent pointed to by 'path' is uninitialized.
2922 *  - The extent pointed to by 'path' contains a superset
2923 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
2924 *
2925 * Post-conditions on success:
2926 *  - the returned value is the number of blocks beyond map->l_lblk
2927 *    that are allocated and initialized.
2928 *    It is guaranteed to be >= map->m_len.
2929 */
2930static int ext4_ext_convert_to_initialized(handle_t *handle,
2931					   struct inode *inode,
2932					   struct ext4_map_blocks *map,
2933					   struct ext4_ext_path *path)
2934{
2935	struct ext4_extent_header *eh;
2936	struct ext4_map_blocks split_map;
2937	struct ext4_extent zero_ex;
2938	struct ext4_extent *ex;
2939	ext4_lblk_t ee_block, eof_block;
2940	unsigned int ee_len, depth;
2941	int allocated;
2942	int err = 0;
2943	int split_flag = 0;
2944
2945	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2946		"block %llu, max_blocks %u\n", inode->i_ino,
2947		(unsigned long long)map->m_lblk, map->m_len);
2948
2949	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2950		inode->i_sb->s_blocksize_bits;
2951	if (eof_block < map->m_lblk + map->m_len)
2952		eof_block = map->m_lblk + map->m_len;
2953
2954	depth = ext_depth(inode);
2955	eh = path[depth].p_hdr;
2956	ex = path[depth].p_ext;
2957	ee_block = le32_to_cpu(ex->ee_block);
2958	ee_len = ext4_ext_get_actual_len(ex);
2959	allocated = ee_len - (map->m_lblk - ee_block);
2960
2961	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
2962
2963	/* Pre-conditions */
2964	BUG_ON(!ext4_ext_is_uninitialized(ex));
2965	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2966	BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2967
2968	/*
2969	 * Attempt to transfer newly initialized blocks from the currently
2970	 * uninitialized extent to its left neighbor. This is much cheaper
2971	 * than an insertion followed by a merge as those involve costly
2972	 * memmove() calls. This is the common case in steady state for
2973	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
2974	 * writes.
2975	 *
2976	 * Limitations of the current logic:
2977	 *  - L1: we only deal with writes at the start of the extent.
2978	 *    The approach could be extended to writes at the end
2979	 *    of the extent but this scenario was deemed less common.
2980	 *  - L2: we do not deal with writes covering the whole extent.
2981	 *    This would require removing the extent if the transfer
2982	 *    is possible.
2983	 *  - L3: we only attempt to merge with an extent stored in the
2984	 *    same extent tree node.
2985	 */
2986	if ((map->m_lblk == ee_block) &&	/*L1*/
2987		(map->m_len < ee_len) &&	/*L2*/
2988		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
2989		struct ext4_extent *prev_ex;
2990		ext4_lblk_t prev_lblk;
2991		ext4_fsblk_t prev_pblk, ee_pblk;
2992		unsigned int prev_len, write_len;
2993
2994		prev_ex = ex - 1;
2995		prev_lblk = le32_to_cpu(prev_ex->ee_block);
2996		prev_len = ext4_ext_get_actual_len(prev_ex);
2997		prev_pblk = ext4_ext_pblock(prev_ex);
2998		ee_pblk = ext4_ext_pblock(ex);
2999		write_len = map->m_len;
3000
3001		/*
3002		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
3003		 * upon those conditions:
3004		 * - C1: prev_ex is initialized,
3005		 * - C2: prev_ex is logically abutting ex,
3006		 * - C3: prev_ex is physically abutting ex,
3007		 * - C4: prev_ex can receive the additional blocks without
3008		 *   overflowing the (initialized) length limit.
3009		 */
3010		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
3011			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3012			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3013			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
3014			err = ext4_ext_get_access(handle, inode, path + depth);
3015			if (err)
3016				goto out;
3017
3018			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3019				map, ex, prev_ex);
3020
3021			/* Shift the start of ex by 'write_len' blocks */
3022			ex->ee_block = cpu_to_le32(ee_block + write_len);
3023			ext4_ext_store_pblock(ex, ee_pblk + write_len);
3024			ex->ee_len = cpu_to_le16(ee_len - write_len);
3025			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3026
3027			/* Extend prev_ex by 'write_len' blocks */
3028			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3029
3030			/* Mark the block containing both extents as dirty */
3031			ext4_ext_dirty(handle, inode, path + depth);
3032
3033			/* Update path to point to the right extent */
3034			path[depth].p_ext = prev_ex;
3035
3036			/* Result: number of initialized blocks past m_lblk */
3037			allocated = write_len;
3038			goto out;
3039		}
3040	}
3041
3042	WARN_ON(map->m_lblk < ee_block);
3043	/*
3044	 * It is safe to convert extent to initialized via explicit
3045	 * zeroout only if extent is fully insde i_size or new_size.
3046	 */
3047	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3048
3049	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
3050	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
3051	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3052		err = ext4_ext_zeroout(inode, ex);
3053		if (err)
3054			goto out;
3055
3056		err = ext4_ext_get_access(handle, inode, path + depth);
3057		if (err)
3058			goto out;
3059		ext4_ext_mark_initialized(ex);
3060		ext4_ext_try_to_merge(inode, path, ex);
3061		err = ext4_ext_dirty(handle, inode, path + depth);
3062		goto out;
3063	}
3064
3065	/*
3066	 * four cases:
3067	 * 1. split the extent into three extents.
3068	 * 2. split the extent into two extents, zeroout the first half.
3069	 * 3. split the extent into two extents, zeroout the second half.
3070	 * 4. split the extent into two extents with out zeroout.
3071	 */
3072	split_map.m_lblk = map->m_lblk;
3073	split_map.m_len = map->m_len;
3074
3075	if (allocated > map->m_len) {
3076		if (allocated <= EXT4_EXT_ZERO_LEN &&
3077		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3078			/* case 3 */
3079			zero_ex.ee_block =
3080					 cpu_to_le32(map->m_lblk);
3081			zero_ex.ee_len = cpu_to_le16(allocated);
3082			ext4_ext_store_pblock(&zero_ex,
3083				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3084			err = ext4_ext_zeroout(inode, &zero_ex);
3085			if (err)
3086				goto out;
3087			split_map.m_lblk = map->m_lblk;
3088			split_map.m_len = allocated;
3089		} else if ((map->m_lblk - ee_block + map->m_len <
3090			   EXT4_EXT_ZERO_LEN) &&
3091			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3092			/* case 2 */
3093			if (map->m_lblk != ee_block) {
3094				zero_ex.ee_block = ex->ee_block;
3095				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3096							ee_block);
3097				ext4_ext_store_pblock(&zero_ex,
3098						      ext4_ext_pblock(ex));
3099				err = ext4_ext_zeroout(inode, &zero_ex);
3100				if (err)
3101					goto out;
3102			}
3103
3104			split_map.m_lblk = ee_block;
3105			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3106			allocated = map->m_len;
3107		}
3108	}
3109
3110	allocated = ext4_split_extent(handle, inode, path,
3111				       &split_map, split_flag, 0);
3112	if (allocated < 0)
3113		err = allocated;
3114
3115out:
3116	return err ? err : allocated;
3117}
3118
3119/*
3120 * This function is called by ext4_ext_map_blocks() from
3121 * ext4_get_blocks_dio_write() when DIO to write
3122 * to an uninitialized extent.
3123 *
3124 * Writing to an uninitialized extent may result in splitting the uninitialized
3125 * extent into multiple /initialized uninitialized extents (up to three)
3126 * There are three possibilities:
3127 *   a> There is no split required: Entire extent should be uninitialized
3128 *   b> Splits in two extents: Write is happening at either end of the extent
3129 *   c> Splits in three extents: Somone is writing in middle of the extent
3130 *
3131 * One of more index blocks maybe needed if the extent tree grow after
3132 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3133 * complete, we need to split the uninitialized extent before DIO submit
3134 * the IO. The uninitialized extent called at this time will be split
3135 * into three uninitialized extent(at most). After IO complete, the part
3136 * being filled will be convert to initialized by the end_io callback function
3137 * via ext4_convert_unwritten_extents().
3138 *
3139 * Returns the size of uninitialized extent to be written on success.
3140 */
3141static int ext4_split_unwritten_extents(handle_t *handle,
3142					struct inode *inode,
3143					struct ext4_map_blocks *map,
3144					struct ext4_ext_path *path,
3145					int flags)
3146{
3147	ext4_lblk_t eof_block;
3148	ext4_lblk_t ee_block;
3149	struct ext4_extent *ex;
3150	unsigned int ee_len;
3151	int split_flag = 0, depth;
3152
3153	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3154		"block %llu, max_blocks %u\n", inode->i_ino,
3155		(unsigned long long)map->m_lblk, map->m_len);
3156
3157	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3158		inode->i_sb->s_blocksize_bits;
3159	if (eof_block < map->m_lblk + map->m_len)
3160		eof_block = map->m_lblk + map->m_len;
3161	/*
3162	 * It is safe to convert extent to initialized via explicit
3163	 * zeroout only if extent is fully insde i_size or new_size.
3164	 */
3165	depth = ext_depth(inode);
3166	ex = path[depth].p_ext;
3167	ee_block = le32_to_cpu(ex->ee_block);
3168	ee_len = ext4_ext_get_actual_len(ex);
3169
3170	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3171	split_flag |= EXT4_EXT_MARK_UNINIT2;
3172
3173	flags |= EXT4_GET_BLOCKS_PRE_IO;
3174	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3175}
3176
3177static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3178					      struct inode *inode,
3179					      struct ext4_ext_path *path)
3180{
3181	struct ext4_extent *ex;
3182	int depth;
3183	int err = 0;
3184
3185	depth = ext_depth(inode);
3186	ex = path[depth].p_ext;
3187
3188	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3189		"block %llu, max_blocks %u\n", inode->i_ino,
3190		(unsigned long long)le32_to_cpu(ex->ee_block),
3191		ext4_ext_get_actual_len(ex));
3192
3193	err = ext4_ext_get_access(handle, inode, path + depth);
3194	if (err)
3195		goto out;
3196	/* first mark the extent as initialized */
3197	ext4_ext_mark_initialized(ex);
3198
3199	/* note: ext4_ext_correct_indexes() isn't needed here because
3200	 * borders are not changed
3201	 */
3202	ext4_ext_try_to_merge(inode, path, ex);
3203
3204	/* Mark modified extent as dirty */
3205	err = ext4_ext_dirty(handle, inode, path + depth);
3206out:
3207	ext4_ext_show_leaf(inode, path);
3208	return err;
3209}
3210
3211static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3212			sector_t block, int count)
3213{
3214	int i;
3215	for (i = 0; i < count; i++)
3216                unmap_underlying_metadata(bdev, block + i);
3217}
3218
3219/*
3220 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3221 */
3222static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3223			      ext4_lblk_t lblk,
3224			      struct ext4_ext_path *path,
3225			      unsigned int len)
3226{
3227	int i, depth;
3228	struct ext4_extent_header *eh;
3229	struct ext4_extent *last_ex;
3230
3231	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3232		return 0;
3233
3234	depth = ext_depth(inode);
3235	eh = path[depth].p_hdr;
3236
3237	if (unlikely(!eh->eh_entries)) {
3238		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3239				 "EOFBLOCKS_FL set");
3240		return -EIO;
3241	}
3242	last_ex = EXT_LAST_EXTENT(eh);
3243	/*
3244	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3245	 * last block in the last extent in the file.  We test this by
3246	 * first checking to see if the caller to
3247	 * ext4_ext_get_blocks() was interested in the last block (or
3248	 * a block beyond the last block) in the current extent.  If
3249	 * this turns out to be false, we can bail out from this
3250	 * function immediately.
3251	 */
3252	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3253	    ext4_ext_get_actual_len(last_ex))
3254		return 0;
3255	/*
3256	 * If the caller does appear to be planning to write at or
3257	 * beyond the end of the current extent, we then test to see
3258	 * if the current extent is the last extent in the file, by
3259	 * checking to make sure it was reached via the rightmost node
3260	 * at each level of the tree.
3261	 */
3262	for (i = depth-1; i >= 0; i--)
3263		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3264			return 0;
3265	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3266	return ext4_mark_inode_dirty(handle, inode);
3267}
3268
3269/**
3270 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3271 *
3272 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3273 * whether there are any buffers marked for delayed allocation. It returns '1'
3274 * on the first delalloc'ed buffer head found. If no buffer head in the given
3275 * range is marked for delalloc, it returns 0.
3276 * lblk_start should always be <= lblk_end.
3277 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3278 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3279 * block sooner). This is useful when blocks are truncated sequentially from
3280 * lblk_start towards lblk_end.
3281 */
3282static int ext4_find_delalloc_range(struct inode *inode,
3283				    ext4_lblk_t lblk_start,
3284				    ext4_lblk_t lblk_end,
3285				    int search_hint_reverse)
3286{
3287	struct address_space *mapping = inode->i_mapping;
3288	struct buffer_head *head, *bh = NULL;
3289	struct page *page;
3290	ext4_lblk_t i, pg_lblk;
3291	pgoff_t index;
3292
3293	/* reverse search wont work if fs block size is less than page size */
3294	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3295		search_hint_reverse = 0;
3296
3297	if (search_hint_reverse)
3298		i = lblk_end;
3299	else
3300		i = lblk_start;
3301
3302	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3303
3304	while ((i >= lblk_start) && (i <= lblk_end)) {
3305		page = find_get_page(mapping, index);
3306		if (!page)
3307			goto nextpage;
3308
3309		if (!page_has_buffers(page))
3310			goto nextpage;
3311
3312		head = page_buffers(page);
3313		if (!head)
3314			goto nextpage;
3315
3316		bh = head;
3317		pg_lblk = index << (PAGE_CACHE_SHIFT -
3318						inode->i_blkbits);
3319		do {
3320			if (unlikely(pg_lblk < lblk_start)) {
3321				/*
3322				 * This is possible when fs block size is less
3323				 * than page size and our cluster starts/ends in
3324				 * middle of the page. So we need to skip the
3325				 * initial few blocks till we reach the 'lblk'
3326				 */
3327				pg_lblk++;
3328				continue;
3329			}
3330
3331			/* Check if the buffer is delayed allocated and that it
3332			 * is not yet mapped. (when da-buffers are mapped during
3333			 * their writeout, their da_mapped bit is set.)
3334			 */
3335			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3336				page_cache_release(page);
3337				trace_ext4_find_delalloc_range(inode,
3338						lblk_start, lblk_end,
3339						search_hint_reverse,
3340						1, i);
3341				return 1;
3342			}
3343			if (search_hint_reverse)
3344				i--;
3345			else
3346				i++;
3347		} while ((i >= lblk_start) && (i <= lblk_end) &&
3348				((bh = bh->b_this_page) != head));
3349nextpage:
3350		if (page)
3351			page_cache_release(page);
3352		/*
3353		 * Move to next page. 'i' will be the first lblk in the next
3354		 * page.
3355		 */
3356		if (search_hint_reverse)
3357			index--;
3358		else
3359			index++;
3360		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3361	}
3362
3363	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3364					search_hint_reverse, 0, 0);
3365	return 0;
3366}
3367
3368int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3369			       int search_hint_reverse)
3370{
3371	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3372	ext4_lblk_t lblk_start, lblk_end;
3373	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3374	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3375
3376	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3377					search_hint_reverse);
3378}
3379
3380/**
3381 * Determines how many complete clusters (out of those specified by the 'map')
3382 * are under delalloc and were reserved quota for.
3383 * This function is called when we are writing out the blocks that were
3384 * originally written with their allocation delayed, but then the space was
3385 * allocated using fallocate() before the delayed allocation could be resolved.
3386 * The cases to look for are:
3387 * ('=' indicated delayed allocated blocks
3388 *  '-' indicates non-delayed allocated blocks)
3389 * (a) partial clusters towards beginning and/or end outside of allocated range
3390 *     are not delalloc'ed.
3391 *	Ex:
3392 *	|----c---=|====c====|====c====|===-c----|
3393 *	         |++++++ allocated ++++++|
3394 *	==> 4 complete clusters in above example
3395 *
3396 * (b) partial cluster (outside of allocated range) towards either end is
3397 *     marked for delayed allocation. In this case, we will exclude that
3398 *     cluster.
3399 *	Ex:
3400 *	|----====c========|========c========|
3401 *	     |++++++ allocated ++++++|
3402 *	==> 1 complete clusters in above example
3403 *
3404 *	Ex:
3405 *	|================c================|
3406 *            |++++++ allocated ++++++|
3407 *	==> 0 complete clusters in above example
3408 *
3409 * The ext4_da_update_reserve_space will be called only if we
3410 * determine here that there were some "entire" clusters that span
3411 * this 'allocated' range.
3412 * In the non-bigalloc case, this function will just end up returning num_blks
3413 * without ever calling ext4_find_delalloc_range.
3414 */
3415static unsigned int
3416get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3417			   unsigned int num_blks)
3418{
3419	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3420	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3421	ext4_lblk_t lblk_from, lblk_to, c_offset;
3422	unsigned int allocated_clusters = 0;
3423
3424	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3425	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3426
3427	/* max possible clusters for this allocation */
3428	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3429
3430	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3431
3432	/* Check towards left side */
3433	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3434	if (c_offset) {
3435		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3436		lblk_to = lblk_from + c_offset - 1;
3437
3438		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3439			allocated_clusters--;
3440	}
3441
3442	/* Now check towards right. */
3443	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3444	if (allocated_clusters && c_offset) {
3445		lblk_from = lblk_start + num_blks;
3446		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3447
3448		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3449			allocated_clusters--;
3450	}
3451
3452	return allocated_clusters;
3453}
3454
3455static int
3456ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3457			struct ext4_map_blocks *map,
3458			struct ext4_ext_path *path, int flags,
3459			unsigned int allocated, ext4_fsblk_t newblock)
3460{
3461	int ret = 0;
3462	int err = 0;
3463	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3464
3465	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3466		  "block %llu, max_blocks %u, flags %d, allocated %u",
3467		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3468		  flags, allocated);
3469	ext4_ext_show_leaf(inode, path);
3470
3471	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3472						    newblock);
3473
3474	/* get_block() before submit the IO, split the extent */
3475	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3476		ret = ext4_split_unwritten_extents(handle, inode, map,
3477						   path, flags);
3478		/*
3479		 * Flag the inode(non aio case) or end_io struct (aio case)
3480		 * that this IO needs to conversion to written when IO is
3481		 * completed
3482		 */
3483		if (io) {
3484			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
3485				io->flag = EXT4_IO_END_UNWRITTEN;
3486				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3487			}
3488		} else
3489			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3490		if (ext4_should_dioread_nolock(inode))
3491			map->m_flags |= EXT4_MAP_UNINIT;
3492		goto out;
3493	}
3494	/* IO end_io complete, convert the filled extent to written */
3495	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3496		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3497							path);
3498		if (ret >= 0) {
3499			ext4_update_inode_fsync_trans(handle, inode, 1);
3500			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3501						 path, map->m_len);
3502		} else
3503			err = ret;
3504		goto out2;
3505	}
3506	/* buffered IO case */
3507	/*
3508	 * repeat fallocate creation request
3509	 * we already have an unwritten extent
3510	 */
3511	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3512		goto map_out;
3513
3514	/* buffered READ or buffered write_begin() lookup */
3515	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3516		/*
3517		 * We have blocks reserved already.  We
3518		 * return allocated blocks so that delalloc
3519		 * won't do block reservation for us.  But
3520		 * the buffer head will be unmapped so that
3521		 * a read from the block returns 0s.
3522		 */
3523		map->m_flags |= EXT4_MAP_UNWRITTEN;
3524		goto out1;
3525	}
3526
3527	/* buffered write, writepage time, convert*/
3528	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3529	if (ret >= 0)
3530		ext4_update_inode_fsync_trans(handle, inode, 1);
3531out:
3532	if (ret <= 0) {
3533		err = ret;
3534		goto out2;
3535	} else
3536		allocated = ret;
3537	map->m_flags |= EXT4_MAP_NEW;
3538	/*
3539	 * if we allocated more blocks than requested
3540	 * we need to make sure we unmap the extra block
3541	 * allocated. The actual needed block will get
3542	 * unmapped later when we find the buffer_head marked
3543	 * new.
3544	 */
3545	if (allocated > map->m_len) {
3546		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3547					newblock + map->m_len,
3548					allocated - map->m_len);
3549		allocated = map->m_len;
3550	}
3551
3552	/*
3553	 * If we have done fallocate with the offset that is already
3554	 * delayed allocated, we would have block reservation
3555	 * and quota reservation done in the delayed write path.
3556	 * But fallocate would have already updated quota and block
3557	 * count for this offset. So cancel these reservation
3558	 */
3559	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3560		unsigned int reserved_clusters;
3561		reserved_clusters = get_reserved_cluster_alloc(inode,
3562				map->m_lblk, map->m_len);
3563		if (reserved_clusters)
3564			ext4_da_update_reserve_space(inode,
3565						     reserved_clusters,
3566						     0);
3567	}
3568
3569map_out:
3570	map->m_flags |= EXT4_MAP_MAPPED;
3571	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3572		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3573					 map->m_len);
3574		if (err < 0)
3575			goto out2;
3576	}
3577out1:
3578	if (allocated > map->m_len)
3579		allocated = map->m_len;
3580	ext4_ext_show_leaf(inode, path);
3581	map->m_pblk = newblock;
3582	map->m_len = allocated;
3583out2:
3584	if (path) {
3585		ext4_ext_drop_refs(path);
3586		kfree(path);
3587	}
3588	return err ? err : allocated;
3589}
3590
3591/*
3592 * get_implied_cluster_alloc - check to see if the requested
3593 * allocation (in the map structure) overlaps with a cluster already
3594 * allocated in an extent.
3595 *	@sb	The filesystem superblock structure
3596 *	@map	The requested lblk->pblk mapping
3597 *	@ex	The extent structure which might contain an implied
3598 *			cluster allocation
3599 *
3600 * This function is called by ext4_ext_map_blocks() after we failed to
3601 * find blocks that were already in the inode's extent tree.  Hence,
3602 * we know that the beginning of the requested region cannot overlap
3603 * the extent from the inode's extent tree.  There are three cases we
3604 * want to catch.  The first is this case:
3605 *
3606 *		 |--- cluster # N--|
3607 *    |--- extent ---|	|---- requested region ---|
3608 *			|==========|
3609 *
3610 * The second case that we need to test for is this one:
3611 *
3612 *   |--------- cluster # N ----------------|
3613 *	   |--- requested region --|   |------- extent ----|
3614 *	   |=======================|
3615 *
3616 * The third case is when the requested region lies between two extents
3617 * within the same cluster:
3618 *          |------------- cluster # N-------------|
3619 * |----- ex -----|                  |---- ex_right ----|
3620 *                  |------ requested region ------|
3621 *                  |================|
3622 *
3623 * In each of the above cases, we need to set the map->m_pblk and
3624 * map->m_len so it corresponds to the return the extent labelled as
3625 * "|====|" from cluster #N, since it is already in use for data in
3626 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3627 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3628 * as a new "allocated" block region.  Otherwise, we will return 0 and
3629 * ext4_ext_map_blocks() will then allocate one or more new clusters
3630 * by calling ext4_mb_new_blocks().
3631 */
3632static int get_implied_cluster_alloc(struct super_block *sb,
3633				     struct ext4_map_blocks *map,
3634				     struct ext4_extent *ex,
3635				     struct ext4_ext_path *path)
3636{
3637	struct ext4_sb_info *sbi = EXT4_SB(sb);
3638	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3639	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3640	ext4_lblk_t rr_cluster_start, rr_cluster_end;
3641	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3642	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3643	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3644
3645	/* The extent passed in that we are trying to match */
3646	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3647	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3648
3649	/* The requested region passed into ext4_map_blocks() */
3650	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3651	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3652
3653	if ((rr_cluster_start == ex_cluster_end) ||
3654	    (rr_cluster_start == ex_cluster_start)) {
3655		if (rr_cluster_start == ex_cluster_end)
3656			ee_start += ee_len - 1;
3657		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3658			c_offset;
3659		map->m_len = min(map->m_len,
3660				 (unsigned) sbi->s_cluster_ratio - c_offset);
3661		/*
3662		 * Check for and handle this case:
3663		 *
3664		 *   |--------- cluster # N-------------|
3665		 *		       |------- extent ----|
3666		 *	   |--- requested region ---|
3667		 *	   |===========|
3668		 */
3669
3670		if (map->m_lblk < ee_block)
3671			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3672
3673		/*
3674		 * Check for the case where there is already another allocated
3675		 * block to the right of 'ex' but before the end of the cluster.
3676		 *
3677		 *          |------------- cluster # N-------------|
3678		 * |----- ex -----|                  |---- ex_right ----|
3679		 *                  |------ requested region ------|
3680		 *                  |================|
3681		 */
3682		if (map->m_lblk > ee_block) {
3683			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3684			map->m_len = min(map->m_len, next - map->m_lblk);
3685		}
3686
3687		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3688		return 1;
3689	}
3690
3691	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3692	return 0;
3693}
3694
3695
3696/*
3697 * Block allocation/map/preallocation routine for extents based files
3698 *
3699 *
3700 * Need to be called with
3701 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3702 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3703 *
3704 * return > 0, number of of blocks already mapped/allocated
3705 *          if create == 0 and these are pre-allocated blocks
3706 *          	buffer head is unmapped
3707 *          otherwise blocks are mapped
3708 *
3709 * return = 0, if plain look up failed (blocks have not been allocated)
3710 *          buffer head is unmapped
3711 *
3712 * return < 0, error case.
3713 */
3714int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3715			struct ext4_map_blocks *map, int flags)
3716{
3717	struct ext4_ext_path *path = NULL;
3718	struct ext4_extent newex, *ex, *ex2;
3719	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3720	ext4_fsblk_t newblock = 0;
3721	int free_on_err = 0, err = 0, depth, ret;
3722	unsigned int allocated = 0, offset = 0;
3723	unsigned int allocated_clusters = 0;
3724	unsigned int punched_out = 0;
3725	unsigned int result = 0;
3726	struct ext4_allocation_request ar;
3727	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3728	ext4_lblk_t cluster_offset;
3729
3730	ext_debug("blocks %u/%u requested for inode %lu\n",
3731		  map->m_lblk, map->m_len, inode->i_ino);
3732	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3733
3734	/* check in cache */
3735	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3736		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3737		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3738			if ((sbi->s_cluster_ratio > 1) &&
3739			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3740				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3741
3742			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3743				/*
3744				 * block isn't allocated yet and
3745				 * user doesn't want to allocate it
3746				 */
3747				goto out2;
3748			}
3749			/* we should allocate requested block */
3750		} else {
3751			/* block is already allocated */
3752			if (sbi->s_cluster_ratio > 1)
3753				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3754			newblock = map->m_lblk
3755				   - le32_to_cpu(newex.ee_block)
3756				   + ext4_ext_pblock(&newex);
3757			/* number of remaining blocks in the extent */
3758			allocated = ext4_ext_get_actual_len(&newex) -
3759				(map->m_lblk - le32_to_cpu(newex.ee_block));
3760			goto out;
3761		}
3762	}
3763
3764	/* find extent for this block */
3765	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3766	if (IS_ERR(path)) {
3767		err = PTR_ERR(path);
3768		path = NULL;
3769		goto out2;
3770	}
3771
3772	depth = ext_depth(inode);
3773
3774	/*
3775	 * consistent leaf must not be empty;
3776	 * this situation is possible, though, _during_ tree modification;
3777	 * this is why assert can't be put in ext4_ext_find_extent()
3778	 */
3779	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3780		EXT4_ERROR_INODE(inode, "bad extent address "
3781				 "lblock: %lu, depth: %d pblock %lld",
3782				 (unsigned long) map->m_lblk, depth,
3783				 path[depth].p_block);
3784		err = -EIO;
3785		goto out2;
3786	}
3787
3788	ex = path[depth].p_ext;
3789	if (ex) {
3790		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3791		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3792		unsigned short ee_len;
3793
3794		/*
3795		 * Uninitialized extents are treated as holes, except that
3796		 * we split out initialized portions during a write.
3797		 */
3798		ee_len = ext4_ext_get_actual_len(ex);
3799
3800		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3801
3802		/* if found extent covers block, simply return it */
3803		if (in_range(map->m_lblk, ee_block, ee_len)) {
3804			struct ext4_map_blocks punch_map;
3805			ext4_fsblk_t partial_cluster = 0;
3806
3807			newblock = map->m_lblk - ee_block + ee_start;
3808			/* number of remaining blocks in the extent */
3809			allocated = ee_len - (map->m_lblk - ee_block);
3810			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3811				  ee_block, ee_len, newblock);
3812
3813			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3814				/*
3815				 * Do not put uninitialized extent
3816				 * in the cache
3817				 */
3818				if (!ext4_ext_is_uninitialized(ex)) {
3819					ext4_ext_put_in_cache(inode, ee_block,
3820						ee_len, ee_start);
3821					goto out;
3822				}
3823				ret = ext4_ext_handle_uninitialized_extents(
3824					handle, inode, map, path, flags,
3825					allocated, newblock);
3826				return ret;
3827			}
3828
3829			/*
3830			 * Punch out the map length, but only to the
3831			 * end of the extent
3832			 */
3833			punched_out = allocated < map->m_len ?
3834				allocated : map->m_len;
3835
3836			/*
3837			 * Sense extents need to be converted to
3838			 * uninitialized, they must fit in an
3839			 * uninitialized extent
3840			 */
3841			if (punched_out > EXT_UNINIT_MAX_LEN)
3842				punched_out = EXT_UNINIT_MAX_LEN;
3843
3844			punch_map.m_lblk = map->m_lblk;
3845			punch_map.m_pblk = newblock;
3846			punch_map.m_len = punched_out;
3847			punch_map.m_flags = 0;
3848
3849			/* Check to see if the extent needs to be split */
3850			if (punch_map.m_len != ee_len ||
3851				punch_map.m_lblk != ee_block) {
3852
3853				ret = ext4_split_extent(handle, inode,
3854				path, &punch_map, 0,
3855				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3856				EXT4_GET_BLOCKS_PRE_IO);
3857
3858				if (ret < 0) {
3859					err = ret;
3860					goto out2;
3861				}
3862				/*
3863				 * find extent for the block at
3864				 * the start of the hole
3865				 */
3866				ext4_ext_drop_refs(path);
3867				kfree(path);
3868
3869				path = ext4_ext_find_extent(inode,
3870				map->m_lblk, NULL);
3871				if (IS_ERR(path)) {
3872					err = PTR_ERR(path);
3873					path = NULL;
3874					goto out2;
3875				}
3876
3877				depth = ext_depth(inode);
3878				ex = path[depth].p_ext;
3879				ee_len = ext4_ext_get_actual_len(ex);
3880				ee_block = le32_to_cpu(ex->ee_block);
3881				ee_start = ext4_ext_pblock(ex);
3882
3883			}
3884
3885			ext4_ext_mark_uninitialized(ex);
3886
3887			ext4_ext_invalidate_cache(inode);
3888
3889			err = ext4_ext_rm_leaf(handle, inode, path,
3890					       &partial_cluster, map->m_lblk,
3891					       map->m_lblk + punched_out);
3892
3893			if (!err && path->p_hdr->eh_entries == 0) {
3894				/*
3895				 * Punch hole freed all of this sub tree,
3896				 * so we need to correct eh_depth
3897				 */
3898				err = ext4_ext_get_access(handle, inode, path);
3899				if (err == 0) {
3900					ext_inode_hdr(inode)->eh_depth = 0;
3901					ext_inode_hdr(inode)->eh_max =
3902					cpu_to_le16(ext4_ext_space_root(
3903						inode, 0));
3904
3905					err = ext4_ext_dirty(
3906						handle, inode, path);
3907				}
3908			}
3909
3910			goto out2;
3911		}
3912	}
3913
3914	if ((sbi->s_cluster_ratio > 1) &&
3915	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3916		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3917
3918	/*
3919	 * requested block isn't allocated yet;
3920	 * we couldn't try to create block if create flag is zero
3921	 */
3922	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3923		/*
3924		 * put just found gap into cache to speed up
3925		 * subsequent requests
3926		 */
3927		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3928		goto out2;
3929	}
3930
3931	/*
3932	 * Okay, we need to do block allocation.
3933	 */
3934	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3935	newex.ee_block = cpu_to_le32(map->m_lblk);
3936	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3937
3938	/*
3939	 * If we are doing bigalloc, check to see if the extent returned
3940	 * by ext4_ext_find_extent() implies a cluster we can use.
3941	 */
3942	if (cluster_offset && ex &&
3943	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3944		ar.len = allocated = map->m_len;
3945		newblock = map->m_pblk;
3946		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3947		goto got_allocated_blocks;
3948	}
3949
3950	/* find neighbour allocated blocks */
3951	ar.lleft = map->m_lblk;
3952	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3953	if (err)
3954		goto out2;
3955	ar.lright = map->m_lblk;
3956	ex2 = NULL;
3957	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3958	if (err)
3959		goto out2;
3960
3961	/* Check if the extent after searching to the right implies a
3962	 * cluster we can use. */
3963	if ((sbi->s_cluster_ratio > 1) && ex2 &&
3964	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3965		ar.len = allocated = map->m_len;
3966		newblock = map->m_pblk;
3967		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3968		goto got_allocated_blocks;
3969	}
3970
3971	/*
3972	 * See if request is beyond maximum number of blocks we can have in
3973	 * a single extent. For an initialized extent this limit is
3974	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3975	 * EXT_UNINIT_MAX_LEN.
3976	 */
3977	if (map->m_len > EXT_INIT_MAX_LEN &&
3978	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3979		map->m_len = EXT_INIT_MAX_LEN;
3980	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3981		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3982		map->m_len = EXT_UNINIT_MAX_LEN;
3983
3984	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3985	newex.ee_len = cpu_to_le16(map->m_len);
3986	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3987	if (err)
3988		allocated = ext4_ext_get_actual_len(&newex);
3989	else
3990		allocated = map->m_len;
3991
3992	/* allocate new block */
3993	ar.inode = inode;
3994	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3995	ar.logical = map->m_lblk;
3996	/*
3997	 * We calculate the offset from the beginning of the cluster
3998	 * for the logical block number, since when we allocate a
3999	 * physical cluster, the physical block should start at the
4000	 * same offset from the beginning of the cluster.  This is
4001	 * needed so that future calls to get_implied_cluster_alloc()
4002	 * work correctly.
4003	 */
4004	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
4005	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4006	ar.goal -= offset;
4007	ar.logical -= offset;
4008	if (S_ISREG(inode->i_mode))
4009		ar.flags = EXT4_MB_HINT_DATA;
4010	else
4011		/* disable in-core preallocation for non-regular files */
4012		ar.flags = 0;
4013	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4014		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4015	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4016	if (!newblock)
4017		goto out2;
4018	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4019		  ar.goal, newblock, allocated);
4020	free_on_err = 1;
4021	allocated_clusters = ar.len;
4022	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4023	if (ar.len > allocated)
4024		ar.len = allocated;
4025
4026got_allocated_blocks:
4027	/* try to insert new extent into found leaf and return */
4028	ext4_ext_store_pblock(&newex, newblock + offset);
4029	newex.ee_len = cpu_to_le16(ar.len);
4030	/* Mark uninitialized */
4031	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4032		ext4_ext_mark_uninitialized(&newex);
4033		/*
4034		 * io_end structure was created for every IO write to an
4035		 * uninitialized extent. To avoid unnecessary conversion,
4036		 * here we flag the IO that really needs the conversion.
4037		 * For non asycn direct IO case, flag the inode state
4038		 * that we need to perform conversion when IO is done.
4039		 */
4040		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
4041			if (io) {
4042				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
4043					io->flag = EXT4_IO_END_UNWRITTEN;
4044					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
4045				}
4046			} else
4047				ext4_set_inode_state(inode,
4048						     EXT4_STATE_DIO_UNWRITTEN);
4049		}
4050		if (ext4_should_dioread_nolock(inode))
4051			map->m_flags |= EXT4_MAP_UNINIT;
4052	}
4053
4054	err = 0;
4055	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4056		err = check_eofblocks_fl(handle, inode, map->m_lblk,
4057					 path, ar.len);
4058	if (!err)
4059		err = ext4_ext_insert_extent(handle, inode, path,
4060					     &newex, flags);
4061	if (err && free_on_err) {
4062		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4063			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4064		/* free data blocks we just allocated */
4065		/* not a good idea to call discard here directly,
4066		 * but otherwise we'd need to call it every free() */
4067		ext4_discard_preallocations(inode);
4068		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
4069				 ext4_ext_get_actual_len(&newex), fb_flags);
4070		goto out2;
4071	}
4072
4073	/* previous routine could use block we allocated */
4074	newblock = ext4_ext_pblock(&newex);
4075	allocated = ext4_ext_get_actual_len(&newex);
4076	if (allocated > map->m_len)
4077		allocated = map->m_len;
4078	map->m_flags |= EXT4_MAP_NEW;
4079
4080	/*
4081	 * Update reserved blocks/metadata blocks after successful
4082	 * block allocation which had been deferred till now.
4083	 */
4084	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4085		unsigned int reserved_clusters;
4086		/*
4087		 * Check how many clusters we had reserved this allocated range
4088		 */
4089		reserved_clusters = get_reserved_cluster_alloc(inode,
4090						map->m_lblk, allocated);
4091		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4092			if (reserved_clusters) {
4093				/*
4094				 * We have clusters reserved for this range.
4095				 * But since we are not doing actual allocation
4096				 * and are simply using blocks from previously
4097				 * allocated cluster, we should release the
4098				 * reservation and not claim quota.
4099				 */
4100				ext4_da_update_reserve_space(inode,
4101						reserved_clusters, 0);
4102			}
4103		} else {
4104			BUG_ON(allocated_clusters < reserved_clusters);
4105			/* We will claim quota for all newly allocated blocks.*/
4106			ext4_da_update_reserve_space(inode, allocated_clusters,
4107							1);
4108			if (reserved_clusters < allocated_clusters) {
4109				struct ext4_inode_info *ei = EXT4_I(inode);
4110				int reservation = allocated_clusters -
4111						  reserved_clusters;
4112				/*
4113				 * It seems we claimed few clusters outside of
4114				 * the range of this allocation. We should give
4115				 * it back to the reservation pool. This can
4116				 * happen in the following case:
4117				 *
4118				 * * Suppose s_cluster_ratio is 4 (i.e., each
4119				 *   cluster has 4 blocks. Thus, the clusters
4120				 *   are [0-3],[4-7],[8-11]...
4121				 * * First comes delayed allocation write for
4122				 *   logical blocks 10 & 11. Since there were no
4123				 *   previous delayed allocated blocks in the
4124				 *   range [8-11], we would reserve 1 cluster
4125				 *   for this write.
4126				 * * Next comes write for logical blocks 3 to 8.
4127				 *   In this case, we will reserve 2 clusters
4128				 *   (for [0-3] and [4-7]; and not for [8-11] as
4129				 *   that range has a delayed allocated blocks.
4130				 *   Thus total reserved clusters now becomes 3.
4131				 * * Now, during the delayed allocation writeout
4132				 *   time, we will first write blocks [3-8] and
4133				 *   allocate 3 clusters for writing these
4134				 *   blocks. Also, we would claim all these
4135				 *   three clusters above.
4136				 * * Now when we come here to writeout the
4137				 *   blocks [10-11], we would expect to claim
4138				 *   the reservation of 1 cluster we had made
4139				 *   (and we would claim it since there are no
4140				 *   more delayed allocated blocks in the range
4141				 *   [8-11]. But our reserved cluster count had
4142				 *   already gone to 0.
4143				 *
4144				 *   Thus, at the step 4 above when we determine
4145				 *   that there are still some unwritten delayed
4146				 *   allocated blocks outside of our current
4147				 *   block range, we should increment the
4148				 *   reserved clusters count so that when the
4149				 *   remaining blocks finally gets written, we
4150				 *   could claim them.
4151				 */
4152				dquot_reserve_block(inode,
4153						EXT4_C2B(sbi, reservation));
4154				spin_lock(&ei->i_block_reservation_lock);
4155				ei->i_reserved_data_blocks += reservation;
4156				spin_unlock(&ei->i_block_reservation_lock);
4157			}
4158		}
4159	}
4160
4161	/*
4162	 * Cache the extent and update transaction to commit on fdatasync only
4163	 * when it is _not_ an uninitialized extent.
4164	 */
4165	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
4166		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
4167		ext4_update_inode_fsync_trans(handle, inode, 1);
4168	} else
4169		ext4_update_inode_fsync_trans(handle, inode, 0);
4170out:
4171	if (allocated > map->m_len)
4172		allocated = map->m_len;
4173	ext4_ext_show_leaf(inode, path);
4174	map->m_flags |= EXT4_MAP_MAPPED;
4175	map->m_pblk = newblock;
4176	map->m_len = allocated;
4177out2:
4178	if (path) {
4179		ext4_ext_drop_refs(path);
4180		kfree(path);
4181	}
4182	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4183		newblock, map->m_len, err ? err : allocated);
4184
4185	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4186			punched_out : allocated;
4187
4188	return err ? err : result;
4189}
4190
4191void ext4_ext_truncate(struct inode *inode)
4192{
4193	struct address_space *mapping = inode->i_mapping;
4194	struct super_block *sb = inode->i_sb;
4195	ext4_lblk_t last_block;
4196	handle_t *handle;
4197	loff_t page_len;
4198	int err = 0;
4199
4200	/*
4201	 * finish any pending end_io work so we won't run the risk of
4202	 * converting any truncated blocks to initialized later
4203	 */
4204	ext4_flush_completed_IO(inode);
4205
4206	/*
4207	 * probably first extent we're gonna free will be last in block
4208	 */
4209	err = ext4_writepage_trans_blocks(inode);
4210	handle = ext4_journal_start(inode, err);
4211	if (IS_ERR(handle))
4212		return;
4213
4214	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4215		page_len = PAGE_CACHE_SIZE -
4216			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4217
4218		err = ext4_discard_partial_page_buffers(handle,
4219			mapping, inode->i_size, page_len, 0);
4220
4221		if (err)
4222			goto out_stop;
4223	}
4224
4225	if (ext4_orphan_add(handle, inode))
4226		goto out_stop;
4227
4228	down_write(&EXT4_I(inode)->i_data_sem);
4229	ext4_ext_invalidate_cache(inode);
4230
4231	ext4_discard_preallocations(inode);
4232
4233	/*
4234	 * TODO: optimization is possible here.
4235	 * Probably we need not scan at all,
4236	 * because page truncation is enough.
4237	 */
4238
4239	/* we have to know where to truncate from in crash case */
4240	EXT4_I(inode)->i_disksize = inode->i_size;
4241	ext4_mark_inode_dirty(handle, inode);
4242
4243	last_block = (inode->i_size + sb->s_blocksize - 1)
4244			>> EXT4_BLOCK_SIZE_BITS(sb);
4245	err = ext4_ext_remove_space(inode, last_block);
4246
4247	/* In a multi-transaction truncate, we only make the final
4248	 * transaction synchronous.
4249	 */
4250	if (IS_SYNC(inode))
4251		ext4_handle_sync(handle);
4252
4253	up_write(&EXT4_I(inode)->i_data_sem);
4254
4255out_stop:
4256	/*
4257	 * If this was a simple ftruncate() and the file will remain alive,
4258	 * then we need to clear up the orphan record which we created above.
4259	 * However, if this was a real unlink then we were called by
4260	 * ext4_delete_inode(), and we allow that function to clean up the
4261	 * orphan info for us.
4262	 */
4263	if (inode->i_nlink)
4264		ext4_orphan_del(handle, inode);
4265
4266	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4267	ext4_mark_inode_dirty(handle, inode);
4268	ext4_journal_stop(handle);
4269}
4270
4271static void ext4_falloc_update_inode(struct inode *inode,
4272				int mode, loff_t new_size, int update_ctime)
4273{
4274	struct timespec now;
4275
4276	if (update_ctime) {
4277		now = current_fs_time(inode->i_sb);
4278		if (!timespec_equal(&inode->i_ctime, &now))
4279			inode->i_ctime = now;
4280	}
4281	/*
4282	 * Update only when preallocation was requested beyond
4283	 * the file size.
4284	 */
4285	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4286		if (new_size > i_size_read(inode))
4287			i_size_write(inode, new_size);
4288		if (new_size > EXT4_I(inode)->i_disksize)
4289			ext4_update_i_disksize(inode, new_size);
4290	} else {
4291		/*
4292		 * Mark that we allocate beyond EOF so the subsequent truncate
4293		 * can proceed even if the new size is the same as i_size.
4294		 */
4295		if (new_size > i_size_read(inode))
4296			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4297	}
4298
4299}
4300
4301/*
4302 * preallocate space for a file. This implements ext4's fallocate file
4303 * operation, which gets called from sys_fallocate system call.
4304 * For block-mapped files, posix_fallocate should fall back to the method
4305 * of writing zeroes to the required new blocks (the same behavior which is
4306 * expected for file systems which do not support fallocate() system call).
4307 */
4308long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4309{
4310	struct inode *inode = file->f_path.dentry->d_inode;
4311	handle_t *handle;
4312	loff_t new_size;
4313	unsigned int max_blocks;
4314	int ret = 0;
4315	int ret2 = 0;
4316	int retries = 0;
4317	int flags;
4318	struct ext4_map_blocks map;
4319	unsigned int credits, blkbits = inode->i_blkbits;
4320
4321	/*
4322	 * currently supporting (pre)allocate mode for extent-based
4323	 * files _only_
4324	 */
4325	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4326		return -EOPNOTSUPP;
4327
4328	/* Return error if mode is not supported */
4329	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4330		return -EOPNOTSUPP;
4331
4332	if (mode & FALLOC_FL_PUNCH_HOLE)
4333		return ext4_punch_hole(file, offset, len);
4334
4335	trace_ext4_fallocate_enter(inode, offset, len, mode);
4336	map.m_lblk = offset >> blkbits;
4337	/*
4338	 * We can't just convert len to max_blocks because
4339	 * If blocksize = 4096 offset = 3072 and len = 2048
4340	 */
4341	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4342		- map.m_lblk;
4343	/*
4344	 * credits to insert 1 extent into extent tree
4345	 */
4346	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4347	mutex_lock(&inode->i_mutex);
4348	ret = inode_newsize_ok(inode, (len + offset));
4349	if (ret) {
4350		mutex_unlock(&inode->i_mutex);
4351		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4352		return ret;
4353	}
4354	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4355		EXT4_GET_BLOCKS_NO_NORMALIZE;
4356	if (mode & FALLOC_FL_KEEP_SIZE)
4357		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4358retry:
4359	while (ret >= 0 && ret < max_blocks) {
4360		map.m_lblk = map.m_lblk + ret;
4361		map.m_len = max_blocks = max_blocks - ret;
4362		handle = ext4_journal_start(inode, credits);
4363		if (IS_ERR(handle)) {
4364			ret = PTR_ERR(handle);
4365			break;
4366		}
4367		ret = ext4_map_blocks(handle, inode, &map, flags);
4368		if (ret <= 0) {
4369#ifdef EXT4FS_DEBUG
4370			WARN_ON(ret <= 0);
4371			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4372				    "returned error inode#%lu, block=%u, "
4373				    "max_blocks=%u", __func__,
4374				    inode->i_ino, map.m_lblk, max_blocks);
4375#endif
4376			ext4_mark_inode_dirty(handle, inode);
4377			ret2 = ext4_journal_stop(handle);
4378			break;
4379		}
4380		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4381						blkbits) >> blkbits))
4382			new_size = offset + len;
4383		else
4384			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4385
4386		ext4_falloc_update_inode(inode, mode, new_size,
4387					 (map.m_flags & EXT4_MAP_NEW));
4388		ext4_mark_inode_dirty(handle, inode);
4389		ret2 = ext4_journal_stop(handle);
4390		if (ret2)
4391			break;
4392	}
4393	if (ret == -ENOSPC &&
4394			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4395		ret = 0;
4396		goto retry;
4397	}
4398	mutex_unlock(&inode->i_mutex);
4399	trace_ext4_fallocate_exit(inode, offset, max_blocks,
4400				ret > 0 ? ret2 : ret);
4401	return ret > 0 ? ret2 : ret;
4402}
4403
4404/*
4405 * This function convert a range of blocks to written extents
4406 * The caller of this function will pass the start offset and the size.
4407 * all unwritten extents within this range will be converted to
4408 * written extents.
4409 *
4410 * This function is called from the direct IO end io call back
4411 * function, to convert the fallocated extents after IO is completed.
4412 * Returns 0 on success.
4413 */
4414int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4415				    ssize_t len)
4416{
4417	handle_t *handle;
4418	unsigned int max_blocks;
4419	int ret = 0;
4420	int ret2 = 0;
4421	struct ext4_map_blocks map;
4422	unsigned int credits, blkbits = inode->i_blkbits;
4423
4424	map.m_lblk = offset >> blkbits;
4425	/*
4426	 * We can't just convert len to max_blocks because
4427	 * If blocksize = 4096 offset = 3072 and len = 2048
4428	 */
4429	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4430		      map.m_lblk);
4431	/*
4432	 * credits to insert 1 extent into extent tree
4433	 */
4434	credits = ext4_chunk_trans_blocks(inode, max_blocks);
4435	while (ret >= 0 && ret < max_blocks) {
4436		map.m_lblk += ret;
4437		map.m_len = (max_blocks -= ret);
4438		handle = ext4_journal_start(inode, credits);
4439		if (IS_ERR(handle)) {
4440			ret = PTR_ERR(handle);
4441			break;
4442		}
4443		ret = ext4_map_blocks(handle, inode, &map,
4444				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4445		if (ret <= 0) {
4446			WARN_ON(ret <= 0);
4447			printk(KERN_ERR "%s: ext4_ext_map_blocks "
4448				    "returned error inode#%lu, block=%u, "
4449				    "max_blocks=%u", __func__,
4450				    inode->i_ino, map.m_lblk, map.m_len);
4451		}
4452		ext4_mark_inode_dirty(handle, inode);
4453		ret2 = ext4_journal_stop(handle);
4454		if (ret <= 0 || ret2 )
4455			break;
4456	}
4457	return ret > 0 ? ret2 : ret;
4458}
4459
4460/*
4461 * Callback function called for each extent to gather FIEMAP information.
4462 */
4463static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
4464		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
4465		       void *data)
4466{
4467	__u64	logical;
4468	__u64	physical;
4469	__u64	length;
4470	__u32	flags = 0;
4471	int		ret = 0;
4472	struct fiemap_extent_info *fieinfo = data;
4473	unsigned char blksize_bits;
4474
4475	blksize_bits = inode->i_sb->s_blocksize_bits;
4476	logical = (__u64)newex->ec_block << blksize_bits;
4477
4478	if (newex->ec_start == 0) {
4479		/*
4480		 * No extent in extent-tree contains block @newex->ec_start,
4481		 * then the block may stay in 1)a hole or 2)delayed-extent.
4482		 *
4483		 * Holes or delayed-extents are processed as follows.
4484		 * 1. lookup dirty pages with specified range in pagecache.
4485		 *    If no page is got, then there is no delayed-extent and
4486		 *    return with EXT_CONTINUE.
4487		 * 2. find the 1st mapped buffer,
4488		 * 3. check if the mapped buffer is both in the request range
4489		 *    and a delayed buffer. If not, there is no delayed-extent,
4490		 *    then return.
4491		 * 4. a delayed-extent is found, the extent will be collected.
4492		 */
4493		ext4_lblk_t	end = 0;
4494		pgoff_t		last_offset;
4495		pgoff_t		offset;
4496		pgoff_t		index;
4497		pgoff_t		start_index = 0;
4498		struct page	**pages = NULL;
4499		struct buffer_head *bh = NULL;
4500		struct buffer_head *head = NULL;
4501		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4502
4503		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4504		if (pages == NULL)
4505			return -ENOMEM;
4506
4507		offset = logical >> PAGE_SHIFT;
4508repeat:
4509		last_offset = offset;
4510		head = NULL;
4511		ret = find_get_pages_tag(inode->i_mapping, &offset,
4512					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4513
4514		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4515			/* First time, try to find a mapped buffer. */
4516			if (ret == 0) {
4517out:
4518				for (index = 0; index < ret; index++)
4519					page_cache_release(pages[index]);
4520				/* just a hole. */
4521				kfree(pages);
4522				return EXT_CONTINUE;
4523			}
4524			index = 0;
4525
4526next_page:
4527			/* Try to find the 1st mapped buffer. */
4528			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4529				  blksize_bits;
4530			if (!page_has_buffers(pages[index]))
4531				goto out;
4532			head = page_buffers(pages[index]);
4533			if (!head)
4534				goto out;
4535
4536			index++;
4537			bh = head;
4538			do {
4539				if (end >= newex->ec_block +
4540					newex->ec_len)
4541					/* The buffer is out of
4542					 * the request range.
4543					 */
4544					goto out;
4545
4546				if (buffer_mapped(bh) &&
4547				    end >= newex->ec_block) {
4548					start_index = index - 1;
4549					/* get the 1st mapped buffer. */
4550					goto found_mapped_buffer;
4551				}
4552
4553				bh = bh->b_this_page;
4554				end++;
4555			} while (bh != head);
4556
4557			/* No mapped buffer in the range found in this page,
4558			 * We need to look up next page.
4559			 */
4560			if (index >= ret) {
4561				/* There is no page left, but we need to limit
4562				 * newex->ec_len.
4563				 */
4564				newex->ec_len = end - newex->ec_block;
4565				goto out;
4566			}
4567			goto next_page;
4568		} else {
4569			/*Find contiguous delayed buffers. */
4570			if (ret > 0 && pages[0]->index == last_offset)
4571				head = page_buffers(pages[0]);
4572			bh = head;
4573			index = 1;
4574			start_index = 0;
4575		}
4576
4577found_mapped_buffer:
4578		if (bh != NULL && buffer_delay(bh)) {
4579			/* 1st or contiguous delayed buffer found. */
4580			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4581				/*
4582				 * 1st delayed buffer found, record
4583				 * the start of extent.
4584				 */
4585				flags |= FIEMAP_EXTENT_DELALLOC;
4586				newex->ec_block = end;
4587				logical = (__u64)end << blksize_bits;
4588			}
4589			/* Find contiguous delayed buffers. */
4590			do {
4591				if (!buffer_delay(bh))
4592					goto found_delayed_extent;
4593				bh = bh->b_this_page;
4594				end++;
4595			} while (bh != head);
4596
4597			for (; index < ret; index++) {
4598				if (!page_has_buffers(pages[index])) {
4599					bh = NULL;
4600					break;
4601				}
4602				head = page_buffers(pages[index]);
4603				if (!head) {
4604					bh = NULL;
4605					break;
4606				}
4607
4608				if (pages[index]->index !=
4609				    pages[start_index]->index + index
4610				    - start_index) {
4611					/* Blocks are not contiguous. */
4612					bh = NULL;
4613					break;
4614				}
4615				bh = head;
4616				do {
4617					if (!buffer_delay(bh))
4618						/* Delayed-extent ends. */
4619						goto found_delayed_extent;
4620					bh = bh->b_this_page;
4621					end++;
4622				} while (bh != head);
4623			}
4624		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4625			/* a hole found. */
4626			goto out;
4627
4628found_delayed_extent:
4629		newex->ec_len = min(end - newex->ec_block,
4630						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4631		if (ret == nr_pages && bh != NULL &&
4632			newex->ec_len < EXT_INIT_MAX_LEN &&
4633			buffer_delay(bh)) {
4634			/* Have not collected an extent and continue. */
4635			for (index = 0; index < ret; index++)
4636				page_cache_release(pages[index]);
4637			goto repeat;
4638		}
4639
4640		for (index = 0; index < ret; index++)
4641			page_cache_release(pages[index]);
4642		kfree(pages);
4643	}
4644
4645	physical = (__u64)newex->ec_start << blksize_bits;
4646	length =   (__u64)newex->ec_len << blksize_bits;
4647
4648	if (ex && ext4_ext_is_uninitialized(ex))
4649		flags |= FIEMAP_EXTENT_UNWRITTEN;
4650
4651	if (next == EXT_MAX_BLOCKS)
4652		flags |= FIEMAP_EXTENT_LAST;
4653
4654	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4655					length, flags);
4656	if (ret < 0)
4657		return ret;
4658	if (ret == 1)
4659		return EXT_BREAK;
4660	return EXT_CONTINUE;
4661}
4662/* fiemap flags we can handle specified here */
4663#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4664
4665static int ext4_xattr_fiemap(struct inode *inode,
4666				struct fiemap_extent_info *fieinfo)
4667{
4668	__u64 physical = 0;
4669	__u64 length;
4670	__u32 flags = FIEMAP_EXTENT_LAST;
4671	int blockbits = inode->i_sb->s_blocksize_bits;
4672	int error = 0;
4673
4674	/* in-inode? */
4675	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4676		struct ext4_iloc iloc;
4677		int offset;	/* offset of xattr in inode */
4678
4679		error = ext4_get_inode_loc(inode, &iloc);
4680		if (error)
4681			return error;
4682		physical = iloc.bh->b_blocknr << blockbits;
4683		offset = EXT4_GOOD_OLD_INODE_SIZE +
4684				EXT4_I(inode)->i_extra_isize;
4685		physical += offset;
4686		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4687		flags |= FIEMAP_EXTENT_DATA_INLINE;
4688		brelse(iloc.bh);
4689	} else { /* external block */
4690		physical = EXT4_I(inode)->i_file_acl << blockbits;
4691		length = inode->i_sb->s_blocksize;
4692	}
4693
4694	if (physical)
4695		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4696						length, flags);
4697	return (error < 0 ? error : 0);
4698}
4699
4700/*
4701 * ext4_ext_punch_hole
4702 *
4703 * Punches a hole of "length" bytes in a file starting
4704 * at byte "offset"
4705 *
4706 * @inode:  The inode of the file to punch a hole in
4707 * @offset: The starting byte offset of the hole
4708 * @length: The length of the hole
4709 *
4710 * Returns the number of blocks removed or negative on err
4711 */
4712int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4713{
4714	struct inode *inode = file->f_path.dentry->d_inode;
4715	struct super_block *sb = inode->i_sb;
4716	struct ext4_ext_cache cache_ex;
4717	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4718	struct address_space *mapping = inode->i_mapping;
4719	struct ext4_map_blocks map;
4720	handle_t *handle;
4721	loff_t first_page, last_page, page_len;
4722	loff_t first_page_offset, last_page_offset;
4723	int ret, credits, blocks_released, err = 0;
4724
4725	/* No need to punch hole beyond i_size */
4726	if (offset >= inode->i_size)
4727		return 0;
4728
4729	/*
4730	 * If the hole extends beyond i_size, set the hole
4731	 * to end after the page that contains i_size
4732	 */
4733	if (offset + length > inode->i_size) {
4734		length = inode->i_size +
4735		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4736		   offset;
4737	}
4738
4739	first_block = (offset + sb->s_blocksize - 1) >>
4740		EXT4_BLOCK_SIZE_BITS(sb);
4741	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4742
4743	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4744	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4745
4746	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4747	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4748
4749	/*
4750	 * Write out all dirty pages to avoid race conditions
4751	 * Then release them.
4752	 */
4753	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4754		err = filemap_write_and_wait_range(mapping,
4755			offset, offset + length - 1);
4756
4757		if (err)
4758			return err;
4759	}
4760
4761	/* Now release the pages */
4762	if (last_page_offset > first_page_offset) {
4763		truncate_inode_pages_range(mapping, first_page_offset,
4764					   last_page_offset-1);
4765	}
4766
4767	/* finish any pending end_io work */
4768	ext4_flush_completed_IO(inode);
4769
4770	credits = ext4_writepage_trans_blocks(inode);
4771	handle = ext4_journal_start(inode, credits);
4772	if (IS_ERR(handle))
4773		return PTR_ERR(handle);
4774
4775	err = ext4_orphan_add(handle, inode);
4776	if (err)
4777		goto out;
4778
4779	/*
4780	 * Now we need to zero out the non-page-aligned data in the
4781	 * pages at the start and tail of the hole, and unmap the buffer
4782	 * heads for the block aligned regions of the page that were
4783	 * completely zeroed.
4784	 */
4785	if (first_page > last_page) {
4786		/*
4787		 * If the file space being truncated is contained within a page
4788		 * just zero out and unmap the middle of that page
4789		 */
4790		err = ext4_discard_partial_page_buffers(handle,
4791			mapping, offset, length, 0);
4792
4793		if (err)
4794			goto out;
4795	} else {
4796		/*
4797		 * zero out and unmap the partial page that contains
4798		 * the start of the hole
4799		 */
4800		page_len  = first_page_offset - offset;
4801		if (page_len > 0) {
4802			err = ext4_discard_partial_page_buffers(handle, mapping,
4803						   offset, page_len, 0);
4804			if (err)
4805				goto out;
4806		}
4807
4808		/*
4809		 * zero out and unmap the partial page that contains
4810		 * the end of the hole
4811		 */
4812		page_len = offset + length - last_page_offset;
4813		if (page_len > 0) {
4814			err = ext4_discard_partial_page_buffers(handle, mapping,
4815					last_page_offset, page_len, 0);
4816			if (err)
4817				goto out;
4818		}
4819	}
4820
4821
4822	/*
4823	 * If i_size is contained in the last page, we need to
4824	 * unmap and zero the partial page after i_size
4825	 */
4826	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4827	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4828
4829		page_len = PAGE_CACHE_SIZE -
4830			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4831
4832		if (page_len > 0) {
4833			err = ext4_discard_partial_page_buffers(handle,
4834			  mapping, inode->i_size, page_len, 0);
4835
4836			if (err)
4837				goto out;
4838		}
4839	}
4840
4841	/* If there are no blocks to remove, return now */
4842	if (first_block >= last_block)
4843		goto out;
4844
4845	down_write(&EXT4_I(inode)->i_data_sem);
4846	ext4_ext_invalidate_cache(inode);
4847	ext4_discard_preallocations(inode);
4848
4849	/*
4850	 * Loop over all the blocks and identify blocks
4851	 * that need to be punched out
4852	 */
4853	iblock = first_block;
4854	blocks_released = 0;
4855	while (iblock < last_block) {
4856		max_blocks = last_block - iblock;
4857		num_blocks = 1;
4858		memset(&map, 0, sizeof(map));
4859		map.m_lblk = iblock;
4860		map.m_len = max_blocks;
4861		ret = ext4_ext_map_blocks(handle, inode, &map,
4862			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4863
4864		if (ret > 0) {
4865			blocks_released += ret;
4866			num_blocks = ret;
4867		} else if (ret == 0) {
4868			/*
4869			 * If map blocks could not find the block,
4870			 * then it is in a hole.  If the hole was
4871			 * not already cached, then map blocks should
4872			 * put it in the cache.  So we can get the hole
4873			 * out of the cache
4874			 */
4875			memset(&cache_ex, 0, sizeof(cache_ex));
4876			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4877				!cache_ex.ec_start) {
4878
4879				/* The hole is cached */
4880				num_blocks = cache_ex.ec_block +
4881				cache_ex.ec_len - iblock;
4882
4883			} else {
4884				/* The block could not be identified */
4885				err = -EIO;
4886				break;
4887			}
4888		} else {
4889			/* Map blocks error */
4890			err = ret;
4891			break;
4892		}
4893
4894		if (num_blocks == 0) {
4895			/* This condition should never happen */
4896			ext_debug("Block lookup failed");
4897			err = -EIO;
4898			break;
4899		}
4900
4901		iblock += num_blocks;
4902	}
4903
4904	if (blocks_released > 0) {
4905		ext4_ext_invalidate_cache(inode);
4906		ext4_discard_preallocations(inode);
4907	}
4908
4909	if (IS_SYNC(inode))
4910		ext4_handle_sync(handle);
4911
4912	up_write(&EXT4_I(inode)->i_data_sem);
4913
4914out:
4915	ext4_orphan_del(handle, inode);
4916	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4917	ext4_mark_inode_dirty(handle, inode);
4918	ext4_journal_stop(handle);
4919	return err;
4920}
4921int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4922		__u64 start, __u64 len)
4923{
4924	ext4_lblk_t start_blk;
4925	int error = 0;
4926
4927	/* fallback to generic here if not in extents fmt */
4928	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4929		return generic_block_fiemap(inode, fieinfo, start, len,
4930			ext4_get_block);
4931
4932	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4933		return -EBADR;
4934
4935	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4936		error = ext4_xattr_fiemap(inode, fieinfo);
4937	} else {
4938		ext4_lblk_t len_blks;
4939		__u64 last_blk;
4940
4941		start_blk = start >> inode->i_sb->s_blocksize_bits;
4942		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4943		if (last_blk >= EXT_MAX_BLOCKS)
4944			last_blk = EXT_MAX_BLOCKS-1;
4945		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4946
4947		/*
4948		 * Walk the extent tree gathering extent information.
4949		 * ext4_ext_fiemap_cb will push extents back to user.
4950		 */
4951		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4952					  ext4_ext_fiemap_cb, fieinfo);
4953	}
4954
4955	return error;
4956}
4957