extents.c revision 58590b06d79f7ce5ab64ff3b6d537180fa50dc84
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54	ext4_fsblk_t block;
55
56	block = le32_to_cpu(ex->ee_start_lo);
57	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58	return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67	ext4_fsblk_t block;
68
69	block = le32_to_cpu(ix->ei_leaf_lo);
70	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71	return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97					    struct inode *inode,
98					    int needed)
99{
100	int err;
101
102	if (!ext4_handle_valid(handle))
103		return 0;
104	if (handle->h_buffer_credits > needed)
105		return 0;
106	err = ext4_journal_extend(handle, needed);
107	if (err <= 0)
108		return err;
109	err = ext4_truncate_restart_trans(handle, inode, needed);
110	if (err == 0)
111		err = -EAGAIN;
112
113	return err;
114}
115
116/*
117 * could return:
118 *  - EROFS
119 *  - ENOMEM
120 */
121static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
122				struct ext4_ext_path *path)
123{
124	if (path->p_bh) {
125		/* path points to block */
126		return ext4_journal_get_write_access(handle, path->p_bh);
127	}
128	/* path points to leaf/index in inode body */
129	/* we use in-core data, no need to protect them */
130	return 0;
131}
132
133/*
134 * could return:
135 *  - EROFS
136 *  - ENOMEM
137 *  - EIO
138 */
139static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
140				struct ext4_ext_path *path)
141{
142	int err;
143	if (path->p_bh) {
144		/* path points to block */
145		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
146	} else {
147		/* path points to leaf/index in inode body */
148		err = ext4_mark_inode_dirty(handle, inode);
149	}
150	return err;
151}
152
153static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
154			      struct ext4_ext_path *path,
155			      ext4_lblk_t block)
156{
157	struct ext4_inode_info *ei = EXT4_I(inode);
158	ext4_fsblk_t bg_start;
159	ext4_fsblk_t last_block;
160	ext4_grpblk_t colour;
161	ext4_group_t block_group;
162	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
163	int depth;
164
165	if (path) {
166		struct ext4_extent *ex;
167		depth = path->p_depth;
168
169		/* try to predict block placement */
170		ex = path[depth].p_ext;
171		if (ex)
172			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
173
174		/* it looks like index is empty;
175		 * try to find starting block from index itself */
176		if (path[depth].p_bh)
177			return path[depth].p_bh->b_blocknr;
178	}
179
180	/* OK. use inode's group */
181	block_group = ei->i_block_group;
182	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
183		/*
184		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
185		 * block groups per flexgroup, reserve the first block
186		 * group for directories and special files.  Regular
187		 * files will start at the second block group.  This
188		 * tends to speed up directory access and improves
189		 * fsck times.
190		 */
191		block_group &= ~(flex_size-1);
192		if (S_ISREG(inode->i_mode))
193			block_group++;
194	}
195	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
196	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
197
198	/*
199	 * If we are doing delayed allocation, we don't need take
200	 * colour into account.
201	 */
202	if (test_opt(inode->i_sb, DELALLOC))
203		return bg_start;
204
205	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
206		colour = (current->pid % 16) *
207			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
208	else
209		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
210	return bg_start + colour + block;
211}
212
213/*
214 * Allocation for a meta data block
215 */
216static ext4_fsblk_t
217ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
218			struct ext4_ext_path *path,
219			struct ext4_extent *ex, int *err)
220{
221	ext4_fsblk_t goal, newblock;
222
223	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
224	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
225	return newblock;
226}
227
228static inline int ext4_ext_space_block(struct inode *inode, int check)
229{
230	int size;
231
232	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
233			/ sizeof(struct ext4_extent);
234	if (!check) {
235#ifdef AGGRESSIVE_TEST
236		if (size > 6)
237			size = 6;
238#endif
239	}
240	return size;
241}
242
243static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
244{
245	int size;
246
247	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
248			/ sizeof(struct ext4_extent_idx);
249	if (!check) {
250#ifdef AGGRESSIVE_TEST
251		if (size > 5)
252			size = 5;
253#endif
254	}
255	return size;
256}
257
258static inline int ext4_ext_space_root(struct inode *inode, int check)
259{
260	int size;
261
262	size = sizeof(EXT4_I(inode)->i_data);
263	size -= sizeof(struct ext4_extent_header);
264	size /= sizeof(struct ext4_extent);
265	if (!check) {
266#ifdef AGGRESSIVE_TEST
267		if (size > 3)
268			size = 3;
269#endif
270	}
271	return size;
272}
273
274static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
275{
276	int size;
277
278	size = sizeof(EXT4_I(inode)->i_data);
279	size -= sizeof(struct ext4_extent_header);
280	size /= sizeof(struct ext4_extent_idx);
281	if (!check) {
282#ifdef AGGRESSIVE_TEST
283		if (size > 4)
284			size = 4;
285#endif
286	}
287	return size;
288}
289
290/*
291 * Calculate the number of metadata blocks needed
292 * to allocate @blocks
293 * Worse case is one block per extent
294 */
295int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
296{
297	struct ext4_inode_info *ei = EXT4_I(inode);
298	int idxs, num = 0;
299
300	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
301		/ sizeof(struct ext4_extent_idx));
302
303	/*
304	 * If the new delayed allocation block is contiguous with the
305	 * previous da block, it can share index blocks with the
306	 * previous block, so we only need to allocate a new index
307	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
308	 * an additional index block, and at ldxs**3 blocks, yet
309	 * another index blocks.
310	 */
311	if (ei->i_da_metadata_calc_len &&
312	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
313		if ((ei->i_da_metadata_calc_len % idxs) == 0)
314			num++;
315		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
316			num++;
317		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
318			num++;
319			ei->i_da_metadata_calc_len = 0;
320		} else
321			ei->i_da_metadata_calc_len++;
322		ei->i_da_metadata_calc_last_lblock++;
323		return num;
324	}
325
326	/*
327	 * In the worst case we need a new set of index blocks at
328	 * every level of the inode's extent tree.
329	 */
330	ei->i_da_metadata_calc_len = 1;
331	ei->i_da_metadata_calc_last_lblock = lblock;
332	return ext_depth(inode) + 1;
333}
334
335static int
336ext4_ext_max_entries(struct inode *inode, int depth)
337{
338	int max;
339
340	if (depth == ext_depth(inode)) {
341		if (depth == 0)
342			max = ext4_ext_space_root(inode, 1);
343		else
344			max = ext4_ext_space_root_idx(inode, 1);
345	} else {
346		if (depth == 0)
347			max = ext4_ext_space_block(inode, 1);
348		else
349			max = ext4_ext_space_block_idx(inode, 1);
350	}
351
352	return max;
353}
354
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{
357	ext4_fsblk_t block = ext_pblock(ext);
358	int len = ext4_ext_get_actual_len(ext);
359
360	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
361}
362
363static int ext4_valid_extent_idx(struct inode *inode,
364				struct ext4_extent_idx *ext_idx)
365{
366	ext4_fsblk_t block = idx_pblock(ext_idx);
367
368	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369}
370
371static int ext4_valid_extent_entries(struct inode *inode,
372				struct ext4_extent_header *eh,
373				int depth)
374{
375	struct ext4_extent *ext;
376	struct ext4_extent_idx *ext_idx;
377	unsigned short entries;
378	if (eh->eh_entries == 0)
379		return 1;
380
381	entries = le16_to_cpu(eh->eh_entries);
382
383	if (depth == 0) {
384		/* leaf entries */
385		ext = EXT_FIRST_EXTENT(eh);
386		while (entries) {
387			if (!ext4_valid_extent(inode, ext))
388				return 0;
389			ext++;
390			entries--;
391		}
392	} else {
393		ext_idx = EXT_FIRST_INDEX(eh);
394		while (entries) {
395			if (!ext4_valid_extent_idx(inode, ext_idx))
396				return 0;
397			ext_idx++;
398			entries--;
399		}
400	}
401	return 1;
402}
403
404static int __ext4_ext_check(const char *function, unsigned int line,
405			    struct inode *inode, struct ext4_extent_header *eh,
406			    int depth)
407{
408	const char *error_msg;
409	int max = 0;
410
411	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
412		error_msg = "invalid magic";
413		goto corrupted;
414	}
415	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
416		error_msg = "unexpected eh_depth";
417		goto corrupted;
418	}
419	if (unlikely(eh->eh_max == 0)) {
420		error_msg = "invalid eh_max";
421		goto corrupted;
422	}
423	max = ext4_ext_max_entries(inode, depth);
424	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
425		error_msg = "too large eh_max";
426		goto corrupted;
427	}
428	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
429		error_msg = "invalid eh_entries";
430		goto corrupted;
431	}
432	if (!ext4_valid_extent_entries(inode, eh, depth)) {
433		error_msg = "invalid extent entries";
434		goto corrupted;
435	}
436	return 0;
437
438corrupted:
439	ext4_error_inode(inode, function, line, 0,
440			"bad header/extent: %s - magic %x, "
441			"entries %u, max %u(%u), depth %u(%u)",
442			error_msg, le16_to_cpu(eh->eh_magic),
443			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
444			max, le16_to_cpu(eh->eh_depth), depth);
445
446	return -EIO;
447}
448
449#define ext4_ext_check(inode, eh, depth)	\
450	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
451
452int ext4_ext_check_inode(struct inode *inode)
453{
454	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
455}
456
457#ifdef EXT_DEBUG
458static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
459{
460	int k, l = path->p_depth;
461
462	ext_debug("path:");
463	for (k = 0; k <= l; k++, path++) {
464		if (path->p_idx) {
465		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466			    idx_pblock(path->p_idx));
467		} else if (path->p_ext) {
468			ext_debug("  %d:[%d]%d:%llu ",
469				  le32_to_cpu(path->p_ext->ee_block),
470				  ext4_ext_is_uninitialized(path->p_ext),
471				  ext4_ext_get_actual_len(path->p_ext),
472				  ext_pblock(path->p_ext));
473		} else
474			ext_debug("  []");
475	}
476	ext_debug("\n");
477}
478
479static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
480{
481	int depth = ext_depth(inode);
482	struct ext4_extent_header *eh;
483	struct ext4_extent *ex;
484	int i;
485
486	if (!path)
487		return;
488
489	eh = path[depth].p_hdr;
490	ex = EXT_FIRST_EXTENT(eh);
491
492	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
493
494	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496			  ext4_ext_is_uninitialized(ex),
497			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
498	}
499	ext_debug("\n");
500}
501#else
502#define ext4_ext_show_path(inode, path)
503#define ext4_ext_show_leaf(inode, path)
504#endif
505
506void ext4_ext_drop_refs(struct ext4_ext_path *path)
507{
508	int depth = path->p_depth;
509	int i;
510
511	for (i = 0; i <= depth; i++, path++)
512		if (path->p_bh) {
513			brelse(path->p_bh);
514			path->p_bh = NULL;
515		}
516}
517
518/*
519 * ext4_ext_binsearch_idx:
520 * binary search for the closest index of the given block
521 * the header must be checked before calling this
522 */
523static void
524ext4_ext_binsearch_idx(struct inode *inode,
525			struct ext4_ext_path *path, ext4_lblk_t block)
526{
527	struct ext4_extent_header *eh = path->p_hdr;
528	struct ext4_extent_idx *r, *l, *m;
529
530
531	ext_debug("binsearch for %u(idx):  ", block);
532
533	l = EXT_FIRST_INDEX(eh) + 1;
534	r = EXT_LAST_INDEX(eh);
535	while (l <= r) {
536		m = l + (r - l) / 2;
537		if (block < le32_to_cpu(m->ei_block))
538			r = m - 1;
539		else
540			l = m + 1;
541		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
542				m, le32_to_cpu(m->ei_block),
543				r, le32_to_cpu(r->ei_block));
544	}
545
546	path->p_idx = l - 1;
547	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548		  idx_pblock(path->p_idx));
549
550#ifdef CHECK_BINSEARCH
551	{
552		struct ext4_extent_idx *chix, *ix;
553		int k;
554
555		chix = ix = EXT_FIRST_INDEX(eh);
556		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
557		  if (k != 0 &&
558		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
559				printk(KERN_DEBUG "k=%d, ix=0x%p, "
560				       "first=0x%p\n", k,
561				       ix, EXT_FIRST_INDEX(eh));
562				printk(KERN_DEBUG "%u <= %u\n",
563				       le32_to_cpu(ix->ei_block),
564				       le32_to_cpu(ix[-1].ei_block));
565			}
566			BUG_ON(k && le32_to_cpu(ix->ei_block)
567					   <= le32_to_cpu(ix[-1].ei_block));
568			if (block < le32_to_cpu(ix->ei_block))
569				break;
570			chix = ix;
571		}
572		BUG_ON(chix != path->p_idx);
573	}
574#endif
575
576}
577
578/*
579 * ext4_ext_binsearch:
580 * binary search for closest extent of the given block
581 * the header must be checked before calling this
582 */
583static void
584ext4_ext_binsearch(struct inode *inode,
585		struct ext4_ext_path *path, ext4_lblk_t block)
586{
587	struct ext4_extent_header *eh = path->p_hdr;
588	struct ext4_extent *r, *l, *m;
589
590	if (eh->eh_entries == 0) {
591		/*
592		 * this leaf is empty:
593		 * we get such a leaf in split/add case
594		 */
595		return;
596	}
597
598	ext_debug("binsearch for %u:  ", block);
599
600	l = EXT_FIRST_EXTENT(eh) + 1;
601	r = EXT_LAST_EXTENT(eh);
602
603	while (l <= r) {
604		m = l + (r - l) / 2;
605		if (block < le32_to_cpu(m->ee_block))
606			r = m - 1;
607		else
608			l = m + 1;
609		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
610				m, le32_to_cpu(m->ee_block),
611				r, le32_to_cpu(r->ee_block));
612	}
613
614	path->p_ext = l - 1;
615	ext_debug("  -> %d:%llu:[%d]%d ",
616			le32_to_cpu(path->p_ext->ee_block),
617			ext_pblock(path->p_ext),
618			ext4_ext_is_uninitialized(path->p_ext),
619			ext4_ext_get_actual_len(path->p_ext));
620
621#ifdef CHECK_BINSEARCH
622	{
623		struct ext4_extent *chex, *ex;
624		int k;
625
626		chex = ex = EXT_FIRST_EXTENT(eh);
627		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
628			BUG_ON(k && le32_to_cpu(ex->ee_block)
629					  <= le32_to_cpu(ex[-1].ee_block));
630			if (block < le32_to_cpu(ex->ee_block))
631				break;
632			chex = ex;
633		}
634		BUG_ON(chex != path->p_ext);
635	}
636#endif
637
638}
639
640int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
641{
642	struct ext4_extent_header *eh;
643
644	eh = ext_inode_hdr(inode);
645	eh->eh_depth = 0;
646	eh->eh_entries = 0;
647	eh->eh_magic = EXT4_EXT_MAGIC;
648	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
649	ext4_mark_inode_dirty(handle, inode);
650	ext4_ext_invalidate_cache(inode);
651	return 0;
652}
653
654struct ext4_ext_path *
655ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
656					struct ext4_ext_path *path)
657{
658	struct ext4_extent_header *eh;
659	struct buffer_head *bh;
660	short int depth, i, ppos = 0, alloc = 0;
661
662	eh = ext_inode_hdr(inode);
663	depth = ext_depth(inode);
664
665	/* account possible depth increase */
666	if (!path) {
667		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
668				GFP_NOFS);
669		if (!path)
670			return ERR_PTR(-ENOMEM);
671		alloc = 1;
672	}
673	path[0].p_hdr = eh;
674	path[0].p_bh = NULL;
675
676	i = depth;
677	/* walk through the tree */
678	while (i) {
679		int need_to_validate = 0;
680
681		ext_debug("depth %d: num %d, max %d\n",
682			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683
684		ext4_ext_binsearch_idx(inode, path + ppos, block);
685		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
686		path[ppos].p_depth = i;
687		path[ppos].p_ext = NULL;
688
689		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
690		if (unlikely(!bh))
691			goto err;
692		if (!bh_uptodate_or_lock(bh)) {
693			if (bh_submit_read(bh) < 0) {
694				put_bh(bh);
695				goto err;
696			}
697			/* validate the extent entries */
698			need_to_validate = 1;
699		}
700		eh = ext_block_hdr(bh);
701		ppos++;
702		if (unlikely(ppos > depth)) {
703			put_bh(bh);
704			EXT4_ERROR_INODE(inode,
705					 "ppos %d > depth %d", ppos, depth);
706			goto err;
707		}
708		path[ppos].p_bh = bh;
709		path[ppos].p_hdr = eh;
710		i--;
711
712		if (need_to_validate && ext4_ext_check(inode, eh, i))
713			goto err;
714	}
715
716	path[ppos].p_depth = i;
717	path[ppos].p_ext = NULL;
718	path[ppos].p_idx = NULL;
719
720	/* find extent */
721	ext4_ext_binsearch(inode, path + ppos, block);
722	/* if not an empty leaf */
723	if (path[ppos].p_ext)
724		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
725
726	ext4_ext_show_path(inode, path);
727
728	return path;
729
730err:
731	ext4_ext_drop_refs(path);
732	if (alloc)
733		kfree(path);
734	return ERR_PTR(-EIO);
735}
736
737/*
738 * ext4_ext_insert_index:
739 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp
741 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743				struct ext4_ext_path *curp,
744				int logical, ext4_fsblk_t ptr)
745{
746	struct ext4_extent_idx *ix;
747	int len, err;
748
749	err = ext4_ext_get_access(handle, inode, curp);
750	if (err)
751		return err;
752
753	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
754		EXT4_ERROR_INODE(inode,
755				 "logical %d == ei_block %d!",
756				 logical, le32_to_cpu(curp->p_idx->ei_block));
757		return -EIO;
758	}
759	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
760	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
761		/* insert after */
762		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
763			len = (len - 1) * sizeof(struct ext4_extent_idx);
764			len = len < 0 ? 0 : len;
765			ext_debug("insert new index %d after: %llu. "
766					"move %d from 0x%p to 0x%p\n",
767					logical, ptr, len,
768					(curp->p_idx + 1), (curp->p_idx + 2));
769			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
770		}
771		ix = curp->p_idx + 1;
772	} else {
773		/* insert before */
774		len = len * sizeof(struct ext4_extent_idx);
775		len = len < 0 ? 0 : len;
776		ext_debug("insert new index %d before: %llu. "
777				"move %d from 0x%p to 0x%p\n",
778				logical, ptr, len,
779				curp->p_idx, (curp->p_idx + 1));
780		memmove(curp->p_idx + 1, curp->p_idx, len);
781		ix = curp->p_idx;
782	}
783
784	ix->ei_block = cpu_to_le32(logical);
785	ext4_idx_store_pblock(ix, ptr);
786	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
787
788	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
789			     > le16_to_cpu(curp->p_hdr->eh_max))) {
790		EXT4_ERROR_INODE(inode,
791				 "logical %d == ei_block %d!",
792				 logical, le32_to_cpu(curp->p_idx->ei_block));
793		return -EIO;
794	}
795	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
796		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
797		return -EIO;
798	}
799
800	err = ext4_ext_dirty(handle, inode, curp);
801	ext4_std_error(inode->i_sb, err);
802
803	return err;
804}
805
806/*
807 * ext4_ext_split:
808 * inserts new subtree into the path, using free index entry
809 * at depth @at:
810 * - allocates all needed blocks (new leaf and all intermediate index blocks)
811 * - makes decision where to split
812 * - moves remaining extents and index entries (right to the split point)
813 *   into the newly allocated blocks
814 * - initializes subtree
815 */
816static int ext4_ext_split(handle_t *handle, struct inode *inode,
817				struct ext4_ext_path *path,
818				struct ext4_extent *newext, int at)
819{
820	struct buffer_head *bh = NULL;
821	int depth = ext_depth(inode);
822	struct ext4_extent_header *neh;
823	struct ext4_extent_idx *fidx;
824	struct ext4_extent *ex;
825	int i = at, k, m, a;
826	ext4_fsblk_t newblock, oldblock;
827	__le32 border;
828	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
829	int err = 0;
830
831	/* make decision: where to split? */
832	/* FIXME: now decision is simplest: at current extent */
833
834	/* if current leaf will be split, then we should use
835	 * border from split point */
836	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
837		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
838		return -EIO;
839	}
840	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
841		border = path[depth].p_ext[1].ee_block;
842		ext_debug("leaf will be split."
843				" next leaf starts at %d\n",
844				  le32_to_cpu(border));
845	} else {
846		border = newext->ee_block;
847		ext_debug("leaf will be added."
848				" next leaf starts at %d\n",
849				le32_to_cpu(border));
850	}
851
852	/*
853	 * If error occurs, then we break processing
854	 * and mark filesystem read-only. index won't
855	 * be inserted and tree will be in consistent
856	 * state. Next mount will repair buffers too.
857	 */
858
859	/*
860	 * Get array to track all allocated blocks.
861	 * We need this to handle errors and free blocks
862	 * upon them.
863	 */
864	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
865	if (!ablocks)
866		return -ENOMEM;
867
868	/* allocate all needed blocks */
869	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
870	for (a = 0; a < depth - at; a++) {
871		newblock = ext4_ext_new_meta_block(handle, inode, path,
872						   newext, &err);
873		if (newblock == 0)
874			goto cleanup;
875		ablocks[a] = newblock;
876	}
877
878	/* initialize new leaf */
879	newblock = ablocks[--a];
880	if (unlikely(newblock == 0)) {
881		EXT4_ERROR_INODE(inode, "newblock == 0!");
882		err = -EIO;
883		goto cleanup;
884	}
885	bh = sb_getblk(inode->i_sb, newblock);
886	if (!bh) {
887		err = -EIO;
888		goto cleanup;
889	}
890	lock_buffer(bh);
891
892	err = ext4_journal_get_create_access(handle, bh);
893	if (err)
894		goto cleanup;
895
896	neh = ext_block_hdr(bh);
897	neh->eh_entries = 0;
898	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
899	neh->eh_magic = EXT4_EXT_MAGIC;
900	neh->eh_depth = 0;
901	ex = EXT_FIRST_EXTENT(neh);
902
903	/* move remainder of path[depth] to the new leaf */
904	if (unlikely(path[depth].p_hdr->eh_entries !=
905		     path[depth].p_hdr->eh_max)) {
906		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
907				 path[depth].p_hdr->eh_entries,
908				 path[depth].p_hdr->eh_max);
909		err = -EIO;
910		goto cleanup;
911	}
912	/* start copy from next extent */
913	/* TODO: we could do it by single memmove */
914	m = 0;
915	path[depth].p_ext++;
916	while (path[depth].p_ext <=
917			EXT_MAX_EXTENT(path[depth].p_hdr)) {
918		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
919				le32_to_cpu(path[depth].p_ext->ee_block),
920				ext_pblock(path[depth].p_ext),
921				ext4_ext_is_uninitialized(path[depth].p_ext),
922				ext4_ext_get_actual_len(path[depth].p_ext),
923				newblock);
924		/*memmove(ex++, path[depth].p_ext++,
925				sizeof(struct ext4_extent));
926		neh->eh_entries++;*/
927		path[depth].p_ext++;
928		m++;
929	}
930	if (m) {
931		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
932		le16_add_cpu(&neh->eh_entries, m);
933	}
934
935	set_buffer_uptodate(bh);
936	unlock_buffer(bh);
937
938	err = ext4_handle_dirty_metadata(handle, inode, bh);
939	if (err)
940		goto cleanup;
941	brelse(bh);
942	bh = NULL;
943
944	/* correct old leaf */
945	if (m) {
946		err = ext4_ext_get_access(handle, inode, path + depth);
947		if (err)
948			goto cleanup;
949		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
950		err = ext4_ext_dirty(handle, inode, path + depth);
951		if (err)
952			goto cleanup;
953
954	}
955
956	/* create intermediate indexes */
957	k = depth - at - 1;
958	if (unlikely(k < 0)) {
959		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
960		err = -EIO;
961		goto cleanup;
962	}
963	if (k)
964		ext_debug("create %d intermediate indices\n", k);
965	/* insert new index into current index block */
966	/* current depth stored in i var */
967	i = depth - 1;
968	while (k--) {
969		oldblock = newblock;
970		newblock = ablocks[--a];
971		bh = sb_getblk(inode->i_sb, newblock);
972		if (!bh) {
973			err = -EIO;
974			goto cleanup;
975		}
976		lock_buffer(bh);
977
978		err = ext4_journal_get_create_access(handle, bh);
979		if (err)
980			goto cleanup;
981
982		neh = ext_block_hdr(bh);
983		neh->eh_entries = cpu_to_le16(1);
984		neh->eh_magic = EXT4_EXT_MAGIC;
985		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
986		neh->eh_depth = cpu_to_le16(depth - i);
987		fidx = EXT_FIRST_INDEX(neh);
988		fidx->ei_block = border;
989		ext4_idx_store_pblock(fidx, oldblock);
990
991		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
992				i, newblock, le32_to_cpu(border), oldblock);
993		/* copy indexes */
994		m = 0;
995		path[i].p_idx++;
996
997		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
998				EXT_MAX_INDEX(path[i].p_hdr));
999		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1000					EXT_LAST_INDEX(path[i].p_hdr))) {
1001			EXT4_ERROR_INODE(inode,
1002					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1003					 le32_to_cpu(path[i].p_ext->ee_block));
1004			err = -EIO;
1005			goto cleanup;
1006		}
1007		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1008			ext_debug("%d: move %d:%llu in new index %llu\n", i,
1009					le32_to_cpu(path[i].p_idx->ei_block),
1010					idx_pblock(path[i].p_idx),
1011					newblock);
1012			/*memmove(++fidx, path[i].p_idx++,
1013					sizeof(struct ext4_extent_idx));
1014			neh->eh_entries++;
1015			BUG_ON(neh->eh_entries > neh->eh_max);*/
1016			path[i].p_idx++;
1017			m++;
1018		}
1019		if (m) {
1020			memmove(++fidx, path[i].p_idx - m,
1021				sizeof(struct ext4_extent_idx) * m);
1022			le16_add_cpu(&neh->eh_entries, m);
1023		}
1024		set_buffer_uptodate(bh);
1025		unlock_buffer(bh);
1026
1027		err = ext4_handle_dirty_metadata(handle, inode, bh);
1028		if (err)
1029			goto cleanup;
1030		brelse(bh);
1031		bh = NULL;
1032
1033		/* correct old index */
1034		if (m) {
1035			err = ext4_ext_get_access(handle, inode, path + i);
1036			if (err)
1037				goto cleanup;
1038			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1039			err = ext4_ext_dirty(handle, inode, path + i);
1040			if (err)
1041				goto cleanup;
1042		}
1043
1044		i--;
1045	}
1046
1047	/* insert new index */
1048	err = ext4_ext_insert_index(handle, inode, path + at,
1049				    le32_to_cpu(border), newblock);
1050
1051cleanup:
1052	if (bh) {
1053		if (buffer_locked(bh))
1054			unlock_buffer(bh);
1055		brelse(bh);
1056	}
1057
1058	if (err) {
1059		/* free all allocated blocks in error case */
1060		for (i = 0; i < depth; i++) {
1061			if (!ablocks[i])
1062				continue;
1063			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1064					 EXT4_FREE_BLOCKS_METADATA);
1065		}
1066	}
1067	kfree(ablocks);
1068
1069	return err;
1070}
1071
1072/*
1073 * ext4_ext_grow_indepth:
1074 * implements tree growing procedure:
1075 * - allocates new block
1076 * - moves top-level data (index block or leaf) into the new block
1077 * - initializes new top-level, creating index that points to the
1078 *   just created block
1079 */
1080static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1081					struct ext4_ext_path *path,
1082					struct ext4_extent *newext)
1083{
1084	struct ext4_ext_path *curp = path;
1085	struct ext4_extent_header *neh;
1086	struct buffer_head *bh;
1087	ext4_fsblk_t newblock;
1088	int err = 0;
1089
1090	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1091	if (newblock == 0)
1092		return err;
1093
1094	bh = sb_getblk(inode->i_sb, newblock);
1095	if (!bh) {
1096		err = -EIO;
1097		ext4_std_error(inode->i_sb, err);
1098		return err;
1099	}
1100	lock_buffer(bh);
1101
1102	err = ext4_journal_get_create_access(handle, bh);
1103	if (err) {
1104		unlock_buffer(bh);
1105		goto out;
1106	}
1107
1108	/* move top-level index/leaf into new block */
1109	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1110
1111	/* set size of new block */
1112	neh = ext_block_hdr(bh);
1113	/* old root could have indexes or leaves
1114	 * so calculate e_max right way */
1115	if (ext_depth(inode))
1116		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1117	else
1118		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1119	neh->eh_magic = EXT4_EXT_MAGIC;
1120	set_buffer_uptodate(bh);
1121	unlock_buffer(bh);
1122
1123	err = ext4_handle_dirty_metadata(handle, inode, bh);
1124	if (err)
1125		goto out;
1126
1127	/* create index in new top-level index: num,max,pointer */
1128	err = ext4_ext_get_access(handle, inode, curp);
1129	if (err)
1130		goto out;
1131
1132	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1133	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1134	curp->p_hdr->eh_entries = cpu_to_le16(1);
1135	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1136
1137	if (path[0].p_hdr->eh_depth)
1138		curp->p_idx->ei_block =
1139			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1140	else
1141		curp->p_idx->ei_block =
1142			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1143	ext4_idx_store_pblock(curp->p_idx, newblock);
1144
1145	neh = ext_inode_hdr(inode);
1146	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1147		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1148		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149		  idx_pblock(EXT_FIRST_INDEX(neh)));
1150
1151	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1152	err = ext4_ext_dirty(handle, inode, curp);
1153out:
1154	brelse(bh);
1155
1156	return err;
1157}
1158
1159/*
1160 * ext4_ext_create_new_leaf:
1161 * finds empty index and adds new leaf.
1162 * if no free index is found, then it requests in-depth growing.
1163 */
1164static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1165					struct ext4_ext_path *path,
1166					struct ext4_extent *newext)
1167{
1168	struct ext4_ext_path *curp;
1169	int depth, i, err = 0;
1170
1171repeat:
1172	i = depth = ext_depth(inode);
1173
1174	/* walk up to the tree and look for free index entry */
1175	curp = path + depth;
1176	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1177		i--;
1178		curp--;
1179	}
1180
1181	/* we use already allocated block for index block,
1182	 * so subsequent data blocks should be contiguous */
1183	if (EXT_HAS_FREE_INDEX(curp)) {
1184		/* if we found index with free entry, then use that
1185		 * entry: create all needed subtree and add new leaf */
1186		err = ext4_ext_split(handle, inode, path, newext, i);
1187		if (err)
1188			goto out;
1189
1190		/* refill path */
1191		ext4_ext_drop_refs(path);
1192		path = ext4_ext_find_extent(inode,
1193				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1194				    path);
1195		if (IS_ERR(path))
1196			err = PTR_ERR(path);
1197	} else {
1198		/* tree is full, time to grow in depth */
1199		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1200		if (err)
1201			goto out;
1202
1203		/* refill path */
1204		ext4_ext_drop_refs(path);
1205		path = ext4_ext_find_extent(inode,
1206				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1207				    path);
1208		if (IS_ERR(path)) {
1209			err = PTR_ERR(path);
1210			goto out;
1211		}
1212
1213		/*
1214		 * only first (depth 0 -> 1) produces free space;
1215		 * in all other cases we have to split the grown tree
1216		 */
1217		depth = ext_depth(inode);
1218		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1219			/* now we need to split */
1220			goto repeat;
1221		}
1222	}
1223
1224out:
1225	return err;
1226}
1227
1228/*
1229 * search the closest allocated block to the left for *logical
1230 * and returns it at @logical + it's physical address at @phys
1231 * if *logical is the smallest allocated block, the function
1232 * returns 0 at @phys
1233 * return value contains 0 (success) or error code
1234 */
1235int
1236ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1237			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1238{
1239	struct ext4_extent_idx *ix;
1240	struct ext4_extent *ex;
1241	int depth, ee_len;
1242
1243	if (unlikely(path == NULL)) {
1244		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1245		return -EIO;
1246	}
1247	depth = path->p_depth;
1248	*phys = 0;
1249
1250	if (depth == 0 && path->p_ext == NULL)
1251		return 0;
1252
1253	/* usually extent in the path covers blocks smaller
1254	 * then *logical, but it can be that extent is the
1255	 * first one in the file */
1256
1257	ex = path[depth].p_ext;
1258	ee_len = ext4_ext_get_actual_len(ex);
1259	if (*logical < le32_to_cpu(ex->ee_block)) {
1260		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1261			EXT4_ERROR_INODE(inode,
1262					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1263					 *logical, le32_to_cpu(ex->ee_block));
1264			return -EIO;
1265		}
1266		while (--depth >= 0) {
1267			ix = path[depth].p_idx;
1268			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1269				EXT4_ERROR_INODE(inode,
1270				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1271				  ix != NULL ? ix->ei_block : 0,
1272				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1273				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1274				  depth);
1275				return -EIO;
1276			}
1277		}
1278		return 0;
1279	}
1280
1281	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1282		EXT4_ERROR_INODE(inode,
1283				 "logical %d < ee_block %d + ee_len %d!",
1284				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1285		return -EIO;
1286	}
1287
1288	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1289	*phys = ext_pblock(ex) + ee_len - 1;
1290	return 0;
1291}
1292
1293/*
1294 * search the closest allocated block to the right for *logical
1295 * and returns it at @logical + it's physical address at @phys
1296 * if *logical is the smallest allocated block, the function
1297 * returns 0 at @phys
1298 * return value contains 0 (success) or error code
1299 */
1300int
1301ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1302			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1303{
1304	struct buffer_head *bh = NULL;
1305	struct ext4_extent_header *eh;
1306	struct ext4_extent_idx *ix;
1307	struct ext4_extent *ex;
1308	ext4_fsblk_t block;
1309	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1310	int ee_len;
1311
1312	if (unlikely(path == NULL)) {
1313		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1314		return -EIO;
1315	}
1316	depth = path->p_depth;
1317	*phys = 0;
1318
1319	if (depth == 0 && path->p_ext == NULL)
1320		return 0;
1321
1322	/* usually extent in the path covers blocks smaller
1323	 * then *logical, but it can be that extent is the
1324	 * first one in the file */
1325
1326	ex = path[depth].p_ext;
1327	ee_len = ext4_ext_get_actual_len(ex);
1328	if (*logical < le32_to_cpu(ex->ee_block)) {
1329		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1330			EXT4_ERROR_INODE(inode,
1331					 "first_extent(path[%d].p_hdr) != ex",
1332					 depth);
1333			return -EIO;
1334		}
1335		while (--depth >= 0) {
1336			ix = path[depth].p_idx;
1337			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1338				EXT4_ERROR_INODE(inode,
1339						 "ix != EXT_FIRST_INDEX *logical %d!",
1340						 *logical);
1341				return -EIO;
1342			}
1343		}
1344		*logical = le32_to_cpu(ex->ee_block);
1345		*phys = ext_pblock(ex);
1346		return 0;
1347	}
1348
1349	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1350		EXT4_ERROR_INODE(inode,
1351				 "logical %d < ee_block %d + ee_len %d!",
1352				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1353		return -EIO;
1354	}
1355
1356	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1357		/* next allocated block in this leaf */
1358		ex++;
1359		*logical = le32_to_cpu(ex->ee_block);
1360		*phys = ext_pblock(ex);
1361		return 0;
1362	}
1363
1364	/* go up and search for index to the right */
1365	while (--depth >= 0) {
1366		ix = path[depth].p_idx;
1367		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1368			goto got_index;
1369	}
1370
1371	/* we've gone up to the root and found no index to the right */
1372	return 0;
1373
1374got_index:
1375	/* we've found index to the right, let's
1376	 * follow it and find the closest allocated
1377	 * block to the right */
1378	ix++;
1379	block = idx_pblock(ix);
1380	while (++depth < path->p_depth) {
1381		bh = sb_bread(inode->i_sb, block);
1382		if (bh == NULL)
1383			return -EIO;
1384		eh = ext_block_hdr(bh);
1385		/* subtract from p_depth to get proper eh_depth */
1386		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1387			put_bh(bh);
1388			return -EIO;
1389		}
1390		ix = EXT_FIRST_INDEX(eh);
1391		block = idx_pblock(ix);
1392		put_bh(bh);
1393	}
1394
1395	bh = sb_bread(inode->i_sb, block);
1396	if (bh == NULL)
1397		return -EIO;
1398	eh = ext_block_hdr(bh);
1399	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1400		put_bh(bh);
1401		return -EIO;
1402	}
1403	ex = EXT_FIRST_EXTENT(eh);
1404	*logical = le32_to_cpu(ex->ee_block);
1405	*phys = ext_pblock(ex);
1406	put_bh(bh);
1407	return 0;
1408}
1409
1410/*
1411 * ext4_ext_next_allocated_block:
1412 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1413 * NOTE: it considers block number from index entry as
1414 * allocated block. Thus, index entries have to be consistent
1415 * with leaves.
1416 */
1417static ext4_lblk_t
1418ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1419{
1420	int depth;
1421
1422	BUG_ON(path == NULL);
1423	depth = path->p_depth;
1424
1425	if (depth == 0 && path->p_ext == NULL)
1426		return EXT_MAX_BLOCK;
1427
1428	while (depth >= 0) {
1429		if (depth == path->p_depth) {
1430			/* leaf */
1431			if (path[depth].p_ext !=
1432					EXT_LAST_EXTENT(path[depth].p_hdr))
1433			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1434		} else {
1435			/* index */
1436			if (path[depth].p_idx !=
1437					EXT_LAST_INDEX(path[depth].p_hdr))
1438			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1439		}
1440		depth--;
1441	}
1442
1443	return EXT_MAX_BLOCK;
1444}
1445
1446/*
1447 * ext4_ext_next_leaf_block:
1448 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1449 */
1450static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1451					struct ext4_ext_path *path)
1452{
1453	int depth;
1454
1455	BUG_ON(path == NULL);
1456	depth = path->p_depth;
1457
1458	/* zero-tree has no leaf blocks at all */
1459	if (depth == 0)
1460		return EXT_MAX_BLOCK;
1461
1462	/* go to index block */
1463	depth--;
1464
1465	while (depth >= 0) {
1466		if (path[depth].p_idx !=
1467				EXT_LAST_INDEX(path[depth].p_hdr))
1468			return (ext4_lblk_t)
1469				le32_to_cpu(path[depth].p_idx[1].ei_block);
1470		depth--;
1471	}
1472
1473	return EXT_MAX_BLOCK;
1474}
1475
1476/*
1477 * ext4_ext_correct_indexes:
1478 * if leaf gets modified and modified extent is first in the leaf,
1479 * then we have to correct all indexes above.
1480 * TODO: do we need to correct tree in all cases?
1481 */
1482static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1483				struct ext4_ext_path *path)
1484{
1485	struct ext4_extent_header *eh;
1486	int depth = ext_depth(inode);
1487	struct ext4_extent *ex;
1488	__le32 border;
1489	int k, err = 0;
1490
1491	eh = path[depth].p_hdr;
1492	ex = path[depth].p_ext;
1493
1494	if (unlikely(ex == NULL || eh == NULL)) {
1495		EXT4_ERROR_INODE(inode,
1496				 "ex %p == NULL or eh %p == NULL", ex, eh);
1497		return -EIO;
1498	}
1499
1500	if (depth == 0) {
1501		/* there is no tree at all */
1502		return 0;
1503	}
1504
1505	if (ex != EXT_FIRST_EXTENT(eh)) {
1506		/* we correct tree if first leaf got modified only */
1507		return 0;
1508	}
1509
1510	/*
1511	 * TODO: we need correction if border is smaller than current one
1512	 */
1513	k = depth - 1;
1514	border = path[depth].p_ext->ee_block;
1515	err = ext4_ext_get_access(handle, inode, path + k);
1516	if (err)
1517		return err;
1518	path[k].p_idx->ei_block = border;
1519	err = ext4_ext_dirty(handle, inode, path + k);
1520	if (err)
1521		return err;
1522
1523	while (k--) {
1524		/* change all left-side indexes */
1525		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1526			break;
1527		err = ext4_ext_get_access(handle, inode, path + k);
1528		if (err)
1529			break;
1530		path[k].p_idx->ei_block = border;
1531		err = ext4_ext_dirty(handle, inode, path + k);
1532		if (err)
1533			break;
1534	}
1535
1536	return err;
1537}
1538
1539int
1540ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1541				struct ext4_extent *ex2)
1542{
1543	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1544
1545	/*
1546	 * Make sure that either both extents are uninitialized, or
1547	 * both are _not_.
1548	 */
1549	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1550		return 0;
1551
1552	if (ext4_ext_is_uninitialized(ex1))
1553		max_len = EXT_UNINIT_MAX_LEN;
1554	else
1555		max_len = EXT_INIT_MAX_LEN;
1556
1557	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1558	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1559
1560	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1561			le32_to_cpu(ex2->ee_block))
1562		return 0;
1563
1564	/*
1565	 * To allow future support for preallocated extents to be added
1566	 * as an RO_COMPAT feature, refuse to merge to extents if
1567	 * this can result in the top bit of ee_len being set.
1568	 */
1569	if (ext1_ee_len + ext2_ee_len > max_len)
1570		return 0;
1571#ifdef AGGRESSIVE_TEST
1572	if (ext1_ee_len >= 4)
1573		return 0;
1574#endif
1575
1576	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1577		return 1;
1578	return 0;
1579}
1580
1581/*
1582 * This function tries to merge the "ex" extent to the next extent in the tree.
1583 * It always tries to merge towards right. If you want to merge towards
1584 * left, pass "ex - 1" as argument instead of "ex".
1585 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1586 * 1 if they got merged.
1587 */
1588int ext4_ext_try_to_merge(struct inode *inode,
1589			  struct ext4_ext_path *path,
1590			  struct ext4_extent *ex)
1591{
1592	struct ext4_extent_header *eh;
1593	unsigned int depth, len;
1594	int merge_done = 0;
1595	int uninitialized = 0;
1596
1597	depth = ext_depth(inode);
1598	BUG_ON(path[depth].p_hdr == NULL);
1599	eh = path[depth].p_hdr;
1600
1601	while (ex < EXT_LAST_EXTENT(eh)) {
1602		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1603			break;
1604		/* merge with next extent! */
1605		if (ext4_ext_is_uninitialized(ex))
1606			uninitialized = 1;
1607		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1608				+ ext4_ext_get_actual_len(ex + 1));
1609		if (uninitialized)
1610			ext4_ext_mark_uninitialized(ex);
1611
1612		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1613			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1614				* sizeof(struct ext4_extent);
1615			memmove(ex + 1, ex + 2, len);
1616		}
1617		le16_add_cpu(&eh->eh_entries, -1);
1618		merge_done = 1;
1619		WARN_ON(eh->eh_entries == 0);
1620		if (!eh->eh_entries)
1621			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1622	}
1623
1624	return merge_done;
1625}
1626
1627/*
1628 * check if a portion of the "newext" extent overlaps with an
1629 * existing extent.
1630 *
1631 * If there is an overlap discovered, it updates the length of the newext
1632 * such that there will be no overlap, and then returns 1.
1633 * If there is no overlap found, it returns 0.
1634 */
1635unsigned int ext4_ext_check_overlap(struct inode *inode,
1636				    struct ext4_extent *newext,
1637				    struct ext4_ext_path *path)
1638{
1639	ext4_lblk_t b1, b2;
1640	unsigned int depth, len1;
1641	unsigned int ret = 0;
1642
1643	b1 = le32_to_cpu(newext->ee_block);
1644	len1 = ext4_ext_get_actual_len(newext);
1645	depth = ext_depth(inode);
1646	if (!path[depth].p_ext)
1647		goto out;
1648	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1649
1650	/*
1651	 * get the next allocated block if the extent in the path
1652	 * is before the requested block(s)
1653	 */
1654	if (b2 < b1) {
1655		b2 = ext4_ext_next_allocated_block(path);
1656		if (b2 == EXT_MAX_BLOCK)
1657			goto out;
1658	}
1659
1660	/* check for wrap through zero on extent logical start block*/
1661	if (b1 + len1 < b1) {
1662		len1 = EXT_MAX_BLOCK - b1;
1663		newext->ee_len = cpu_to_le16(len1);
1664		ret = 1;
1665	}
1666
1667	/* check for overlap */
1668	if (b1 + len1 > b2) {
1669		newext->ee_len = cpu_to_le16(b2 - b1);
1670		ret = 1;
1671	}
1672out:
1673	return ret;
1674}
1675
1676/*
1677 * ext4_ext_insert_extent:
1678 * tries to merge requsted extent into the existing extent or
1679 * inserts requested extent as new one into the tree,
1680 * creating new leaf in the no-space case.
1681 */
1682int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1683				struct ext4_ext_path *path,
1684				struct ext4_extent *newext, int flag)
1685{
1686	struct ext4_extent_header *eh;
1687	struct ext4_extent *ex, *fex;
1688	struct ext4_extent *nearex; /* nearest extent */
1689	struct ext4_ext_path *npath = NULL;
1690	int depth, len, err;
1691	ext4_lblk_t next;
1692	unsigned uninitialized = 0;
1693
1694	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1695		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1696		return -EIO;
1697	}
1698	depth = ext_depth(inode);
1699	ex = path[depth].p_ext;
1700	if (unlikely(path[depth].p_hdr == NULL)) {
1701		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1702		return -EIO;
1703	}
1704
1705	/* try to insert block into found extent and return */
1706	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1707		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1708		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1709				ext4_ext_is_uninitialized(newext),
1710				ext4_ext_get_actual_len(newext),
1711				le32_to_cpu(ex->ee_block),
1712				ext4_ext_is_uninitialized(ex),
1713				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1714		err = ext4_ext_get_access(handle, inode, path + depth);
1715		if (err)
1716			return err;
1717
1718		/*
1719		 * ext4_can_extents_be_merged should have checked that either
1720		 * both extents are uninitialized, or both aren't. Thus we
1721		 * need to check only one of them here.
1722		 */
1723		if (ext4_ext_is_uninitialized(ex))
1724			uninitialized = 1;
1725		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1726					+ ext4_ext_get_actual_len(newext));
1727		if (uninitialized)
1728			ext4_ext_mark_uninitialized(ex);
1729		eh = path[depth].p_hdr;
1730		nearex = ex;
1731		goto merge;
1732	}
1733
1734repeat:
1735	depth = ext_depth(inode);
1736	eh = path[depth].p_hdr;
1737	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1738		goto has_space;
1739
1740	/* probably next leaf has space for us? */
1741	fex = EXT_LAST_EXTENT(eh);
1742	next = ext4_ext_next_leaf_block(inode, path);
1743	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1744	    && next != EXT_MAX_BLOCK) {
1745		ext_debug("next leaf block - %d\n", next);
1746		BUG_ON(npath != NULL);
1747		npath = ext4_ext_find_extent(inode, next, NULL);
1748		if (IS_ERR(npath))
1749			return PTR_ERR(npath);
1750		BUG_ON(npath->p_depth != path->p_depth);
1751		eh = npath[depth].p_hdr;
1752		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1753			ext_debug("next leaf isnt full(%d)\n",
1754				  le16_to_cpu(eh->eh_entries));
1755			path = npath;
1756			goto repeat;
1757		}
1758		ext_debug("next leaf has no free space(%d,%d)\n",
1759			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1760	}
1761
1762	/*
1763	 * There is no free space in the found leaf.
1764	 * We're gonna add a new leaf in the tree.
1765	 */
1766	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1767	if (err)
1768		goto cleanup;
1769	depth = ext_depth(inode);
1770	eh = path[depth].p_hdr;
1771
1772has_space:
1773	nearex = path[depth].p_ext;
1774
1775	err = ext4_ext_get_access(handle, inode, path + depth);
1776	if (err)
1777		goto cleanup;
1778
1779	if (!nearex) {
1780		/* there is no extent in this leaf, create first one */
1781		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1782				le32_to_cpu(newext->ee_block),
1783				ext_pblock(newext),
1784				ext4_ext_is_uninitialized(newext),
1785				ext4_ext_get_actual_len(newext));
1786		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1787	} else if (le32_to_cpu(newext->ee_block)
1788			   > le32_to_cpu(nearex->ee_block)) {
1789/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1790		if (nearex != EXT_LAST_EXTENT(eh)) {
1791			len = EXT_MAX_EXTENT(eh) - nearex;
1792			len = (len - 1) * sizeof(struct ext4_extent);
1793			len = len < 0 ? 0 : len;
1794			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1795					"move %d from 0x%p to 0x%p\n",
1796					le32_to_cpu(newext->ee_block),
1797					ext_pblock(newext),
1798					ext4_ext_is_uninitialized(newext),
1799					ext4_ext_get_actual_len(newext),
1800					nearex, len, nearex + 1, nearex + 2);
1801			memmove(nearex + 2, nearex + 1, len);
1802		}
1803		path[depth].p_ext = nearex + 1;
1804	} else {
1805		BUG_ON(newext->ee_block == nearex->ee_block);
1806		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1807		len = len < 0 ? 0 : len;
1808		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1809				"move %d from 0x%p to 0x%p\n",
1810				le32_to_cpu(newext->ee_block),
1811				ext_pblock(newext),
1812				ext4_ext_is_uninitialized(newext),
1813				ext4_ext_get_actual_len(newext),
1814				nearex, len, nearex + 1, nearex + 2);
1815		memmove(nearex + 1, nearex, len);
1816		path[depth].p_ext = nearex;
1817	}
1818
1819	le16_add_cpu(&eh->eh_entries, 1);
1820	nearex = path[depth].p_ext;
1821	nearex->ee_block = newext->ee_block;
1822	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1823	nearex->ee_len = newext->ee_len;
1824
1825merge:
1826	/* try to merge extents to the right */
1827	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1828		ext4_ext_try_to_merge(inode, path, nearex);
1829
1830	/* try to merge extents to the left */
1831
1832	/* time to correct all indexes above */
1833	err = ext4_ext_correct_indexes(handle, inode, path);
1834	if (err)
1835		goto cleanup;
1836
1837	err = ext4_ext_dirty(handle, inode, path + depth);
1838
1839cleanup:
1840	if (npath) {
1841		ext4_ext_drop_refs(npath);
1842		kfree(npath);
1843	}
1844	ext4_ext_invalidate_cache(inode);
1845	return err;
1846}
1847
1848int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1849			ext4_lblk_t num, ext_prepare_callback func,
1850			void *cbdata)
1851{
1852	struct ext4_ext_path *path = NULL;
1853	struct ext4_ext_cache cbex;
1854	struct ext4_extent *ex;
1855	ext4_lblk_t next, start = 0, end = 0;
1856	ext4_lblk_t last = block + num;
1857	int depth, exists, err = 0;
1858
1859	BUG_ON(func == NULL);
1860	BUG_ON(inode == NULL);
1861
1862	while (block < last && block != EXT_MAX_BLOCK) {
1863		num = last - block;
1864		/* find extent for this block */
1865		down_read(&EXT4_I(inode)->i_data_sem);
1866		path = ext4_ext_find_extent(inode, block, path);
1867		up_read(&EXT4_I(inode)->i_data_sem);
1868		if (IS_ERR(path)) {
1869			err = PTR_ERR(path);
1870			path = NULL;
1871			break;
1872		}
1873
1874		depth = ext_depth(inode);
1875		if (unlikely(path[depth].p_hdr == NULL)) {
1876			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1877			err = -EIO;
1878			break;
1879		}
1880		ex = path[depth].p_ext;
1881		next = ext4_ext_next_allocated_block(path);
1882
1883		exists = 0;
1884		if (!ex) {
1885			/* there is no extent yet, so try to allocate
1886			 * all requested space */
1887			start = block;
1888			end = block + num;
1889		} else if (le32_to_cpu(ex->ee_block) > block) {
1890			/* need to allocate space before found extent */
1891			start = block;
1892			end = le32_to_cpu(ex->ee_block);
1893			if (block + num < end)
1894				end = block + num;
1895		} else if (block >= le32_to_cpu(ex->ee_block)
1896					+ ext4_ext_get_actual_len(ex)) {
1897			/* need to allocate space after found extent */
1898			start = block;
1899			end = block + num;
1900			if (end >= next)
1901				end = next;
1902		} else if (block >= le32_to_cpu(ex->ee_block)) {
1903			/*
1904			 * some part of requested space is covered
1905			 * by found extent
1906			 */
1907			start = block;
1908			end = le32_to_cpu(ex->ee_block)
1909				+ ext4_ext_get_actual_len(ex);
1910			if (block + num < end)
1911				end = block + num;
1912			exists = 1;
1913		} else {
1914			BUG();
1915		}
1916		BUG_ON(end <= start);
1917
1918		if (!exists) {
1919			cbex.ec_block = start;
1920			cbex.ec_len = end - start;
1921			cbex.ec_start = 0;
1922			cbex.ec_type = EXT4_EXT_CACHE_GAP;
1923		} else {
1924			cbex.ec_block = le32_to_cpu(ex->ee_block);
1925			cbex.ec_len = ext4_ext_get_actual_len(ex);
1926			cbex.ec_start = ext_pblock(ex);
1927			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1928		}
1929
1930		if (unlikely(cbex.ec_len == 0)) {
1931			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1932			err = -EIO;
1933			break;
1934		}
1935		err = func(inode, path, &cbex, ex, cbdata);
1936		ext4_ext_drop_refs(path);
1937
1938		if (err < 0)
1939			break;
1940
1941		if (err == EXT_REPEAT)
1942			continue;
1943		else if (err == EXT_BREAK) {
1944			err = 0;
1945			break;
1946		}
1947
1948		if (ext_depth(inode) != depth) {
1949			/* depth was changed. we have to realloc path */
1950			kfree(path);
1951			path = NULL;
1952		}
1953
1954		block = cbex.ec_block + cbex.ec_len;
1955	}
1956
1957	if (path) {
1958		ext4_ext_drop_refs(path);
1959		kfree(path);
1960	}
1961
1962	return err;
1963}
1964
1965static void
1966ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1967			__u32 len, ext4_fsblk_t start, int type)
1968{
1969	struct ext4_ext_cache *cex;
1970	BUG_ON(len == 0);
1971	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1972	cex = &EXT4_I(inode)->i_cached_extent;
1973	cex->ec_type = type;
1974	cex->ec_block = block;
1975	cex->ec_len = len;
1976	cex->ec_start = start;
1977	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1978}
1979
1980/*
1981 * ext4_ext_put_gap_in_cache:
1982 * calculate boundaries of the gap that the requested block fits into
1983 * and cache this gap
1984 */
1985static void
1986ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1987				ext4_lblk_t block)
1988{
1989	int depth = ext_depth(inode);
1990	unsigned long len;
1991	ext4_lblk_t lblock;
1992	struct ext4_extent *ex;
1993
1994	ex = path[depth].p_ext;
1995	if (ex == NULL) {
1996		/* there is no extent yet, so gap is [0;-] */
1997		lblock = 0;
1998		len = EXT_MAX_BLOCK;
1999		ext_debug("cache gap(whole file):");
2000	} else if (block < le32_to_cpu(ex->ee_block)) {
2001		lblock = block;
2002		len = le32_to_cpu(ex->ee_block) - block;
2003		ext_debug("cache gap(before): %u [%u:%u]",
2004				block,
2005				le32_to_cpu(ex->ee_block),
2006				 ext4_ext_get_actual_len(ex));
2007	} else if (block >= le32_to_cpu(ex->ee_block)
2008			+ ext4_ext_get_actual_len(ex)) {
2009		ext4_lblk_t next;
2010		lblock = le32_to_cpu(ex->ee_block)
2011			+ ext4_ext_get_actual_len(ex);
2012
2013		next = ext4_ext_next_allocated_block(path);
2014		ext_debug("cache gap(after): [%u:%u] %u",
2015				le32_to_cpu(ex->ee_block),
2016				ext4_ext_get_actual_len(ex),
2017				block);
2018		BUG_ON(next == lblock);
2019		len = next - lblock;
2020	} else {
2021		lblock = len = 0;
2022		BUG();
2023	}
2024
2025	ext_debug(" -> %u:%lu\n", lblock, len);
2026	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
2027}
2028
2029static int
2030ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2031			struct ext4_extent *ex)
2032{
2033	struct ext4_ext_cache *cex;
2034	int ret = EXT4_EXT_CACHE_NO;
2035
2036	/*
2037	 * We borrow i_block_reservation_lock to protect i_cached_extent
2038	 */
2039	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2040	cex = &EXT4_I(inode)->i_cached_extent;
2041
2042	/* has cache valid data? */
2043	if (cex->ec_type == EXT4_EXT_CACHE_NO)
2044		goto errout;
2045
2046	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2047			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2048	if (in_range(block, cex->ec_block, cex->ec_len)) {
2049		ex->ee_block = cpu_to_le32(cex->ec_block);
2050		ext4_ext_store_pblock(ex, cex->ec_start);
2051		ex->ee_len = cpu_to_le16(cex->ec_len);
2052		ext_debug("%u cached by %u:%u:%llu\n",
2053				block,
2054				cex->ec_block, cex->ec_len, cex->ec_start);
2055		ret = cex->ec_type;
2056	}
2057errout:
2058	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2059	return ret;
2060}
2061
2062/*
2063 * ext4_ext_rm_idx:
2064 * removes index from the index block.
2065 * It's used in truncate case only, thus all requests are for
2066 * last index in the block only.
2067 */
2068static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2069			struct ext4_ext_path *path)
2070{
2071	int err;
2072	ext4_fsblk_t leaf;
2073
2074	/* free index block */
2075	path--;
2076	leaf = idx_pblock(path->p_idx);
2077	if (unlikely(path->p_hdr->eh_entries == 0)) {
2078		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2079		return -EIO;
2080	}
2081	err = ext4_ext_get_access(handle, inode, path);
2082	if (err)
2083		return err;
2084	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2085	err = ext4_ext_dirty(handle, inode, path);
2086	if (err)
2087		return err;
2088	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2089	ext4_free_blocks(handle, inode, 0, leaf, 1,
2090			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2091	return err;
2092}
2093
2094/*
2095 * ext4_ext_calc_credits_for_single_extent:
2096 * This routine returns max. credits that needed to insert an extent
2097 * to the extent tree.
2098 * When pass the actual path, the caller should calculate credits
2099 * under i_data_sem.
2100 */
2101int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2102						struct ext4_ext_path *path)
2103{
2104	if (path) {
2105		int depth = ext_depth(inode);
2106		int ret = 0;
2107
2108		/* probably there is space in leaf? */
2109		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2110				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2111
2112			/*
2113			 *  There are some space in the leaf tree, no
2114			 *  need to account for leaf block credit
2115			 *
2116			 *  bitmaps and block group descriptor blocks
2117			 *  and other metadat blocks still need to be
2118			 *  accounted.
2119			 */
2120			/* 1 bitmap, 1 block group descriptor */
2121			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2122			return ret;
2123		}
2124	}
2125
2126	return ext4_chunk_trans_blocks(inode, nrblocks);
2127}
2128
2129/*
2130 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2131 *
2132 * if nrblocks are fit in a single extent (chunk flag is 1), then
2133 * in the worse case, each tree level index/leaf need to be changed
2134 * if the tree split due to insert a new extent, then the old tree
2135 * index/leaf need to be updated too
2136 *
2137 * If the nrblocks are discontiguous, they could cause
2138 * the whole tree split more than once, but this is really rare.
2139 */
2140int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2141{
2142	int index;
2143	int depth = ext_depth(inode);
2144
2145	if (chunk)
2146		index = depth * 2;
2147	else
2148		index = depth * 3;
2149
2150	return index;
2151}
2152
2153static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2154				struct ext4_extent *ex,
2155				ext4_lblk_t from, ext4_lblk_t to)
2156{
2157	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2158	int flags = EXT4_FREE_BLOCKS_FORGET;
2159
2160	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2161		flags |= EXT4_FREE_BLOCKS_METADATA;
2162#ifdef EXTENTS_STATS
2163	{
2164		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2165		spin_lock(&sbi->s_ext_stats_lock);
2166		sbi->s_ext_blocks += ee_len;
2167		sbi->s_ext_extents++;
2168		if (ee_len < sbi->s_ext_min)
2169			sbi->s_ext_min = ee_len;
2170		if (ee_len > sbi->s_ext_max)
2171			sbi->s_ext_max = ee_len;
2172		if (ext_depth(inode) > sbi->s_depth_max)
2173			sbi->s_depth_max = ext_depth(inode);
2174		spin_unlock(&sbi->s_ext_stats_lock);
2175	}
2176#endif
2177	if (from >= le32_to_cpu(ex->ee_block)
2178	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2179		/* tail removal */
2180		ext4_lblk_t num;
2181		ext4_fsblk_t start;
2182
2183		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2184		start = ext_pblock(ex) + ee_len - num;
2185		ext_debug("free last %u blocks starting %llu\n", num, start);
2186		ext4_free_blocks(handle, inode, 0, start, num, flags);
2187	} else if (from == le32_to_cpu(ex->ee_block)
2188		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2189		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2190			from, to, le32_to_cpu(ex->ee_block), ee_len);
2191	} else {
2192		printk(KERN_INFO "strange request: removal(2) "
2193				"%u-%u from %u:%u\n",
2194				from, to, le32_to_cpu(ex->ee_block), ee_len);
2195	}
2196	return 0;
2197}
2198
2199static int
2200ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2201		struct ext4_ext_path *path, ext4_lblk_t start)
2202{
2203	int err = 0, correct_index = 0;
2204	int depth = ext_depth(inode), credits;
2205	struct ext4_extent_header *eh;
2206	ext4_lblk_t a, b, block;
2207	unsigned num;
2208	ext4_lblk_t ex_ee_block;
2209	unsigned short ex_ee_len;
2210	unsigned uninitialized = 0;
2211	struct ext4_extent *ex;
2212
2213	/* the header must be checked already in ext4_ext_remove_space() */
2214	ext_debug("truncate since %u in leaf\n", start);
2215	if (!path[depth].p_hdr)
2216		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2217	eh = path[depth].p_hdr;
2218	if (unlikely(path[depth].p_hdr == NULL)) {
2219		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2220		return -EIO;
2221	}
2222	/* find where to start removing */
2223	ex = EXT_LAST_EXTENT(eh);
2224
2225	ex_ee_block = le32_to_cpu(ex->ee_block);
2226	ex_ee_len = ext4_ext_get_actual_len(ex);
2227
2228	while (ex >= EXT_FIRST_EXTENT(eh) &&
2229			ex_ee_block + ex_ee_len > start) {
2230
2231		if (ext4_ext_is_uninitialized(ex))
2232			uninitialized = 1;
2233		else
2234			uninitialized = 0;
2235
2236		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2237			 uninitialized, ex_ee_len);
2238		path[depth].p_ext = ex;
2239
2240		a = ex_ee_block > start ? ex_ee_block : start;
2241		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2242			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2243
2244		ext_debug("  border %u:%u\n", a, b);
2245
2246		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2247			block = 0;
2248			num = 0;
2249			BUG();
2250		} else if (a != ex_ee_block) {
2251			/* remove tail of the extent */
2252			block = ex_ee_block;
2253			num = a - block;
2254		} else if (b != ex_ee_block + ex_ee_len - 1) {
2255			/* remove head of the extent */
2256			block = a;
2257			num = b - a;
2258			/* there is no "make a hole" API yet */
2259			BUG();
2260		} else {
2261			/* remove whole extent: excellent! */
2262			block = ex_ee_block;
2263			num = 0;
2264			BUG_ON(a != ex_ee_block);
2265			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2266		}
2267
2268		/*
2269		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2270		 * descriptor) for each block group; assume two block
2271		 * groups plus ex_ee_len/blocks_per_block_group for
2272		 * the worst case
2273		 */
2274		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2275		if (ex == EXT_FIRST_EXTENT(eh)) {
2276			correct_index = 1;
2277			credits += (ext_depth(inode)) + 1;
2278		}
2279		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2280
2281		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2282		if (err)
2283			goto out;
2284
2285		err = ext4_ext_get_access(handle, inode, path + depth);
2286		if (err)
2287			goto out;
2288
2289		err = ext4_remove_blocks(handle, inode, ex, a, b);
2290		if (err)
2291			goto out;
2292
2293		if (num == 0) {
2294			/* this extent is removed; mark slot entirely unused */
2295			ext4_ext_store_pblock(ex, 0);
2296			le16_add_cpu(&eh->eh_entries, -1);
2297		}
2298
2299		ex->ee_block = cpu_to_le32(block);
2300		ex->ee_len = cpu_to_le16(num);
2301		/*
2302		 * Do not mark uninitialized if all the blocks in the
2303		 * extent have been removed.
2304		 */
2305		if (uninitialized && num)
2306			ext4_ext_mark_uninitialized(ex);
2307
2308		err = ext4_ext_dirty(handle, inode, path + depth);
2309		if (err)
2310			goto out;
2311
2312		ext_debug("new extent: %u:%u:%llu\n", block, num,
2313				ext_pblock(ex));
2314		ex--;
2315		ex_ee_block = le32_to_cpu(ex->ee_block);
2316		ex_ee_len = ext4_ext_get_actual_len(ex);
2317	}
2318
2319	if (correct_index && eh->eh_entries)
2320		err = ext4_ext_correct_indexes(handle, inode, path);
2321
2322	/* if this leaf is free, then we should
2323	 * remove it from index block above */
2324	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2325		err = ext4_ext_rm_idx(handle, inode, path + depth);
2326
2327out:
2328	return err;
2329}
2330
2331/*
2332 * ext4_ext_more_to_rm:
2333 * returns 1 if current index has to be freed (even partial)
2334 */
2335static int
2336ext4_ext_more_to_rm(struct ext4_ext_path *path)
2337{
2338	BUG_ON(path->p_idx == NULL);
2339
2340	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2341		return 0;
2342
2343	/*
2344	 * if truncate on deeper level happened, it wasn't partial,
2345	 * so we have to consider current index for truncation
2346	 */
2347	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2348		return 0;
2349	return 1;
2350}
2351
2352static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2353{
2354	struct super_block *sb = inode->i_sb;
2355	int depth = ext_depth(inode);
2356	struct ext4_ext_path *path;
2357	handle_t *handle;
2358	int i, err;
2359
2360	ext_debug("truncate since %u\n", start);
2361
2362	/* probably first extent we're gonna free will be last in block */
2363	handle = ext4_journal_start(inode, depth + 1);
2364	if (IS_ERR(handle))
2365		return PTR_ERR(handle);
2366
2367again:
2368	ext4_ext_invalidate_cache(inode);
2369
2370	/*
2371	 * We start scanning from right side, freeing all the blocks
2372	 * after i_size and walking into the tree depth-wise.
2373	 */
2374	depth = ext_depth(inode);
2375	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2376	if (path == NULL) {
2377		ext4_journal_stop(handle);
2378		return -ENOMEM;
2379	}
2380	path[0].p_depth = depth;
2381	path[0].p_hdr = ext_inode_hdr(inode);
2382	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2383		err = -EIO;
2384		goto out;
2385	}
2386	i = err = 0;
2387
2388	while (i >= 0 && err == 0) {
2389		if (i == depth) {
2390			/* this is leaf block */
2391			err = ext4_ext_rm_leaf(handle, inode, path, start);
2392			/* root level has p_bh == NULL, brelse() eats this */
2393			brelse(path[i].p_bh);
2394			path[i].p_bh = NULL;
2395			i--;
2396			continue;
2397		}
2398
2399		/* this is index block */
2400		if (!path[i].p_hdr) {
2401			ext_debug("initialize header\n");
2402			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2403		}
2404
2405		if (!path[i].p_idx) {
2406			/* this level hasn't been touched yet */
2407			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2408			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2409			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2410				  path[i].p_hdr,
2411				  le16_to_cpu(path[i].p_hdr->eh_entries));
2412		} else {
2413			/* we were already here, see at next index */
2414			path[i].p_idx--;
2415		}
2416
2417		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2418				i, EXT_FIRST_INDEX(path[i].p_hdr),
2419				path[i].p_idx);
2420		if (ext4_ext_more_to_rm(path + i)) {
2421			struct buffer_head *bh;
2422			/* go to the next level */
2423			ext_debug("move to level %d (block %llu)\n",
2424				  i + 1, idx_pblock(path[i].p_idx));
2425			memset(path + i + 1, 0, sizeof(*path));
2426			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2427			if (!bh) {
2428				/* should we reset i_size? */
2429				err = -EIO;
2430				break;
2431			}
2432			if (WARN_ON(i + 1 > depth)) {
2433				err = -EIO;
2434				break;
2435			}
2436			if (ext4_ext_check(inode, ext_block_hdr(bh),
2437							depth - i - 1)) {
2438				err = -EIO;
2439				break;
2440			}
2441			path[i + 1].p_bh = bh;
2442
2443			/* save actual number of indexes since this
2444			 * number is changed at the next iteration */
2445			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2446			i++;
2447		} else {
2448			/* we finished processing this index, go up */
2449			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2450				/* index is empty, remove it;
2451				 * handle must be already prepared by the
2452				 * truncatei_leaf() */
2453				err = ext4_ext_rm_idx(handle, inode, path + i);
2454			}
2455			/* root level has p_bh == NULL, brelse() eats this */
2456			brelse(path[i].p_bh);
2457			path[i].p_bh = NULL;
2458			i--;
2459			ext_debug("return to level %d\n", i);
2460		}
2461	}
2462
2463	/* TODO: flexible tree reduction should be here */
2464	if (path->p_hdr->eh_entries == 0) {
2465		/*
2466		 * truncate to zero freed all the tree,
2467		 * so we need to correct eh_depth
2468		 */
2469		err = ext4_ext_get_access(handle, inode, path);
2470		if (err == 0) {
2471			ext_inode_hdr(inode)->eh_depth = 0;
2472			ext_inode_hdr(inode)->eh_max =
2473				cpu_to_le16(ext4_ext_space_root(inode, 0));
2474			err = ext4_ext_dirty(handle, inode, path);
2475		}
2476	}
2477out:
2478	ext4_ext_drop_refs(path);
2479	kfree(path);
2480	if (err == -EAGAIN)
2481		goto again;
2482	ext4_journal_stop(handle);
2483
2484	return err;
2485}
2486
2487/*
2488 * called at mount time
2489 */
2490void ext4_ext_init(struct super_block *sb)
2491{
2492	/*
2493	 * possible initialization would be here
2494	 */
2495
2496	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2497#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2498		printk(KERN_INFO "EXT4-fs: file extents enabled");
2499#ifdef AGGRESSIVE_TEST
2500		printk(", aggressive tests");
2501#endif
2502#ifdef CHECK_BINSEARCH
2503		printk(", check binsearch");
2504#endif
2505#ifdef EXTENTS_STATS
2506		printk(", stats");
2507#endif
2508		printk("\n");
2509#endif
2510#ifdef EXTENTS_STATS
2511		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2512		EXT4_SB(sb)->s_ext_min = 1 << 30;
2513		EXT4_SB(sb)->s_ext_max = 0;
2514#endif
2515	}
2516}
2517
2518/*
2519 * called at umount time
2520 */
2521void ext4_ext_release(struct super_block *sb)
2522{
2523	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2524		return;
2525
2526#ifdef EXTENTS_STATS
2527	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2528		struct ext4_sb_info *sbi = EXT4_SB(sb);
2529		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2530			sbi->s_ext_blocks, sbi->s_ext_extents,
2531			sbi->s_ext_blocks / sbi->s_ext_extents);
2532		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2533			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2534	}
2535#endif
2536}
2537
2538static void bi_complete(struct bio *bio, int error)
2539{
2540	complete((struct completion *)bio->bi_private);
2541}
2542
2543/* FIXME!! we need to try to merge to left or right after zero-out  */
2544static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2545{
2546	int ret;
2547	struct bio *bio;
2548	int blkbits, blocksize;
2549	sector_t ee_pblock;
2550	struct completion event;
2551	unsigned int ee_len, len, done, offset;
2552
2553
2554	blkbits   = inode->i_blkbits;
2555	blocksize = inode->i_sb->s_blocksize;
2556	ee_len    = ext4_ext_get_actual_len(ex);
2557	ee_pblock = ext_pblock(ex);
2558
2559	/* convert ee_pblock to 512 byte sectors */
2560	ee_pblock = ee_pblock << (blkbits - 9);
2561
2562	while (ee_len > 0) {
2563
2564		if (ee_len > BIO_MAX_PAGES)
2565			len = BIO_MAX_PAGES;
2566		else
2567			len = ee_len;
2568
2569		bio = bio_alloc(GFP_NOIO, len);
2570		if (!bio)
2571			return -ENOMEM;
2572
2573		bio->bi_sector = ee_pblock;
2574		bio->bi_bdev   = inode->i_sb->s_bdev;
2575
2576		done = 0;
2577		offset = 0;
2578		while (done < len) {
2579			ret = bio_add_page(bio, ZERO_PAGE(0),
2580							blocksize, offset);
2581			if (ret != blocksize) {
2582				/*
2583				 * We can't add any more pages because of
2584				 * hardware limitations.  Start a new bio.
2585				 */
2586				break;
2587			}
2588			done++;
2589			offset += blocksize;
2590			if (offset >= PAGE_CACHE_SIZE)
2591				offset = 0;
2592		}
2593
2594		init_completion(&event);
2595		bio->bi_private = &event;
2596		bio->bi_end_io = bi_complete;
2597		submit_bio(WRITE, bio);
2598		wait_for_completion(&event);
2599
2600		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2601			bio_put(bio);
2602			return -EIO;
2603		}
2604		bio_put(bio);
2605		ee_len    -= done;
2606		ee_pblock += done  << (blkbits - 9);
2607	}
2608	return 0;
2609}
2610
2611#define EXT4_EXT_ZERO_LEN 7
2612/*
2613 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized).
2617 * There are three possibilities:
2618 *   a> There is no split required: Entire extent should be initialized
2619 *   b> Splits in two extents: Write is happening at either end of the extent
2620 *   c> Splits in three extents: Somone is writing in middle of the extent
2621 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle,
2623					   struct inode *inode,
2624					   struct ext4_map_blocks *map,
2625					   struct ext4_ext_path *path)
2626{
2627	struct ext4_extent *ex, newex, orig_ex;
2628	struct ext4_extent *ex1 = NULL;
2629	struct ext4_extent *ex2 = NULL;
2630	struct ext4_extent *ex3 = NULL;
2631	struct ext4_extent_header *eh;
2632	ext4_lblk_t ee_block, eof_block;
2633	unsigned int allocated, ee_len, depth;
2634	ext4_fsblk_t newblock;
2635	int err = 0;
2636	int ret = 0;
2637	int may_zeroout;
2638
2639	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2640		"block %llu, max_blocks %u\n", inode->i_ino,
2641		(unsigned long long)map->m_lblk, map->m_len);
2642
2643	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2644		inode->i_sb->s_blocksize_bits;
2645	if (eof_block < map->m_lblk + map->m_len)
2646		eof_block = map->m_lblk + map->m_len;
2647
2648	depth = ext_depth(inode);
2649	eh = path[depth].p_hdr;
2650	ex = path[depth].p_ext;
2651	ee_block = le32_to_cpu(ex->ee_block);
2652	ee_len = ext4_ext_get_actual_len(ex);
2653	allocated = ee_len - (map->m_lblk - ee_block);
2654	newblock = map->m_lblk - ee_block + ext_pblock(ex);
2655
2656	ex2 = ex;
2657	orig_ex.ee_block = ex->ee_block;
2658	orig_ex.ee_len   = cpu_to_le16(ee_len);
2659	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2660
2661	/*
2662	 * It is safe to convert extent to initialized via explicit
2663	 * zeroout only if extent is fully insde i_size or new_size.
2664	 */
2665	may_zeroout = ee_block + ee_len <= eof_block;
2666
2667	err = ext4_ext_get_access(handle, inode, path + depth);
2668	if (err)
2669		goto out;
2670	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2671	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2672		err =  ext4_ext_zeroout(inode, &orig_ex);
2673		if (err)
2674			goto fix_extent_len;
2675		/* update the extent length and mark as initialized */
2676		ex->ee_block = orig_ex.ee_block;
2677		ex->ee_len   = orig_ex.ee_len;
2678		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2679		ext4_ext_dirty(handle, inode, path + depth);
2680		/* zeroed the full extent */
2681		return allocated;
2682	}
2683
2684	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2685	if (map->m_lblk > ee_block) {
2686		ex1 = ex;
2687		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2688		ext4_ext_mark_uninitialized(ex1);
2689		ex2 = &newex;
2690	}
2691	/*
2692	 * for sanity, update the length of the ex2 extent before
2693	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2694	 * overlap of blocks.
2695	 */
2696	if (!ex1 && allocated > map->m_len)
2697		ex2->ee_len = cpu_to_le16(map->m_len);
2698	/* ex3: to ee_block + ee_len : uninitialised */
2699	if (allocated > map->m_len) {
2700		unsigned int newdepth;
2701		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2702		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2703			/*
2704			 * map->m_lblk == ee_block is handled by the zerouout
2705			 * at the beginning.
2706			 * Mark first half uninitialized.
2707			 * Mark second half initialized and zero out the
2708			 * initialized extent
2709			 */
2710			ex->ee_block = orig_ex.ee_block;
2711			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2712			ext4_ext_mark_uninitialized(ex);
2713			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2714			ext4_ext_dirty(handle, inode, path + depth);
2715
2716			ex3 = &newex;
2717			ex3->ee_block = cpu_to_le32(map->m_lblk);
2718			ext4_ext_store_pblock(ex3, newblock);
2719			ex3->ee_len = cpu_to_le16(allocated);
2720			err = ext4_ext_insert_extent(handle, inode, path,
2721							ex3, 0);
2722			if (err == -ENOSPC) {
2723				err =  ext4_ext_zeroout(inode, &orig_ex);
2724				if (err)
2725					goto fix_extent_len;
2726				ex->ee_block = orig_ex.ee_block;
2727				ex->ee_len   = orig_ex.ee_len;
2728				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2729				ext4_ext_dirty(handle, inode, path + depth);
2730				/* blocks available from map->m_lblk */
2731				return allocated;
2732
2733			} else if (err)
2734				goto fix_extent_len;
2735
2736			/*
2737			 * We need to zero out the second half because
2738			 * an fallocate request can update file size and
2739			 * converting the second half to initialized extent
2740			 * implies that we can leak some junk data to user
2741			 * space.
2742			 */
2743			err =  ext4_ext_zeroout(inode, ex3);
2744			if (err) {
2745				/*
2746				 * We should actually mark the
2747				 * second half as uninit and return error
2748				 * Insert would have changed the extent
2749				 */
2750				depth = ext_depth(inode);
2751				ext4_ext_drop_refs(path);
2752				path = ext4_ext_find_extent(inode, map->m_lblk,
2753							    path);
2754				if (IS_ERR(path)) {
2755					err = PTR_ERR(path);
2756					return err;
2757				}
2758				/* get the second half extent details */
2759				ex = path[depth].p_ext;
2760				err = ext4_ext_get_access(handle, inode,
2761								path + depth);
2762				if (err)
2763					return err;
2764				ext4_ext_mark_uninitialized(ex);
2765				ext4_ext_dirty(handle, inode, path + depth);
2766				return err;
2767			}
2768
2769			/* zeroed the second half */
2770			return allocated;
2771		}
2772		ex3 = &newex;
2773		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2774		ext4_ext_store_pblock(ex3, newblock + map->m_len);
2775		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2776		ext4_ext_mark_uninitialized(ex3);
2777		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2778		if (err == -ENOSPC && may_zeroout) {
2779			err =  ext4_ext_zeroout(inode, &orig_ex);
2780			if (err)
2781				goto fix_extent_len;
2782			/* update the extent length and mark as initialized */
2783			ex->ee_block = orig_ex.ee_block;
2784			ex->ee_len   = orig_ex.ee_len;
2785			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786			ext4_ext_dirty(handle, inode, path + depth);
2787			/* zeroed the full extent */
2788			/* blocks available from map->m_lblk */
2789			return allocated;
2790
2791		} else if (err)
2792			goto fix_extent_len;
2793		/*
2794		 * The depth, and hence eh & ex might change
2795		 * as part of the insert above.
2796		 */
2797		newdepth = ext_depth(inode);
2798		/*
2799		 * update the extent length after successful insert of the
2800		 * split extent
2801		 */
2802		ee_len -= ext4_ext_get_actual_len(ex3);
2803		orig_ex.ee_len = cpu_to_le16(ee_len);
2804		may_zeroout = ee_block + ee_len <= eof_block;
2805
2806		depth = newdepth;
2807		ext4_ext_drop_refs(path);
2808		path = ext4_ext_find_extent(inode, map->m_lblk, path);
2809		if (IS_ERR(path)) {
2810			err = PTR_ERR(path);
2811			goto out;
2812		}
2813		eh = path[depth].p_hdr;
2814		ex = path[depth].p_ext;
2815		if (ex2 != &newex)
2816			ex2 = ex;
2817
2818		err = ext4_ext_get_access(handle, inode, path + depth);
2819		if (err)
2820			goto out;
2821
2822		allocated = map->m_len;
2823
2824		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2825		 * to insert a extent in the middle zerout directly
2826		 * otherwise give the extent a chance to merge to left
2827		 */
2828		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2829			map->m_lblk != ee_block && may_zeroout) {
2830			err =  ext4_ext_zeroout(inode, &orig_ex);
2831			if (err)
2832				goto fix_extent_len;
2833			/* update the extent length and mark as initialized */
2834			ex->ee_block = orig_ex.ee_block;
2835			ex->ee_len   = orig_ex.ee_len;
2836			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2837			ext4_ext_dirty(handle, inode, path + depth);
2838			/* zero out the first half */
2839			/* blocks available from map->m_lblk */
2840			return allocated;
2841		}
2842	}
2843	/*
2844	 * If there was a change of depth as part of the
2845	 * insertion of ex3 above, we need to update the length
2846	 * of the ex1 extent again here
2847	 */
2848	if (ex1 && ex1 != ex) {
2849		ex1 = ex;
2850		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2851		ext4_ext_mark_uninitialized(ex1);
2852		ex2 = &newex;
2853	}
2854	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2855	ex2->ee_block = cpu_to_le32(map->m_lblk);
2856	ext4_ext_store_pblock(ex2, newblock);
2857	ex2->ee_len = cpu_to_le16(allocated);
2858	if (ex2 != ex)
2859		goto insert;
2860	/*
2861	 * New (initialized) extent starts from the first block
2862	 * in the current extent. i.e., ex2 == ex
2863	 * We have to see if it can be merged with the extent
2864	 * on the left.
2865	 */
2866	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2867		/*
2868		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2869		 * since it merges towards right _only_.
2870		 */
2871		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2872		if (ret) {
2873			err = ext4_ext_correct_indexes(handle, inode, path);
2874			if (err)
2875				goto out;
2876			depth = ext_depth(inode);
2877			ex2--;
2878		}
2879	}
2880	/*
2881	 * Try to Merge towards right. This might be required
2882	 * only when the whole extent is being written to.
2883	 * i.e. ex2 == ex and ex3 == NULL.
2884	 */
2885	if (!ex3) {
2886		ret = ext4_ext_try_to_merge(inode, path, ex2);
2887		if (ret) {
2888			err = ext4_ext_correct_indexes(handle, inode, path);
2889			if (err)
2890				goto out;
2891		}
2892	}
2893	/* Mark modified extent as dirty */
2894	err = ext4_ext_dirty(handle, inode, path + depth);
2895	goto out;
2896insert:
2897	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2898	if (err == -ENOSPC && may_zeroout) {
2899		err =  ext4_ext_zeroout(inode, &orig_ex);
2900		if (err)
2901			goto fix_extent_len;
2902		/* update the extent length and mark as initialized */
2903		ex->ee_block = orig_ex.ee_block;
2904		ex->ee_len   = orig_ex.ee_len;
2905		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2906		ext4_ext_dirty(handle, inode, path + depth);
2907		/* zero out the first half */
2908		return allocated;
2909	} else if (err)
2910		goto fix_extent_len;
2911out:
2912	ext4_ext_show_leaf(inode, path);
2913	return err ? err : allocated;
2914
2915fix_extent_len:
2916	ex->ee_block = orig_ex.ee_block;
2917	ex->ee_len   = orig_ex.ee_len;
2918	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2919	ext4_ext_mark_uninitialized(ex);
2920	ext4_ext_dirty(handle, inode, path + depth);
2921	return err;
2922}
2923
2924/*
2925 * This function is called by ext4_ext_map_blocks() from
2926 * ext4_get_blocks_dio_write() when DIO to write
2927 * to an uninitialized extent.
2928 *
2929 * Writing to an uninitized extent may result in splitting the uninitialized
2930 * extent into multiple /intialized unintialized extents (up to three)
2931 * There are three possibilities:
2932 *   a> There is no split required: Entire extent should be uninitialized
2933 *   b> Splits in two extents: Write is happening at either end of the extent
2934 *   c> Splits in three extents: Somone is writing in middle of the extent
2935 *
2936 * One of more index blocks maybe needed if the extent tree grow after
2937 * the unintialized extent split. To prevent ENOSPC occur at the IO
2938 * complete, we need to split the uninitialized extent before DIO submit
2939 * the IO. The uninitialized extent called at this time will be split
2940 * into three uninitialized extent(at most). After IO complete, the part
2941 * being filled will be convert to initialized by the end_io callback function
2942 * via ext4_convert_unwritten_extents().
2943 *
2944 * Returns the size of uninitialized extent to be written on success.
2945 */
2946static int ext4_split_unwritten_extents(handle_t *handle,
2947					struct inode *inode,
2948					struct ext4_map_blocks *map,
2949					struct ext4_ext_path *path,
2950					int flags)
2951{
2952	struct ext4_extent *ex, newex, orig_ex;
2953	struct ext4_extent *ex1 = NULL;
2954	struct ext4_extent *ex2 = NULL;
2955	struct ext4_extent *ex3 = NULL;
2956	ext4_lblk_t ee_block, eof_block;
2957	unsigned int allocated, ee_len, depth;
2958	ext4_fsblk_t newblock;
2959	int err = 0;
2960	int may_zeroout;
2961
2962	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2963		"block %llu, max_blocks %u\n", inode->i_ino,
2964		(unsigned long long)map->m_lblk, map->m_len);
2965
2966	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2967		inode->i_sb->s_blocksize_bits;
2968	if (eof_block < map->m_lblk + map->m_len)
2969		eof_block = map->m_lblk + map->m_len;
2970
2971	depth = ext_depth(inode);
2972	ex = path[depth].p_ext;
2973	ee_block = le32_to_cpu(ex->ee_block);
2974	ee_len = ext4_ext_get_actual_len(ex);
2975	allocated = ee_len - (map->m_lblk - ee_block);
2976	newblock = map->m_lblk - ee_block + ext_pblock(ex);
2977
2978	ex2 = ex;
2979	orig_ex.ee_block = ex->ee_block;
2980	orig_ex.ee_len   = cpu_to_le16(ee_len);
2981	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2982
2983	/*
2984	 * It is safe to convert extent to initialized via explicit
2985	 * zeroout only if extent is fully insde i_size or new_size.
2986	 */
2987	may_zeroout = ee_block + ee_len <= eof_block;
2988
2989	/*
2990 	 * If the uninitialized extent begins at the same logical
2991 	 * block where the write begins, and the write completely
2992 	 * covers the extent, then we don't need to split it.
2993 	 */
2994	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2995		return allocated;
2996
2997	err = ext4_ext_get_access(handle, inode, path + depth);
2998	if (err)
2999		goto out;
3000	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
3001	if (map->m_lblk > ee_block) {
3002		ex1 = ex;
3003		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3004		ext4_ext_mark_uninitialized(ex1);
3005		ex2 = &newex;
3006	}
3007	/*
3008	 * for sanity, update the length of the ex2 extent before
3009	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
3010	 * overlap of blocks.
3011	 */
3012	if (!ex1 && allocated > map->m_len)
3013		ex2->ee_len = cpu_to_le16(map->m_len);
3014	/* ex3: to ee_block + ee_len : uninitialised */
3015	if (allocated > map->m_len) {
3016		unsigned int newdepth;
3017		ex3 = &newex;
3018		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
3019		ext4_ext_store_pblock(ex3, newblock + map->m_len);
3020		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
3021		ext4_ext_mark_uninitialized(ex3);
3022		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
3023		if (err == -ENOSPC && may_zeroout) {
3024			err =  ext4_ext_zeroout(inode, &orig_ex);
3025			if (err)
3026				goto fix_extent_len;
3027			/* update the extent length and mark as initialized */
3028			ex->ee_block = orig_ex.ee_block;
3029			ex->ee_len   = orig_ex.ee_len;
3030			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3031			ext4_ext_dirty(handle, inode, path + depth);
3032			/* zeroed the full extent */
3033			/* blocks available from map->m_lblk */
3034			return allocated;
3035
3036		} else if (err)
3037			goto fix_extent_len;
3038		/*
3039		 * The depth, and hence eh & ex might change
3040		 * as part of the insert above.
3041		 */
3042		newdepth = ext_depth(inode);
3043		/*
3044		 * update the extent length after successful insert of the
3045		 * split extent
3046		 */
3047		ee_len -= ext4_ext_get_actual_len(ex3);
3048		orig_ex.ee_len = cpu_to_le16(ee_len);
3049		may_zeroout = ee_block + ee_len <= eof_block;
3050
3051		depth = newdepth;
3052		ext4_ext_drop_refs(path);
3053		path = ext4_ext_find_extent(inode, map->m_lblk, path);
3054		if (IS_ERR(path)) {
3055			err = PTR_ERR(path);
3056			goto out;
3057		}
3058		ex = path[depth].p_ext;
3059		if (ex2 != &newex)
3060			ex2 = ex;
3061
3062		err = ext4_ext_get_access(handle, inode, path + depth);
3063		if (err)
3064			goto out;
3065
3066		allocated = map->m_len;
3067	}
3068	/*
3069	 * If there was a change of depth as part of the
3070	 * insertion of ex3 above, we need to update the length
3071	 * of the ex1 extent again here
3072	 */
3073	if (ex1 && ex1 != ex) {
3074		ex1 = ex;
3075		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3076		ext4_ext_mark_uninitialized(ex1);
3077		ex2 = &newex;
3078	}
3079	/*
3080	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3081	 * using direct I/O, uninitialised still.
3082	 */
3083	ex2->ee_block = cpu_to_le32(map->m_lblk);
3084	ext4_ext_store_pblock(ex2, newblock);
3085	ex2->ee_len = cpu_to_le16(allocated);
3086	ext4_ext_mark_uninitialized(ex2);
3087	if (ex2 != ex)
3088		goto insert;
3089	/* Mark modified extent as dirty */
3090	err = ext4_ext_dirty(handle, inode, path + depth);
3091	ext_debug("out here\n");
3092	goto out;
3093insert:
3094	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3095	if (err == -ENOSPC && may_zeroout) {
3096		err =  ext4_ext_zeroout(inode, &orig_ex);
3097		if (err)
3098			goto fix_extent_len;
3099		/* update the extent length and mark as initialized */
3100		ex->ee_block = orig_ex.ee_block;
3101		ex->ee_len   = orig_ex.ee_len;
3102		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3103		ext4_ext_dirty(handle, inode, path + depth);
3104		/* zero out the first half */
3105		return allocated;
3106	} else if (err)
3107		goto fix_extent_len;
3108out:
3109	ext4_ext_show_leaf(inode, path);
3110	return err ? err : allocated;
3111
3112fix_extent_len:
3113	ex->ee_block = orig_ex.ee_block;
3114	ex->ee_len   = orig_ex.ee_len;
3115	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3116	ext4_ext_mark_uninitialized(ex);
3117	ext4_ext_dirty(handle, inode, path + depth);
3118	return err;
3119}
3120static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3121					      struct inode *inode,
3122					      struct ext4_ext_path *path)
3123{
3124	struct ext4_extent *ex;
3125	struct ext4_extent_header *eh;
3126	int depth;
3127	int err = 0;
3128	int ret = 0;
3129
3130	depth = ext_depth(inode);
3131	eh = path[depth].p_hdr;
3132	ex = path[depth].p_ext;
3133
3134	err = ext4_ext_get_access(handle, inode, path + depth);
3135	if (err)
3136		goto out;
3137	/* first mark the extent as initialized */
3138	ext4_ext_mark_initialized(ex);
3139
3140	/*
3141	 * We have to see if it can be merged with the extent
3142	 * on the left.
3143	 */
3144	if (ex > EXT_FIRST_EXTENT(eh)) {
3145		/*
3146		 * To merge left, pass "ex - 1" to try_to_merge(),
3147		 * since it merges towards right _only_.
3148		 */
3149		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3150		if (ret) {
3151			err = ext4_ext_correct_indexes(handle, inode, path);
3152			if (err)
3153				goto out;
3154			depth = ext_depth(inode);
3155			ex--;
3156		}
3157	}
3158	/*
3159	 * Try to Merge towards right.
3160	 */
3161	ret = ext4_ext_try_to_merge(inode, path, ex);
3162	if (ret) {
3163		err = ext4_ext_correct_indexes(handle, inode, path);
3164		if (err)
3165			goto out;
3166		depth = ext_depth(inode);
3167	}
3168	/* Mark modified extent as dirty */
3169	err = ext4_ext_dirty(handle, inode, path + depth);
3170out:
3171	ext4_ext_show_leaf(inode, path);
3172	return err;
3173}
3174
3175static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3176			sector_t block, int count)
3177{
3178	int i;
3179	for (i = 0; i < count; i++)
3180                unmap_underlying_metadata(bdev, block + i);
3181}
3182
3183/*
3184 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3185 */
3186static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3187			      struct ext4_map_blocks *map,
3188			      struct ext4_ext_path *path,
3189			      unsigned int len)
3190{
3191	int i, depth;
3192	struct ext4_extent_header *eh;
3193	struct ext4_extent *ex, *last_ex;
3194
3195	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3196		return 0;
3197
3198	depth = ext_depth(inode);
3199	eh = path[depth].p_hdr;
3200	ex = path[depth].p_ext;
3201
3202	if (unlikely(!eh->eh_entries)) {
3203		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3204				 "EOFBLOCKS_FL set");
3205		return -EIO;
3206	}
3207	last_ex = EXT_LAST_EXTENT(eh);
3208	/*
3209	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3210	 * last block in the last extent in the file.  We test this by
3211	 * first checking to see if the caller to
3212	 * ext4_ext_get_blocks() was interested in the last block (or
3213	 * a block beyond the last block) in the current extent.  If
3214	 * this turns out to be false, we can bail out from this
3215	 * function immediately.
3216	 */
3217	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
3218	    ext4_ext_get_actual_len(last_ex))
3219		return 0;
3220	/*
3221	 * If the caller does appear to be planning to write at or
3222	 * beyond the end of the current extent, we then test to see
3223	 * if the current extent is the last extent in the file, by
3224	 * checking to make sure it was reached via the rightmost node
3225	 * at each level of the tree.
3226	 */
3227	for (i = depth-1; i >= 0; i--)
3228		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3229			return 0;
3230	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3231	return ext4_mark_inode_dirty(handle, inode);
3232}
3233
3234static int
3235ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3236			struct ext4_map_blocks *map,
3237			struct ext4_ext_path *path, int flags,
3238			unsigned int allocated, ext4_fsblk_t newblock)
3239{
3240	int ret = 0;
3241	int err = 0;
3242	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3243
3244	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3245		  "block %llu, max_blocks %u, flags %d, allocated %u",
3246		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3247		  flags, allocated);
3248	ext4_ext_show_leaf(inode, path);
3249
3250	/* get_block() before submit the IO, split the extent */
3251	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3252		ret = ext4_split_unwritten_extents(handle, inode, map,
3253						   path, flags);
3254		/*
3255		 * Flag the inode(non aio case) or end_io struct (aio case)
3256		 * that this IO needs to convertion to written when IO is
3257		 * completed
3258		 */
3259		if (io)
3260			io->flag = EXT4_IO_UNWRITTEN;
3261		else
3262			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3263		if (ext4_should_dioread_nolock(inode))
3264			map->m_flags |= EXT4_MAP_UNINIT;
3265		goto out;
3266	}
3267	/* IO end_io complete, convert the filled extent to written */
3268	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3269		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3270							path);
3271		if (ret >= 0) {
3272			ext4_update_inode_fsync_trans(handle, inode, 1);
3273			err = check_eofblocks_fl(handle, inode, map, path,
3274						 map->m_len);
3275		} else
3276			err = ret;
3277		goto out2;
3278	}
3279	/* buffered IO case */
3280	/*
3281	 * repeat fallocate creation request
3282	 * we already have an unwritten extent
3283	 */
3284	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3285		goto map_out;
3286
3287	/* buffered READ or buffered write_begin() lookup */
3288	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3289		/*
3290		 * We have blocks reserved already.  We
3291		 * return allocated blocks so that delalloc
3292		 * won't do block reservation for us.  But
3293		 * the buffer head will be unmapped so that
3294		 * a read from the block returns 0s.
3295		 */
3296		map->m_flags |= EXT4_MAP_UNWRITTEN;
3297		goto out1;
3298	}
3299
3300	/* buffered write, writepage time, convert*/
3301	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3302	if (ret >= 0) {
3303		ext4_update_inode_fsync_trans(handle, inode, 1);
3304		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
3305		if (err < 0)
3306			goto out2;
3307	}
3308
3309out:
3310	if (ret <= 0) {
3311		err = ret;
3312		goto out2;
3313	} else
3314		allocated = ret;
3315	map->m_flags |= EXT4_MAP_NEW;
3316	/*
3317	 * if we allocated more blocks than requested
3318	 * we need to make sure we unmap the extra block
3319	 * allocated. The actual needed block will get
3320	 * unmapped later when we find the buffer_head marked
3321	 * new.
3322	 */
3323	if (allocated > map->m_len) {
3324		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3325					newblock + map->m_len,
3326					allocated - map->m_len);
3327		allocated = map->m_len;
3328	}
3329
3330	/*
3331	 * If we have done fallocate with the offset that is already
3332	 * delayed allocated, we would have block reservation
3333	 * and quota reservation done in the delayed write path.
3334	 * But fallocate would have already updated quota and block
3335	 * count for this offset. So cancel these reservation
3336	 */
3337	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3338		ext4_da_update_reserve_space(inode, allocated, 0);
3339
3340map_out:
3341	map->m_flags |= EXT4_MAP_MAPPED;
3342out1:
3343	if (allocated > map->m_len)
3344		allocated = map->m_len;
3345	ext4_ext_show_leaf(inode, path);
3346	map->m_pblk = newblock;
3347	map->m_len = allocated;
3348out2:
3349	if (path) {
3350		ext4_ext_drop_refs(path);
3351		kfree(path);
3352	}
3353	return err ? err : allocated;
3354}
3355
3356/*
3357 * Block allocation/map/preallocation routine for extents based files
3358 *
3359 *
3360 * Need to be called with
3361 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3362 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3363 *
3364 * return > 0, number of of blocks already mapped/allocated
3365 *          if create == 0 and these are pre-allocated blocks
3366 *          	buffer head is unmapped
3367 *          otherwise blocks are mapped
3368 *
3369 * return = 0, if plain look up failed (blocks have not been allocated)
3370 *          buffer head is unmapped
3371 *
3372 * return < 0, error case.
3373 */
3374int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3375			struct ext4_map_blocks *map, int flags)
3376{
3377	struct ext4_ext_path *path = NULL;
3378	struct ext4_extent_header *eh;
3379	struct ext4_extent newex, *ex;
3380	ext4_fsblk_t newblock;
3381	int err = 0, depth, ret, cache_type;
3382	unsigned int allocated = 0;
3383	struct ext4_allocation_request ar;
3384	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3385
3386	ext_debug("blocks %u/%u requested for inode %lu\n",
3387		  map->m_lblk, map->m_len, inode->i_ino);
3388
3389	/* check in cache */
3390	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3391	if (cache_type) {
3392		if (cache_type == EXT4_EXT_CACHE_GAP) {
3393			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3394				/*
3395				 * block isn't allocated yet and
3396				 * user doesn't want to allocate it
3397				 */
3398				goto out2;
3399			}
3400			/* we should allocate requested block */
3401		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3402			/* block is already allocated */
3403			newblock = map->m_lblk
3404				   - le32_to_cpu(newex.ee_block)
3405				   + ext_pblock(&newex);
3406			/* number of remaining blocks in the extent */
3407			allocated = ext4_ext_get_actual_len(&newex) -
3408				(map->m_lblk - le32_to_cpu(newex.ee_block));
3409			goto out;
3410		} else {
3411			BUG();
3412		}
3413	}
3414
3415	/* find extent for this block */
3416	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3417	if (IS_ERR(path)) {
3418		err = PTR_ERR(path);
3419		path = NULL;
3420		goto out2;
3421	}
3422
3423	depth = ext_depth(inode);
3424
3425	/*
3426	 * consistent leaf must not be empty;
3427	 * this situation is possible, though, _during_ tree modification;
3428	 * this is why assert can't be put in ext4_ext_find_extent()
3429	 */
3430	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3431		EXT4_ERROR_INODE(inode, "bad extent address "
3432				 "lblock: %lu, depth: %d pblock %lld",
3433				 (unsigned long) map->m_lblk, depth,
3434				 path[depth].p_block);
3435		err = -EIO;
3436		goto out2;
3437	}
3438	eh = path[depth].p_hdr;
3439
3440	ex = path[depth].p_ext;
3441	if (ex) {
3442		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3443		ext4_fsblk_t ee_start = ext_pblock(ex);
3444		unsigned short ee_len;
3445
3446		/*
3447		 * Uninitialized extents are treated as holes, except that
3448		 * we split out initialized portions during a write.
3449		 */
3450		ee_len = ext4_ext_get_actual_len(ex);
3451		/* if found extent covers block, simply return it */
3452		if (in_range(map->m_lblk, ee_block, ee_len)) {
3453			newblock = map->m_lblk - ee_block + ee_start;
3454			/* number of remaining blocks in the extent */
3455			allocated = ee_len - (map->m_lblk - ee_block);
3456			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3457				  ee_block, ee_len, newblock);
3458
3459			/* Do not put uninitialized extent in the cache */
3460			if (!ext4_ext_is_uninitialized(ex)) {
3461				ext4_ext_put_in_cache(inode, ee_block,
3462							ee_len, ee_start,
3463							EXT4_EXT_CACHE_EXTENT);
3464				goto out;
3465			}
3466			ret = ext4_ext_handle_uninitialized_extents(handle,
3467					inode, map, path, flags, allocated,
3468					newblock);
3469			return ret;
3470		}
3471	}
3472
3473	/*
3474	 * requested block isn't allocated yet;
3475	 * we couldn't try to create block if create flag is zero
3476	 */
3477	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3478		/*
3479		 * put just found gap into cache to speed up
3480		 * subsequent requests
3481		 */
3482		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3483		goto out2;
3484	}
3485	/*
3486	 * Okay, we need to do block allocation.
3487	 */
3488
3489	/* find neighbour allocated blocks */
3490	ar.lleft = map->m_lblk;
3491	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3492	if (err)
3493		goto out2;
3494	ar.lright = map->m_lblk;
3495	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3496	if (err)
3497		goto out2;
3498
3499	/*
3500	 * See if request is beyond maximum number of blocks we can have in
3501	 * a single extent. For an initialized extent this limit is
3502	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3503	 * EXT_UNINIT_MAX_LEN.
3504	 */
3505	if (map->m_len > EXT_INIT_MAX_LEN &&
3506	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3507		map->m_len = EXT_INIT_MAX_LEN;
3508	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3509		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3510		map->m_len = EXT_UNINIT_MAX_LEN;
3511
3512	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3513	newex.ee_block = cpu_to_le32(map->m_lblk);
3514	newex.ee_len = cpu_to_le16(map->m_len);
3515	err = ext4_ext_check_overlap(inode, &newex, path);
3516	if (err)
3517		allocated = ext4_ext_get_actual_len(&newex);
3518	else
3519		allocated = map->m_len;
3520
3521	/* allocate new block */
3522	ar.inode = inode;
3523	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3524	ar.logical = map->m_lblk;
3525	ar.len = allocated;
3526	if (S_ISREG(inode->i_mode))
3527		ar.flags = EXT4_MB_HINT_DATA;
3528	else
3529		/* disable in-core preallocation for non-regular files */
3530		ar.flags = 0;
3531	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3532	if (!newblock)
3533		goto out2;
3534	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3535		  ar.goal, newblock, allocated);
3536
3537	/* try to insert new extent into found leaf and return */
3538	ext4_ext_store_pblock(&newex, newblock);
3539	newex.ee_len = cpu_to_le16(ar.len);
3540	/* Mark uninitialized */
3541	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3542		ext4_ext_mark_uninitialized(&newex);
3543		/*
3544		 * io_end structure was created for every IO write to an
3545		 * uninitialized extent. To avoid unecessary conversion,
3546		 * here we flag the IO that really needs the conversion.
3547		 * For non asycn direct IO case, flag the inode state
3548		 * that we need to perform convertion when IO is done.
3549		 */
3550		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3551			if (io)
3552				io->flag = EXT4_IO_UNWRITTEN;
3553			else
3554				ext4_set_inode_state(inode,
3555						     EXT4_STATE_DIO_UNWRITTEN);
3556		}
3557		if (ext4_should_dioread_nolock(inode))
3558			map->m_flags |= EXT4_MAP_UNINIT;
3559	}
3560
3561	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
3562	if (err)
3563		goto out2;
3564
3565	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3566	if (err) {
3567		/* free data blocks we just allocated */
3568		/* not a good idea to call discard here directly,
3569		 * but otherwise we'd need to call it every free() */
3570		ext4_discard_preallocations(inode);
3571		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3572				 ext4_ext_get_actual_len(&newex), 0);
3573		goto out2;
3574	}
3575
3576	/* previous routine could use block we allocated */
3577	newblock = ext_pblock(&newex);
3578	allocated = ext4_ext_get_actual_len(&newex);
3579	if (allocated > map->m_len)
3580		allocated = map->m_len;
3581	map->m_flags |= EXT4_MAP_NEW;
3582
3583	/*
3584	 * Update reserved blocks/metadata blocks after successful
3585	 * block allocation which had been deferred till now.
3586	 */
3587	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3588		ext4_da_update_reserve_space(inode, allocated, 1);
3589
3590	/*
3591	 * Cache the extent and update transaction to commit on fdatasync only
3592	 * when it is _not_ an uninitialized extent.
3593	 */
3594	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3595		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3596						EXT4_EXT_CACHE_EXTENT);
3597		ext4_update_inode_fsync_trans(handle, inode, 1);
3598	} else
3599		ext4_update_inode_fsync_trans(handle, inode, 0);
3600out:
3601	if (allocated > map->m_len)
3602		allocated = map->m_len;
3603	ext4_ext_show_leaf(inode, path);
3604	map->m_flags |= EXT4_MAP_MAPPED;
3605	map->m_pblk = newblock;
3606	map->m_len = allocated;
3607out2:
3608	if (path) {
3609		ext4_ext_drop_refs(path);
3610		kfree(path);
3611	}
3612	return err ? err : allocated;
3613}
3614
3615void ext4_ext_truncate(struct inode *inode)
3616{
3617	struct address_space *mapping = inode->i_mapping;
3618	struct super_block *sb = inode->i_sb;
3619	ext4_lblk_t last_block;
3620	handle_t *handle;
3621	int err = 0;
3622
3623	/*
3624	 * probably first extent we're gonna free will be last in block
3625	 */
3626	err = ext4_writepage_trans_blocks(inode);
3627	handle = ext4_journal_start(inode, err);
3628	if (IS_ERR(handle))
3629		return;
3630
3631	if (inode->i_size & (sb->s_blocksize - 1))
3632		ext4_block_truncate_page(handle, mapping, inode->i_size);
3633
3634	if (ext4_orphan_add(handle, inode))
3635		goto out_stop;
3636
3637	down_write(&EXT4_I(inode)->i_data_sem);
3638	ext4_ext_invalidate_cache(inode);
3639
3640	ext4_discard_preallocations(inode);
3641
3642	/*
3643	 * TODO: optimization is possible here.
3644	 * Probably we need not scan at all,
3645	 * because page truncation is enough.
3646	 */
3647
3648	/* we have to know where to truncate from in crash case */
3649	EXT4_I(inode)->i_disksize = inode->i_size;
3650	ext4_mark_inode_dirty(handle, inode);
3651
3652	last_block = (inode->i_size + sb->s_blocksize - 1)
3653			>> EXT4_BLOCK_SIZE_BITS(sb);
3654	err = ext4_ext_remove_space(inode, last_block);
3655
3656	/* In a multi-transaction truncate, we only make the final
3657	 * transaction synchronous.
3658	 */
3659	if (IS_SYNC(inode))
3660		ext4_handle_sync(handle);
3661
3662out_stop:
3663	up_write(&EXT4_I(inode)->i_data_sem);
3664	/*
3665	 * If this was a simple ftruncate() and the file will remain alive,
3666	 * then we need to clear up the orphan record which we created above.
3667	 * However, if this was a real unlink then we were called by
3668	 * ext4_delete_inode(), and we allow that function to clean up the
3669	 * orphan info for us.
3670	 */
3671	if (inode->i_nlink)
3672		ext4_orphan_del(handle, inode);
3673
3674	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3675	ext4_mark_inode_dirty(handle, inode);
3676	ext4_journal_stop(handle);
3677}
3678
3679static void ext4_falloc_update_inode(struct inode *inode,
3680				int mode, loff_t new_size, int update_ctime)
3681{
3682	struct timespec now;
3683
3684	if (update_ctime) {
3685		now = current_fs_time(inode->i_sb);
3686		if (!timespec_equal(&inode->i_ctime, &now))
3687			inode->i_ctime = now;
3688	}
3689	/*
3690	 * Update only when preallocation was requested beyond
3691	 * the file size.
3692	 */
3693	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3694		if (new_size > i_size_read(inode))
3695			i_size_write(inode, new_size);
3696		if (new_size > EXT4_I(inode)->i_disksize)
3697			ext4_update_i_disksize(inode, new_size);
3698	} else {
3699		/*
3700		 * Mark that we allocate beyond EOF so the subsequent truncate
3701		 * can proceed even if the new size is the same as i_size.
3702		 */
3703		if (new_size > i_size_read(inode))
3704			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3705	}
3706
3707}
3708
3709/*
3710 * preallocate space for a file. This implements ext4's fallocate inode
3711 * operation, which gets called from sys_fallocate system call.
3712 * For block-mapped files, posix_fallocate should fall back to the method
3713 * of writing zeroes to the required new blocks (the same behavior which is
3714 * expected for file systems which do not support fallocate() system call).
3715 */
3716long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3717{
3718	handle_t *handle;
3719	loff_t new_size;
3720	unsigned int max_blocks;
3721	int ret = 0;
3722	int ret2 = 0;
3723	int retries = 0;
3724	struct ext4_map_blocks map;
3725	unsigned int credits, blkbits = inode->i_blkbits;
3726
3727	/*
3728	 * currently supporting (pre)allocate mode for extent-based
3729	 * files _only_
3730	 */
3731	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3732		return -EOPNOTSUPP;
3733
3734	/* preallocation to directories is currently not supported */
3735	if (S_ISDIR(inode->i_mode))
3736		return -ENODEV;
3737
3738	map.m_lblk = offset >> blkbits;
3739	/*
3740	 * We can't just convert len to max_blocks because
3741	 * If blocksize = 4096 offset = 3072 and len = 2048
3742	 */
3743	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3744		- map.m_lblk;
3745	/*
3746	 * credits to insert 1 extent into extent tree
3747	 */
3748	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3749	mutex_lock(&inode->i_mutex);
3750	ret = inode_newsize_ok(inode, (len + offset));
3751	if (ret) {
3752		mutex_unlock(&inode->i_mutex);
3753		return ret;
3754	}
3755retry:
3756	while (ret >= 0 && ret < max_blocks) {
3757		map.m_lblk = map.m_lblk + ret;
3758		map.m_len = max_blocks = max_blocks - ret;
3759		handle = ext4_journal_start(inode, credits);
3760		if (IS_ERR(handle)) {
3761			ret = PTR_ERR(handle);
3762			break;
3763		}
3764		ret = ext4_map_blocks(handle, inode, &map,
3765				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3766		if (ret <= 0) {
3767#ifdef EXT4FS_DEBUG
3768			WARN_ON(ret <= 0);
3769			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3770				    "returned error inode#%lu, block=%u, "
3771				    "max_blocks=%u", __func__,
3772				    inode->i_ino, block, max_blocks);
3773#endif
3774			ext4_mark_inode_dirty(handle, inode);
3775			ret2 = ext4_journal_stop(handle);
3776			break;
3777		}
3778		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3779						blkbits) >> blkbits))
3780			new_size = offset + len;
3781		else
3782			new_size = (map.m_lblk + ret) << blkbits;
3783
3784		ext4_falloc_update_inode(inode, mode, new_size,
3785					 (map.m_flags & EXT4_MAP_NEW));
3786		ext4_mark_inode_dirty(handle, inode);
3787		ret2 = ext4_journal_stop(handle);
3788		if (ret2)
3789			break;
3790	}
3791	if (ret == -ENOSPC &&
3792			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3793		ret = 0;
3794		goto retry;
3795	}
3796	mutex_unlock(&inode->i_mutex);
3797	return ret > 0 ? ret2 : ret;
3798}
3799
3800/*
3801 * This function convert a range of blocks to written extents
3802 * The caller of this function will pass the start offset and the size.
3803 * all unwritten extents within this range will be converted to
3804 * written extents.
3805 *
3806 * This function is called from the direct IO end io call back
3807 * function, to convert the fallocated extents after IO is completed.
3808 * Returns 0 on success.
3809 */
3810int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3811				    ssize_t len)
3812{
3813	handle_t *handle;
3814	unsigned int max_blocks;
3815	int ret = 0;
3816	int ret2 = 0;
3817	struct ext4_map_blocks map;
3818	unsigned int credits, blkbits = inode->i_blkbits;
3819
3820	map.m_lblk = offset >> blkbits;
3821	/*
3822	 * We can't just convert len to max_blocks because
3823	 * If blocksize = 4096 offset = 3072 and len = 2048
3824	 */
3825	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3826		      map.m_lblk);
3827	/*
3828	 * credits to insert 1 extent into extent tree
3829	 */
3830	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3831	while (ret >= 0 && ret < max_blocks) {
3832		map.m_lblk += ret;
3833		map.m_len = (max_blocks -= ret);
3834		handle = ext4_journal_start(inode, credits);
3835		if (IS_ERR(handle)) {
3836			ret = PTR_ERR(handle);
3837			break;
3838		}
3839		ret = ext4_map_blocks(handle, inode, &map,
3840				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3841		if (ret <= 0) {
3842			WARN_ON(ret <= 0);
3843			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3844				    "returned error inode#%lu, block=%u, "
3845				    "max_blocks=%u", __func__,
3846				    inode->i_ino, map.m_lblk, map.m_len);
3847		}
3848		ext4_mark_inode_dirty(handle, inode);
3849		ret2 = ext4_journal_stop(handle);
3850		if (ret <= 0 || ret2 )
3851			break;
3852	}
3853	return ret > 0 ? ret2 : ret;
3854}
3855/*
3856 * Callback function called for each extent to gather FIEMAP information.
3857 */
3858static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3859		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3860		       void *data)
3861{
3862	struct fiemap_extent_info *fieinfo = data;
3863	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3864	__u64	logical;
3865	__u64	physical;
3866	__u64	length;
3867	__u32	flags = 0;
3868	int	error;
3869
3870	logical =  (__u64)newex->ec_block << blksize_bits;
3871
3872	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3873		pgoff_t offset;
3874		struct page *page;
3875		struct buffer_head *bh = NULL;
3876
3877		offset = logical >> PAGE_SHIFT;
3878		page = find_get_page(inode->i_mapping, offset);
3879		if (!page || !page_has_buffers(page))
3880			return EXT_CONTINUE;
3881
3882		bh = page_buffers(page);
3883
3884		if (!bh)
3885			return EXT_CONTINUE;
3886
3887		if (buffer_delay(bh)) {
3888			flags |= FIEMAP_EXTENT_DELALLOC;
3889			page_cache_release(page);
3890		} else {
3891			page_cache_release(page);
3892			return EXT_CONTINUE;
3893		}
3894	}
3895
3896	physical = (__u64)newex->ec_start << blksize_bits;
3897	length =   (__u64)newex->ec_len << blksize_bits;
3898
3899	if (ex && ext4_ext_is_uninitialized(ex))
3900		flags |= FIEMAP_EXTENT_UNWRITTEN;
3901
3902	/*
3903	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3904	 *
3905	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3906	 * this also indicates no more allocated blocks.
3907	 *
3908	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3909	 */
3910	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3911	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3912		loff_t size = i_size_read(inode);
3913		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3914
3915		flags |= FIEMAP_EXTENT_LAST;
3916		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3917		    logical+length > size)
3918			length = (size - logical + bs - 1) & ~(bs-1);
3919	}
3920
3921	error = fiemap_fill_next_extent(fieinfo, logical, physical,
3922					length, flags);
3923	if (error < 0)
3924		return error;
3925	if (error == 1)
3926		return EXT_BREAK;
3927
3928	return EXT_CONTINUE;
3929}
3930
3931/* fiemap flags we can handle specified here */
3932#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3933
3934static int ext4_xattr_fiemap(struct inode *inode,
3935				struct fiemap_extent_info *fieinfo)
3936{
3937	__u64 physical = 0;
3938	__u64 length;
3939	__u32 flags = FIEMAP_EXTENT_LAST;
3940	int blockbits = inode->i_sb->s_blocksize_bits;
3941	int error = 0;
3942
3943	/* in-inode? */
3944	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3945		struct ext4_iloc iloc;
3946		int offset;	/* offset of xattr in inode */
3947
3948		error = ext4_get_inode_loc(inode, &iloc);
3949		if (error)
3950			return error;
3951		physical = iloc.bh->b_blocknr << blockbits;
3952		offset = EXT4_GOOD_OLD_INODE_SIZE +
3953				EXT4_I(inode)->i_extra_isize;
3954		physical += offset;
3955		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3956		flags |= FIEMAP_EXTENT_DATA_INLINE;
3957		brelse(iloc.bh);
3958	} else { /* external block */
3959		physical = EXT4_I(inode)->i_file_acl << blockbits;
3960		length = inode->i_sb->s_blocksize;
3961	}
3962
3963	if (physical)
3964		error = fiemap_fill_next_extent(fieinfo, 0, physical,
3965						length, flags);
3966	return (error < 0 ? error : 0);
3967}
3968
3969int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3970		__u64 start, __u64 len)
3971{
3972	ext4_lblk_t start_blk;
3973	int error = 0;
3974
3975	/* fallback to generic here if not in extents fmt */
3976	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3977		return generic_block_fiemap(inode, fieinfo, start, len,
3978			ext4_get_block);
3979
3980	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3981		return -EBADR;
3982
3983	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3984		error = ext4_xattr_fiemap(inode, fieinfo);
3985	} else {
3986		ext4_lblk_t len_blks;
3987		__u64 last_blk;
3988
3989		start_blk = start >> inode->i_sb->s_blocksize_bits;
3990		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3991		if (last_blk >= EXT_MAX_BLOCK)
3992			last_blk = EXT_MAX_BLOCK-1;
3993		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3994
3995		/*
3996		 * Walk the extent tree gathering extent information.
3997		 * ext4_ext_fiemap_cb will push extents back to user.
3998		 */
3999		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4000					  ext4_ext_fiemap_cb, fieinfo);
4001	}
4002
4003	return error;
4004}
4005
4006