extents.c revision 19f5fb7ad679bb361222c7916086435020c37cce
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54	ext4_fsblk_t block;
55
56	block = le32_to_cpu(ex->ee_start_lo);
57	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58	return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67	ext4_fsblk_t block;
68
69	block = le32_to_cpu(ix->ei_leaf_lo);
70	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71	return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97					    struct inode *inode,
98					    int needed)
99{
100	int err;
101
102	if (!ext4_handle_valid(handle))
103		return 0;
104	if (handle->h_buffer_credits > needed)
105		return 0;
106	err = ext4_journal_extend(handle, needed);
107	if (err <= 0)
108		return err;
109	err = ext4_truncate_restart_trans(handle, inode, needed);
110	/*
111	 * We have dropped i_data_sem so someone might have cached again
112	 * an extent we are going to truncate.
113	 */
114	ext4_ext_invalidate_cache(inode);
115
116	return err;
117}
118
119/*
120 * could return:
121 *  - EROFS
122 *  - ENOMEM
123 */
124static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
125				struct ext4_ext_path *path)
126{
127	if (path->p_bh) {
128		/* path points to block */
129		return ext4_journal_get_write_access(handle, path->p_bh);
130	}
131	/* path points to leaf/index in inode body */
132	/* we use in-core data, no need to protect them */
133	return 0;
134}
135
136/*
137 * could return:
138 *  - EROFS
139 *  - ENOMEM
140 *  - EIO
141 */
142static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
143				struct ext4_ext_path *path)
144{
145	int err;
146	if (path->p_bh) {
147		/* path points to block */
148		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
149	} else {
150		/* path points to leaf/index in inode body */
151		err = ext4_mark_inode_dirty(handle, inode);
152	}
153	return err;
154}
155
156static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
157			      struct ext4_ext_path *path,
158			      ext4_lblk_t block)
159{
160	struct ext4_inode_info *ei = EXT4_I(inode);
161	ext4_fsblk_t bg_start;
162	ext4_fsblk_t last_block;
163	ext4_grpblk_t colour;
164	ext4_group_t block_group;
165	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
166	int depth;
167
168	if (path) {
169		struct ext4_extent *ex;
170		depth = path->p_depth;
171
172		/* try to predict block placement */
173		ex = path[depth].p_ext;
174		if (ex)
175			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
176
177		/* it looks like index is empty;
178		 * try to find starting block from index itself */
179		if (path[depth].p_bh)
180			return path[depth].p_bh->b_blocknr;
181	}
182
183	/* OK. use inode's group */
184	block_group = ei->i_block_group;
185	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186		/*
187		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188		 * block groups per flexgroup, reserve the first block
189		 * group for directories and special files.  Regular
190		 * files will start at the second block group.  This
191		 * tends to speed up directory access and improves
192		 * fsck times.
193		 */
194		block_group &= ~(flex_size-1);
195		if (S_ISREG(inode->i_mode))
196			block_group++;
197	}
198	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
199		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201
202	/*
203	 * If we are doing delayed allocation, we don't need take
204	 * colour into account.
205	 */
206	if (test_opt(inode->i_sb, DELALLOC))
207		return bg_start;
208
209	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
210		colour = (current->pid % 16) *
211			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
212	else
213		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
214	return bg_start + colour + block;
215}
216
217/*
218 * Allocation for a meta data block
219 */
220static ext4_fsblk_t
221ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
222			struct ext4_ext_path *path,
223			struct ext4_extent *ex, int *err)
224{
225	ext4_fsblk_t goal, newblock;
226
227	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
228	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
229	return newblock;
230}
231
232static inline int ext4_ext_space_block(struct inode *inode, int check)
233{
234	int size;
235
236	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
237			/ sizeof(struct ext4_extent);
238	if (!check) {
239#ifdef AGGRESSIVE_TEST
240		if (size > 6)
241			size = 6;
242#endif
243	}
244	return size;
245}
246
247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
248{
249	int size;
250
251	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
252			/ sizeof(struct ext4_extent_idx);
253	if (!check) {
254#ifdef AGGRESSIVE_TEST
255		if (size > 5)
256			size = 5;
257#endif
258	}
259	return size;
260}
261
262static inline int ext4_ext_space_root(struct inode *inode, int check)
263{
264	int size;
265
266	size = sizeof(EXT4_I(inode)->i_data);
267	size -= sizeof(struct ext4_extent_header);
268	size /= sizeof(struct ext4_extent);
269	if (!check) {
270#ifdef AGGRESSIVE_TEST
271		if (size > 3)
272			size = 3;
273#endif
274	}
275	return size;
276}
277
278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
279{
280	int size;
281
282	size = sizeof(EXT4_I(inode)->i_data);
283	size -= sizeof(struct ext4_extent_header);
284	size /= sizeof(struct ext4_extent_idx);
285	if (!check) {
286#ifdef AGGRESSIVE_TEST
287		if (size > 4)
288			size = 4;
289#endif
290	}
291	return size;
292}
293
294/*
295 * Calculate the number of metadata blocks needed
296 * to allocate @blocks
297 * Worse case is one block per extent
298 */
299int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
300{
301	struct ext4_inode_info *ei = EXT4_I(inode);
302	int idxs, num = 0;
303
304	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
305		/ sizeof(struct ext4_extent_idx));
306
307	/*
308	 * If the new delayed allocation block is contiguous with the
309	 * previous da block, it can share index blocks with the
310	 * previous block, so we only need to allocate a new index
311	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
312	 * an additional index block, and at ldxs**3 blocks, yet
313	 * another index blocks.
314	 */
315	if (ei->i_da_metadata_calc_len &&
316	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
317		if ((ei->i_da_metadata_calc_len % idxs) == 0)
318			num++;
319		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
320			num++;
321		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
322			num++;
323			ei->i_da_metadata_calc_len = 0;
324		} else
325			ei->i_da_metadata_calc_len++;
326		ei->i_da_metadata_calc_last_lblock++;
327		return num;
328	}
329
330	/*
331	 * In the worst case we need a new set of index blocks at
332	 * every level of the inode's extent tree.
333	 */
334	ei->i_da_metadata_calc_len = 1;
335	ei->i_da_metadata_calc_last_lblock = lblock;
336	return ext_depth(inode) + 1;
337}
338
339static int
340ext4_ext_max_entries(struct inode *inode, int depth)
341{
342	int max;
343
344	if (depth == ext_depth(inode)) {
345		if (depth == 0)
346			max = ext4_ext_space_root(inode, 1);
347		else
348			max = ext4_ext_space_root_idx(inode, 1);
349	} else {
350		if (depth == 0)
351			max = ext4_ext_space_block(inode, 1);
352		else
353			max = ext4_ext_space_block_idx(inode, 1);
354	}
355
356	return max;
357}
358
359static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
360{
361	ext4_fsblk_t block = ext_pblock(ext);
362	int len = ext4_ext_get_actual_len(ext);
363
364	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
365}
366
367static int ext4_valid_extent_idx(struct inode *inode,
368				struct ext4_extent_idx *ext_idx)
369{
370	ext4_fsblk_t block = idx_pblock(ext_idx);
371
372	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
373}
374
375static int ext4_valid_extent_entries(struct inode *inode,
376				struct ext4_extent_header *eh,
377				int depth)
378{
379	struct ext4_extent *ext;
380	struct ext4_extent_idx *ext_idx;
381	unsigned short entries;
382	if (eh->eh_entries == 0)
383		return 1;
384
385	entries = le16_to_cpu(eh->eh_entries);
386
387	if (depth == 0) {
388		/* leaf entries */
389		ext = EXT_FIRST_EXTENT(eh);
390		while (entries) {
391			if (!ext4_valid_extent(inode, ext))
392				return 0;
393			ext++;
394			entries--;
395		}
396	} else {
397		ext_idx = EXT_FIRST_INDEX(eh);
398		while (entries) {
399			if (!ext4_valid_extent_idx(inode, ext_idx))
400				return 0;
401			ext_idx++;
402			entries--;
403		}
404	}
405	return 1;
406}
407
408static int __ext4_ext_check(const char *function, struct inode *inode,
409					struct ext4_extent_header *eh,
410					int depth)
411{
412	const char *error_msg;
413	int max = 0;
414
415	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
416		error_msg = "invalid magic";
417		goto corrupted;
418	}
419	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
420		error_msg = "unexpected eh_depth";
421		goto corrupted;
422	}
423	if (unlikely(eh->eh_max == 0)) {
424		error_msg = "invalid eh_max";
425		goto corrupted;
426	}
427	max = ext4_ext_max_entries(inode, depth);
428	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
429		error_msg = "too large eh_max";
430		goto corrupted;
431	}
432	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
433		error_msg = "invalid eh_entries";
434		goto corrupted;
435	}
436	if (!ext4_valid_extent_entries(inode, eh, depth)) {
437		error_msg = "invalid extent entries";
438		goto corrupted;
439	}
440	return 0;
441
442corrupted:
443	ext4_error(inode->i_sb, function,
444			"bad header/extent in inode #%lu: %s - magic %x, "
445			"entries %u, max %u(%u), depth %u(%u)",
446			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
447			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
448			max, le16_to_cpu(eh->eh_depth), depth);
449
450	return -EIO;
451}
452
453#define ext4_ext_check(inode, eh, depth)	\
454	__ext4_ext_check(__func__, inode, eh, depth)
455
456int ext4_ext_check_inode(struct inode *inode)
457{
458	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
459}
460
461#ifdef EXT_DEBUG
462static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
463{
464	int k, l = path->p_depth;
465
466	ext_debug("path:");
467	for (k = 0; k <= l; k++, path++) {
468		if (path->p_idx) {
469		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
470			    idx_pblock(path->p_idx));
471		} else if (path->p_ext) {
472			ext_debug("  %d:[%d]%d:%llu ",
473				  le32_to_cpu(path->p_ext->ee_block),
474				  ext4_ext_is_uninitialized(path->p_ext),
475				  ext4_ext_get_actual_len(path->p_ext),
476				  ext_pblock(path->p_ext));
477		} else
478			ext_debug("  []");
479	}
480	ext_debug("\n");
481}
482
483static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
484{
485	int depth = ext_depth(inode);
486	struct ext4_extent_header *eh;
487	struct ext4_extent *ex;
488	int i;
489
490	if (!path)
491		return;
492
493	eh = path[depth].p_hdr;
494	ex = EXT_FIRST_EXTENT(eh);
495
496	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
497
498	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
499		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
500			  ext4_ext_is_uninitialized(ex),
501			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
502	}
503	ext_debug("\n");
504}
505#else
506#define ext4_ext_show_path(inode, path)
507#define ext4_ext_show_leaf(inode, path)
508#endif
509
510void ext4_ext_drop_refs(struct ext4_ext_path *path)
511{
512	int depth = path->p_depth;
513	int i;
514
515	for (i = 0; i <= depth; i++, path++)
516		if (path->p_bh) {
517			brelse(path->p_bh);
518			path->p_bh = NULL;
519		}
520}
521
522/*
523 * ext4_ext_binsearch_idx:
524 * binary search for the closest index of the given block
525 * the header must be checked before calling this
526 */
527static void
528ext4_ext_binsearch_idx(struct inode *inode,
529			struct ext4_ext_path *path, ext4_lblk_t block)
530{
531	struct ext4_extent_header *eh = path->p_hdr;
532	struct ext4_extent_idx *r, *l, *m;
533
534
535	ext_debug("binsearch for %u(idx):  ", block);
536
537	l = EXT_FIRST_INDEX(eh) + 1;
538	r = EXT_LAST_INDEX(eh);
539	while (l <= r) {
540		m = l + (r - l) / 2;
541		if (block < le32_to_cpu(m->ei_block))
542			r = m - 1;
543		else
544			l = m + 1;
545		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
546				m, le32_to_cpu(m->ei_block),
547				r, le32_to_cpu(r->ei_block));
548	}
549
550	path->p_idx = l - 1;
551	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
552		  idx_pblock(path->p_idx));
553
554#ifdef CHECK_BINSEARCH
555	{
556		struct ext4_extent_idx *chix, *ix;
557		int k;
558
559		chix = ix = EXT_FIRST_INDEX(eh);
560		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
561		  if (k != 0 &&
562		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
563				printk(KERN_DEBUG "k=%d, ix=0x%p, "
564				       "first=0x%p\n", k,
565				       ix, EXT_FIRST_INDEX(eh));
566				printk(KERN_DEBUG "%u <= %u\n",
567				       le32_to_cpu(ix->ei_block),
568				       le32_to_cpu(ix[-1].ei_block));
569			}
570			BUG_ON(k && le32_to_cpu(ix->ei_block)
571					   <= le32_to_cpu(ix[-1].ei_block));
572			if (block < le32_to_cpu(ix->ei_block))
573				break;
574			chix = ix;
575		}
576		BUG_ON(chix != path->p_idx);
577	}
578#endif
579
580}
581
582/*
583 * ext4_ext_binsearch:
584 * binary search for closest extent of the given block
585 * the header must be checked before calling this
586 */
587static void
588ext4_ext_binsearch(struct inode *inode,
589		struct ext4_ext_path *path, ext4_lblk_t block)
590{
591	struct ext4_extent_header *eh = path->p_hdr;
592	struct ext4_extent *r, *l, *m;
593
594	if (eh->eh_entries == 0) {
595		/*
596		 * this leaf is empty:
597		 * we get such a leaf in split/add case
598		 */
599		return;
600	}
601
602	ext_debug("binsearch for %u:  ", block);
603
604	l = EXT_FIRST_EXTENT(eh) + 1;
605	r = EXT_LAST_EXTENT(eh);
606
607	while (l <= r) {
608		m = l + (r - l) / 2;
609		if (block < le32_to_cpu(m->ee_block))
610			r = m - 1;
611		else
612			l = m + 1;
613		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
614				m, le32_to_cpu(m->ee_block),
615				r, le32_to_cpu(r->ee_block));
616	}
617
618	path->p_ext = l - 1;
619	ext_debug("  -> %d:%llu:[%d]%d ",
620			le32_to_cpu(path->p_ext->ee_block),
621			ext_pblock(path->p_ext),
622			ext4_ext_is_uninitialized(path->p_ext),
623			ext4_ext_get_actual_len(path->p_ext));
624
625#ifdef CHECK_BINSEARCH
626	{
627		struct ext4_extent *chex, *ex;
628		int k;
629
630		chex = ex = EXT_FIRST_EXTENT(eh);
631		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
632			BUG_ON(k && le32_to_cpu(ex->ee_block)
633					  <= le32_to_cpu(ex[-1].ee_block));
634			if (block < le32_to_cpu(ex->ee_block))
635				break;
636			chex = ex;
637		}
638		BUG_ON(chex != path->p_ext);
639	}
640#endif
641
642}
643
644int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
645{
646	struct ext4_extent_header *eh;
647
648	eh = ext_inode_hdr(inode);
649	eh->eh_depth = 0;
650	eh->eh_entries = 0;
651	eh->eh_magic = EXT4_EXT_MAGIC;
652	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
653	ext4_mark_inode_dirty(handle, inode);
654	ext4_ext_invalidate_cache(inode);
655	return 0;
656}
657
658struct ext4_ext_path *
659ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
660					struct ext4_ext_path *path)
661{
662	struct ext4_extent_header *eh;
663	struct buffer_head *bh;
664	short int depth, i, ppos = 0, alloc = 0;
665
666	eh = ext_inode_hdr(inode);
667	depth = ext_depth(inode);
668
669	/* account possible depth increase */
670	if (!path) {
671		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
672				GFP_NOFS);
673		if (!path)
674			return ERR_PTR(-ENOMEM);
675		alloc = 1;
676	}
677	path[0].p_hdr = eh;
678	path[0].p_bh = NULL;
679
680	i = depth;
681	/* walk through the tree */
682	while (i) {
683		int need_to_validate = 0;
684
685		ext_debug("depth %d: num %d, max %d\n",
686			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
687
688		ext4_ext_binsearch_idx(inode, path + ppos, block);
689		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
690		path[ppos].p_depth = i;
691		path[ppos].p_ext = NULL;
692
693		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
694		if (unlikely(!bh))
695			goto err;
696		if (!bh_uptodate_or_lock(bh)) {
697			if (bh_submit_read(bh) < 0) {
698				put_bh(bh);
699				goto err;
700			}
701			/* validate the extent entries */
702			need_to_validate = 1;
703		}
704		eh = ext_block_hdr(bh);
705		ppos++;
706		BUG_ON(ppos > depth);
707		path[ppos].p_bh = bh;
708		path[ppos].p_hdr = eh;
709		i--;
710
711		if (need_to_validate && ext4_ext_check(inode, eh, i))
712			goto err;
713	}
714
715	path[ppos].p_depth = i;
716	path[ppos].p_ext = NULL;
717	path[ppos].p_idx = NULL;
718
719	/* find extent */
720	ext4_ext_binsearch(inode, path + ppos, block);
721	/* if not an empty leaf */
722	if (path[ppos].p_ext)
723		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
724
725	ext4_ext_show_path(inode, path);
726
727	return path;
728
729err:
730	ext4_ext_drop_refs(path);
731	if (alloc)
732		kfree(path);
733	return ERR_PTR(-EIO);
734}
735
736/*
737 * ext4_ext_insert_index:
738 * insert new index [@logical;@ptr] into the block at @curp;
739 * check where to insert: before @curp or after @curp
740 */
741int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
742				struct ext4_ext_path *curp,
743				int logical, ext4_fsblk_t ptr)
744{
745	struct ext4_extent_idx *ix;
746	int len, err;
747
748	err = ext4_ext_get_access(handle, inode, curp);
749	if (err)
750		return err;
751
752	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
753	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
754	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
755		/* insert after */
756		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
757			len = (len - 1) * sizeof(struct ext4_extent_idx);
758			len = len < 0 ? 0 : len;
759			ext_debug("insert new index %d after: %llu. "
760					"move %d from 0x%p to 0x%p\n",
761					logical, ptr, len,
762					(curp->p_idx + 1), (curp->p_idx + 2));
763			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
764		}
765		ix = curp->p_idx + 1;
766	} else {
767		/* insert before */
768		len = len * sizeof(struct ext4_extent_idx);
769		len = len < 0 ? 0 : len;
770		ext_debug("insert new index %d before: %llu. "
771				"move %d from 0x%p to 0x%p\n",
772				logical, ptr, len,
773				curp->p_idx, (curp->p_idx + 1));
774		memmove(curp->p_idx + 1, curp->p_idx, len);
775		ix = curp->p_idx;
776	}
777
778	ix->ei_block = cpu_to_le32(logical);
779	ext4_idx_store_pblock(ix, ptr);
780	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
781
782	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
783			     > le16_to_cpu(curp->p_hdr->eh_max));
784	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
785
786	err = ext4_ext_dirty(handle, inode, curp);
787	ext4_std_error(inode->i_sb, err);
788
789	return err;
790}
791
792/*
793 * ext4_ext_split:
794 * inserts new subtree into the path, using free index entry
795 * at depth @at:
796 * - allocates all needed blocks (new leaf and all intermediate index blocks)
797 * - makes decision where to split
798 * - moves remaining extents and index entries (right to the split point)
799 *   into the newly allocated blocks
800 * - initializes subtree
801 */
802static int ext4_ext_split(handle_t *handle, struct inode *inode,
803				struct ext4_ext_path *path,
804				struct ext4_extent *newext, int at)
805{
806	struct buffer_head *bh = NULL;
807	int depth = ext_depth(inode);
808	struct ext4_extent_header *neh;
809	struct ext4_extent_idx *fidx;
810	struct ext4_extent *ex;
811	int i = at, k, m, a;
812	ext4_fsblk_t newblock, oldblock;
813	__le32 border;
814	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
815	int err = 0;
816
817	/* make decision: where to split? */
818	/* FIXME: now decision is simplest: at current extent */
819
820	/* if current leaf will be split, then we should use
821	 * border from split point */
822	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
823	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
824		border = path[depth].p_ext[1].ee_block;
825		ext_debug("leaf will be split."
826				" next leaf starts at %d\n",
827				  le32_to_cpu(border));
828	} else {
829		border = newext->ee_block;
830		ext_debug("leaf will be added."
831				" next leaf starts at %d\n",
832				le32_to_cpu(border));
833	}
834
835	/*
836	 * If error occurs, then we break processing
837	 * and mark filesystem read-only. index won't
838	 * be inserted and tree will be in consistent
839	 * state. Next mount will repair buffers too.
840	 */
841
842	/*
843	 * Get array to track all allocated blocks.
844	 * We need this to handle errors and free blocks
845	 * upon them.
846	 */
847	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
848	if (!ablocks)
849		return -ENOMEM;
850
851	/* allocate all needed blocks */
852	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
853	for (a = 0; a < depth - at; a++) {
854		newblock = ext4_ext_new_meta_block(handle, inode, path,
855						   newext, &err);
856		if (newblock == 0)
857			goto cleanup;
858		ablocks[a] = newblock;
859	}
860
861	/* initialize new leaf */
862	newblock = ablocks[--a];
863	BUG_ON(newblock == 0);
864	bh = sb_getblk(inode->i_sb, newblock);
865	if (!bh) {
866		err = -EIO;
867		goto cleanup;
868	}
869	lock_buffer(bh);
870
871	err = ext4_journal_get_create_access(handle, bh);
872	if (err)
873		goto cleanup;
874
875	neh = ext_block_hdr(bh);
876	neh->eh_entries = 0;
877	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
878	neh->eh_magic = EXT4_EXT_MAGIC;
879	neh->eh_depth = 0;
880	ex = EXT_FIRST_EXTENT(neh);
881
882	/* move remainder of path[depth] to the new leaf */
883	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
884	/* start copy from next extent */
885	/* TODO: we could do it by single memmove */
886	m = 0;
887	path[depth].p_ext++;
888	while (path[depth].p_ext <=
889			EXT_MAX_EXTENT(path[depth].p_hdr)) {
890		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
891				le32_to_cpu(path[depth].p_ext->ee_block),
892				ext_pblock(path[depth].p_ext),
893				ext4_ext_is_uninitialized(path[depth].p_ext),
894				ext4_ext_get_actual_len(path[depth].p_ext),
895				newblock);
896		/*memmove(ex++, path[depth].p_ext++,
897				sizeof(struct ext4_extent));
898		neh->eh_entries++;*/
899		path[depth].p_ext++;
900		m++;
901	}
902	if (m) {
903		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
904		le16_add_cpu(&neh->eh_entries, m);
905	}
906
907	set_buffer_uptodate(bh);
908	unlock_buffer(bh);
909
910	err = ext4_handle_dirty_metadata(handle, inode, bh);
911	if (err)
912		goto cleanup;
913	brelse(bh);
914	bh = NULL;
915
916	/* correct old leaf */
917	if (m) {
918		err = ext4_ext_get_access(handle, inode, path + depth);
919		if (err)
920			goto cleanup;
921		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
922		err = ext4_ext_dirty(handle, inode, path + depth);
923		if (err)
924			goto cleanup;
925
926	}
927
928	/* create intermediate indexes */
929	k = depth - at - 1;
930	BUG_ON(k < 0);
931	if (k)
932		ext_debug("create %d intermediate indices\n", k);
933	/* insert new index into current index block */
934	/* current depth stored in i var */
935	i = depth - 1;
936	while (k--) {
937		oldblock = newblock;
938		newblock = ablocks[--a];
939		bh = sb_getblk(inode->i_sb, newblock);
940		if (!bh) {
941			err = -EIO;
942			goto cleanup;
943		}
944		lock_buffer(bh);
945
946		err = ext4_journal_get_create_access(handle, bh);
947		if (err)
948			goto cleanup;
949
950		neh = ext_block_hdr(bh);
951		neh->eh_entries = cpu_to_le16(1);
952		neh->eh_magic = EXT4_EXT_MAGIC;
953		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
954		neh->eh_depth = cpu_to_le16(depth - i);
955		fidx = EXT_FIRST_INDEX(neh);
956		fidx->ei_block = border;
957		ext4_idx_store_pblock(fidx, oldblock);
958
959		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
960				i, newblock, le32_to_cpu(border), oldblock);
961		/* copy indexes */
962		m = 0;
963		path[i].p_idx++;
964
965		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
966				EXT_MAX_INDEX(path[i].p_hdr));
967		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
968				EXT_LAST_INDEX(path[i].p_hdr));
969		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
970			ext_debug("%d: move %d:%llu in new index %llu\n", i,
971					le32_to_cpu(path[i].p_idx->ei_block),
972					idx_pblock(path[i].p_idx),
973					newblock);
974			/*memmove(++fidx, path[i].p_idx++,
975					sizeof(struct ext4_extent_idx));
976			neh->eh_entries++;
977			BUG_ON(neh->eh_entries > neh->eh_max);*/
978			path[i].p_idx++;
979			m++;
980		}
981		if (m) {
982			memmove(++fidx, path[i].p_idx - m,
983				sizeof(struct ext4_extent_idx) * m);
984			le16_add_cpu(&neh->eh_entries, m);
985		}
986		set_buffer_uptodate(bh);
987		unlock_buffer(bh);
988
989		err = ext4_handle_dirty_metadata(handle, inode, bh);
990		if (err)
991			goto cleanup;
992		brelse(bh);
993		bh = NULL;
994
995		/* correct old index */
996		if (m) {
997			err = ext4_ext_get_access(handle, inode, path + i);
998			if (err)
999				goto cleanup;
1000			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1001			err = ext4_ext_dirty(handle, inode, path + i);
1002			if (err)
1003				goto cleanup;
1004		}
1005
1006		i--;
1007	}
1008
1009	/* insert new index */
1010	err = ext4_ext_insert_index(handle, inode, path + at,
1011				    le32_to_cpu(border), newblock);
1012
1013cleanup:
1014	if (bh) {
1015		if (buffer_locked(bh))
1016			unlock_buffer(bh);
1017		brelse(bh);
1018	}
1019
1020	if (err) {
1021		/* free all allocated blocks in error case */
1022		for (i = 0; i < depth; i++) {
1023			if (!ablocks[i])
1024				continue;
1025			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1026					 EXT4_FREE_BLOCKS_METADATA);
1027		}
1028	}
1029	kfree(ablocks);
1030
1031	return err;
1032}
1033
1034/*
1035 * ext4_ext_grow_indepth:
1036 * implements tree growing procedure:
1037 * - allocates new block
1038 * - moves top-level data (index block or leaf) into the new block
1039 * - initializes new top-level, creating index that points to the
1040 *   just created block
1041 */
1042static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1043					struct ext4_ext_path *path,
1044					struct ext4_extent *newext)
1045{
1046	struct ext4_ext_path *curp = path;
1047	struct ext4_extent_header *neh;
1048	struct ext4_extent_idx *fidx;
1049	struct buffer_head *bh;
1050	ext4_fsblk_t newblock;
1051	int err = 0;
1052
1053	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1054	if (newblock == 0)
1055		return err;
1056
1057	bh = sb_getblk(inode->i_sb, newblock);
1058	if (!bh) {
1059		err = -EIO;
1060		ext4_std_error(inode->i_sb, err);
1061		return err;
1062	}
1063	lock_buffer(bh);
1064
1065	err = ext4_journal_get_create_access(handle, bh);
1066	if (err) {
1067		unlock_buffer(bh);
1068		goto out;
1069	}
1070
1071	/* move top-level index/leaf into new block */
1072	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1073
1074	/* set size of new block */
1075	neh = ext_block_hdr(bh);
1076	/* old root could have indexes or leaves
1077	 * so calculate e_max right way */
1078	if (ext_depth(inode))
1079		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1080	else
1081		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1082	neh->eh_magic = EXT4_EXT_MAGIC;
1083	set_buffer_uptodate(bh);
1084	unlock_buffer(bh);
1085
1086	err = ext4_handle_dirty_metadata(handle, inode, bh);
1087	if (err)
1088		goto out;
1089
1090	/* create index in new top-level index: num,max,pointer */
1091	err = ext4_ext_get_access(handle, inode, curp);
1092	if (err)
1093		goto out;
1094
1095	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1096	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1097	curp->p_hdr->eh_entries = cpu_to_le16(1);
1098	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1099
1100	if (path[0].p_hdr->eh_depth)
1101		curp->p_idx->ei_block =
1102			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1103	else
1104		curp->p_idx->ei_block =
1105			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1106	ext4_idx_store_pblock(curp->p_idx, newblock);
1107
1108	neh = ext_inode_hdr(inode);
1109	fidx = EXT_FIRST_INDEX(neh);
1110	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1111		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1112		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
1113
1114	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1115	err = ext4_ext_dirty(handle, inode, curp);
1116out:
1117	brelse(bh);
1118
1119	return err;
1120}
1121
1122/*
1123 * ext4_ext_create_new_leaf:
1124 * finds empty index and adds new leaf.
1125 * if no free index is found, then it requests in-depth growing.
1126 */
1127static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1128					struct ext4_ext_path *path,
1129					struct ext4_extent *newext)
1130{
1131	struct ext4_ext_path *curp;
1132	int depth, i, err = 0;
1133
1134repeat:
1135	i = depth = ext_depth(inode);
1136
1137	/* walk up to the tree and look for free index entry */
1138	curp = path + depth;
1139	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1140		i--;
1141		curp--;
1142	}
1143
1144	/* we use already allocated block for index block,
1145	 * so subsequent data blocks should be contiguous */
1146	if (EXT_HAS_FREE_INDEX(curp)) {
1147		/* if we found index with free entry, then use that
1148		 * entry: create all needed subtree and add new leaf */
1149		err = ext4_ext_split(handle, inode, path, newext, i);
1150		if (err)
1151			goto out;
1152
1153		/* refill path */
1154		ext4_ext_drop_refs(path);
1155		path = ext4_ext_find_extent(inode,
1156				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1157				    path);
1158		if (IS_ERR(path))
1159			err = PTR_ERR(path);
1160	} else {
1161		/* tree is full, time to grow in depth */
1162		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1163		if (err)
1164			goto out;
1165
1166		/* refill path */
1167		ext4_ext_drop_refs(path);
1168		path = ext4_ext_find_extent(inode,
1169				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1170				    path);
1171		if (IS_ERR(path)) {
1172			err = PTR_ERR(path);
1173			goto out;
1174		}
1175
1176		/*
1177		 * only first (depth 0 -> 1) produces free space;
1178		 * in all other cases we have to split the grown tree
1179		 */
1180		depth = ext_depth(inode);
1181		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1182			/* now we need to split */
1183			goto repeat;
1184		}
1185	}
1186
1187out:
1188	return err;
1189}
1190
1191/*
1192 * search the closest allocated block to the left for *logical
1193 * and returns it at @logical + it's physical address at @phys
1194 * if *logical is the smallest allocated block, the function
1195 * returns 0 at @phys
1196 * return value contains 0 (success) or error code
1197 */
1198int
1199ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1200			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1201{
1202	struct ext4_extent_idx *ix;
1203	struct ext4_extent *ex;
1204	int depth, ee_len;
1205
1206	BUG_ON(path == NULL);
1207	depth = path->p_depth;
1208	*phys = 0;
1209
1210	if (depth == 0 && path->p_ext == NULL)
1211		return 0;
1212
1213	/* usually extent in the path covers blocks smaller
1214	 * then *logical, but it can be that extent is the
1215	 * first one in the file */
1216
1217	ex = path[depth].p_ext;
1218	ee_len = ext4_ext_get_actual_len(ex);
1219	if (*logical < le32_to_cpu(ex->ee_block)) {
1220		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1221		while (--depth >= 0) {
1222			ix = path[depth].p_idx;
1223			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1224		}
1225		return 0;
1226	}
1227
1228	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1229
1230	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1231	*phys = ext_pblock(ex) + ee_len - 1;
1232	return 0;
1233}
1234
1235/*
1236 * search the closest allocated block to the right for *logical
1237 * and returns it at @logical + it's physical address at @phys
1238 * if *logical is the smallest allocated block, the function
1239 * returns 0 at @phys
1240 * return value contains 0 (success) or error code
1241 */
1242int
1243ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1244			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1245{
1246	struct buffer_head *bh = NULL;
1247	struct ext4_extent_header *eh;
1248	struct ext4_extent_idx *ix;
1249	struct ext4_extent *ex;
1250	ext4_fsblk_t block;
1251	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1252	int ee_len;
1253
1254	BUG_ON(path == NULL);
1255	depth = path->p_depth;
1256	*phys = 0;
1257
1258	if (depth == 0 && path->p_ext == NULL)
1259		return 0;
1260
1261	/* usually extent in the path covers blocks smaller
1262	 * then *logical, but it can be that extent is the
1263	 * first one in the file */
1264
1265	ex = path[depth].p_ext;
1266	ee_len = ext4_ext_get_actual_len(ex);
1267	if (*logical < le32_to_cpu(ex->ee_block)) {
1268		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1269		while (--depth >= 0) {
1270			ix = path[depth].p_idx;
1271			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1272		}
1273		*logical = le32_to_cpu(ex->ee_block);
1274		*phys = ext_pblock(ex);
1275		return 0;
1276	}
1277
1278	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1279
1280	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1281		/* next allocated block in this leaf */
1282		ex++;
1283		*logical = le32_to_cpu(ex->ee_block);
1284		*phys = ext_pblock(ex);
1285		return 0;
1286	}
1287
1288	/* go up and search for index to the right */
1289	while (--depth >= 0) {
1290		ix = path[depth].p_idx;
1291		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1292			goto got_index;
1293	}
1294
1295	/* we've gone up to the root and found no index to the right */
1296	return 0;
1297
1298got_index:
1299	/* we've found index to the right, let's
1300	 * follow it and find the closest allocated
1301	 * block to the right */
1302	ix++;
1303	block = idx_pblock(ix);
1304	while (++depth < path->p_depth) {
1305		bh = sb_bread(inode->i_sb, block);
1306		if (bh == NULL)
1307			return -EIO;
1308		eh = ext_block_hdr(bh);
1309		/* subtract from p_depth to get proper eh_depth */
1310		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1311			put_bh(bh);
1312			return -EIO;
1313		}
1314		ix = EXT_FIRST_INDEX(eh);
1315		block = idx_pblock(ix);
1316		put_bh(bh);
1317	}
1318
1319	bh = sb_bread(inode->i_sb, block);
1320	if (bh == NULL)
1321		return -EIO;
1322	eh = ext_block_hdr(bh);
1323	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1324		put_bh(bh);
1325		return -EIO;
1326	}
1327	ex = EXT_FIRST_EXTENT(eh);
1328	*logical = le32_to_cpu(ex->ee_block);
1329	*phys = ext_pblock(ex);
1330	put_bh(bh);
1331	return 0;
1332}
1333
1334/*
1335 * ext4_ext_next_allocated_block:
1336 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1337 * NOTE: it considers block number from index entry as
1338 * allocated block. Thus, index entries have to be consistent
1339 * with leaves.
1340 */
1341static ext4_lblk_t
1342ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1343{
1344	int depth;
1345
1346	BUG_ON(path == NULL);
1347	depth = path->p_depth;
1348
1349	if (depth == 0 && path->p_ext == NULL)
1350		return EXT_MAX_BLOCK;
1351
1352	while (depth >= 0) {
1353		if (depth == path->p_depth) {
1354			/* leaf */
1355			if (path[depth].p_ext !=
1356					EXT_LAST_EXTENT(path[depth].p_hdr))
1357			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1358		} else {
1359			/* index */
1360			if (path[depth].p_idx !=
1361					EXT_LAST_INDEX(path[depth].p_hdr))
1362			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1363		}
1364		depth--;
1365	}
1366
1367	return EXT_MAX_BLOCK;
1368}
1369
1370/*
1371 * ext4_ext_next_leaf_block:
1372 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1373 */
1374static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1375					struct ext4_ext_path *path)
1376{
1377	int depth;
1378
1379	BUG_ON(path == NULL);
1380	depth = path->p_depth;
1381
1382	/* zero-tree has no leaf blocks at all */
1383	if (depth == 0)
1384		return EXT_MAX_BLOCK;
1385
1386	/* go to index block */
1387	depth--;
1388
1389	while (depth >= 0) {
1390		if (path[depth].p_idx !=
1391				EXT_LAST_INDEX(path[depth].p_hdr))
1392			return (ext4_lblk_t)
1393				le32_to_cpu(path[depth].p_idx[1].ei_block);
1394		depth--;
1395	}
1396
1397	return EXT_MAX_BLOCK;
1398}
1399
1400/*
1401 * ext4_ext_correct_indexes:
1402 * if leaf gets modified and modified extent is first in the leaf,
1403 * then we have to correct all indexes above.
1404 * TODO: do we need to correct tree in all cases?
1405 */
1406static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1407				struct ext4_ext_path *path)
1408{
1409	struct ext4_extent_header *eh;
1410	int depth = ext_depth(inode);
1411	struct ext4_extent *ex;
1412	__le32 border;
1413	int k, err = 0;
1414
1415	eh = path[depth].p_hdr;
1416	ex = path[depth].p_ext;
1417	BUG_ON(ex == NULL);
1418	BUG_ON(eh == NULL);
1419
1420	if (depth == 0) {
1421		/* there is no tree at all */
1422		return 0;
1423	}
1424
1425	if (ex != EXT_FIRST_EXTENT(eh)) {
1426		/* we correct tree if first leaf got modified only */
1427		return 0;
1428	}
1429
1430	/*
1431	 * TODO: we need correction if border is smaller than current one
1432	 */
1433	k = depth - 1;
1434	border = path[depth].p_ext->ee_block;
1435	err = ext4_ext_get_access(handle, inode, path + k);
1436	if (err)
1437		return err;
1438	path[k].p_idx->ei_block = border;
1439	err = ext4_ext_dirty(handle, inode, path + k);
1440	if (err)
1441		return err;
1442
1443	while (k--) {
1444		/* change all left-side indexes */
1445		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1446			break;
1447		err = ext4_ext_get_access(handle, inode, path + k);
1448		if (err)
1449			break;
1450		path[k].p_idx->ei_block = border;
1451		err = ext4_ext_dirty(handle, inode, path + k);
1452		if (err)
1453			break;
1454	}
1455
1456	return err;
1457}
1458
1459int
1460ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1461				struct ext4_extent *ex2)
1462{
1463	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1464
1465	/*
1466	 * Make sure that either both extents are uninitialized, or
1467	 * both are _not_.
1468	 */
1469	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1470		return 0;
1471
1472	if (ext4_ext_is_uninitialized(ex1))
1473		max_len = EXT_UNINIT_MAX_LEN;
1474	else
1475		max_len = EXT_INIT_MAX_LEN;
1476
1477	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1478	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1479
1480	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1481			le32_to_cpu(ex2->ee_block))
1482		return 0;
1483
1484	/*
1485	 * To allow future support for preallocated extents to be added
1486	 * as an RO_COMPAT feature, refuse to merge to extents if
1487	 * this can result in the top bit of ee_len being set.
1488	 */
1489	if (ext1_ee_len + ext2_ee_len > max_len)
1490		return 0;
1491#ifdef AGGRESSIVE_TEST
1492	if (ext1_ee_len >= 4)
1493		return 0;
1494#endif
1495
1496	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1497		return 1;
1498	return 0;
1499}
1500
1501/*
1502 * This function tries to merge the "ex" extent to the next extent in the tree.
1503 * It always tries to merge towards right. If you want to merge towards
1504 * left, pass "ex - 1" as argument instead of "ex".
1505 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1506 * 1 if they got merged.
1507 */
1508int ext4_ext_try_to_merge(struct inode *inode,
1509			  struct ext4_ext_path *path,
1510			  struct ext4_extent *ex)
1511{
1512	struct ext4_extent_header *eh;
1513	unsigned int depth, len;
1514	int merge_done = 0;
1515	int uninitialized = 0;
1516
1517	depth = ext_depth(inode);
1518	BUG_ON(path[depth].p_hdr == NULL);
1519	eh = path[depth].p_hdr;
1520
1521	while (ex < EXT_LAST_EXTENT(eh)) {
1522		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1523			break;
1524		/* merge with next extent! */
1525		if (ext4_ext_is_uninitialized(ex))
1526			uninitialized = 1;
1527		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1528				+ ext4_ext_get_actual_len(ex + 1));
1529		if (uninitialized)
1530			ext4_ext_mark_uninitialized(ex);
1531
1532		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1533			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1534				* sizeof(struct ext4_extent);
1535			memmove(ex + 1, ex + 2, len);
1536		}
1537		le16_add_cpu(&eh->eh_entries, -1);
1538		merge_done = 1;
1539		WARN_ON(eh->eh_entries == 0);
1540		if (!eh->eh_entries)
1541			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1542			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1543	}
1544
1545	return merge_done;
1546}
1547
1548/*
1549 * check if a portion of the "newext" extent overlaps with an
1550 * existing extent.
1551 *
1552 * If there is an overlap discovered, it updates the length of the newext
1553 * such that there will be no overlap, and then returns 1.
1554 * If there is no overlap found, it returns 0.
1555 */
1556unsigned int ext4_ext_check_overlap(struct inode *inode,
1557				    struct ext4_extent *newext,
1558				    struct ext4_ext_path *path)
1559{
1560	ext4_lblk_t b1, b2;
1561	unsigned int depth, len1;
1562	unsigned int ret = 0;
1563
1564	b1 = le32_to_cpu(newext->ee_block);
1565	len1 = ext4_ext_get_actual_len(newext);
1566	depth = ext_depth(inode);
1567	if (!path[depth].p_ext)
1568		goto out;
1569	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1570
1571	/*
1572	 * get the next allocated block if the extent in the path
1573	 * is before the requested block(s)
1574	 */
1575	if (b2 < b1) {
1576		b2 = ext4_ext_next_allocated_block(path);
1577		if (b2 == EXT_MAX_BLOCK)
1578			goto out;
1579	}
1580
1581	/* check for wrap through zero on extent logical start block*/
1582	if (b1 + len1 < b1) {
1583		len1 = EXT_MAX_BLOCK - b1;
1584		newext->ee_len = cpu_to_le16(len1);
1585		ret = 1;
1586	}
1587
1588	/* check for overlap */
1589	if (b1 + len1 > b2) {
1590		newext->ee_len = cpu_to_le16(b2 - b1);
1591		ret = 1;
1592	}
1593out:
1594	return ret;
1595}
1596
1597/*
1598 * ext4_ext_insert_extent:
1599 * tries to merge requsted extent into the existing extent or
1600 * inserts requested extent as new one into the tree,
1601 * creating new leaf in the no-space case.
1602 */
1603int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1604				struct ext4_ext_path *path,
1605				struct ext4_extent *newext, int flag)
1606{
1607	struct ext4_extent_header *eh;
1608	struct ext4_extent *ex, *fex;
1609	struct ext4_extent *nearex; /* nearest extent */
1610	struct ext4_ext_path *npath = NULL;
1611	int depth, len, err;
1612	ext4_lblk_t next;
1613	unsigned uninitialized = 0;
1614
1615	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1616	depth = ext_depth(inode);
1617	ex = path[depth].p_ext;
1618	BUG_ON(path[depth].p_hdr == NULL);
1619
1620	/* try to insert block into found extent and return */
1621	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1622		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1623		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1624				ext4_ext_is_uninitialized(newext),
1625				ext4_ext_get_actual_len(newext),
1626				le32_to_cpu(ex->ee_block),
1627				ext4_ext_is_uninitialized(ex),
1628				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1629		err = ext4_ext_get_access(handle, inode, path + depth);
1630		if (err)
1631			return err;
1632
1633		/*
1634		 * ext4_can_extents_be_merged should have checked that either
1635		 * both extents are uninitialized, or both aren't. Thus we
1636		 * need to check only one of them here.
1637		 */
1638		if (ext4_ext_is_uninitialized(ex))
1639			uninitialized = 1;
1640		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1641					+ ext4_ext_get_actual_len(newext));
1642		if (uninitialized)
1643			ext4_ext_mark_uninitialized(ex);
1644		eh = path[depth].p_hdr;
1645		nearex = ex;
1646		goto merge;
1647	}
1648
1649repeat:
1650	depth = ext_depth(inode);
1651	eh = path[depth].p_hdr;
1652	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1653		goto has_space;
1654
1655	/* probably next leaf has space for us? */
1656	fex = EXT_LAST_EXTENT(eh);
1657	next = ext4_ext_next_leaf_block(inode, path);
1658	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1659	    && next != EXT_MAX_BLOCK) {
1660		ext_debug("next leaf block - %d\n", next);
1661		BUG_ON(npath != NULL);
1662		npath = ext4_ext_find_extent(inode, next, NULL);
1663		if (IS_ERR(npath))
1664			return PTR_ERR(npath);
1665		BUG_ON(npath->p_depth != path->p_depth);
1666		eh = npath[depth].p_hdr;
1667		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1668			ext_debug("next leaf isnt full(%d)\n",
1669				  le16_to_cpu(eh->eh_entries));
1670			path = npath;
1671			goto repeat;
1672		}
1673		ext_debug("next leaf has no free space(%d,%d)\n",
1674			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1675	}
1676
1677	/*
1678	 * There is no free space in the found leaf.
1679	 * We're gonna add a new leaf in the tree.
1680	 */
1681	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1682	if (err)
1683		goto cleanup;
1684	depth = ext_depth(inode);
1685	eh = path[depth].p_hdr;
1686
1687has_space:
1688	nearex = path[depth].p_ext;
1689
1690	err = ext4_ext_get_access(handle, inode, path + depth);
1691	if (err)
1692		goto cleanup;
1693
1694	if (!nearex) {
1695		/* there is no extent in this leaf, create first one */
1696		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1697				le32_to_cpu(newext->ee_block),
1698				ext_pblock(newext),
1699				ext4_ext_is_uninitialized(newext),
1700				ext4_ext_get_actual_len(newext));
1701		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1702	} else if (le32_to_cpu(newext->ee_block)
1703			   > le32_to_cpu(nearex->ee_block)) {
1704/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1705		if (nearex != EXT_LAST_EXTENT(eh)) {
1706			len = EXT_MAX_EXTENT(eh) - nearex;
1707			len = (len - 1) * sizeof(struct ext4_extent);
1708			len = len < 0 ? 0 : len;
1709			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1710					"move %d from 0x%p to 0x%p\n",
1711					le32_to_cpu(newext->ee_block),
1712					ext_pblock(newext),
1713					ext4_ext_is_uninitialized(newext),
1714					ext4_ext_get_actual_len(newext),
1715					nearex, len, nearex + 1, nearex + 2);
1716			memmove(nearex + 2, nearex + 1, len);
1717		}
1718		path[depth].p_ext = nearex + 1;
1719	} else {
1720		BUG_ON(newext->ee_block == nearex->ee_block);
1721		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1722		len = len < 0 ? 0 : len;
1723		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1724				"move %d from 0x%p to 0x%p\n",
1725				le32_to_cpu(newext->ee_block),
1726				ext_pblock(newext),
1727				ext4_ext_is_uninitialized(newext),
1728				ext4_ext_get_actual_len(newext),
1729				nearex, len, nearex + 1, nearex + 2);
1730		memmove(nearex + 1, nearex, len);
1731		path[depth].p_ext = nearex;
1732	}
1733
1734	le16_add_cpu(&eh->eh_entries, 1);
1735	nearex = path[depth].p_ext;
1736	nearex->ee_block = newext->ee_block;
1737	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1738	nearex->ee_len = newext->ee_len;
1739
1740merge:
1741	/* try to merge extents to the right */
1742	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1743		ext4_ext_try_to_merge(inode, path, nearex);
1744
1745	/* try to merge extents to the left */
1746
1747	/* time to correct all indexes above */
1748	err = ext4_ext_correct_indexes(handle, inode, path);
1749	if (err)
1750		goto cleanup;
1751
1752	err = ext4_ext_dirty(handle, inode, path + depth);
1753
1754cleanup:
1755	if (npath) {
1756		ext4_ext_drop_refs(npath);
1757		kfree(npath);
1758	}
1759	ext4_ext_invalidate_cache(inode);
1760	return err;
1761}
1762
1763int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1764			ext4_lblk_t num, ext_prepare_callback func,
1765			void *cbdata)
1766{
1767	struct ext4_ext_path *path = NULL;
1768	struct ext4_ext_cache cbex;
1769	struct ext4_extent *ex;
1770	ext4_lblk_t next, start = 0, end = 0;
1771	ext4_lblk_t last = block + num;
1772	int depth, exists, err = 0;
1773
1774	BUG_ON(func == NULL);
1775	BUG_ON(inode == NULL);
1776
1777	while (block < last && block != EXT_MAX_BLOCK) {
1778		num = last - block;
1779		/* find extent for this block */
1780		down_read(&EXT4_I(inode)->i_data_sem);
1781		path = ext4_ext_find_extent(inode, block, path);
1782		up_read(&EXT4_I(inode)->i_data_sem);
1783		if (IS_ERR(path)) {
1784			err = PTR_ERR(path);
1785			path = NULL;
1786			break;
1787		}
1788
1789		depth = ext_depth(inode);
1790		BUG_ON(path[depth].p_hdr == NULL);
1791		ex = path[depth].p_ext;
1792		next = ext4_ext_next_allocated_block(path);
1793
1794		exists = 0;
1795		if (!ex) {
1796			/* there is no extent yet, so try to allocate
1797			 * all requested space */
1798			start = block;
1799			end = block + num;
1800		} else if (le32_to_cpu(ex->ee_block) > block) {
1801			/* need to allocate space before found extent */
1802			start = block;
1803			end = le32_to_cpu(ex->ee_block);
1804			if (block + num < end)
1805				end = block + num;
1806		} else if (block >= le32_to_cpu(ex->ee_block)
1807					+ ext4_ext_get_actual_len(ex)) {
1808			/* need to allocate space after found extent */
1809			start = block;
1810			end = block + num;
1811			if (end >= next)
1812				end = next;
1813		} else if (block >= le32_to_cpu(ex->ee_block)) {
1814			/*
1815			 * some part of requested space is covered
1816			 * by found extent
1817			 */
1818			start = block;
1819			end = le32_to_cpu(ex->ee_block)
1820				+ ext4_ext_get_actual_len(ex);
1821			if (block + num < end)
1822				end = block + num;
1823			exists = 1;
1824		} else {
1825			BUG();
1826		}
1827		BUG_ON(end <= start);
1828
1829		if (!exists) {
1830			cbex.ec_block = start;
1831			cbex.ec_len = end - start;
1832			cbex.ec_start = 0;
1833			cbex.ec_type = EXT4_EXT_CACHE_GAP;
1834		} else {
1835			cbex.ec_block = le32_to_cpu(ex->ee_block);
1836			cbex.ec_len = ext4_ext_get_actual_len(ex);
1837			cbex.ec_start = ext_pblock(ex);
1838			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1839		}
1840
1841		BUG_ON(cbex.ec_len == 0);
1842		err = func(inode, path, &cbex, ex, cbdata);
1843		ext4_ext_drop_refs(path);
1844
1845		if (err < 0)
1846			break;
1847
1848		if (err == EXT_REPEAT)
1849			continue;
1850		else if (err == EXT_BREAK) {
1851			err = 0;
1852			break;
1853		}
1854
1855		if (ext_depth(inode) != depth) {
1856			/* depth was changed. we have to realloc path */
1857			kfree(path);
1858			path = NULL;
1859		}
1860
1861		block = cbex.ec_block + cbex.ec_len;
1862	}
1863
1864	if (path) {
1865		ext4_ext_drop_refs(path);
1866		kfree(path);
1867	}
1868
1869	return err;
1870}
1871
1872static void
1873ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1874			__u32 len, ext4_fsblk_t start, int type)
1875{
1876	struct ext4_ext_cache *cex;
1877	BUG_ON(len == 0);
1878	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1879	cex = &EXT4_I(inode)->i_cached_extent;
1880	cex->ec_type = type;
1881	cex->ec_block = block;
1882	cex->ec_len = len;
1883	cex->ec_start = start;
1884	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1885}
1886
1887/*
1888 * ext4_ext_put_gap_in_cache:
1889 * calculate boundaries of the gap that the requested block fits into
1890 * and cache this gap
1891 */
1892static void
1893ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1894				ext4_lblk_t block)
1895{
1896	int depth = ext_depth(inode);
1897	unsigned long len;
1898	ext4_lblk_t lblock;
1899	struct ext4_extent *ex;
1900
1901	ex = path[depth].p_ext;
1902	if (ex == NULL) {
1903		/* there is no extent yet, so gap is [0;-] */
1904		lblock = 0;
1905		len = EXT_MAX_BLOCK;
1906		ext_debug("cache gap(whole file):");
1907	} else if (block < le32_to_cpu(ex->ee_block)) {
1908		lblock = block;
1909		len = le32_to_cpu(ex->ee_block) - block;
1910		ext_debug("cache gap(before): %u [%u:%u]",
1911				block,
1912				le32_to_cpu(ex->ee_block),
1913				 ext4_ext_get_actual_len(ex));
1914	} else if (block >= le32_to_cpu(ex->ee_block)
1915			+ ext4_ext_get_actual_len(ex)) {
1916		ext4_lblk_t next;
1917		lblock = le32_to_cpu(ex->ee_block)
1918			+ ext4_ext_get_actual_len(ex);
1919
1920		next = ext4_ext_next_allocated_block(path);
1921		ext_debug("cache gap(after): [%u:%u] %u",
1922				le32_to_cpu(ex->ee_block),
1923				ext4_ext_get_actual_len(ex),
1924				block);
1925		BUG_ON(next == lblock);
1926		len = next - lblock;
1927	} else {
1928		lblock = len = 0;
1929		BUG();
1930	}
1931
1932	ext_debug(" -> %u:%lu\n", lblock, len);
1933	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1934}
1935
1936static int
1937ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1938			struct ext4_extent *ex)
1939{
1940	struct ext4_ext_cache *cex;
1941	int ret = EXT4_EXT_CACHE_NO;
1942
1943	/*
1944	 * We borrow i_block_reservation_lock to protect i_cached_extent
1945	 */
1946	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1947	cex = &EXT4_I(inode)->i_cached_extent;
1948
1949	/* has cache valid data? */
1950	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1951		goto errout;
1952
1953	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1954			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1955	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1956		ex->ee_block = cpu_to_le32(cex->ec_block);
1957		ext4_ext_store_pblock(ex, cex->ec_start);
1958		ex->ee_len = cpu_to_le16(cex->ec_len);
1959		ext_debug("%u cached by %u:%u:%llu\n",
1960				block,
1961				cex->ec_block, cex->ec_len, cex->ec_start);
1962		ret = cex->ec_type;
1963	}
1964errout:
1965	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1966	return ret;
1967}
1968
1969/*
1970 * ext4_ext_rm_idx:
1971 * removes index from the index block.
1972 * It's used in truncate case only, thus all requests are for
1973 * last index in the block only.
1974 */
1975static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1976			struct ext4_ext_path *path)
1977{
1978	int err;
1979	ext4_fsblk_t leaf;
1980
1981	/* free index block */
1982	path--;
1983	leaf = idx_pblock(path->p_idx);
1984	BUG_ON(path->p_hdr->eh_entries == 0);
1985	err = ext4_ext_get_access(handle, inode, path);
1986	if (err)
1987		return err;
1988	le16_add_cpu(&path->p_hdr->eh_entries, -1);
1989	err = ext4_ext_dirty(handle, inode, path);
1990	if (err)
1991		return err;
1992	ext_debug("index is empty, remove it, free block %llu\n", leaf);
1993	ext4_free_blocks(handle, inode, 0, leaf, 1,
1994			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1995	return err;
1996}
1997
1998/*
1999 * ext4_ext_calc_credits_for_single_extent:
2000 * This routine returns max. credits that needed to insert an extent
2001 * to the extent tree.
2002 * When pass the actual path, the caller should calculate credits
2003 * under i_data_sem.
2004 */
2005int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2006						struct ext4_ext_path *path)
2007{
2008	if (path) {
2009		int depth = ext_depth(inode);
2010		int ret = 0;
2011
2012		/* probably there is space in leaf? */
2013		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2014				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2015
2016			/*
2017			 *  There are some space in the leaf tree, no
2018			 *  need to account for leaf block credit
2019			 *
2020			 *  bitmaps and block group descriptor blocks
2021			 *  and other metadat blocks still need to be
2022			 *  accounted.
2023			 */
2024			/* 1 bitmap, 1 block group descriptor */
2025			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2026			return ret;
2027		}
2028	}
2029
2030	return ext4_chunk_trans_blocks(inode, nrblocks);
2031}
2032
2033/*
2034 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2035 *
2036 * if nrblocks are fit in a single extent (chunk flag is 1), then
2037 * in the worse case, each tree level index/leaf need to be changed
2038 * if the tree split due to insert a new extent, then the old tree
2039 * index/leaf need to be updated too
2040 *
2041 * If the nrblocks are discontiguous, they could cause
2042 * the whole tree split more than once, but this is really rare.
2043 */
2044int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2045{
2046	int index;
2047	int depth = ext_depth(inode);
2048
2049	if (chunk)
2050		index = depth * 2;
2051	else
2052		index = depth * 3;
2053
2054	return index;
2055}
2056
2057static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2058				struct ext4_extent *ex,
2059				ext4_lblk_t from, ext4_lblk_t to)
2060{
2061	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2062	int flags = EXT4_FREE_BLOCKS_FORGET;
2063
2064	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2065		flags |= EXT4_FREE_BLOCKS_METADATA;
2066#ifdef EXTENTS_STATS
2067	{
2068		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2069		spin_lock(&sbi->s_ext_stats_lock);
2070		sbi->s_ext_blocks += ee_len;
2071		sbi->s_ext_extents++;
2072		if (ee_len < sbi->s_ext_min)
2073			sbi->s_ext_min = ee_len;
2074		if (ee_len > sbi->s_ext_max)
2075			sbi->s_ext_max = ee_len;
2076		if (ext_depth(inode) > sbi->s_depth_max)
2077			sbi->s_depth_max = ext_depth(inode);
2078		spin_unlock(&sbi->s_ext_stats_lock);
2079	}
2080#endif
2081	if (from >= le32_to_cpu(ex->ee_block)
2082	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2083		/* tail removal */
2084		ext4_lblk_t num;
2085		ext4_fsblk_t start;
2086
2087		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2088		start = ext_pblock(ex) + ee_len - num;
2089		ext_debug("free last %u blocks starting %llu\n", num, start);
2090		ext4_free_blocks(handle, inode, 0, start, num, flags);
2091	} else if (from == le32_to_cpu(ex->ee_block)
2092		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2093		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2094			from, to, le32_to_cpu(ex->ee_block), ee_len);
2095	} else {
2096		printk(KERN_INFO "strange request: removal(2) "
2097				"%u-%u from %u:%u\n",
2098				from, to, le32_to_cpu(ex->ee_block), ee_len);
2099	}
2100	return 0;
2101}
2102
2103static int
2104ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2105		struct ext4_ext_path *path, ext4_lblk_t start)
2106{
2107	int err = 0, correct_index = 0;
2108	int depth = ext_depth(inode), credits;
2109	struct ext4_extent_header *eh;
2110	ext4_lblk_t a, b, block;
2111	unsigned num;
2112	ext4_lblk_t ex_ee_block;
2113	unsigned short ex_ee_len;
2114	unsigned uninitialized = 0;
2115	struct ext4_extent *ex;
2116
2117	/* the header must be checked already in ext4_ext_remove_space() */
2118	ext_debug("truncate since %u in leaf\n", start);
2119	if (!path[depth].p_hdr)
2120		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2121	eh = path[depth].p_hdr;
2122	BUG_ON(eh == NULL);
2123
2124	/* find where to start removing */
2125	ex = EXT_LAST_EXTENT(eh);
2126
2127	ex_ee_block = le32_to_cpu(ex->ee_block);
2128	ex_ee_len = ext4_ext_get_actual_len(ex);
2129
2130	while (ex >= EXT_FIRST_EXTENT(eh) &&
2131			ex_ee_block + ex_ee_len > start) {
2132
2133		if (ext4_ext_is_uninitialized(ex))
2134			uninitialized = 1;
2135		else
2136			uninitialized = 0;
2137
2138		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2139			 uninitialized, ex_ee_len);
2140		path[depth].p_ext = ex;
2141
2142		a = ex_ee_block > start ? ex_ee_block : start;
2143		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2144			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2145
2146		ext_debug("  border %u:%u\n", a, b);
2147
2148		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2149			block = 0;
2150			num = 0;
2151			BUG();
2152		} else if (a != ex_ee_block) {
2153			/* remove tail of the extent */
2154			block = ex_ee_block;
2155			num = a - block;
2156		} else if (b != ex_ee_block + ex_ee_len - 1) {
2157			/* remove head of the extent */
2158			block = a;
2159			num = b - a;
2160			/* there is no "make a hole" API yet */
2161			BUG();
2162		} else {
2163			/* remove whole extent: excellent! */
2164			block = ex_ee_block;
2165			num = 0;
2166			BUG_ON(a != ex_ee_block);
2167			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2168		}
2169
2170		/*
2171		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2172		 * descriptor) for each block group; assume two block
2173		 * groups plus ex_ee_len/blocks_per_block_group for
2174		 * the worst case
2175		 */
2176		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2177		if (ex == EXT_FIRST_EXTENT(eh)) {
2178			correct_index = 1;
2179			credits += (ext_depth(inode)) + 1;
2180		}
2181		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2182
2183		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2184		if (err)
2185			goto out;
2186
2187		err = ext4_ext_get_access(handle, inode, path + depth);
2188		if (err)
2189			goto out;
2190
2191		err = ext4_remove_blocks(handle, inode, ex, a, b);
2192		if (err)
2193			goto out;
2194
2195		if (num == 0) {
2196			/* this extent is removed; mark slot entirely unused */
2197			ext4_ext_store_pblock(ex, 0);
2198			le16_add_cpu(&eh->eh_entries, -1);
2199		}
2200
2201		ex->ee_block = cpu_to_le32(block);
2202		ex->ee_len = cpu_to_le16(num);
2203		/*
2204		 * Do not mark uninitialized if all the blocks in the
2205		 * extent have been removed.
2206		 */
2207		if (uninitialized && num)
2208			ext4_ext_mark_uninitialized(ex);
2209
2210		err = ext4_ext_dirty(handle, inode, path + depth);
2211		if (err)
2212			goto out;
2213
2214		ext_debug("new extent: %u:%u:%llu\n", block, num,
2215				ext_pblock(ex));
2216		ex--;
2217		ex_ee_block = le32_to_cpu(ex->ee_block);
2218		ex_ee_len = ext4_ext_get_actual_len(ex);
2219	}
2220
2221	if (correct_index && eh->eh_entries)
2222		err = ext4_ext_correct_indexes(handle, inode, path);
2223
2224	/* if this leaf is free, then we should
2225	 * remove it from index block above */
2226	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2227		err = ext4_ext_rm_idx(handle, inode, path + depth);
2228
2229out:
2230	return err;
2231}
2232
2233/*
2234 * ext4_ext_more_to_rm:
2235 * returns 1 if current index has to be freed (even partial)
2236 */
2237static int
2238ext4_ext_more_to_rm(struct ext4_ext_path *path)
2239{
2240	BUG_ON(path->p_idx == NULL);
2241
2242	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2243		return 0;
2244
2245	/*
2246	 * if truncate on deeper level happened, it wasn't partial,
2247	 * so we have to consider current index for truncation
2248	 */
2249	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2250		return 0;
2251	return 1;
2252}
2253
2254static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2255{
2256	struct super_block *sb = inode->i_sb;
2257	int depth = ext_depth(inode);
2258	struct ext4_ext_path *path;
2259	handle_t *handle;
2260	int i = 0, err = 0;
2261
2262	ext_debug("truncate since %u\n", start);
2263
2264	/* probably first extent we're gonna free will be last in block */
2265	handle = ext4_journal_start(inode, depth + 1);
2266	if (IS_ERR(handle))
2267		return PTR_ERR(handle);
2268
2269	ext4_ext_invalidate_cache(inode);
2270
2271	/*
2272	 * We start scanning from right side, freeing all the blocks
2273	 * after i_size and walking into the tree depth-wise.
2274	 */
2275	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2276	if (path == NULL) {
2277		ext4_journal_stop(handle);
2278		return -ENOMEM;
2279	}
2280	path[0].p_hdr = ext_inode_hdr(inode);
2281	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2282		err = -EIO;
2283		goto out;
2284	}
2285	path[0].p_depth = depth;
2286
2287	while (i >= 0 && err == 0) {
2288		if (i == depth) {
2289			/* this is leaf block */
2290			err = ext4_ext_rm_leaf(handle, inode, path, start);
2291			/* root level has p_bh == NULL, brelse() eats this */
2292			brelse(path[i].p_bh);
2293			path[i].p_bh = NULL;
2294			i--;
2295			continue;
2296		}
2297
2298		/* this is index block */
2299		if (!path[i].p_hdr) {
2300			ext_debug("initialize header\n");
2301			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2302		}
2303
2304		if (!path[i].p_idx) {
2305			/* this level hasn't been touched yet */
2306			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2307			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2308			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2309				  path[i].p_hdr,
2310				  le16_to_cpu(path[i].p_hdr->eh_entries));
2311		} else {
2312			/* we were already here, see at next index */
2313			path[i].p_idx--;
2314		}
2315
2316		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2317				i, EXT_FIRST_INDEX(path[i].p_hdr),
2318				path[i].p_idx);
2319		if (ext4_ext_more_to_rm(path + i)) {
2320			struct buffer_head *bh;
2321			/* go to the next level */
2322			ext_debug("move to level %d (block %llu)\n",
2323				  i + 1, idx_pblock(path[i].p_idx));
2324			memset(path + i + 1, 0, sizeof(*path));
2325			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2326			if (!bh) {
2327				/* should we reset i_size? */
2328				err = -EIO;
2329				break;
2330			}
2331			if (WARN_ON(i + 1 > depth)) {
2332				err = -EIO;
2333				break;
2334			}
2335			if (ext4_ext_check(inode, ext_block_hdr(bh),
2336							depth - i - 1)) {
2337				err = -EIO;
2338				break;
2339			}
2340			path[i + 1].p_bh = bh;
2341
2342			/* save actual number of indexes since this
2343			 * number is changed at the next iteration */
2344			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2345			i++;
2346		} else {
2347			/* we finished processing this index, go up */
2348			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2349				/* index is empty, remove it;
2350				 * handle must be already prepared by the
2351				 * truncatei_leaf() */
2352				err = ext4_ext_rm_idx(handle, inode, path + i);
2353			}
2354			/* root level has p_bh == NULL, brelse() eats this */
2355			brelse(path[i].p_bh);
2356			path[i].p_bh = NULL;
2357			i--;
2358			ext_debug("return to level %d\n", i);
2359		}
2360	}
2361
2362	/* TODO: flexible tree reduction should be here */
2363	if (path->p_hdr->eh_entries == 0) {
2364		/*
2365		 * truncate to zero freed all the tree,
2366		 * so we need to correct eh_depth
2367		 */
2368		err = ext4_ext_get_access(handle, inode, path);
2369		if (err == 0) {
2370			ext_inode_hdr(inode)->eh_depth = 0;
2371			ext_inode_hdr(inode)->eh_max =
2372				cpu_to_le16(ext4_ext_space_root(inode, 0));
2373			err = ext4_ext_dirty(handle, inode, path);
2374		}
2375	}
2376out:
2377	ext4_ext_drop_refs(path);
2378	kfree(path);
2379	ext4_journal_stop(handle);
2380
2381	return err;
2382}
2383
2384/*
2385 * called at mount time
2386 */
2387void ext4_ext_init(struct super_block *sb)
2388{
2389	/*
2390	 * possible initialization would be here
2391	 */
2392
2393	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2394#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2395		printk(KERN_INFO "EXT4-fs: file extents enabled");
2396#ifdef AGGRESSIVE_TEST
2397		printk(", aggressive tests");
2398#endif
2399#ifdef CHECK_BINSEARCH
2400		printk(", check binsearch");
2401#endif
2402#ifdef EXTENTS_STATS
2403		printk(", stats");
2404#endif
2405		printk("\n");
2406#endif
2407#ifdef EXTENTS_STATS
2408		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2409		EXT4_SB(sb)->s_ext_min = 1 << 30;
2410		EXT4_SB(sb)->s_ext_max = 0;
2411#endif
2412	}
2413}
2414
2415/*
2416 * called at umount time
2417 */
2418void ext4_ext_release(struct super_block *sb)
2419{
2420	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2421		return;
2422
2423#ifdef EXTENTS_STATS
2424	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2425		struct ext4_sb_info *sbi = EXT4_SB(sb);
2426		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2427			sbi->s_ext_blocks, sbi->s_ext_extents,
2428			sbi->s_ext_blocks / sbi->s_ext_extents);
2429		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2430			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2431	}
2432#endif
2433}
2434
2435static void bi_complete(struct bio *bio, int error)
2436{
2437	complete((struct completion *)bio->bi_private);
2438}
2439
2440/* FIXME!! we need to try to merge to left or right after zero-out  */
2441static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2442{
2443	int ret = -EIO;
2444	struct bio *bio;
2445	int blkbits, blocksize;
2446	sector_t ee_pblock;
2447	struct completion event;
2448	unsigned int ee_len, len, done, offset;
2449
2450
2451	blkbits   = inode->i_blkbits;
2452	blocksize = inode->i_sb->s_blocksize;
2453	ee_len    = ext4_ext_get_actual_len(ex);
2454	ee_pblock = ext_pblock(ex);
2455
2456	/* convert ee_pblock to 512 byte sectors */
2457	ee_pblock = ee_pblock << (blkbits - 9);
2458
2459	while (ee_len > 0) {
2460
2461		if (ee_len > BIO_MAX_PAGES)
2462			len = BIO_MAX_PAGES;
2463		else
2464			len = ee_len;
2465
2466		bio = bio_alloc(GFP_NOIO, len);
2467		bio->bi_sector = ee_pblock;
2468		bio->bi_bdev   = inode->i_sb->s_bdev;
2469
2470		done = 0;
2471		offset = 0;
2472		while (done < len) {
2473			ret = bio_add_page(bio, ZERO_PAGE(0),
2474							blocksize, offset);
2475			if (ret != blocksize) {
2476				/*
2477				 * We can't add any more pages because of
2478				 * hardware limitations.  Start a new bio.
2479				 */
2480				break;
2481			}
2482			done++;
2483			offset += blocksize;
2484			if (offset >= PAGE_CACHE_SIZE)
2485				offset = 0;
2486		}
2487
2488		init_completion(&event);
2489		bio->bi_private = &event;
2490		bio->bi_end_io = bi_complete;
2491		submit_bio(WRITE, bio);
2492		wait_for_completion(&event);
2493
2494		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2495			ret = 0;
2496		else {
2497			ret = -EIO;
2498			break;
2499		}
2500		bio_put(bio);
2501		ee_len    -= done;
2502		ee_pblock += done  << (blkbits - 9);
2503	}
2504	return ret;
2505}
2506
2507#define EXT4_EXT_ZERO_LEN 7
2508/*
2509 * This function is called by ext4_ext_get_blocks() if someone tries to write
2510 * to an uninitialized extent. It may result in splitting the uninitialized
2511 * extent into multiple extents (upto three - one initialized and two
2512 * uninitialized).
2513 * There are three possibilities:
2514 *   a> There is no split required: Entire extent should be initialized
2515 *   b> Splits in two extents: Write is happening at either end of the extent
2516 *   c> Splits in three extents: Somone is writing in middle of the extent
2517 */
2518static int ext4_ext_convert_to_initialized(handle_t *handle,
2519						struct inode *inode,
2520						struct ext4_ext_path *path,
2521						ext4_lblk_t iblock,
2522						unsigned int max_blocks)
2523{
2524	struct ext4_extent *ex, newex, orig_ex;
2525	struct ext4_extent *ex1 = NULL;
2526	struct ext4_extent *ex2 = NULL;
2527	struct ext4_extent *ex3 = NULL;
2528	struct ext4_extent_header *eh;
2529	ext4_lblk_t ee_block;
2530	unsigned int allocated, ee_len, depth;
2531	ext4_fsblk_t newblock;
2532	int err = 0;
2533	int ret = 0;
2534
2535	depth = ext_depth(inode);
2536	eh = path[depth].p_hdr;
2537	ex = path[depth].p_ext;
2538	ee_block = le32_to_cpu(ex->ee_block);
2539	ee_len = ext4_ext_get_actual_len(ex);
2540	allocated = ee_len - (iblock - ee_block);
2541	newblock = iblock - ee_block + ext_pblock(ex);
2542	ex2 = ex;
2543	orig_ex.ee_block = ex->ee_block;
2544	orig_ex.ee_len   = cpu_to_le16(ee_len);
2545	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2546
2547	err = ext4_ext_get_access(handle, inode, path + depth);
2548	if (err)
2549		goto out;
2550	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2551	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2552		err =  ext4_ext_zeroout(inode, &orig_ex);
2553		if (err)
2554			goto fix_extent_len;
2555		/* update the extent length and mark as initialized */
2556		ex->ee_block = orig_ex.ee_block;
2557		ex->ee_len   = orig_ex.ee_len;
2558		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2559		ext4_ext_dirty(handle, inode, path + depth);
2560		/* zeroed the full extent */
2561		return allocated;
2562	}
2563
2564	/* ex1: ee_block to iblock - 1 : uninitialized */
2565	if (iblock > ee_block) {
2566		ex1 = ex;
2567		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2568		ext4_ext_mark_uninitialized(ex1);
2569		ex2 = &newex;
2570	}
2571	/*
2572	 * for sanity, update the length of the ex2 extent before
2573	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2574	 * overlap of blocks.
2575	 */
2576	if (!ex1 && allocated > max_blocks)
2577		ex2->ee_len = cpu_to_le16(max_blocks);
2578	/* ex3: to ee_block + ee_len : uninitialised */
2579	if (allocated > max_blocks) {
2580		unsigned int newdepth;
2581		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2582		if (allocated <= EXT4_EXT_ZERO_LEN) {
2583			/*
2584			 * iblock == ee_block is handled by the zerouout
2585			 * at the beginning.
2586			 * Mark first half uninitialized.
2587			 * Mark second half initialized and zero out the
2588			 * initialized extent
2589			 */
2590			ex->ee_block = orig_ex.ee_block;
2591			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2592			ext4_ext_mark_uninitialized(ex);
2593			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2594			ext4_ext_dirty(handle, inode, path + depth);
2595
2596			ex3 = &newex;
2597			ex3->ee_block = cpu_to_le32(iblock);
2598			ext4_ext_store_pblock(ex3, newblock);
2599			ex3->ee_len = cpu_to_le16(allocated);
2600			err = ext4_ext_insert_extent(handle, inode, path,
2601							ex3, 0);
2602			if (err == -ENOSPC) {
2603				err =  ext4_ext_zeroout(inode, &orig_ex);
2604				if (err)
2605					goto fix_extent_len;
2606				ex->ee_block = orig_ex.ee_block;
2607				ex->ee_len   = orig_ex.ee_len;
2608				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2609				ext4_ext_dirty(handle, inode, path + depth);
2610				/* blocks available from iblock */
2611				return allocated;
2612
2613			} else if (err)
2614				goto fix_extent_len;
2615
2616			/*
2617			 * We need to zero out the second half because
2618			 * an fallocate request can update file size and
2619			 * converting the second half to initialized extent
2620			 * implies that we can leak some junk data to user
2621			 * space.
2622			 */
2623			err =  ext4_ext_zeroout(inode, ex3);
2624			if (err) {
2625				/*
2626				 * We should actually mark the
2627				 * second half as uninit and return error
2628				 * Insert would have changed the extent
2629				 */
2630				depth = ext_depth(inode);
2631				ext4_ext_drop_refs(path);
2632				path = ext4_ext_find_extent(inode,
2633								iblock, path);
2634				if (IS_ERR(path)) {
2635					err = PTR_ERR(path);
2636					return err;
2637				}
2638				/* get the second half extent details */
2639				ex = path[depth].p_ext;
2640				err = ext4_ext_get_access(handle, inode,
2641								path + depth);
2642				if (err)
2643					return err;
2644				ext4_ext_mark_uninitialized(ex);
2645				ext4_ext_dirty(handle, inode, path + depth);
2646				return err;
2647			}
2648
2649			/* zeroed the second half */
2650			return allocated;
2651		}
2652		ex3 = &newex;
2653		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2654		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2655		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2656		ext4_ext_mark_uninitialized(ex3);
2657		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2658		if (err == -ENOSPC) {
2659			err =  ext4_ext_zeroout(inode, &orig_ex);
2660			if (err)
2661				goto fix_extent_len;
2662			/* update the extent length and mark as initialized */
2663			ex->ee_block = orig_ex.ee_block;
2664			ex->ee_len   = orig_ex.ee_len;
2665			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2666			ext4_ext_dirty(handle, inode, path + depth);
2667			/* zeroed the full extent */
2668			/* blocks available from iblock */
2669			return allocated;
2670
2671		} else if (err)
2672			goto fix_extent_len;
2673		/*
2674		 * The depth, and hence eh & ex might change
2675		 * as part of the insert above.
2676		 */
2677		newdepth = ext_depth(inode);
2678		/*
2679		 * update the extent length after successful insert of the
2680		 * split extent
2681		 */
2682		orig_ex.ee_len = cpu_to_le16(ee_len -
2683						ext4_ext_get_actual_len(ex3));
2684		depth = newdepth;
2685		ext4_ext_drop_refs(path);
2686		path = ext4_ext_find_extent(inode, iblock, path);
2687		if (IS_ERR(path)) {
2688			err = PTR_ERR(path);
2689			goto out;
2690		}
2691		eh = path[depth].p_hdr;
2692		ex = path[depth].p_ext;
2693		if (ex2 != &newex)
2694			ex2 = ex;
2695
2696		err = ext4_ext_get_access(handle, inode, path + depth);
2697		if (err)
2698			goto out;
2699
2700		allocated = max_blocks;
2701
2702		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2703		 * to insert a extent in the middle zerout directly
2704		 * otherwise give the extent a chance to merge to left
2705		 */
2706		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2707							iblock != ee_block) {
2708			err =  ext4_ext_zeroout(inode, &orig_ex);
2709			if (err)
2710				goto fix_extent_len;
2711			/* update the extent length and mark as initialized */
2712			ex->ee_block = orig_ex.ee_block;
2713			ex->ee_len   = orig_ex.ee_len;
2714			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2715			ext4_ext_dirty(handle, inode, path + depth);
2716			/* zero out the first half */
2717			/* blocks available from iblock */
2718			return allocated;
2719		}
2720	}
2721	/*
2722	 * If there was a change of depth as part of the
2723	 * insertion of ex3 above, we need to update the length
2724	 * of the ex1 extent again here
2725	 */
2726	if (ex1 && ex1 != ex) {
2727		ex1 = ex;
2728		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2729		ext4_ext_mark_uninitialized(ex1);
2730		ex2 = &newex;
2731	}
2732	/* ex2: iblock to iblock + maxblocks-1 : initialised */
2733	ex2->ee_block = cpu_to_le32(iblock);
2734	ext4_ext_store_pblock(ex2, newblock);
2735	ex2->ee_len = cpu_to_le16(allocated);
2736	if (ex2 != ex)
2737		goto insert;
2738	/*
2739	 * New (initialized) extent starts from the first block
2740	 * in the current extent. i.e., ex2 == ex
2741	 * We have to see if it can be merged with the extent
2742	 * on the left.
2743	 */
2744	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2745		/*
2746		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2747		 * since it merges towards right _only_.
2748		 */
2749		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2750		if (ret) {
2751			err = ext4_ext_correct_indexes(handle, inode, path);
2752			if (err)
2753				goto out;
2754			depth = ext_depth(inode);
2755			ex2--;
2756		}
2757	}
2758	/*
2759	 * Try to Merge towards right. This might be required
2760	 * only when the whole extent is being written to.
2761	 * i.e. ex2 == ex and ex3 == NULL.
2762	 */
2763	if (!ex3) {
2764		ret = ext4_ext_try_to_merge(inode, path, ex2);
2765		if (ret) {
2766			err = ext4_ext_correct_indexes(handle, inode, path);
2767			if (err)
2768				goto out;
2769		}
2770	}
2771	/* Mark modified extent as dirty */
2772	err = ext4_ext_dirty(handle, inode, path + depth);
2773	goto out;
2774insert:
2775	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2776	if (err == -ENOSPC) {
2777		err =  ext4_ext_zeroout(inode, &orig_ex);
2778		if (err)
2779			goto fix_extent_len;
2780		/* update the extent length and mark as initialized */
2781		ex->ee_block = orig_ex.ee_block;
2782		ex->ee_len   = orig_ex.ee_len;
2783		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2784		ext4_ext_dirty(handle, inode, path + depth);
2785		/* zero out the first half */
2786		return allocated;
2787	} else if (err)
2788		goto fix_extent_len;
2789out:
2790	ext4_ext_show_leaf(inode, path);
2791	return err ? err : allocated;
2792
2793fix_extent_len:
2794	ex->ee_block = orig_ex.ee_block;
2795	ex->ee_len   = orig_ex.ee_len;
2796	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2797	ext4_ext_mark_uninitialized(ex);
2798	ext4_ext_dirty(handle, inode, path + depth);
2799	return err;
2800}
2801
2802/*
2803 * This function is called by ext4_ext_get_blocks() from
2804 * ext4_get_blocks_dio_write() when DIO to write
2805 * to an uninitialized extent.
2806 *
2807 * Writing to an uninitized extent may result in splitting the uninitialized
2808 * extent into multiple /intialized unintialized extents (up to three)
2809 * There are three possibilities:
2810 *   a> There is no split required: Entire extent should be uninitialized
2811 *   b> Splits in two extents: Write is happening at either end of the extent
2812 *   c> Splits in three extents: Somone is writing in middle of the extent
2813 *
2814 * One of more index blocks maybe needed if the extent tree grow after
2815 * the unintialized extent split. To prevent ENOSPC occur at the IO
2816 * complete, we need to split the uninitialized extent before DIO submit
2817 * the IO. The uninitilized extent called at this time will be split
2818 * into three uninitialized extent(at most). After IO complete, the part
2819 * being filled will be convert to initialized by the end_io callback function
2820 * via ext4_convert_unwritten_extents().
2821 *
2822 * Returns the size of uninitialized extent to be written on success.
2823 */
2824static int ext4_split_unwritten_extents(handle_t *handle,
2825					struct inode *inode,
2826					struct ext4_ext_path *path,
2827					ext4_lblk_t iblock,
2828					unsigned int max_blocks,
2829					int flags)
2830{
2831	struct ext4_extent *ex, newex, orig_ex;
2832	struct ext4_extent *ex1 = NULL;
2833	struct ext4_extent *ex2 = NULL;
2834	struct ext4_extent *ex3 = NULL;
2835	struct ext4_extent_header *eh;
2836	ext4_lblk_t ee_block;
2837	unsigned int allocated, ee_len, depth;
2838	ext4_fsblk_t newblock;
2839	int err = 0;
2840
2841	ext_debug("ext4_split_unwritten_extents: inode %lu,"
2842		  "iblock %llu, max_blocks %u\n", inode->i_ino,
2843		  (unsigned long long)iblock, max_blocks);
2844	depth = ext_depth(inode);
2845	eh = path[depth].p_hdr;
2846	ex = path[depth].p_ext;
2847	ee_block = le32_to_cpu(ex->ee_block);
2848	ee_len = ext4_ext_get_actual_len(ex);
2849	allocated = ee_len - (iblock - ee_block);
2850	newblock = iblock - ee_block + ext_pblock(ex);
2851	ex2 = ex;
2852	orig_ex.ee_block = ex->ee_block;
2853	orig_ex.ee_len   = cpu_to_le16(ee_len);
2854	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2855
2856	/*
2857 	 * If the uninitialized extent begins at the same logical
2858 	 * block where the write begins, and the write completely
2859 	 * covers the extent, then we don't need to split it.
2860 	 */
2861	if ((iblock == ee_block) && (allocated <= max_blocks))
2862		return allocated;
2863
2864	err = ext4_ext_get_access(handle, inode, path + depth);
2865	if (err)
2866		goto out;
2867	/* ex1: ee_block to iblock - 1 : uninitialized */
2868	if (iblock > ee_block) {
2869		ex1 = ex;
2870		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2871		ext4_ext_mark_uninitialized(ex1);
2872		ex2 = &newex;
2873	}
2874	/*
2875	 * for sanity, update the length of the ex2 extent before
2876	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2877	 * overlap of blocks.
2878	 */
2879	if (!ex1 && allocated > max_blocks)
2880		ex2->ee_len = cpu_to_le16(max_blocks);
2881	/* ex3: to ee_block + ee_len : uninitialised */
2882	if (allocated > max_blocks) {
2883		unsigned int newdepth;
2884		ex3 = &newex;
2885		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2886		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2887		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2888		ext4_ext_mark_uninitialized(ex3);
2889		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2890		if (err == -ENOSPC) {
2891			err =  ext4_ext_zeroout(inode, &orig_ex);
2892			if (err)
2893				goto fix_extent_len;
2894			/* update the extent length and mark as initialized */
2895			ex->ee_block = orig_ex.ee_block;
2896			ex->ee_len   = orig_ex.ee_len;
2897			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2898			ext4_ext_dirty(handle, inode, path + depth);
2899			/* zeroed the full extent */
2900			/* blocks available from iblock */
2901			return allocated;
2902
2903		} else if (err)
2904			goto fix_extent_len;
2905		/*
2906		 * The depth, and hence eh & ex might change
2907		 * as part of the insert above.
2908		 */
2909		newdepth = ext_depth(inode);
2910		/*
2911		 * update the extent length after successful insert of the
2912		 * split extent
2913		 */
2914		orig_ex.ee_len = cpu_to_le16(ee_len -
2915						ext4_ext_get_actual_len(ex3));
2916		depth = newdepth;
2917		ext4_ext_drop_refs(path);
2918		path = ext4_ext_find_extent(inode, iblock, path);
2919		if (IS_ERR(path)) {
2920			err = PTR_ERR(path);
2921			goto out;
2922		}
2923		eh = path[depth].p_hdr;
2924		ex = path[depth].p_ext;
2925		if (ex2 != &newex)
2926			ex2 = ex;
2927
2928		err = ext4_ext_get_access(handle, inode, path + depth);
2929		if (err)
2930			goto out;
2931
2932		allocated = max_blocks;
2933	}
2934	/*
2935	 * If there was a change of depth as part of the
2936	 * insertion of ex3 above, we need to update the length
2937	 * of the ex1 extent again here
2938	 */
2939	if (ex1 && ex1 != ex) {
2940		ex1 = ex;
2941		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2942		ext4_ext_mark_uninitialized(ex1);
2943		ex2 = &newex;
2944	}
2945	/*
2946	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2947	 * uninitialised still.
2948	 */
2949	ex2->ee_block = cpu_to_le32(iblock);
2950	ext4_ext_store_pblock(ex2, newblock);
2951	ex2->ee_len = cpu_to_le16(allocated);
2952	ext4_ext_mark_uninitialized(ex2);
2953	if (ex2 != ex)
2954		goto insert;
2955	/* Mark modified extent as dirty */
2956	err = ext4_ext_dirty(handle, inode, path + depth);
2957	ext_debug("out here\n");
2958	goto out;
2959insert:
2960	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2961	if (err == -ENOSPC) {
2962		err =  ext4_ext_zeroout(inode, &orig_ex);
2963		if (err)
2964			goto fix_extent_len;
2965		/* update the extent length and mark as initialized */
2966		ex->ee_block = orig_ex.ee_block;
2967		ex->ee_len   = orig_ex.ee_len;
2968		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2969		ext4_ext_dirty(handle, inode, path + depth);
2970		/* zero out the first half */
2971		return allocated;
2972	} else if (err)
2973		goto fix_extent_len;
2974out:
2975	ext4_ext_show_leaf(inode, path);
2976	return err ? err : allocated;
2977
2978fix_extent_len:
2979	ex->ee_block = orig_ex.ee_block;
2980	ex->ee_len   = orig_ex.ee_len;
2981	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2982	ext4_ext_mark_uninitialized(ex);
2983	ext4_ext_dirty(handle, inode, path + depth);
2984	return err;
2985}
2986static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2987					      struct inode *inode,
2988					      struct ext4_ext_path *path)
2989{
2990	struct ext4_extent *ex;
2991	struct ext4_extent_header *eh;
2992	int depth;
2993	int err = 0;
2994	int ret = 0;
2995
2996	depth = ext_depth(inode);
2997	eh = path[depth].p_hdr;
2998	ex = path[depth].p_ext;
2999
3000	err = ext4_ext_get_access(handle, inode, path + depth);
3001	if (err)
3002		goto out;
3003	/* first mark the extent as initialized */
3004	ext4_ext_mark_initialized(ex);
3005
3006	/*
3007	 * We have to see if it can be merged with the extent
3008	 * on the left.
3009	 */
3010	if (ex > EXT_FIRST_EXTENT(eh)) {
3011		/*
3012		 * To merge left, pass "ex - 1" to try_to_merge(),
3013		 * since it merges towards right _only_.
3014		 */
3015		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3016		if (ret) {
3017			err = ext4_ext_correct_indexes(handle, inode, path);
3018			if (err)
3019				goto out;
3020			depth = ext_depth(inode);
3021			ex--;
3022		}
3023	}
3024	/*
3025	 * Try to Merge towards right.
3026	 */
3027	ret = ext4_ext_try_to_merge(inode, path, ex);
3028	if (ret) {
3029		err = ext4_ext_correct_indexes(handle, inode, path);
3030		if (err)
3031			goto out;
3032		depth = ext_depth(inode);
3033	}
3034	/* Mark modified extent as dirty */
3035	err = ext4_ext_dirty(handle, inode, path + depth);
3036out:
3037	ext4_ext_show_leaf(inode, path);
3038	return err;
3039}
3040
3041static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3042			sector_t block, int count)
3043{
3044	int i;
3045	for (i = 0; i < count; i++)
3046                unmap_underlying_metadata(bdev, block + i);
3047}
3048
3049static int
3050ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3051			ext4_lblk_t iblock, unsigned int max_blocks,
3052			struct ext4_ext_path *path, int flags,
3053			unsigned int allocated, struct buffer_head *bh_result,
3054			ext4_fsblk_t newblock)
3055{
3056	int ret = 0;
3057	int err = 0;
3058	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3059
3060	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3061		  "block %llu, max_blocks %u, flags %d, allocated %u",
3062		  inode->i_ino, (unsigned long long)iblock, max_blocks,
3063		  flags, allocated);
3064	ext4_ext_show_leaf(inode, path);
3065
3066	/* DIO get_block() before submit the IO, split the extent */
3067	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3068		ret = ext4_split_unwritten_extents(handle,
3069						inode, path, iblock,
3070						max_blocks, flags);
3071		/*
3072		 * Flag the inode(non aio case) or end_io struct (aio case)
3073		 * that this IO needs to convertion to written when IO is
3074		 * completed
3075		 */
3076		if (io)
3077			io->flag = DIO_AIO_UNWRITTEN;
3078		else
3079			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3080		goto out;
3081	}
3082	/* async DIO end_io complete, convert the filled extent to written */
3083	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3084		ret = ext4_convert_unwritten_extents_dio(handle, inode,
3085							path);
3086		if (ret >= 0)
3087			ext4_update_inode_fsync_trans(handle, inode, 1);
3088		goto out2;
3089	}
3090	/* buffered IO case */
3091	/*
3092	 * repeat fallocate creation request
3093	 * we already have an unwritten extent
3094	 */
3095	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3096		goto map_out;
3097
3098	/* buffered READ or buffered write_begin() lookup */
3099	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3100		/*
3101		 * We have blocks reserved already.  We
3102		 * return allocated blocks so that delalloc
3103		 * won't do block reservation for us.  But
3104		 * the buffer head will be unmapped so that
3105		 * a read from the block returns 0s.
3106		 */
3107		set_buffer_unwritten(bh_result);
3108		goto out1;
3109	}
3110
3111	/* buffered write, writepage time, convert*/
3112	ret = ext4_ext_convert_to_initialized(handle, inode,
3113						path, iblock,
3114						max_blocks);
3115	if (ret >= 0)
3116		ext4_update_inode_fsync_trans(handle, inode, 1);
3117out:
3118	if (ret <= 0) {
3119		err = ret;
3120		goto out2;
3121	} else
3122		allocated = ret;
3123	set_buffer_new(bh_result);
3124	/*
3125	 * if we allocated more blocks than requested
3126	 * we need to make sure we unmap the extra block
3127	 * allocated. The actual needed block will get
3128	 * unmapped later when we find the buffer_head marked
3129	 * new.
3130	 */
3131	if (allocated > max_blocks) {
3132		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3133					newblock + max_blocks,
3134					allocated - max_blocks);
3135		allocated = max_blocks;
3136	}
3137
3138	/*
3139	 * If we have done fallocate with the offset that is already
3140	 * delayed allocated, we would have block reservation
3141	 * and quota reservation done in the delayed write path.
3142	 * But fallocate would have already updated quota and block
3143	 * count for this offset. So cancel these reservation
3144	 */
3145	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3146		ext4_da_update_reserve_space(inode, allocated, 0);
3147
3148map_out:
3149	set_buffer_mapped(bh_result);
3150out1:
3151	if (allocated > max_blocks)
3152		allocated = max_blocks;
3153	ext4_ext_show_leaf(inode, path);
3154	bh_result->b_bdev = inode->i_sb->s_bdev;
3155	bh_result->b_blocknr = newblock;
3156out2:
3157	if (path) {
3158		ext4_ext_drop_refs(path);
3159		kfree(path);
3160	}
3161	return err ? err : allocated;
3162}
3163/*
3164 * Block allocation/map/preallocation routine for extents based files
3165 *
3166 *
3167 * Need to be called with
3168 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3169 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3170 *
3171 * return > 0, number of of blocks already mapped/allocated
3172 *          if create == 0 and these are pre-allocated blocks
3173 *          	buffer head is unmapped
3174 *          otherwise blocks are mapped
3175 *
3176 * return = 0, if plain look up failed (blocks have not been allocated)
3177 *          buffer head is unmapped
3178 *
3179 * return < 0, error case.
3180 */
3181int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3182			ext4_lblk_t iblock,
3183			unsigned int max_blocks, struct buffer_head *bh_result,
3184			int flags)
3185{
3186	struct ext4_ext_path *path = NULL;
3187	struct ext4_extent_header *eh;
3188	struct ext4_extent newex, *ex;
3189	ext4_fsblk_t newblock;
3190	int err = 0, depth, ret, cache_type;
3191	unsigned int allocated = 0;
3192	struct ext4_allocation_request ar;
3193	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3194
3195	__clear_bit(BH_New, &bh_result->b_state);
3196	ext_debug("blocks %u/%u requested for inode %lu\n",
3197			iblock, max_blocks, inode->i_ino);
3198
3199	/* check in cache */
3200	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
3201	if (cache_type) {
3202		if (cache_type == EXT4_EXT_CACHE_GAP) {
3203			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3204				/*
3205				 * block isn't allocated yet and
3206				 * user doesn't want to allocate it
3207				 */
3208				goto out2;
3209			}
3210			/* we should allocate requested block */
3211		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3212			/* block is already allocated */
3213			newblock = iblock
3214				   - le32_to_cpu(newex.ee_block)
3215				   + ext_pblock(&newex);
3216			/* number of remaining blocks in the extent */
3217			allocated = ext4_ext_get_actual_len(&newex) -
3218					(iblock - le32_to_cpu(newex.ee_block));
3219			goto out;
3220		} else {
3221			BUG();
3222		}
3223	}
3224
3225	/* find extent for this block */
3226	path = ext4_ext_find_extent(inode, iblock, NULL);
3227	if (IS_ERR(path)) {
3228		err = PTR_ERR(path);
3229		path = NULL;
3230		goto out2;
3231	}
3232
3233	depth = ext_depth(inode);
3234
3235	/*
3236	 * consistent leaf must not be empty;
3237	 * this situation is possible, though, _during_ tree modification;
3238	 * this is why assert can't be put in ext4_ext_find_extent()
3239	 */
3240	if (path[depth].p_ext == NULL && depth != 0) {
3241		ext4_error(inode->i_sb, __func__, "bad extent address "
3242			   "inode: %lu, iblock: %d, depth: %d",
3243			   inode->i_ino, iblock, depth);
3244		err = -EIO;
3245		goto out2;
3246	}
3247	eh = path[depth].p_hdr;
3248
3249	ex = path[depth].p_ext;
3250	if (ex) {
3251		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3252		ext4_fsblk_t ee_start = ext_pblock(ex);
3253		unsigned short ee_len;
3254
3255		/*
3256		 * Uninitialized extents are treated as holes, except that
3257		 * we split out initialized portions during a write.
3258		 */
3259		ee_len = ext4_ext_get_actual_len(ex);
3260		/* if found extent covers block, simply return it */
3261		if (iblock >= ee_block && iblock < ee_block + ee_len) {
3262			newblock = iblock - ee_block + ee_start;
3263			/* number of remaining blocks in the extent */
3264			allocated = ee_len - (iblock - ee_block);
3265			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
3266					ee_block, ee_len, newblock);
3267
3268			/* Do not put uninitialized extent in the cache */
3269			if (!ext4_ext_is_uninitialized(ex)) {
3270				ext4_ext_put_in_cache(inode, ee_block,
3271							ee_len, ee_start,
3272							EXT4_EXT_CACHE_EXTENT);
3273				goto out;
3274			}
3275			ret = ext4_ext_handle_uninitialized_extents(handle,
3276					inode, iblock, max_blocks, path,
3277					flags, allocated, bh_result, newblock);
3278			return ret;
3279		}
3280	}
3281
3282	/*
3283	 * requested block isn't allocated yet;
3284	 * we couldn't try to create block if create flag is zero
3285	 */
3286	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3287		/*
3288		 * put just found gap into cache to speed up
3289		 * subsequent requests
3290		 */
3291		ext4_ext_put_gap_in_cache(inode, path, iblock);
3292		goto out2;
3293	}
3294	/*
3295	 * Okay, we need to do block allocation.
3296	 */
3297
3298	/* find neighbour allocated blocks */
3299	ar.lleft = iblock;
3300	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3301	if (err)
3302		goto out2;
3303	ar.lright = iblock;
3304	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3305	if (err)
3306		goto out2;
3307
3308	/*
3309	 * See if request is beyond maximum number of blocks we can have in
3310	 * a single extent. For an initialized extent this limit is
3311	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3312	 * EXT_UNINIT_MAX_LEN.
3313	 */
3314	if (max_blocks > EXT_INIT_MAX_LEN &&
3315	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3316		max_blocks = EXT_INIT_MAX_LEN;
3317	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
3318		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3319		max_blocks = EXT_UNINIT_MAX_LEN;
3320
3321	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
3322	newex.ee_block = cpu_to_le32(iblock);
3323	newex.ee_len = cpu_to_le16(max_blocks);
3324	err = ext4_ext_check_overlap(inode, &newex, path);
3325	if (err)
3326		allocated = ext4_ext_get_actual_len(&newex);
3327	else
3328		allocated = max_blocks;
3329
3330	/* allocate new block */
3331	ar.inode = inode;
3332	ar.goal = ext4_ext_find_goal(inode, path, iblock);
3333	ar.logical = iblock;
3334	ar.len = allocated;
3335	if (S_ISREG(inode->i_mode))
3336		ar.flags = EXT4_MB_HINT_DATA;
3337	else
3338		/* disable in-core preallocation for non-regular files */
3339		ar.flags = 0;
3340	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3341	if (!newblock)
3342		goto out2;
3343	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3344		  ar.goal, newblock, allocated);
3345
3346	/* try to insert new extent into found leaf and return */
3347	ext4_ext_store_pblock(&newex, newblock);
3348	newex.ee_len = cpu_to_le16(ar.len);
3349	/* Mark uninitialized */
3350	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3351		ext4_ext_mark_uninitialized(&newex);
3352		/*
3353		 * io_end structure was created for every async
3354		 * direct IO write to the middle of the file.
3355		 * To avoid unecessary convertion for every aio dio rewrite
3356		 * to the mid of file, here we flag the IO that is really
3357		 * need the convertion.
3358		 * For non asycn direct IO case, flag the inode state
3359		 * that we need to perform convertion when IO is done.
3360		 */
3361		if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3362			if (io)
3363				io->flag = DIO_AIO_UNWRITTEN;
3364			else
3365				ext4_set_inode_state(inode,
3366						     EXT4_STATE_DIO_UNWRITTEN);
3367		}
3368	}
3369	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3370	if (err) {
3371		/* free data blocks we just allocated */
3372		/* not a good idea to call discard here directly,
3373		 * but otherwise we'd need to call it every free() */
3374		ext4_discard_preallocations(inode);
3375		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3376				 ext4_ext_get_actual_len(&newex), 0);
3377		goto out2;
3378	}
3379
3380	/* previous routine could use block we allocated */
3381	newblock = ext_pblock(&newex);
3382	allocated = ext4_ext_get_actual_len(&newex);
3383	if (allocated > max_blocks)
3384		allocated = max_blocks;
3385	set_buffer_new(bh_result);
3386
3387	/*
3388	 * Update reserved blocks/metadata blocks after successful
3389	 * block allocation which had been deferred till now.
3390	 */
3391	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3392		ext4_da_update_reserve_space(inode, allocated, 1);
3393
3394	/*
3395	 * Cache the extent and update transaction to commit on fdatasync only
3396	 * when it is _not_ an uninitialized extent.
3397	 */
3398	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3399		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
3400						EXT4_EXT_CACHE_EXTENT);
3401		ext4_update_inode_fsync_trans(handle, inode, 1);
3402	} else
3403		ext4_update_inode_fsync_trans(handle, inode, 0);
3404out:
3405	if (allocated > max_blocks)
3406		allocated = max_blocks;
3407	ext4_ext_show_leaf(inode, path);
3408	set_buffer_mapped(bh_result);
3409	bh_result->b_bdev = inode->i_sb->s_bdev;
3410	bh_result->b_blocknr = newblock;
3411out2:
3412	if (path) {
3413		ext4_ext_drop_refs(path);
3414		kfree(path);
3415	}
3416	return err ? err : allocated;
3417}
3418
3419void ext4_ext_truncate(struct inode *inode)
3420{
3421	struct address_space *mapping = inode->i_mapping;
3422	struct super_block *sb = inode->i_sb;
3423	ext4_lblk_t last_block;
3424	handle_t *handle;
3425	int err = 0;
3426
3427	/*
3428	 * probably first extent we're gonna free will be last in block
3429	 */
3430	err = ext4_writepage_trans_blocks(inode);
3431	handle = ext4_journal_start(inode, err);
3432	if (IS_ERR(handle))
3433		return;
3434
3435	if (inode->i_size & (sb->s_blocksize - 1))
3436		ext4_block_truncate_page(handle, mapping, inode->i_size);
3437
3438	if (ext4_orphan_add(handle, inode))
3439		goto out_stop;
3440
3441	down_write(&EXT4_I(inode)->i_data_sem);
3442	ext4_ext_invalidate_cache(inode);
3443
3444	ext4_discard_preallocations(inode);
3445
3446	/*
3447	 * TODO: optimization is possible here.
3448	 * Probably we need not scan at all,
3449	 * because page truncation is enough.
3450	 */
3451
3452	/* we have to know where to truncate from in crash case */
3453	EXT4_I(inode)->i_disksize = inode->i_size;
3454	ext4_mark_inode_dirty(handle, inode);
3455
3456	last_block = (inode->i_size + sb->s_blocksize - 1)
3457			>> EXT4_BLOCK_SIZE_BITS(sb);
3458	err = ext4_ext_remove_space(inode, last_block);
3459
3460	/* In a multi-transaction truncate, we only make the final
3461	 * transaction synchronous.
3462	 */
3463	if (IS_SYNC(inode))
3464		ext4_handle_sync(handle);
3465
3466out_stop:
3467	up_write(&EXT4_I(inode)->i_data_sem);
3468	/*
3469	 * If this was a simple ftruncate() and the file will remain alive,
3470	 * then we need to clear up the orphan record which we created above.
3471	 * However, if this was a real unlink then we were called by
3472	 * ext4_delete_inode(), and we allow that function to clean up the
3473	 * orphan info for us.
3474	 */
3475	if (inode->i_nlink)
3476		ext4_orphan_del(handle, inode);
3477
3478	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3479	ext4_mark_inode_dirty(handle, inode);
3480	ext4_journal_stop(handle);
3481}
3482
3483static void ext4_falloc_update_inode(struct inode *inode,
3484				int mode, loff_t new_size, int update_ctime)
3485{
3486	struct timespec now;
3487
3488	if (update_ctime) {
3489		now = current_fs_time(inode->i_sb);
3490		if (!timespec_equal(&inode->i_ctime, &now))
3491			inode->i_ctime = now;
3492	}
3493	/*
3494	 * Update only when preallocation was requested beyond
3495	 * the file size.
3496	 */
3497	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3498		if (new_size > i_size_read(inode))
3499			i_size_write(inode, new_size);
3500		if (new_size > EXT4_I(inode)->i_disksize)
3501			ext4_update_i_disksize(inode, new_size);
3502	}
3503
3504}
3505
3506/*
3507 * preallocate space for a file. This implements ext4's fallocate inode
3508 * operation, which gets called from sys_fallocate system call.
3509 * For block-mapped files, posix_fallocate should fall back to the method
3510 * of writing zeroes to the required new blocks (the same behavior which is
3511 * expected for file systems which do not support fallocate() system call).
3512 */
3513long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3514{
3515	handle_t *handle;
3516	ext4_lblk_t block;
3517	loff_t new_size;
3518	unsigned int max_blocks;
3519	int ret = 0;
3520	int ret2 = 0;
3521	int retries = 0;
3522	struct buffer_head map_bh;
3523	unsigned int credits, blkbits = inode->i_blkbits;
3524
3525	/*
3526	 * currently supporting (pre)allocate mode for extent-based
3527	 * files _only_
3528	 */
3529	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3530		return -EOPNOTSUPP;
3531
3532	/* preallocation to directories is currently not supported */
3533	if (S_ISDIR(inode->i_mode))
3534		return -ENODEV;
3535
3536	block = offset >> blkbits;
3537	/*
3538	 * We can't just convert len to max_blocks because
3539	 * If blocksize = 4096 offset = 3072 and len = 2048
3540	 */
3541	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3542							- block;
3543	/*
3544	 * credits to insert 1 extent into extent tree
3545	 */
3546	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3547	mutex_lock(&inode->i_mutex);
3548retry:
3549	while (ret >= 0 && ret < max_blocks) {
3550		block = block + ret;
3551		max_blocks = max_blocks - ret;
3552		handle = ext4_journal_start(inode, credits);
3553		if (IS_ERR(handle)) {
3554			ret = PTR_ERR(handle);
3555			break;
3556		}
3557		map_bh.b_state = 0;
3558		ret = ext4_get_blocks(handle, inode, block,
3559				      max_blocks, &map_bh,
3560				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3561		if (ret <= 0) {
3562#ifdef EXT4FS_DEBUG
3563			WARN_ON(ret <= 0);
3564			printk(KERN_ERR "%s: ext4_ext_get_blocks "
3565				    "returned error inode#%lu, block=%u, "
3566				    "max_blocks=%u", __func__,
3567				    inode->i_ino, block, max_blocks);
3568#endif
3569			ext4_mark_inode_dirty(handle, inode);
3570			ret2 = ext4_journal_stop(handle);
3571			break;
3572		}
3573		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3574						blkbits) >> blkbits))
3575			new_size = offset + len;
3576		else
3577			new_size = (block + ret) << blkbits;
3578
3579		ext4_falloc_update_inode(inode, mode, new_size,
3580						buffer_new(&map_bh));
3581		ext4_mark_inode_dirty(handle, inode);
3582		ret2 = ext4_journal_stop(handle);
3583		if (ret2)
3584			break;
3585	}
3586	if (ret == -ENOSPC &&
3587			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3588		ret = 0;
3589		goto retry;
3590	}
3591	mutex_unlock(&inode->i_mutex);
3592	return ret > 0 ? ret2 : ret;
3593}
3594
3595/*
3596 * This function convert a range of blocks to written extents
3597 * The caller of this function will pass the start offset and the size.
3598 * all unwritten extents within this range will be converted to
3599 * written extents.
3600 *
3601 * This function is called from the direct IO end io call back
3602 * function, to convert the fallocated extents after IO is completed.
3603 * Returns 0 on success.
3604 */
3605int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3606				    ssize_t len)
3607{
3608	handle_t *handle;
3609	ext4_lblk_t block;
3610	unsigned int max_blocks;
3611	int ret = 0;
3612	int ret2 = 0;
3613	struct buffer_head map_bh;
3614	unsigned int credits, blkbits = inode->i_blkbits;
3615
3616	block = offset >> blkbits;
3617	/*
3618	 * We can't just convert len to max_blocks because
3619	 * If blocksize = 4096 offset = 3072 and len = 2048
3620	 */
3621	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3622							- block;
3623	/*
3624	 * credits to insert 1 extent into extent tree
3625	 */
3626	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3627	while (ret >= 0 && ret < max_blocks) {
3628		block = block + ret;
3629		max_blocks = max_blocks - ret;
3630		handle = ext4_journal_start(inode, credits);
3631		if (IS_ERR(handle)) {
3632			ret = PTR_ERR(handle);
3633			break;
3634		}
3635		map_bh.b_state = 0;
3636		ret = ext4_get_blocks(handle, inode, block,
3637				      max_blocks, &map_bh,
3638				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3639		if (ret <= 0) {
3640			WARN_ON(ret <= 0);
3641			printk(KERN_ERR "%s: ext4_ext_get_blocks "
3642				    "returned error inode#%lu, block=%u, "
3643				    "max_blocks=%u", __func__,
3644				    inode->i_ino, block, max_blocks);
3645		}
3646		ext4_mark_inode_dirty(handle, inode);
3647		ret2 = ext4_journal_stop(handle);
3648		if (ret <= 0 || ret2 )
3649			break;
3650	}
3651	return ret > 0 ? ret2 : ret;
3652}
3653/*
3654 * Callback function called for each extent to gather FIEMAP information.
3655 */
3656static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3657		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3658		       void *data)
3659{
3660	struct fiemap_extent_info *fieinfo = data;
3661	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3662	__u64	logical;
3663	__u64	physical;
3664	__u64	length;
3665	__u32	flags = 0;
3666	int	error;
3667
3668	logical =  (__u64)newex->ec_block << blksize_bits;
3669
3670	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3671		pgoff_t offset;
3672		struct page *page;
3673		struct buffer_head *bh = NULL;
3674
3675		offset = logical >> PAGE_SHIFT;
3676		page = find_get_page(inode->i_mapping, offset);
3677		if (!page || !page_has_buffers(page))
3678			return EXT_CONTINUE;
3679
3680		bh = page_buffers(page);
3681
3682		if (!bh)
3683			return EXT_CONTINUE;
3684
3685		if (buffer_delay(bh)) {
3686			flags |= FIEMAP_EXTENT_DELALLOC;
3687			page_cache_release(page);
3688		} else {
3689			page_cache_release(page);
3690			return EXT_CONTINUE;
3691		}
3692	}
3693
3694	physical = (__u64)newex->ec_start << blksize_bits;
3695	length =   (__u64)newex->ec_len << blksize_bits;
3696
3697	if (ex && ext4_ext_is_uninitialized(ex))
3698		flags |= FIEMAP_EXTENT_UNWRITTEN;
3699
3700	/*
3701	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3702	 *
3703	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3704	 * this also indicates no more allocated blocks.
3705	 *
3706	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3707	 */
3708	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3709	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3710		loff_t size = i_size_read(inode);
3711		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3712
3713		flags |= FIEMAP_EXTENT_LAST;
3714		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3715		    logical+length > size)
3716			length = (size - logical + bs - 1) & ~(bs-1);
3717	}
3718
3719	error = fiemap_fill_next_extent(fieinfo, logical, physical,
3720					length, flags);
3721	if (error < 0)
3722		return error;
3723	if (error == 1)
3724		return EXT_BREAK;
3725
3726	return EXT_CONTINUE;
3727}
3728
3729/* fiemap flags we can handle specified here */
3730#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3731
3732static int ext4_xattr_fiemap(struct inode *inode,
3733				struct fiemap_extent_info *fieinfo)
3734{
3735	__u64 physical = 0;
3736	__u64 length;
3737	__u32 flags = FIEMAP_EXTENT_LAST;
3738	int blockbits = inode->i_sb->s_blocksize_bits;
3739	int error = 0;
3740
3741	/* in-inode? */
3742	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3743		struct ext4_iloc iloc;
3744		int offset;	/* offset of xattr in inode */
3745
3746		error = ext4_get_inode_loc(inode, &iloc);
3747		if (error)
3748			return error;
3749		physical = iloc.bh->b_blocknr << blockbits;
3750		offset = EXT4_GOOD_OLD_INODE_SIZE +
3751				EXT4_I(inode)->i_extra_isize;
3752		physical += offset;
3753		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3754		flags |= FIEMAP_EXTENT_DATA_INLINE;
3755	} else { /* external block */
3756		physical = EXT4_I(inode)->i_file_acl << blockbits;
3757		length = inode->i_sb->s_blocksize;
3758	}
3759
3760	if (physical)
3761		error = fiemap_fill_next_extent(fieinfo, 0, physical,
3762						length, flags);
3763	return (error < 0 ? error : 0);
3764}
3765
3766int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3767		__u64 start, __u64 len)
3768{
3769	ext4_lblk_t start_blk;
3770	ext4_lblk_t len_blks;
3771	int error = 0;
3772
3773	/* fallback to generic here if not in extents fmt */
3774	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3775		return generic_block_fiemap(inode, fieinfo, start, len,
3776			ext4_get_block);
3777
3778	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3779		return -EBADR;
3780
3781	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3782		error = ext4_xattr_fiemap(inode, fieinfo);
3783	} else {
3784		start_blk = start >> inode->i_sb->s_blocksize_bits;
3785		len_blks = len >> inode->i_sb->s_blocksize_bits;
3786
3787		/*
3788		 * Walk the extent tree gathering extent information.
3789		 * ext4_ext_fiemap_cb will push extents back to user.
3790		 */
3791		error = ext4_ext_walk_space(inode, start_blk, len_blks,
3792					  ext4_ext_fiemap_cb, fieinfo);
3793	}
3794
3795	return error;
3796}
3797
3798