extents.c revision 9102e4fa8016af8bf1a263df913ee8fdafd4dfb0
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include "ext4_jbd2.h"
44#include "ext4_extents.h"
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53	ext4_fsblk_t block;
54
55	block = le32_to_cpu(ex->ee_start_lo);
56	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57	return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66	ext4_fsblk_t block;
67
68	block = le32_to_cpu(ix->ei_leaf_lo);
69	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70	return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static int ext4_ext_journal_restart(handle_t *handle, int needed)
96{
97	int err;
98
99	if (handle->h_buffer_credits > needed)
100		return 0;
101	err = ext4_journal_extend(handle, needed);
102	if (err)
103		return err;
104	return ext4_journal_restart(handle, needed);
105}
106
107/*
108 * could return:
109 *  - EROFS
110 *  - ENOMEM
111 */
112static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
113				struct ext4_ext_path *path)
114{
115	if (path->p_bh) {
116		/* path points to block */
117		return ext4_journal_get_write_access(handle, path->p_bh);
118	}
119	/* path points to leaf/index in inode body */
120	/* we use in-core data, no need to protect them */
121	return 0;
122}
123
124/*
125 * could return:
126 *  - EROFS
127 *  - ENOMEM
128 *  - EIO
129 */
130static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
131				struct ext4_ext_path *path)
132{
133	int err;
134	if (path->p_bh) {
135		/* path points to block */
136		err = ext4_journal_dirty_metadata(handle, path->p_bh);
137	} else {
138		/* path points to leaf/index in inode body */
139		err = ext4_mark_inode_dirty(handle, inode);
140	}
141	return err;
142}
143
144static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
145			      struct ext4_ext_path *path,
146			      ext4_lblk_t block)
147{
148	struct ext4_inode_info *ei = EXT4_I(inode);
149	ext4_fsblk_t bg_start;
150	ext4_fsblk_t last_block;
151	ext4_grpblk_t colour;
152	int depth;
153
154	if (path) {
155		struct ext4_extent *ex;
156		depth = path->p_depth;
157
158		/* try to predict block placement */
159		ex = path[depth].p_ext;
160		if (ex)
161			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
162
163		/* it looks like index is empty;
164		 * try to find starting block from index itself */
165		if (path[depth].p_bh)
166			return path[depth].p_bh->b_blocknr;
167	}
168
169	/* OK. use inode's group */
170	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
171		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
172	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
173
174	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
175		colour = (current->pid % 16) *
176			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
177	else
178		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
179	return bg_start + colour + block;
180}
181
182static ext4_fsblk_t
183ext4_ext_new_block(handle_t *handle, struct inode *inode,
184			struct ext4_ext_path *path,
185			struct ext4_extent *ex, int *err)
186{
187	ext4_fsblk_t goal, newblock;
188
189	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
190	newblock = ext4_new_block(handle, inode, goal, err);
191	return newblock;
192}
193
194static int ext4_ext_space_block(struct inode *inode)
195{
196	int size;
197
198	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
199			/ sizeof(struct ext4_extent);
200#ifdef AGGRESSIVE_TEST
201	if (size > 6)
202		size = 6;
203#endif
204	return size;
205}
206
207static int ext4_ext_space_block_idx(struct inode *inode)
208{
209	int size;
210
211	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
212			/ sizeof(struct ext4_extent_idx);
213#ifdef AGGRESSIVE_TEST
214	if (size > 5)
215		size = 5;
216#endif
217	return size;
218}
219
220static int ext4_ext_space_root(struct inode *inode)
221{
222	int size;
223
224	size = sizeof(EXT4_I(inode)->i_data);
225	size -= sizeof(struct ext4_extent_header);
226	size /= sizeof(struct ext4_extent);
227#ifdef AGGRESSIVE_TEST
228	if (size > 3)
229		size = 3;
230#endif
231	return size;
232}
233
234static int ext4_ext_space_root_idx(struct inode *inode)
235{
236	int size;
237
238	size = sizeof(EXT4_I(inode)->i_data);
239	size -= sizeof(struct ext4_extent_header);
240	size /= sizeof(struct ext4_extent_idx);
241#ifdef AGGRESSIVE_TEST
242	if (size > 4)
243		size = 4;
244#endif
245	return size;
246}
247
248static int
249ext4_ext_max_entries(struct inode *inode, int depth)
250{
251	int max;
252
253	if (depth == ext_depth(inode)) {
254		if (depth == 0)
255			max = ext4_ext_space_root(inode);
256		else
257			max = ext4_ext_space_root_idx(inode);
258	} else {
259		if (depth == 0)
260			max = ext4_ext_space_block(inode);
261		else
262			max = ext4_ext_space_block_idx(inode);
263	}
264
265	return max;
266}
267
268static int __ext4_ext_check_header(const char *function, struct inode *inode,
269					struct ext4_extent_header *eh,
270					int depth)
271{
272	const char *error_msg;
273	int max = 0;
274
275	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
276		error_msg = "invalid magic";
277		goto corrupted;
278	}
279	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
280		error_msg = "unexpected eh_depth";
281		goto corrupted;
282	}
283	if (unlikely(eh->eh_max == 0)) {
284		error_msg = "invalid eh_max";
285		goto corrupted;
286	}
287	max = ext4_ext_max_entries(inode, depth);
288	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
289		error_msg = "too large eh_max";
290		goto corrupted;
291	}
292	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
293		error_msg = "invalid eh_entries";
294		goto corrupted;
295	}
296	return 0;
297
298corrupted:
299	ext4_error(inode->i_sb, function,
300			"bad header in inode #%lu: %s - magic %x, "
301			"entries %u, max %u(%u), depth %u(%u)",
302			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
303			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
304			max, le16_to_cpu(eh->eh_depth), depth);
305
306	return -EIO;
307}
308
309#define ext4_ext_check_header(inode, eh, depth)	\
310	__ext4_ext_check_header(__func__, inode, eh, depth)
311
312#ifdef EXT_DEBUG
313static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
314{
315	int k, l = path->p_depth;
316
317	ext_debug("path:");
318	for (k = 0; k <= l; k++, path++) {
319		if (path->p_idx) {
320		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
321			    idx_pblock(path->p_idx));
322		} else if (path->p_ext) {
323			ext_debug("  %d:%d:%llu ",
324				  le32_to_cpu(path->p_ext->ee_block),
325				  ext4_ext_get_actual_len(path->p_ext),
326				  ext_pblock(path->p_ext));
327		} else
328			ext_debug("  []");
329	}
330	ext_debug("\n");
331}
332
333static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
334{
335	int depth = ext_depth(inode);
336	struct ext4_extent_header *eh;
337	struct ext4_extent *ex;
338	int i;
339
340	if (!path)
341		return;
342
343	eh = path[depth].p_hdr;
344	ex = EXT_FIRST_EXTENT(eh);
345
346	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
347		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
348			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
349	}
350	ext_debug("\n");
351}
352#else
353#define ext4_ext_show_path(inode,path)
354#define ext4_ext_show_leaf(inode,path)
355#endif
356
357void ext4_ext_drop_refs(struct ext4_ext_path *path)
358{
359	int depth = path->p_depth;
360	int i;
361
362	for (i = 0; i <= depth; i++, path++)
363		if (path->p_bh) {
364			brelse(path->p_bh);
365			path->p_bh = NULL;
366		}
367}
368
369/*
370 * ext4_ext_binsearch_idx:
371 * binary search for the closest index of the given block
372 * the header must be checked before calling this
373 */
374static void
375ext4_ext_binsearch_idx(struct inode *inode,
376			struct ext4_ext_path *path, ext4_lblk_t block)
377{
378	struct ext4_extent_header *eh = path->p_hdr;
379	struct ext4_extent_idx *r, *l, *m;
380
381
382	ext_debug("binsearch for %u(idx):  ", block);
383
384	l = EXT_FIRST_INDEX(eh) + 1;
385	r = EXT_LAST_INDEX(eh);
386	while (l <= r) {
387		m = l + (r - l) / 2;
388		if (block < le32_to_cpu(m->ei_block))
389			r = m - 1;
390		else
391			l = m + 1;
392		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
393				m, le32_to_cpu(m->ei_block),
394				r, le32_to_cpu(r->ei_block));
395	}
396
397	path->p_idx = l - 1;
398	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
399		  idx_pblock(path->p_idx));
400
401#ifdef CHECK_BINSEARCH
402	{
403		struct ext4_extent_idx *chix, *ix;
404		int k;
405
406		chix = ix = EXT_FIRST_INDEX(eh);
407		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
408		  if (k != 0 &&
409		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
410				printk("k=%d, ix=0x%p, first=0x%p\n", k,
411					ix, EXT_FIRST_INDEX(eh));
412				printk("%u <= %u\n",
413				       le32_to_cpu(ix->ei_block),
414				       le32_to_cpu(ix[-1].ei_block));
415			}
416			BUG_ON(k && le32_to_cpu(ix->ei_block)
417					   <= le32_to_cpu(ix[-1].ei_block));
418			if (block < le32_to_cpu(ix->ei_block))
419				break;
420			chix = ix;
421		}
422		BUG_ON(chix != path->p_idx);
423	}
424#endif
425
426}
427
428/*
429 * ext4_ext_binsearch:
430 * binary search for closest extent of the given block
431 * the header must be checked before calling this
432 */
433static void
434ext4_ext_binsearch(struct inode *inode,
435		struct ext4_ext_path *path, ext4_lblk_t block)
436{
437	struct ext4_extent_header *eh = path->p_hdr;
438	struct ext4_extent *r, *l, *m;
439
440	if (eh->eh_entries == 0) {
441		/*
442		 * this leaf is empty:
443		 * we get such a leaf in split/add case
444		 */
445		return;
446	}
447
448	ext_debug("binsearch for %u:  ", block);
449
450	l = EXT_FIRST_EXTENT(eh) + 1;
451	r = EXT_LAST_EXTENT(eh);
452
453	while (l <= r) {
454		m = l + (r - l) / 2;
455		if (block < le32_to_cpu(m->ee_block))
456			r = m - 1;
457		else
458			l = m + 1;
459		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
460				m, le32_to_cpu(m->ee_block),
461				r, le32_to_cpu(r->ee_block));
462	}
463
464	path->p_ext = l - 1;
465	ext_debug("  -> %d:%llu:%d ",
466			le32_to_cpu(path->p_ext->ee_block),
467			ext_pblock(path->p_ext),
468			ext4_ext_get_actual_len(path->p_ext));
469
470#ifdef CHECK_BINSEARCH
471	{
472		struct ext4_extent *chex, *ex;
473		int k;
474
475		chex = ex = EXT_FIRST_EXTENT(eh);
476		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
477			BUG_ON(k && le32_to_cpu(ex->ee_block)
478					  <= le32_to_cpu(ex[-1].ee_block));
479			if (block < le32_to_cpu(ex->ee_block))
480				break;
481			chex = ex;
482		}
483		BUG_ON(chex != path->p_ext);
484	}
485#endif
486
487}
488
489int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
490{
491	struct ext4_extent_header *eh;
492
493	eh = ext_inode_hdr(inode);
494	eh->eh_depth = 0;
495	eh->eh_entries = 0;
496	eh->eh_magic = EXT4_EXT_MAGIC;
497	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
498	ext4_mark_inode_dirty(handle, inode);
499	ext4_ext_invalidate_cache(inode);
500	return 0;
501}
502
503struct ext4_ext_path *
504ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
505					struct ext4_ext_path *path)
506{
507	struct ext4_extent_header *eh;
508	struct buffer_head *bh;
509	short int depth, i, ppos = 0, alloc = 0;
510
511	eh = ext_inode_hdr(inode);
512	depth = ext_depth(inode);
513	if (ext4_ext_check_header(inode, eh, depth))
514		return ERR_PTR(-EIO);
515
516
517	/* account possible depth increase */
518	if (!path) {
519		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
520				GFP_NOFS);
521		if (!path)
522			return ERR_PTR(-ENOMEM);
523		alloc = 1;
524	}
525	path[0].p_hdr = eh;
526	path[0].p_bh = NULL;
527
528	i = depth;
529	/* walk through the tree */
530	while (i) {
531		ext_debug("depth %d: num %d, max %d\n",
532			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
533
534		ext4_ext_binsearch_idx(inode, path + ppos, block);
535		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
536		path[ppos].p_depth = i;
537		path[ppos].p_ext = NULL;
538
539		bh = sb_bread(inode->i_sb, path[ppos].p_block);
540		if (!bh)
541			goto err;
542
543		eh = ext_block_hdr(bh);
544		ppos++;
545		BUG_ON(ppos > depth);
546		path[ppos].p_bh = bh;
547		path[ppos].p_hdr = eh;
548		i--;
549
550		if (ext4_ext_check_header(inode, eh, i))
551			goto err;
552	}
553
554	path[ppos].p_depth = i;
555	path[ppos].p_ext = NULL;
556	path[ppos].p_idx = NULL;
557
558	/* find extent */
559	ext4_ext_binsearch(inode, path + ppos, block);
560	/* if not an empty leaf */
561	if (path[ppos].p_ext)
562		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
563
564	ext4_ext_show_path(inode, path);
565
566	return path;
567
568err:
569	ext4_ext_drop_refs(path);
570	if (alloc)
571		kfree(path);
572	return ERR_PTR(-EIO);
573}
574
575/*
576 * ext4_ext_insert_index:
577 * insert new index [@logical;@ptr] into the block at @curp;
578 * check where to insert: before @curp or after @curp
579 */
580static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
581				struct ext4_ext_path *curp,
582				int logical, ext4_fsblk_t ptr)
583{
584	struct ext4_extent_idx *ix;
585	int len, err;
586
587	err = ext4_ext_get_access(handle, inode, curp);
588	if (err)
589		return err;
590
591	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
592	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
593	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
594		/* insert after */
595		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
596			len = (len - 1) * sizeof(struct ext4_extent_idx);
597			len = len < 0 ? 0 : len;
598			ext_debug("insert new index %d after: %llu. "
599					"move %d from 0x%p to 0x%p\n",
600					logical, ptr, len,
601					(curp->p_idx + 1), (curp->p_idx + 2));
602			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
603		}
604		ix = curp->p_idx + 1;
605	} else {
606		/* insert before */
607		len = len * sizeof(struct ext4_extent_idx);
608		len = len < 0 ? 0 : len;
609		ext_debug("insert new index %d before: %llu. "
610				"move %d from 0x%p to 0x%p\n",
611				logical, ptr, len,
612				curp->p_idx, (curp->p_idx + 1));
613		memmove(curp->p_idx + 1, curp->p_idx, len);
614		ix = curp->p_idx;
615	}
616
617	ix->ei_block = cpu_to_le32(logical);
618	ext4_idx_store_pblock(ix, ptr);
619	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
620
621	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
622			     > le16_to_cpu(curp->p_hdr->eh_max));
623	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
624
625	err = ext4_ext_dirty(handle, inode, curp);
626	ext4_std_error(inode->i_sb, err);
627
628	return err;
629}
630
631/*
632 * ext4_ext_split:
633 * inserts new subtree into the path, using free index entry
634 * at depth @at:
635 * - allocates all needed blocks (new leaf and all intermediate index blocks)
636 * - makes decision where to split
637 * - moves remaining extents and index entries (right to the split point)
638 *   into the newly allocated blocks
639 * - initializes subtree
640 */
641static int ext4_ext_split(handle_t *handle, struct inode *inode,
642				struct ext4_ext_path *path,
643				struct ext4_extent *newext, int at)
644{
645	struct buffer_head *bh = NULL;
646	int depth = ext_depth(inode);
647	struct ext4_extent_header *neh;
648	struct ext4_extent_idx *fidx;
649	struct ext4_extent *ex;
650	int i = at, k, m, a;
651	ext4_fsblk_t newblock, oldblock;
652	__le32 border;
653	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
654	int err = 0;
655
656	/* make decision: where to split? */
657	/* FIXME: now decision is simplest: at current extent */
658
659	/* if current leaf will be split, then we should use
660	 * border from split point */
661	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
662	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
663		border = path[depth].p_ext[1].ee_block;
664		ext_debug("leaf will be split."
665				" next leaf starts at %d\n",
666				  le32_to_cpu(border));
667	} else {
668		border = newext->ee_block;
669		ext_debug("leaf will be added."
670				" next leaf starts at %d\n",
671				le32_to_cpu(border));
672	}
673
674	/*
675	 * If error occurs, then we break processing
676	 * and mark filesystem read-only. index won't
677	 * be inserted and tree will be in consistent
678	 * state. Next mount will repair buffers too.
679	 */
680
681	/*
682	 * Get array to track all allocated blocks.
683	 * We need this to handle errors and free blocks
684	 * upon them.
685	 */
686	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
687	if (!ablocks)
688		return -ENOMEM;
689
690	/* allocate all needed blocks */
691	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
692	for (a = 0; a < depth - at; a++) {
693		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
694		if (newblock == 0)
695			goto cleanup;
696		ablocks[a] = newblock;
697	}
698
699	/* initialize new leaf */
700	newblock = ablocks[--a];
701	BUG_ON(newblock == 0);
702	bh = sb_getblk(inode->i_sb, newblock);
703	if (!bh) {
704		err = -EIO;
705		goto cleanup;
706	}
707	lock_buffer(bh);
708
709	err = ext4_journal_get_create_access(handle, bh);
710	if (err)
711		goto cleanup;
712
713	neh = ext_block_hdr(bh);
714	neh->eh_entries = 0;
715	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
716	neh->eh_magic = EXT4_EXT_MAGIC;
717	neh->eh_depth = 0;
718	ex = EXT_FIRST_EXTENT(neh);
719
720	/* move remainder of path[depth] to the new leaf */
721	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
722	/* start copy from next extent */
723	/* TODO: we could do it by single memmove */
724	m = 0;
725	path[depth].p_ext++;
726	while (path[depth].p_ext <=
727			EXT_MAX_EXTENT(path[depth].p_hdr)) {
728		ext_debug("move %d:%llu:%d in new leaf %llu\n",
729				le32_to_cpu(path[depth].p_ext->ee_block),
730				ext_pblock(path[depth].p_ext),
731				ext4_ext_get_actual_len(path[depth].p_ext),
732				newblock);
733		/*memmove(ex++, path[depth].p_ext++,
734				sizeof(struct ext4_extent));
735		neh->eh_entries++;*/
736		path[depth].p_ext++;
737		m++;
738	}
739	if (m) {
740		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
741		le16_add_cpu(&neh->eh_entries, m);
742	}
743
744	set_buffer_uptodate(bh);
745	unlock_buffer(bh);
746
747	err = ext4_journal_dirty_metadata(handle, bh);
748	if (err)
749		goto cleanup;
750	brelse(bh);
751	bh = NULL;
752
753	/* correct old leaf */
754	if (m) {
755		err = ext4_ext_get_access(handle, inode, path + depth);
756		if (err)
757			goto cleanup;
758		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
759		err = ext4_ext_dirty(handle, inode, path + depth);
760		if (err)
761			goto cleanup;
762
763	}
764
765	/* create intermediate indexes */
766	k = depth - at - 1;
767	BUG_ON(k < 0);
768	if (k)
769		ext_debug("create %d intermediate indices\n", k);
770	/* insert new index into current index block */
771	/* current depth stored in i var */
772	i = depth - 1;
773	while (k--) {
774		oldblock = newblock;
775		newblock = ablocks[--a];
776		bh = sb_getblk(inode->i_sb, newblock);
777		if (!bh) {
778			err = -EIO;
779			goto cleanup;
780		}
781		lock_buffer(bh);
782
783		err = ext4_journal_get_create_access(handle, bh);
784		if (err)
785			goto cleanup;
786
787		neh = ext_block_hdr(bh);
788		neh->eh_entries = cpu_to_le16(1);
789		neh->eh_magic = EXT4_EXT_MAGIC;
790		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
791		neh->eh_depth = cpu_to_le16(depth - i);
792		fidx = EXT_FIRST_INDEX(neh);
793		fidx->ei_block = border;
794		ext4_idx_store_pblock(fidx, oldblock);
795
796		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
797				i, newblock, le32_to_cpu(border), oldblock);
798		/* copy indexes */
799		m = 0;
800		path[i].p_idx++;
801
802		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
803				EXT_MAX_INDEX(path[i].p_hdr));
804		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
805				EXT_LAST_INDEX(path[i].p_hdr));
806		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
807			ext_debug("%d: move %d:%llu in new index %llu\n", i,
808					le32_to_cpu(path[i].p_idx->ei_block),
809					idx_pblock(path[i].p_idx),
810					newblock);
811			/*memmove(++fidx, path[i].p_idx++,
812					sizeof(struct ext4_extent_idx));
813			neh->eh_entries++;
814			BUG_ON(neh->eh_entries > neh->eh_max);*/
815			path[i].p_idx++;
816			m++;
817		}
818		if (m) {
819			memmove(++fidx, path[i].p_idx - m,
820				sizeof(struct ext4_extent_idx) * m);
821			le16_add_cpu(&neh->eh_entries, m);
822		}
823		set_buffer_uptodate(bh);
824		unlock_buffer(bh);
825
826		err = ext4_journal_dirty_metadata(handle, bh);
827		if (err)
828			goto cleanup;
829		brelse(bh);
830		bh = NULL;
831
832		/* correct old index */
833		if (m) {
834			err = ext4_ext_get_access(handle, inode, path + i);
835			if (err)
836				goto cleanup;
837			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
838			err = ext4_ext_dirty(handle, inode, path + i);
839			if (err)
840				goto cleanup;
841		}
842
843		i--;
844	}
845
846	/* insert new index */
847	err = ext4_ext_insert_index(handle, inode, path + at,
848				    le32_to_cpu(border), newblock);
849
850cleanup:
851	if (bh) {
852		if (buffer_locked(bh))
853			unlock_buffer(bh);
854		brelse(bh);
855	}
856
857	if (err) {
858		/* free all allocated blocks in error case */
859		for (i = 0; i < depth; i++) {
860			if (!ablocks[i])
861				continue;
862			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
863		}
864	}
865	kfree(ablocks);
866
867	return err;
868}
869
870/*
871 * ext4_ext_grow_indepth:
872 * implements tree growing procedure:
873 * - allocates new block
874 * - moves top-level data (index block or leaf) into the new block
875 * - initializes new top-level, creating index that points to the
876 *   just created block
877 */
878static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
879					struct ext4_ext_path *path,
880					struct ext4_extent *newext)
881{
882	struct ext4_ext_path *curp = path;
883	struct ext4_extent_header *neh;
884	struct ext4_extent_idx *fidx;
885	struct buffer_head *bh;
886	ext4_fsblk_t newblock;
887	int err = 0;
888
889	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
890	if (newblock == 0)
891		return err;
892
893	bh = sb_getblk(inode->i_sb, newblock);
894	if (!bh) {
895		err = -EIO;
896		ext4_std_error(inode->i_sb, err);
897		return err;
898	}
899	lock_buffer(bh);
900
901	err = ext4_journal_get_create_access(handle, bh);
902	if (err) {
903		unlock_buffer(bh);
904		goto out;
905	}
906
907	/* move top-level index/leaf into new block */
908	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
909
910	/* set size of new block */
911	neh = ext_block_hdr(bh);
912	/* old root could have indexes or leaves
913	 * so calculate e_max right way */
914	if (ext_depth(inode))
915	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
916	else
917	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
918	neh->eh_magic = EXT4_EXT_MAGIC;
919	set_buffer_uptodate(bh);
920	unlock_buffer(bh);
921
922	err = ext4_journal_dirty_metadata(handle, bh);
923	if (err)
924		goto out;
925
926	/* create index in new top-level index: num,max,pointer */
927	err = ext4_ext_get_access(handle, inode, curp);
928	if (err)
929		goto out;
930
931	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
932	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
933	curp->p_hdr->eh_entries = cpu_to_le16(1);
934	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
935
936	if (path[0].p_hdr->eh_depth)
937		curp->p_idx->ei_block =
938			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
939	else
940		curp->p_idx->ei_block =
941			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
942	ext4_idx_store_pblock(curp->p_idx, newblock);
943
944	neh = ext_inode_hdr(inode);
945	fidx = EXT_FIRST_INDEX(neh);
946	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
947		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
948		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
949
950	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
951	err = ext4_ext_dirty(handle, inode, curp);
952out:
953	brelse(bh);
954
955	return err;
956}
957
958/*
959 * ext4_ext_create_new_leaf:
960 * finds empty index and adds new leaf.
961 * if no free index is found, then it requests in-depth growing.
962 */
963static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
964					struct ext4_ext_path *path,
965					struct ext4_extent *newext)
966{
967	struct ext4_ext_path *curp;
968	int depth, i, err = 0;
969
970repeat:
971	i = depth = ext_depth(inode);
972
973	/* walk up to the tree and look for free index entry */
974	curp = path + depth;
975	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
976		i--;
977		curp--;
978	}
979
980	/* we use already allocated block for index block,
981	 * so subsequent data blocks should be contiguous */
982	if (EXT_HAS_FREE_INDEX(curp)) {
983		/* if we found index with free entry, then use that
984		 * entry: create all needed subtree and add new leaf */
985		err = ext4_ext_split(handle, inode, path, newext, i);
986		if (err)
987			goto out;
988
989		/* refill path */
990		ext4_ext_drop_refs(path);
991		path = ext4_ext_find_extent(inode,
992				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
993				    path);
994		if (IS_ERR(path))
995			err = PTR_ERR(path);
996	} else {
997		/* tree is full, time to grow in depth */
998		err = ext4_ext_grow_indepth(handle, inode, path, newext);
999		if (err)
1000			goto out;
1001
1002		/* refill path */
1003		ext4_ext_drop_refs(path);
1004		path = ext4_ext_find_extent(inode,
1005				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1006				    path);
1007		if (IS_ERR(path)) {
1008			err = PTR_ERR(path);
1009			goto out;
1010		}
1011
1012		/*
1013		 * only first (depth 0 -> 1) produces free space;
1014		 * in all other cases we have to split the grown tree
1015		 */
1016		depth = ext_depth(inode);
1017		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1018			/* now we need to split */
1019			goto repeat;
1020		}
1021	}
1022
1023out:
1024	return err;
1025}
1026
1027/*
1028 * search the closest allocated block to the left for *logical
1029 * and returns it at @logical + it's physical address at @phys
1030 * if *logical is the smallest allocated block, the function
1031 * returns 0 at @phys
1032 * return value contains 0 (success) or error code
1033 */
1034int
1035ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1036			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1037{
1038	struct ext4_extent_idx *ix;
1039	struct ext4_extent *ex;
1040	int depth, ee_len;
1041
1042	BUG_ON(path == NULL);
1043	depth = path->p_depth;
1044	*phys = 0;
1045
1046	if (depth == 0 && path->p_ext == NULL)
1047		return 0;
1048
1049	/* usually extent in the path covers blocks smaller
1050	 * then *logical, but it can be that extent is the
1051	 * first one in the file */
1052
1053	ex = path[depth].p_ext;
1054	ee_len = ext4_ext_get_actual_len(ex);
1055	if (*logical < le32_to_cpu(ex->ee_block)) {
1056		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1057		while (--depth >= 0) {
1058			ix = path[depth].p_idx;
1059			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1060		}
1061		return 0;
1062	}
1063
1064	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1065
1066	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1067	*phys = ext_pblock(ex) + ee_len - 1;
1068	return 0;
1069}
1070
1071/*
1072 * search the closest allocated block to the right for *logical
1073 * and returns it at @logical + it's physical address at @phys
1074 * if *logical is the smallest allocated block, the function
1075 * returns 0 at @phys
1076 * return value contains 0 (success) or error code
1077 */
1078int
1079ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1080			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1081{
1082	struct buffer_head *bh = NULL;
1083	struct ext4_extent_header *eh;
1084	struct ext4_extent_idx *ix;
1085	struct ext4_extent *ex;
1086	ext4_fsblk_t block;
1087	int depth, ee_len;
1088
1089	BUG_ON(path == NULL);
1090	depth = path->p_depth;
1091	*phys = 0;
1092
1093	if (depth == 0 && path->p_ext == NULL)
1094		return 0;
1095
1096	/* usually extent in the path covers blocks smaller
1097	 * then *logical, but it can be that extent is the
1098	 * first one in the file */
1099
1100	ex = path[depth].p_ext;
1101	ee_len = ext4_ext_get_actual_len(ex);
1102	if (*logical < le32_to_cpu(ex->ee_block)) {
1103		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1104		while (--depth >= 0) {
1105			ix = path[depth].p_idx;
1106			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1107		}
1108		*logical = le32_to_cpu(ex->ee_block);
1109		*phys = ext_pblock(ex);
1110		return 0;
1111	}
1112
1113	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1114
1115	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1116		/* next allocated block in this leaf */
1117		ex++;
1118		*logical = le32_to_cpu(ex->ee_block);
1119		*phys = ext_pblock(ex);
1120		return 0;
1121	}
1122
1123	/* go up and search for index to the right */
1124	while (--depth >= 0) {
1125		ix = path[depth].p_idx;
1126		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1127			break;
1128	}
1129
1130	if (depth < 0) {
1131		/* we've gone up to the root and
1132		 * found no index to the right */
1133		return 0;
1134	}
1135
1136	/* we've found index to the right, let's
1137	 * follow it and find the closest allocated
1138	 * block to the right */
1139	ix++;
1140	block = idx_pblock(ix);
1141	while (++depth < path->p_depth) {
1142		bh = sb_bread(inode->i_sb, block);
1143		if (bh == NULL)
1144			return -EIO;
1145		eh = ext_block_hdr(bh);
1146		if (ext4_ext_check_header(inode, eh, depth)) {
1147			put_bh(bh);
1148			return -EIO;
1149		}
1150		ix = EXT_FIRST_INDEX(eh);
1151		block = idx_pblock(ix);
1152		put_bh(bh);
1153	}
1154
1155	bh = sb_bread(inode->i_sb, block);
1156	if (bh == NULL)
1157		return -EIO;
1158	eh = ext_block_hdr(bh);
1159	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1160		put_bh(bh);
1161		return -EIO;
1162	}
1163	ex = EXT_FIRST_EXTENT(eh);
1164	*logical = le32_to_cpu(ex->ee_block);
1165	*phys = ext_pblock(ex);
1166	put_bh(bh);
1167	return 0;
1168
1169}
1170
1171/*
1172 * ext4_ext_next_allocated_block:
1173 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1174 * NOTE: it considers block number from index entry as
1175 * allocated block. Thus, index entries have to be consistent
1176 * with leaves.
1177 */
1178static ext4_lblk_t
1179ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1180{
1181	int depth;
1182
1183	BUG_ON(path == NULL);
1184	depth = path->p_depth;
1185
1186	if (depth == 0 && path->p_ext == NULL)
1187		return EXT_MAX_BLOCK;
1188
1189	while (depth >= 0) {
1190		if (depth == path->p_depth) {
1191			/* leaf */
1192			if (path[depth].p_ext !=
1193					EXT_LAST_EXTENT(path[depth].p_hdr))
1194			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1195		} else {
1196			/* index */
1197			if (path[depth].p_idx !=
1198					EXT_LAST_INDEX(path[depth].p_hdr))
1199			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1200		}
1201		depth--;
1202	}
1203
1204	return EXT_MAX_BLOCK;
1205}
1206
1207/*
1208 * ext4_ext_next_leaf_block:
1209 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1210 */
1211static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1212					struct ext4_ext_path *path)
1213{
1214	int depth;
1215
1216	BUG_ON(path == NULL);
1217	depth = path->p_depth;
1218
1219	/* zero-tree has no leaf blocks at all */
1220	if (depth == 0)
1221		return EXT_MAX_BLOCK;
1222
1223	/* go to index block */
1224	depth--;
1225
1226	while (depth >= 0) {
1227		if (path[depth].p_idx !=
1228				EXT_LAST_INDEX(path[depth].p_hdr))
1229			return (ext4_lblk_t)
1230				le32_to_cpu(path[depth].p_idx[1].ei_block);
1231		depth--;
1232	}
1233
1234	return EXT_MAX_BLOCK;
1235}
1236
1237/*
1238 * ext4_ext_correct_indexes:
1239 * if leaf gets modified and modified extent is first in the leaf,
1240 * then we have to correct all indexes above.
1241 * TODO: do we need to correct tree in all cases?
1242 */
1243static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1244				struct ext4_ext_path *path)
1245{
1246	struct ext4_extent_header *eh;
1247	int depth = ext_depth(inode);
1248	struct ext4_extent *ex;
1249	__le32 border;
1250	int k, err = 0;
1251
1252	eh = path[depth].p_hdr;
1253	ex = path[depth].p_ext;
1254	BUG_ON(ex == NULL);
1255	BUG_ON(eh == NULL);
1256
1257	if (depth == 0) {
1258		/* there is no tree at all */
1259		return 0;
1260	}
1261
1262	if (ex != EXT_FIRST_EXTENT(eh)) {
1263		/* we correct tree if first leaf got modified only */
1264		return 0;
1265	}
1266
1267	/*
1268	 * TODO: we need correction if border is smaller than current one
1269	 */
1270	k = depth - 1;
1271	border = path[depth].p_ext->ee_block;
1272	err = ext4_ext_get_access(handle, inode, path + k);
1273	if (err)
1274		return err;
1275	path[k].p_idx->ei_block = border;
1276	err = ext4_ext_dirty(handle, inode, path + k);
1277	if (err)
1278		return err;
1279
1280	while (k--) {
1281		/* change all left-side indexes */
1282		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1283			break;
1284		err = ext4_ext_get_access(handle, inode, path + k);
1285		if (err)
1286			break;
1287		path[k].p_idx->ei_block = border;
1288		err = ext4_ext_dirty(handle, inode, path + k);
1289		if (err)
1290			break;
1291	}
1292
1293	return err;
1294}
1295
1296static int
1297ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1298				struct ext4_extent *ex2)
1299{
1300	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1301
1302	/*
1303	 * Make sure that either both extents are uninitialized, or
1304	 * both are _not_.
1305	 */
1306	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1307		return 0;
1308
1309	if (ext4_ext_is_uninitialized(ex1))
1310		max_len = EXT_UNINIT_MAX_LEN;
1311	else
1312		max_len = EXT_INIT_MAX_LEN;
1313
1314	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1315	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1316
1317	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1318			le32_to_cpu(ex2->ee_block))
1319		return 0;
1320
1321	/*
1322	 * To allow future support for preallocated extents to be added
1323	 * as an RO_COMPAT feature, refuse to merge to extents if
1324	 * this can result in the top bit of ee_len being set.
1325	 */
1326	if (ext1_ee_len + ext2_ee_len > max_len)
1327		return 0;
1328#ifdef AGGRESSIVE_TEST
1329	if (ext1_ee_len >= 4)
1330		return 0;
1331#endif
1332
1333	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1334		return 1;
1335	return 0;
1336}
1337
1338/*
1339 * This function tries to merge the "ex" extent to the next extent in the tree.
1340 * It always tries to merge towards right. If you want to merge towards
1341 * left, pass "ex - 1" as argument instead of "ex".
1342 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1343 * 1 if they got merged.
1344 */
1345int ext4_ext_try_to_merge(struct inode *inode,
1346			  struct ext4_ext_path *path,
1347			  struct ext4_extent *ex)
1348{
1349	struct ext4_extent_header *eh;
1350	unsigned int depth, len;
1351	int merge_done = 0;
1352	int uninitialized = 0;
1353
1354	depth = ext_depth(inode);
1355	BUG_ON(path[depth].p_hdr == NULL);
1356	eh = path[depth].p_hdr;
1357
1358	while (ex < EXT_LAST_EXTENT(eh)) {
1359		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1360			break;
1361		/* merge with next extent! */
1362		if (ext4_ext_is_uninitialized(ex))
1363			uninitialized = 1;
1364		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1365				+ ext4_ext_get_actual_len(ex + 1));
1366		if (uninitialized)
1367			ext4_ext_mark_uninitialized(ex);
1368
1369		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1370			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1371				* sizeof(struct ext4_extent);
1372			memmove(ex + 1, ex + 2, len);
1373		}
1374		le16_add_cpu(&eh->eh_entries, -1);
1375		merge_done = 1;
1376		WARN_ON(eh->eh_entries == 0);
1377		if (!eh->eh_entries)
1378			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1379			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1380	}
1381
1382	return merge_done;
1383}
1384
1385/*
1386 * check if a portion of the "newext" extent overlaps with an
1387 * existing extent.
1388 *
1389 * If there is an overlap discovered, it updates the length of the newext
1390 * such that there will be no overlap, and then returns 1.
1391 * If there is no overlap found, it returns 0.
1392 */
1393unsigned int ext4_ext_check_overlap(struct inode *inode,
1394				    struct ext4_extent *newext,
1395				    struct ext4_ext_path *path)
1396{
1397	ext4_lblk_t b1, b2;
1398	unsigned int depth, len1;
1399	unsigned int ret = 0;
1400
1401	b1 = le32_to_cpu(newext->ee_block);
1402	len1 = ext4_ext_get_actual_len(newext);
1403	depth = ext_depth(inode);
1404	if (!path[depth].p_ext)
1405		goto out;
1406	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1407
1408	/*
1409	 * get the next allocated block if the extent in the path
1410	 * is before the requested block(s)
1411	 */
1412	if (b2 < b1) {
1413		b2 = ext4_ext_next_allocated_block(path);
1414		if (b2 == EXT_MAX_BLOCK)
1415			goto out;
1416	}
1417
1418	/* check for wrap through zero on extent logical start block*/
1419	if (b1 + len1 < b1) {
1420		len1 = EXT_MAX_BLOCK - b1;
1421		newext->ee_len = cpu_to_le16(len1);
1422		ret = 1;
1423	}
1424
1425	/* check for overlap */
1426	if (b1 + len1 > b2) {
1427		newext->ee_len = cpu_to_le16(b2 - b1);
1428		ret = 1;
1429	}
1430out:
1431	return ret;
1432}
1433
1434/*
1435 * ext4_ext_insert_extent:
1436 * tries to merge requsted extent into the existing extent or
1437 * inserts requested extent as new one into the tree,
1438 * creating new leaf in the no-space case.
1439 */
1440int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1441				struct ext4_ext_path *path,
1442				struct ext4_extent *newext)
1443{
1444	struct ext4_extent_header * eh;
1445	struct ext4_extent *ex, *fex;
1446	struct ext4_extent *nearex; /* nearest extent */
1447	struct ext4_ext_path *npath = NULL;
1448	int depth, len, err;
1449	ext4_lblk_t next;
1450	unsigned uninitialized = 0;
1451
1452	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1453	depth = ext_depth(inode);
1454	ex = path[depth].p_ext;
1455	BUG_ON(path[depth].p_hdr == NULL);
1456
1457	/* try to insert block into found extent and return */
1458	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1459		ext_debug("append %d block to %d:%d (from %llu)\n",
1460				ext4_ext_get_actual_len(newext),
1461				le32_to_cpu(ex->ee_block),
1462				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1463		err = ext4_ext_get_access(handle, inode, path + depth);
1464		if (err)
1465			return err;
1466
1467		/*
1468		 * ext4_can_extents_be_merged should have checked that either
1469		 * both extents are uninitialized, or both aren't. Thus we
1470		 * need to check only one of them here.
1471		 */
1472		if (ext4_ext_is_uninitialized(ex))
1473			uninitialized = 1;
1474		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1475					+ ext4_ext_get_actual_len(newext));
1476		if (uninitialized)
1477			ext4_ext_mark_uninitialized(ex);
1478		eh = path[depth].p_hdr;
1479		nearex = ex;
1480		goto merge;
1481	}
1482
1483repeat:
1484	depth = ext_depth(inode);
1485	eh = path[depth].p_hdr;
1486	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1487		goto has_space;
1488
1489	/* probably next leaf has space for us? */
1490	fex = EXT_LAST_EXTENT(eh);
1491	next = ext4_ext_next_leaf_block(inode, path);
1492	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1493	    && next != EXT_MAX_BLOCK) {
1494		ext_debug("next leaf block - %d\n", next);
1495		BUG_ON(npath != NULL);
1496		npath = ext4_ext_find_extent(inode, next, NULL);
1497		if (IS_ERR(npath))
1498			return PTR_ERR(npath);
1499		BUG_ON(npath->p_depth != path->p_depth);
1500		eh = npath[depth].p_hdr;
1501		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1502			ext_debug("next leaf isnt full(%d)\n",
1503				  le16_to_cpu(eh->eh_entries));
1504			path = npath;
1505			goto repeat;
1506		}
1507		ext_debug("next leaf has no free space(%d,%d)\n",
1508			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1509	}
1510
1511	/*
1512	 * There is no free space in the found leaf.
1513	 * We're gonna add a new leaf in the tree.
1514	 */
1515	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1516	if (err)
1517		goto cleanup;
1518	depth = ext_depth(inode);
1519	eh = path[depth].p_hdr;
1520
1521has_space:
1522	nearex = path[depth].p_ext;
1523
1524	err = ext4_ext_get_access(handle, inode, path + depth);
1525	if (err)
1526		goto cleanup;
1527
1528	if (!nearex) {
1529		/* there is no extent in this leaf, create first one */
1530		ext_debug("first extent in the leaf: %d:%llu:%d\n",
1531				le32_to_cpu(newext->ee_block),
1532				ext_pblock(newext),
1533				ext4_ext_get_actual_len(newext));
1534		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1535	} else if (le32_to_cpu(newext->ee_block)
1536			   > le32_to_cpu(nearex->ee_block)) {
1537/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1538		if (nearex != EXT_LAST_EXTENT(eh)) {
1539			len = EXT_MAX_EXTENT(eh) - nearex;
1540			len = (len - 1) * sizeof(struct ext4_extent);
1541			len = len < 0 ? 0 : len;
1542			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1543					"move %d from 0x%p to 0x%p\n",
1544					le32_to_cpu(newext->ee_block),
1545					ext_pblock(newext),
1546					ext4_ext_get_actual_len(newext),
1547					nearex, len, nearex + 1, nearex + 2);
1548			memmove(nearex + 2, nearex + 1, len);
1549		}
1550		path[depth].p_ext = nearex + 1;
1551	} else {
1552		BUG_ON(newext->ee_block == nearex->ee_block);
1553		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1554		len = len < 0 ? 0 : len;
1555		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1556				"move %d from 0x%p to 0x%p\n",
1557				le32_to_cpu(newext->ee_block),
1558				ext_pblock(newext),
1559				ext4_ext_get_actual_len(newext),
1560				nearex, len, nearex + 1, nearex + 2);
1561		memmove(nearex + 1, nearex, len);
1562		path[depth].p_ext = nearex;
1563	}
1564
1565	le16_add_cpu(&eh->eh_entries, 1);
1566	nearex = path[depth].p_ext;
1567	nearex->ee_block = newext->ee_block;
1568	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1569	nearex->ee_len = newext->ee_len;
1570
1571merge:
1572	/* try to merge extents to the right */
1573	ext4_ext_try_to_merge(inode, path, nearex);
1574
1575	/* try to merge extents to the left */
1576
1577	/* time to correct all indexes above */
1578	err = ext4_ext_correct_indexes(handle, inode, path);
1579	if (err)
1580		goto cleanup;
1581
1582	err = ext4_ext_dirty(handle, inode, path + depth);
1583
1584cleanup:
1585	if (npath) {
1586		ext4_ext_drop_refs(npath);
1587		kfree(npath);
1588	}
1589	ext4_ext_tree_changed(inode);
1590	ext4_ext_invalidate_cache(inode);
1591	return err;
1592}
1593
1594static void
1595ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1596			__u32 len, ext4_fsblk_t start, int type)
1597{
1598	struct ext4_ext_cache *cex;
1599	BUG_ON(len == 0);
1600	cex = &EXT4_I(inode)->i_cached_extent;
1601	cex->ec_type = type;
1602	cex->ec_block = block;
1603	cex->ec_len = len;
1604	cex->ec_start = start;
1605}
1606
1607/*
1608 * ext4_ext_put_gap_in_cache:
1609 * calculate boundaries of the gap that the requested block fits into
1610 * and cache this gap
1611 */
1612static void
1613ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1614				ext4_lblk_t block)
1615{
1616	int depth = ext_depth(inode);
1617	unsigned long len;
1618	ext4_lblk_t lblock;
1619	struct ext4_extent *ex;
1620
1621	ex = path[depth].p_ext;
1622	if (ex == NULL) {
1623		/* there is no extent yet, so gap is [0;-] */
1624		lblock = 0;
1625		len = EXT_MAX_BLOCK;
1626		ext_debug("cache gap(whole file):");
1627	} else if (block < le32_to_cpu(ex->ee_block)) {
1628		lblock = block;
1629		len = le32_to_cpu(ex->ee_block) - block;
1630		ext_debug("cache gap(before): %u [%u:%u]",
1631				block,
1632				le32_to_cpu(ex->ee_block),
1633				 ext4_ext_get_actual_len(ex));
1634	} else if (block >= le32_to_cpu(ex->ee_block)
1635			+ ext4_ext_get_actual_len(ex)) {
1636		ext4_lblk_t next;
1637		lblock = le32_to_cpu(ex->ee_block)
1638			+ ext4_ext_get_actual_len(ex);
1639
1640		next = ext4_ext_next_allocated_block(path);
1641		ext_debug("cache gap(after): [%u:%u] %u",
1642				le32_to_cpu(ex->ee_block),
1643				ext4_ext_get_actual_len(ex),
1644				block);
1645		BUG_ON(next == lblock);
1646		len = next - lblock;
1647	} else {
1648		lblock = len = 0;
1649		BUG();
1650	}
1651
1652	ext_debug(" -> %u:%lu\n", lblock, len);
1653	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1654}
1655
1656static int
1657ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1658			struct ext4_extent *ex)
1659{
1660	struct ext4_ext_cache *cex;
1661
1662	cex = &EXT4_I(inode)->i_cached_extent;
1663
1664	/* has cache valid data? */
1665	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1666		return EXT4_EXT_CACHE_NO;
1667
1668	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1669			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1670	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1671		ex->ee_block = cpu_to_le32(cex->ec_block);
1672		ext4_ext_store_pblock(ex, cex->ec_start);
1673		ex->ee_len = cpu_to_le16(cex->ec_len);
1674		ext_debug("%u cached by %u:%u:%llu\n",
1675				block,
1676				cex->ec_block, cex->ec_len, cex->ec_start);
1677		return cex->ec_type;
1678	}
1679
1680	/* not in cache */
1681	return EXT4_EXT_CACHE_NO;
1682}
1683
1684/*
1685 * ext4_ext_rm_idx:
1686 * removes index from the index block.
1687 * It's used in truncate case only, thus all requests are for
1688 * last index in the block only.
1689 */
1690static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1691			struct ext4_ext_path *path)
1692{
1693	struct buffer_head *bh;
1694	int err;
1695	ext4_fsblk_t leaf;
1696
1697	/* free index block */
1698	path--;
1699	leaf = idx_pblock(path->p_idx);
1700	BUG_ON(path->p_hdr->eh_entries == 0);
1701	err = ext4_ext_get_access(handle, inode, path);
1702	if (err)
1703		return err;
1704	le16_add_cpu(&path->p_hdr->eh_entries, -1);
1705	err = ext4_ext_dirty(handle, inode, path);
1706	if (err)
1707		return err;
1708	ext_debug("index is empty, remove it, free block %llu\n", leaf);
1709	bh = sb_find_get_block(inode->i_sb, leaf);
1710	ext4_forget(handle, 1, inode, bh, leaf);
1711	ext4_free_blocks(handle, inode, leaf, 1, 1);
1712	return err;
1713}
1714
1715/*
1716 * ext4_ext_calc_credits_for_insert:
1717 * This routine returns max. credits that the extent tree can consume.
1718 * It should be OK for low-performance paths like ->writepage()
1719 * To allow many writing processes to fit into a single transaction,
1720 * the caller should calculate credits under i_data_sem and
1721 * pass the actual path.
1722 */
1723int ext4_ext_calc_credits_for_insert(struct inode *inode,
1724						struct ext4_ext_path *path)
1725{
1726	int depth, needed;
1727
1728	if (path) {
1729		/* probably there is space in leaf? */
1730		depth = ext_depth(inode);
1731		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1732				< le16_to_cpu(path[depth].p_hdr->eh_max))
1733			return 1;
1734	}
1735
1736	/*
1737	 * given 32-bit logical block (4294967296 blocks), max. tree
1738	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1739	 * Let's also add one more level for imbalance.
1740	 */
1741	depth = 5;
1742
1743	/* allocation of new data block(s) */
1744	needed = 2;
1745
1746	/*
1747	 * tree can be full, so it would need to grow in depth:
1748	 * we need one credit to modify old root, credits for
1749	 * new root will be added in split accounting
1750	 */
1751	needed += 1;
1752
1753	/*
1754	 * Index split can happen, we would need:
1755	 *    allocate intermediate indexes (bitmap + group)
1756	 *  + change two blocks at each level, but root (already included)
1757	 */
1758	needed += (depth * 2) + (depth * 2);
1759
1760	/* any allocation modifies superblock */
1761	needed += 1;
1762
1763	return needed;
1764}
1765
1766static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1767				struct ext4_extent *ex,
1768				ext4_lblk_t from, ext4_lblk_t to)
1769{
1770	struct buffer_head *bh;
1771	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
1772	int i, metadata = 0;
1773
1774	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1775		metadata = 1;
1776#ifdef EXTENTS_STATS
1777	{
1778		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1779		spin_lock(&sbi->s_ext_stats_lock);
1780		sbi->s_ext_blocks += ee_len;
1781		sbi->s_ext_extents++;
1782		if (ee_len < sbi->s_ext_min)
1783			sbi->s_ext_min = ee_len;
1784		if (ee_len > sbi->s_ext_max)
1785			sbi->s_ext_max = ee_len;
1786		if (ext_depth(inode) > sbi->s_depth_max)
1787			sbi->s_depth_max = ext_depth(inode);
1788		spin_unlock(&sbi->s_ext_stats_lock);
1789	}
1790#endif
1791	if (from >= le32_to_cpu(ex->ee_block)
1792	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1793		/* tail removal */
1794		ext4_lblk_t num;
1795		ext4_fsblk_t start;
1796
1797		num = le32_to_cpu(ex->ee_block) + ee_len - from;
1798		start = ext_pblock(ex) + ee_len - num;
1799		ext_debug("free last %u blocks starting %llu\n", num, start);
1800		for (i = 0; i < num; i++) {
1801			bh = sb_find_get_block(inode->i_sb, start + i);
1802			ext4_forget(handle, 0, inode, bh, start + i);
1803		}
1804		ext4_free_blocks(handle, inode, start, num, metadata);
1805	} else if (from == le32_to_cpu(ex->ee_block)
1806		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1807		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1808			from, to, le32_to_cpu(ex->ee_block), ee_len);
1809	} else {
1810		printk(KERN_INFO "strange request: removal(2) "
1811				"%u-%u from %u:%u\n",
1812				from, to, le32_to_cpu(ex->ee_block), ee_len);
1813	}
1814	return 0;
1815}
1816
1817static int
1818ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1819		struct ext4_ext_path *path, ext4_lblk_t start)
1820{
1821	int err = 0, correct_index = 0;
1822	int depth = ext_depth(inode), credits;
1823	struct ext4_extent_header *eh;
1824	ext4_lblk_t a, b, block;
1825	unsigned num;
1826	ext4_lblk_t ex_ee_block;
1827	unsigned short ex_ee_len;
1828	unsigned uninitialized = 0;
1829	struct ext4_extent *ex;
1830
1831	/* the header must be checked already in ext4_ext_remove_space() */
1832	ext_debug("truncate since %u in leaf\n", start);
1833	if (!path[depth].p_hdr)
1834		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1835	eh = path[depth].p_hdr;
1836	BUG_ON(eh == NULL);
1837
1838	/* find where to start removing */
1839	ex = EXT_LAST_EXTENT(eh);
1840
1841	ex_ee_block = le32_to_cpu(ex->ee_block);
1842	if (ext4_ext_is_uninitialized(ex))
1843		uninitialized = 1;
1844	ex_ee_len = ext4_ext_get_actual_len(ex);
1845
1846	while (ex >= EXT_FIRST_EXTENT(eh) &&
1847			ex_ee_block + ex_ee_len > start) {
1848		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1849		path[depth].p_ext = ex;
1850
1851		a = ex_ee_block > start ? ex_ee_block : start;
1852		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1853			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1854
1855		ext_debug("  border %u:%u\n", a, b);
1856
1857		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1858			block = 0;
1859			num = 0;
1860			BUG();
1861		} else if (a != ex_ee_block) {
1862			/* remove tail of the extent */
1863			block = ex_ee_block;
1864			num = a - block;
1865		} else if (b != ex_ee_block + ex_ee_len - 1) {
1866			/* remove head of the extent */
1867			block = a;
1868			num = b - a;
1869			/* there is no "make a hole" API yet */
1870			BUG();
1871		} else {
1872			/* remove whole extent: excellent! */
1873			block = ex_ee_block;
1874			num = 0;
1875			BUG_ON(a != ex_ee_block);
1876			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1877		}
1878
1879		/* at present, extent can't cross block group: */
1880		/* leaf + bitmap + group desc + sb + inode */
1881		credits = 5;
1882		if (ex == EXT_FIRST_EXTENT(eh)) {
1883			correct_index = 1;
1884			credits += (ext_depth(inode)) + 1;
1885		}
1886#ifdef CONFIG_QUOTA
1887		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1888#endif
1889
1890		err = ext4_ext_journal_restart(handle, credits);
1891		if (err)
1892			goto out;
1893
1894		err = ext4_ext_get_access(handle, inode, path + depth);
1895		if (err)
1896			goto out;
1897
1898		err = ext4_remove_blocks(handle, inode, ex, a, b);
1899		if (err)
1900			goto out;
1901
1902		if (num == 0) {
1903			/* this extent is removed; mark slot entirely unused */
1904			ext4_ext_store_pblock(ex, 0);
1905			le16_add_cpu(&eh->eh_entries, -1);
1906		}
1907
1908		ex->ee_block = cpu_to_le32(block);
1909		ex->ee_len = cpu_to_le16(num);
1910		/*
1911		 * Do not mark uninitialized if all the blocks in the
1912		 * extent have been removed.
1913		 */
1914		if (uninitialized && num)
1915			ext4_ext_mark_uninitialized(ex);
1916
1917		err = ext4_ext_dirty(handle, inode, path + depth);
1918		if (err)
1919			goto out;
1920
1921		ext_debug("new extent: %u:%u:%llu\n", block, num,
1922				ext_pblock(ex));
1923		ex--;
1924		ex_ee_block = le32_to_cpu(ex->ee_block);
1925		ex_ee_len = ext4_ext_get_actual_len(ex);
1926	}
1927
1928	if (correct_index && eh->eh_entries)
1929		err = ext4_ext_correct_indexes(handle, inode, path);
1930
1931	/* if this leaf is free, then we should
1932	 * remove it from index block above */
1933	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1934		err = ext4_ext_rm_idx(handle, inode, path + depth);
1935
1936out:
1937	return err;
1938}
1939
1940/*
1941 * ext4_ext_more_to_rm:
1942 * returns 1 if current index has to be freed (even partial)
1943 */
1944static int
1945ext4_ext_more_to_rm(struct ext4_ext_path *path)
1946{
1947	BUG_ON(path->p_idx == NULL);
1948
1949	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1950		return 0;
1951
1952	/*
1953	 * if truncate on deeper level happened, it wasn't partial,
1954	 * so we have to consider current index for truncation
1955	 */
1956	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1957		return 0;
1958	return 1;
1959}
1960
1961static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
1962{
1963	struct super_block *sb = inode->i_sb;
1964	int depth = ext_depth(inode);
1965	struct ext4_ext_path *path;
1966	handle_t *handle;
1967	int i = 0, err = 0;
1968
1969	ext_debug("truncate since %u\n", start);
1970
1971	/* probably first extent we're gonna free will be last in block */
1972	handle = ext4_journal_start(inode, depth + 1);
1973	if (IS_ERR(handle))
1974		return PTR_ERR(handle);
1975
1976	ext4_ext_invalidate_cache(inode);
1977
1978	/*
1979	 * We start scanning from right side, freeing all the blocks
1980	 * after i_size and walking into the tree depth-wise.
1981	 */
1982	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
1983	if (path == NULL) {
1984		ext4_journal_stop(handle);
1985		return -ENOMEM;
1986	}
1987	path[0].p_hdr = ext_inode_hdr(inode);
1988	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
1989		err = -EIO;
1990		goto out;
1991	}
1992	path[0].p_depth = depth;
1993
1994	while (i >= 0 && err == 0) {
1995		if (i == depth) {
1996			/* this is leaf block */
1997			err = ext4_ext_rm_leaf(handle, inode, path, start);
1998			/* root level has p_bh == NULL, brelse() eats this */
1999			brelse(path[i].p_bh);
2000			path[i].p_bh = NULL;
2001			i--;
2002			continue;
2003		}
2004
2005		/* this is index block */
2006		if (!path[i].p_hdr) {
2007			ext_debug("initialize header\n");
2008			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2009		}
2010
2011		if (!path[i].p_idx) {
2012			/* this level hasn't been touched yet */
2013			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2014			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2015			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2016				  path[i].p_hdr,
2017				  le16_to_cpu(path[i].p_hdr->eh_entries));
2018		} else {
2019			/* we were already here, see at next index */
2020			path[i].p_idx--;
2021		}
2022
2023		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2024				i, EXT_FIRST_INDEX(path[i].p_hdr),
2025				path[i].p_idx);
2026		if (ext4_ext_more_to_rm(path + i)) {
2027			struct buffer_head *bh;
2028			/* go to the next level */
2029			ext_debug("move to level %d (block %llu)\n",
2030				  i + 1, idx_pblock(path[i].p_idx));
2031			memset(path + i + 1, 0, sizeof(*path));
2032			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2033			if (!bh) {
2034				/* should we reset i_size? */
2035				err = -EIO;
2036				break;
2037			}
2038			if (WARN_ON(i + 1 > depth)) {
2039				err = -EIO;
2040				break;
2041			}
2042			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
2043							depth - i - 1)) {
2044				err = -EIO;
2045				break;
2046			}
2047			path[i + 1].p_bh = bh;
2048
2049			/* save actual number of indexes since this
2050			 * number is changed at the next iteration */
2051			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2052			i++;
2053		} else {
2054			/* we finished processing this index, go up */
2055			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2056				/* index is empty, remove it;
2057				 * handle must be already prepared by the
2058				 * truncatei_leaf() */
2059				err = ext4_ext_rm_idx(handle, inode, path + i);
2060			}
2061			/* root level has p_bh == NULL, brelse() eats this */
2062			brelse(path[i].p_bh);
2063			path[i].p_bh = NULL;
2064			i--;
2065			ext_debug("return to level %d\n", i);
2066		}
2067	}
2068
2069	/* TODO: flexible tree reduction should be here */
2070	if (path->p_hdr->eh_entries == 0) {
2071		/*
2072		 * truncate to zero freed all the tree,
2073		 * so we need to correct eh_depth
2074		 */
2075		err = ext4_ext_get_access(handle, inode, path);
2076		if (err == 0) {
2077			ext_inode_hdr(inode)->eh_depth = 0;
2078			ext_inode_hdr(inode)->eh_max =
2079				cpu_to_le16(ext4_ext_space_root(inode));
2080			err = ext4_ext_dirty(handle, inode, path);
2081		}
2082	}
2083out:
2084	ext4_ext_tree_changed(inode);
2085	ext4_ext_drop_refs(path);
2086	kfree(path);
2087	ext4_journal_stop(handle);
2088
2089	return err;
2090}
2091
2092/*
2093 * called at mount time
2094 */
2095void ext4_ext_init(struct super_block *sb)
2096{
2097	/*
2098	 * possible initialization would be here
2099	 */
2100
2101	if (test_opt(sb, EXTENTS)) {
2102		printk("EXT4-fs: file extents enabled");
2103#ifdef AGGRESSIVE_TEST
2104		printk(", aggressive tests");
2105#endif
2106#ifdef CHECK_BINSEARCH
2107		printk(", check binsearch");
2108#endif
2109#ifdef EXTENTS_STATS
2110		printk(", stats");
2111#endif
2112		printk("\n");
2113#ifdef EXTENTS_STATS
2114		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2115		EXT4_SB(sb)->s_ext_min = 1 << 30;
2116		EXT4_SB(sb)->s_ext_max = 0;
2117#endif
2118	}
2119}
2120
2121/*
2122 * called at umount time
2123 */
2124void ext4_ext_release(struct super_block *sb)
2125{
2126	if (!test_opt(sb, EXTENTS))
2127		return;
2128
2129#ifdef EXTENTS_STATS
2130	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2131		struct ext4_sb_info *sbi = EXT4_SB(sb);
2132		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2133			sbi->s_ext_blocks, sbi->s_ext_extents,
2134			sbi->s_ext_blocks / sbi->s_ext_extents);
2135		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2136			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2137	}
2138#endif
2139}
2140
2141static void bi_complete(struct bio *bio, int error)
2142{
2143	complete((struct completion *)bio->bi_private);
2144}
2145
2146/* FIXME!! we need to try to merge to left or right after zero-out  */
2147static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2148{
2149	int ret = -EIO;
2150	struct bio *bio;
2151	int blkbits, blocksize;
2152	sector_t ee_pblock;
2153	struct completion event;
2154	unsigned int ee_len, len, done, offset;
2155
2156
2157	blkbits   = inode->i_blkbits;
2158	blocksize = inode->i_sb->s_blocksize;
2159	ee_len    = ext4_ext_get_actual_len(ex);
2160	ee_pblock = ext_pblock(ex);
2161
2162	/* convert ee_pblock to 512 byte sectors */
2163	ee_pblock = ee_pblock << (blkbits - 9);
2164
2165	while (ee_len > 0) {
2166
2167		if (ee_len > BIO_MAX_PAGES)
2168			len = BIO_MAX_PAGES;
2169		else
2170			len = ee_len;
2171
2172		bio = bio_alloc(GFP_NOIO, len);
2173		if (!bio)
2174			return -ENOMEM;
2175		bio->bi_sector = ee_pblock;
2176		bio->bi_bdev   = inode->i_sb->s_bdev;
2177
2178		done = 0;
2179		offset = 0;
2180		while (done < len) {
2181			ret = bio_add_page(bio, ZERO_PAGE(0),
2182							blocksize, offset);
2183			if (ret != blocksize) {
2184				/*
2185				 * We can't add any more pages because of
2186				 * hardware limitations.  Start a new bio.
2187				 */
2188				break;
2189			}
2190			done++;
2191			offset += blocksize;
2192			if (offset >= PAGE_CACHE_SIZE)
2193				offset = 0;
2194		}
2195
2196		init_completion(&event);
2197		bio->bi_private = &event;
2198		bio->bi_end_io = bi_complete;
2199		submit_bio(WRITE, bio);
2200		wait_for_completion(&event);
2201
2202		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2203			ret = 0;
2204		else {
2205			ret = -EIO;
2206			break;
2207		}
2208		bio_put(bio);
2209		ee_len    -= done;
2210		ee_pblock += done  << (blkbits - 9);
2211	}
2212	return ret;
2213}
2214
2215#define EXT4_EXT_ZERO_LEN 7
2216
2217/*
2218 * This function is called by ext4_ext_get_blocks() if someone tries to write
2219 * to an uninitialized extent. It may result in splitting the uninitialized
2220 * extent into multiple extents (upto three - one initialized and two
2221 * uninitialized).
2222 * There are three possibilities:
2223 *   a> There is no split required: Entire extent should be initialized
2224 *   b> Splits in two extents: Write is happening at either end of the extent
2225 *   c> Splits in three extents: Somone is writing in middle of the extent
2226 */
2227static int ext4_ext_convert_to_initialized(handle_t *handle,
2228						struct inode *inode,
2229						struct ext4_ext_path *path,
2230						ext4_lblk_t iblock,
2231						unsigned long max_blocks)
2232{
2233	struct ext4_extent *ex, newex, orig_ex;
2234	struct ext4_extent *ex1 = NULL;
2235	struct ext4_extent *ex2 = NULL;
2236	struct ext4_extent *ex3 = NULL;
2237	struct ext4_extent_header *eh;
2238	ext4_lblk_t ee_block;
2239	unsigned int allocated, ee_len, depth;
2240	ext4_fsblk_t newblock;
2241	int err = 0;
2242	int ret = 0;
2243
2244	depth = ext_depth(inode);
2245	eh = path[depth].p_hdr;
2246	ex = path[depth].p_ext;
2247	ee_block = le32_to_cpu(ex->ee_block);
2248	ee_len = ext4_ext_get_actual_len(ex);
2249	allocated = ee_len - (iblock - ee_block);
2250	newblock = iblock - ee_block + ext_pblock(ex);
2251	ex2 = ex;
2252	orig_ex.ee_block = ex->ee_block;
2253	orig_ex.ee_len   = cpu_to_le16(ee_len);
2254	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2255
2256	err = ext4_ext_get_access(handle, inode, path + depth);
2257	if (err)
2258		goto out;
2259	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2260	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2261		err =  ext4_ext_zeroout(inode, &orig_ex);
2262		if (err)
2263			goto fix_extent_len;
2264		/* update the extent length and mark as initialized */
2265		ex->ee_block = orig_ex.ee_block;
2266		ex->ee_len   = orig_ex.ee_len;
2267		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2268		ext4_ext_dirty(handle, inode, path + depth);
2269		/* zeroed the full extent */
2270		return allocated;
2271	}
2272
2273	/* ex1: ee_block to iblock - 1 : uninitialized */
2274	if (iblock > ee_block) {
2275		ex1 = ex;
2276		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2277		ext4_ext_mark_uninitialized(ex1);
2278		ex2 = &newex;
2279	}
2280	/*
2281	 * for sanity, update the length of the ex2 extent before
2282	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2283	 * overlap of blocks.
2284	 */
2285	if (!ex1 && allocated > max_blocks)
2286		ex2->ee_len = cpu_to_le16(max_blocks);
2287	/* ex3: to ee_block + ee_len : uninitialised */
2288	if (allocated > max_blocks) {
2289		unsigned int newdepth;
2290		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2291		if (allocated <= EXT4_EXT_ZERO_LEN) {
2292			/* Mark first half uninitialized.
2293			 * Mark second half initialized and zero out the
2294			 * initialized extent
2295			 */
2296			ex->ee_block = orig_ex.ee_block;
2297			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2298			ext4_ext_mark_uninitialized(ex);
2299			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2300			ext4_ext_dirty(handle, inode, path + depth);
2301
2302			ex3 = &newex;
2303			ex3->ee_block = cpu_to_le32(iblock);
2304			ext4_ext_store_pblock(ex3, newblock);
2305			ex3->ee_len = cpu_to_le16(allocated);
2306			err = ext4_ext_insert_extent(handle, inode, path, ex3);
2307			if (err == -ENOSPC) {
2308				err =  ext4_ext_zeroout(inode, &orig_ex);
2309				if (err)
2310					goto fix_extent_len;
2311				ex->ee_block = orig_ex.ee_block;
2312				ex->ee_len   = orig_ex.ee_len;
2313				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2314				ext4_ext_dirty(handle, inode, path + depth);
2315				/* zeroed the full extent */
2316				return allocated;
2317
2318			} else if (err)
2319				goto fix_extent_len;
2320
2321			/*
2322			 * We need to zero out the second half because
2323			 * an fallocate request can update file size and
2324			 * converting the second half to initialized extent
2325			 * implies that we can leak some junk data to user
2326			 * space.
2327			 */
2328			err =  ext4_ext_zeroout(inode, ex3);
2329			if (err) {
2330				/*
2331				 * We should actually mark the
2332				 * second half as uninit and return error
2333				 * Insert would have changed the extent
2334				 */
2335				depth = ext_depth(inode);
2336				ext4_ext_drop_refs(path);
2337				path = ext4_ext_find_extent(inode,
2338								iblock, path);
2339				if (IS_ERR(path)) {
2340					err = PTR_ERR(path);
2341					return err;
2342				}
2343				ex = path[depth].p_ext;
2344				err = ext4_ext_get_access(handle, inode,
2345								path + depth);
2346				if (err)
2347					return err;
2348				ext4_ext_mark_uninitialized(ex);
2349				ext4_ext_dirty(handle, inode, path + depth);
2350				return err;
2351			}
2352
2353			/* zeroed the second half */
2354			return allocated;
2355		}
2356		ex3 = &newex;
2357		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2358		ext4_ext_store_pblock(ex3, newblock + max_blocks);
2359		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2360		ext4_ext_mark_uninitialized(ex3);
2361		err = ext4_ext_insert_extent(handle, inode, path, ex3);
2362		if (err == -ENOSPC) {
2363			err =  ext4_ext_zeroout(inode, &orig_ex);
2364			if (err)
2365				goto fix_extent_len;
2366			/* update the extent length and mark as initialized */
2367			ex->ee_block = orig_ex.ee_block;
2368			ex->ee_len   = orig_ex.ee_len;
2369			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2370			ext4_ext_dirty(handle, inode, path + depth);
2371			/* zeroed the full extent */
2372			return allocated;
2373
2374		} else if (err)
2375			goto fix_extent_len;
2376		/*
2377		 * The depth, and hence eh & ex might change
2378		 * as part of the insert above.
2379		 */
2380		newdepth = ext_depth(inode);
2381		/*
2382		 * update the extent length after successfull insert of the
2383		 * split extent
2384		 */
2385		orig_ex.ee_len = cpu_to_le16(ee_len -
2386						ext4_ext_get_actual_len(ex3));
2387		if (newdepth != depth) {
2388			depth = newdepth;
2389			ext4_ext_drop_refs(path);
2390			path = ext4_ext_find_extent(inode, iblock, path);
2391			if (IS_ERR(path)) {
2392				err = PTR_ERR(path);
2393				goto out;
2394			}
2395			eh = path[depth].p_hdr;
2396			ex = path[depth].p_ext;
2397			if (ex2 != &newex)
2398				ex2 = ex;
2399
2400			err = ext4_ext_get_access(handle, inode, path + depth);
2401			if (err)
2402				goto out;
2403		}
2404		allocated = max_blocks;
2405
2406		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2407		 * to insert a extent in the middle zerout directly
2408		 * otherwise give the extent a chance to merge to left
2409		 */
2410		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2411							iblock != ee_block) {
2412			err =  ext4_ext_zeroout(inode, &orig_ex);
2413			if (err)
2414				goto fix_extent_len;
2415			/* update the extent length and mark as initialized */
2416			ex->ee_block = orig_ex.ee_block;
2417			ex->ee_len   = orig_ex.ee_len;
2418			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2419			ext4_ext_dirty(handle, inode, path + depth);
2420			/* zero out the first half */
2421			return allocated;
2422		}
2423	}
2424	/*
2425	 * If there was a change of depth as part of the
2426	 * insertion of ex3 above, we need to update the length
2427	 * of the ex1 extent again here
2428	 */
2429	if (ex1 && ex1 != ex) {
2430		ex1 = ex;
2431		ex1->ee_len = cpu_to_le16(iblock - ee_block);
2432		ext4_ext_mark_uninitialized(ex1);
2433		ex2 = &newex;
2434	}
2435	/* ex2: iblock to iblock + maxblocks-1 : initialised */
2436	ex2->ee_block = cpu_to_le32(iblock);
2437	ext4_ext_store_pblock(ex2, newblock);
2438	ex2->ee_len = cpu_to_le16(allocated);
2439	if (ex2 != ex)
2440		goto insert;
2441	/*
2442	 * New (initialized) extent starts from the first block
2443	 * in the current extent. i.e., ex2 == ex
2444	 * We have to see if it can be merged with the extent
2445	 * on the left.
2446	 */
2447	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2448		/*
2449		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2450		 * since it merges towards right _only_.
2451		 */
2452		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2453		if (ret) {
2454			err = ext4_ext_correct_indexes(handle, inode, path);
2455			if (err)
2456				goto out;
2457			depth = ext_depth(inode);
2458			ex2--;
2459		}
2460	}
2461	/*
2462	 * Try to Merge towards right. This might be required
2463	 * only when the whole extent is being written to.
2464	 * i.e. ex2 == ex and ex3 == NULL.
2465	 */
2466	if (!ex3) {
2467		ret = ext4_ext_try_to_merge(inode, path, ex2);
2468		if (ret) {
2469			err = ext4_ext_correct_indexes(handle, inode, path);
2470			if (err)
2471				goto out;
2472		}
2473	}
2474	/* Mark modified extent as dirty */
2475	err = ext4_ext_dirty(handle, inode, path + depth);
2476	goto out;
2477insert:
2478	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2479	if (err == -ENOSPC) {
2480		err =  ext4_ext_zeroout(inode, &orig_ex);
2481		if (err)
2482			goto fix_extent_len;
2483		/* update the extent length and mark as initialized */
2484		ex->ee_block = orig_ex.ee_block;
2485		ex->ee_len   = orig_ex.ee_len;
2486		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2487		ext4_ext_dirty(handle, inode, path + depth);
2488		/* zero out the first half */
2489		return allocated;
2490	} else if (err)
2491		goto fix_extent_len;
2492out:
2493	return err ? err : allocated;
2494
2495fix_extent_len:
2496	ex->ee_block = orig_ex.ee_block;
2497	ex->ee_len   = orig_ex.ee_len;
2498	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2499	ext4_ext_mark_uninitialized(ex);
2500	ext4_ext_dirty(handle, inode, path + depth);
2501	return err;
2502}
2503
2504/*
2505 * Block allocation/map/preallocation routine for extents based files
2506 *
2507 *
2508 * Need to be called with
2509 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2510 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2511 *
2512 * return > 0, number of of blocks already mapped/allocated
2513 *          if create == 0 and these are pre-allocated blocks
2514 *          	buffer head is unmapped
2515 *          otherwise blocks are mapped
2516 *
2517 * return = 0, if plain look up failed (blocks have not been allocated)
2518 *          buffer head is unmapped
2519 *
2520 * return < 0, error case.
2521 */
2522int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2523			ext4_lblk_t iblock,
2524			unsigned long max_blocks, struct buffer_head *bh_result,
2525			int create, int extend_disksize)
2526{
2527	struct ext4_ext_path *path = NULL;
2528	struct ext4_extent_header *eh;
2529	struct ext4_extent newex, *ex;
2530	ext4_fsblk_t goal, newblock;
2531	int err = 0, depth, ret;
2532	unsigned long allocated = 0;
2533	struct ext4_allocation_request ar;
2534
2535	__clear_bit(BH_New, &bh_result->b_state);
2536	ext_debug("blocks %u/%lu requested for inode %u\n",
2537			iblock, max_blocks, inode->i_ino);
2538
2539	/* check in cache */
2540	goal = ext4_ext_in_cache(inode, iblock, &newex);
2541	if (goal) {
2542		if (goal == EXT4_EXT_CACHE_GAP) {
2543			if (!create) {
2544				/*
2545				 * block isn't allocated yet and
2546				 * user doesn't want to allocate it
2547				 */
2548				goto out2;
2549			}
2550			/* we should allocate requested block */
2551		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
2552			/* block is already allocated */
2553			newblock = iblock
2554				   - le32_to_cpu(newex.ee_block)
2555				   + ext_pblock(&newex);
2556			/* number of remaining blocks in the extent */
2557			allocated = ext4_ext_get_actual_len(&newex) -
2558					(iblock - le32_to_cpu(newex.ee_block));
2559			goto out;
2560		} else {
2561			BUG();
2562		}
2563	}
2564
2565	/* find extent for this block */
2566	path = ext4_ext_find_extent(inode, iblock, NULL);
2567	if (IS_ERR(path)) {
2568		err = PTR_ERR(path);
2569		path = NULL;
2570		goto out2;
2571	}
2572
2573	depth = ext_depth(inode);
2574
2575	/*
2576	 * consistent leaf must not be empty;
2577	 * this situation is possible, though, _during_ tree modification;
2578	 * this is why assert can't be put in ext4_ext_find_extent()
2579	 */
2580	BUG_ON(path[depth].p_ext == NULL && depth != 0);
2581	eh = path[depth].p_hdr;
2582
2583	ex = path[depth].p_ext;
2584	if (ex) {
2585		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2586		ext4_fsblk_t ee_start = ext_pblock(ex);
2587		unsigned short ee_len;
2588
2589		/*
2590		 * Uninitialized extents are treated as holes, except that
2591		 * we split out initialized portions during a write.
2592		 */
2593		ee_len = ext4_ext_get_actual_len(ex);
2594		/* if found extent covers block, simply return it */
2595		if (iblock >= ee_block && iblock < ee_block + ee_len) {
2596			newblock = iblock - ee_block + ee_start;
2597			/* number of remaining blocks in the extent */
2598			allocated = ee_len - (iblock - ee_block);
2599			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2600					ee_block, ee_len, newblock);
2601
2602			/* Do not put uninitialized extent in the cache */
2603			if (!ext4_ext_is_uninitialized(ex)) {
2604				ext4_ext_put_in_cache(inode, ee_block,
2605							ee_len, ee_start,
2606							EXT4_EXT_CACHE_EXTENT);
2607				goto out;
2608			}
2609			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2610				goto out;
2611			if (!create) {
2612				/*
2613				 * We have blocks reserved already.  We
2614				 * return allocated blocks so that delalloc
2615				 * won't do block reservation for us.  But
2616				 * the buffer head will be unmapped so that
2617				 * a read from the block returns 0s.
2618				 */
2619				if (allocated > max_blocks)
2620					allocated = max_blocks;
2621				/* mark the buffer unwritten */
2622				__set_bit(BH_Unwritten, &bh_result->b_state);
2623				goto out2;
2624			}
2625
2626			ret = ext4_ext_convert_to_initialized(handle, inode,
2627								path, iblock,
2628								max_blocks);
2629			if (ret <= 0) {
2630				err = ret;
2631				goto out2;
2632			} else
2633				allocated = ret;
2634			goto outnew;
2635		}
2636	}
2637
2638	/*
2639	 * requested block isn't allocated yet;
2640	 * we couldn't try to create block if create flag is zero
2641	 */
2642	if (!create) {
2643		/*
2644		 * put just found gap into cache to speed up
2645		 * subsequent requests
2646		 */
2647		ext4_ext_put_gap_in_cache(inode, path, iblock);
2648		goto out2;
2649	}
2650	/*
2651	 * Okay, we need to do block allocation.  Lazily initialize the block
2652	 * allocation info here if necessary.
2653	 */
2654	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2655		ext4_init_block_alloc_info(inode);
2656
2657	/* find neighbour allocated blocks */
2658	ar.lleft = iblock;
2659	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2660	if (err)
2661		goto out2;
2662	ar.lright = iblock;
2663	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2664	if (err)
2665		goto out2;
2666
2667	/*
2668	 * See if request is beyond maximum number of blocks we can have in
2669	 * a single extent. For an initialized extent this limit is
2670	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2671	 * EXT_UNINIT_MAX_LEN.
2672	 */
2673	if (max_blocks > EXT_INIT_MAX_LEN &&
2674	    create != EXT4_CREATE_UNINITIALIZED_EXT)
2675		max_blocks = EXT_INIT_MAX_LEN;
2676	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2677		 create == EXT4_CREATE_UNINITIALIZED_EXT)
2678		max_blocks = EXT_UNINIT_MAX_LEN;
2679
2680	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2681	newex.ee_block = cpu_to_le32(iblock);
2682	newex.ee_len = cpu_to_le16(max_blocks);
2683	err = ext4_ext_check_overlap(inode, &newex, path);
2684	if (err)
2685		allocated = ext4_ext_get_actual_len(&newex);
2686	else
2687		allocated = max_blocks;
2688
2689	/* allocate new block */
2690	ar.inode = inode;
2691	ar.goal = ext4_ext_find_goal(inode, path, iblock);
2692	ar.logical = iblock;
2693	ar.len = allocated;
2694	if (S_ISREG(inode->i_mode))
2695		ar.flags = EXT4_MB_HINT_DATA;
2696	else
2697		/* disable in-core preallocation for non-regular files */
2698		ar.flags = 0;
2699	newblock = ext4_mb_new_blocks(handle, &ar, &err);
2700	if (!newblock)
2701		goto out2;
2702	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2703			goal, newblock, allocated);
2704
2705	/* try to insert new extent into found leaf and return */
2706	ext4_ext_store_pblock(&newex, newblock);
2707	newex.ee_len = cpu_to_le16(ar.len);
2708	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
2709		ext4_ext_mark_uninitialized(&newex);
2710	err = ext4_ext_insert_extent(handle, inode, path, &newex);
2711	if (err) {
2712		/* free data blocks we just allocated */
2713		/* not a good idea to call discard here directly,
2714		 * but otherwise we'd need to call it every free() */
2715		ext4_mb_discard_inode_preallocations(inode);
2716		ext4_free_blocks(handle, inode, ext_pblock(&newex),
2717					ext4_ext_get_actual_len(&newex), 0);
2718		goto out2;
2719	}
2720
2721	/* previous routine could use block we allocated */
2722	newblock = ext_pblock(&newex);
2723	allocated = ext4_ext_get_actual_len(&newex);
2724outnew:
2725	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2726		EXT4_I(inode)->i_disksize = inode->i_size;
2727
2728	__set_bit(BH_New, &bh_result->b_state);
2729
2730	/* Cache only when it is _not_ an uninitialized extent */
2731	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2732		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2733						EXT4_EXT_CACHE_EXTENT);
2734out:
2735	if (allocated > max_blocks)
2736		allocated = max_blocks;
2737	ext4_ext_show_leaf(inode, path);
2738	__set_bit(BH_Mapped, &bh_result->b_state);
2739	bh_result->b_bdev = inode->i_sb->s_bdev;
2740	bh_result->b_blocknr = newblock;
2741out2:
2742	if (path) {
2743		ext4_ext_drop_refs(path);
2744		kfree(path);
2745	}
2746	return err ? err : allocated;
2747}
2748
2749void ext4_ext_truncate(struct inode * inode, struct page *page)
2750{
2751	struct address_space *mapping = inode->i_mapping;
2752	struct super_block *sb = inode->i_sb;
2753	ext4_lblk_t last_block;
2754	handle_t *handle;
2755	int err = 0;
2756
2757	/*
2758	 * probably first extent we're gonna free will be last in block
2759	 */
2760	err = ext4_writepage_trans_blocks(inode) + 3;
2761	handle = ext4_journal_start(inode, err);
2762	if (IS_ERR(handle)) {
2763		if (page) {
2764			clear_highpage(page);
2765			flush_dcache_page(page);
2766			unlock_page(page);
2767			page_cache_release(page);
2768		}
2769		return;
2770	}
2771
2772	if (page)
2773		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2774
2775	down_write(&EXT4_I(inode)->i_data_sem);
2776	ext4_ext_invalidate_cache(inode);
2777
2778	ext4_mb_discard_inode_preallocations(inode);
2779
2780	/*
2781	 * TODO: optimization is possible here.
2782	 * Probably we need not scan at all,
2783	 * because page truncation is enough.
2784	 */
2785	if (ext4_orphan_add(handle, inode))
2786		goto out_stop;
2787
2788	/* we have to know where to truncate from in crash case */
2789	EXT4_I(inode)->i_disksize = inode->i_size;
2790	ext4_mark_inode_dirty(handle, inode);
2791
2792	last_block = (inode->i_size + sb->s_blocksize - 1)
2793			>> EXT4_BLOCK_SIZE_BITS(sb);
2794	err = ext4_ext_remove_space(inode, last_block);
2795
2796	/* In a multi-transaction truncate, we only make the final
2797	 * transaction synchronous.
2798	 */
2799	if (IS_SYNC(inode))
2800		handle->h_sync = 1;
2801
2802out_stop:
2803	/*
2804	 * If this was a simple ftruncate() and the file will remain alive,
2805	 * then we need to clear up the orphan record which we created above.
2806	 * However, if this was a real unlink then we were called by
2807	 * ext4_delete_inode(), and we allow that function to clean up the
2808	 * orphan info for us.
2809	 */
2810	if (inode->i_nlink)
2811		ext4_orphan_del(handle, inode);
2812
2813	up_write(&EXT4_I(inode)->i_data_sem);
2814	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2815	ext4_mark_inode_dirty(handle, inode);
2816	ext4_journal_stop(handle);
2817}
2818
2819/*
2820 * ext4_ext_writepage_trans_blocks:
2821 * calculate max number of blocks we could modify
2822 * in order to allocate new block for an inode
2823 */
2824int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2825{
2826	int needed;
2827
2828	needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2829
2830	/* caller wants to allocate num blocks, but note it includes sb */
2831	needed = needed * num - (num - 1);
2832
2833#ifdef CONFIG_QUOTA
2834	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2835#endif
2836
2837	return needed;
2838}
2839
2840static void ext4_falloc_update_inode(struct inode *inode,
2841				int mode, loff_t new_size, int update_ctime)
2842{
2843	struct timespec now;
2844
2845	if (update_ctime) {
2846		now = current_fs_time(inode->i_sb);
2847		if (!timespec_equal(&inode->i_ctime, &now))
2848			inode->i_ctime = now;
2849	}
2850	/*
2851	 * Update only when preallocation was requested beyond
2852	 * the file size.
2853	 */
2854	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
2855				new_size > i_size_read(inode)) {
2856		i_size_write(inode, new_size);
2857		EXT4_I(inode)->i_disksize = new_size;
2858	}
2859
2860}
2861
2862/*
2863 * preallocate space for a file. This implements ext4's fallocate inode
2864 * operation, which gets called from sys_fallocate system call.
2865 * For block-mapped files, posix_fallocate should fall back to the method
2866 * of writing zeroes to the required new blocks (the same behavior which is
2867 * expected for file systems which do not support fallocate() system call).
2868 */
2869long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2870{
2871	handle_t *handle;
2872	ext4_lblk_t block;
2873	loff_t new_size;
2874	unsigned long max_blocks;
2875	int ret = 0;
2876	int ret2 = 0;
2877	int retries = 0;
2878	struct buffer_head map_bh;
2879	unsigned int credits, blkbits = inode->i_blkbits;
2880
2881	/*
2882	 * currently supporting (pre)allocate mode for extent-based
2883	 * files _only_
2884	 */
2885	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
2886		return -EOPNOTSUPP;
2887
2888	/* preallocation to directories is currently not supported */
2889	if (S_ISDIR(inode->i_mode))
2890		return -ENODEV;
2891
2892	block = offset >> blkbits;
2893	/*
2894	 * We can't just convert len to max_blocks because
2895	 * If blocksize = 4096 offset = 3072 and len = 2048
2896	 */
2897	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2898							- block;
2899	/*
2900	 * credits to insert 1 extent into extent tree + buffers to be able to
2901	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2902	 */
2903	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2904	mutex_lock(&inode->i_mutex);
2905retry:
2906	while (ret >= 0 && ret < max_blocks) {
2907		block = block + ret;
2908		max_blocks = max_blocks - ret;
2909		handle = ext4_journal_start(inode, credits);
2910		if (IS_ERR(handle)) {
2911			ret = PTR_ERR(handle);
2912			break;
2913		}
2914		ret = ext4_get_blocks_wrap(handle, inode, block,
2915					  max_blocks, &map_bh,
2916					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
2917		if (ret <= 0) {
2918#ifdef EXT4FS_DEBUG
2919			WARN_ON(ret <= 0);
2920			printk(KERN_ERR "%s: ext4_ext_get_blocks "
2921				    "returned error inode#%lu, block=%u, "
2922				    "max_blocks=%lu", __func__,
2923				    inode->i_ino, block, max_blocks);
2924#endif
2925			ext4_mark_inode_dirty(handle, inode);
2926			ret2 = ext4_journal_stop(handle);
2927			break;
2928		}
2929		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
2930						blkbits) >> blkbits))
2931			new_size = offset + len;
2932		else
2933			new_size = (block + ret) << blkbits;
2934
2935		ext4_falloc_update_inode(inode, mode, new_size,
2936						buffer_new(&map_bh));
2937		ext4_mark_inode_dirty(handle, inode);
2938		ret2 = ext4_journal_stop(handle);
2939		if (ret2)
2940			break;
2941	}
2942	if (ret == -ENOSPC &&
2943			ext4_should_retry_alloc(inode->i_sb, &retries)) {
2944		ret = 0;
2945		goto retry;
2946	}
2947	mutex_unlock(&inode->i_mutex);
2948	return ret > 0 ? ret2 : ret;
2949}
2950