extents.c revision 65922cb5ced76ba7182e955d4aada96f93446b1a
1/* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23/* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32#include <linux/module.h> 33#include <linux/fs.h> 34#include <linux/time.h> 35#include <linux/jbd2.h> 36#include <linux/highuid.h> 37#include <linux/pagemap.h> 38#include <linux/quotaops.h> 39#include <linux/string.h> 40#include <linux/slab.h> 41#include <linux/falloc.h> 42#include <asm/uaccess.h> 43#include <linux/fiemap.h> 44#include "ext4_jbd2.h" 45#include "ext4_extents.h" 46 47#include <trace/events/ext4.h> 48 49static int ext4_ext_truncate_extend_restart(handle_t *handle, 50 struct inode *inode, 51 int needed) 52{ 53 int err; 54 55 if (!ext4_handle_valid(handle)) 56 return 0; 57 if (handle->h_buffer_credits > needed) 58 return 0; 59 err = ext4_journal_extend(handle, needed); 60 if (err <= 0) 61 return err; 62 err = ext4_truncate_restart_trans(handle, inode, needed); 63 if (err == 0) 64 err = -EAGAIN; 65 66 return err; 67} 68 69/* 70 * could return: 71 * - EROFS 72 * - ENOMEM 73 */ 74static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 75 struct ext4_ext_path *path) 76{ 77 if (path->p_bh) { 78 /* path points to block */ 79 return ext4_journal_get_write_access(handle, path->p_bh); 80 } 81 /* path points to leaf/index in inode body */ 82 /* we use in-core data, no need to protect them */ 83 return 0; 84} 85 86/* 87 * could return: 88 * - EROFS 89 * - ENOMEM 90 * - EIO 91 */ 92static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 93 struct ext4_ext_path *path) 94{ 95 int err; 96 if (path->p_bh) { 97 /* path points to block */ 98 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 99 } else { 100 /* path points to leaf/index in inode body */ 101 err = ext4_mark_inode_dirty(handle, inode); 102 } 103 return err; 104} 105 106static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 107 struct ext4_ext_path *path, 108 ext4_lblk_t block) 109{ 110 struct ext4_inode_info *ei = EXT4_I(inode); 111 ext4_fsblk_t bg_start; 112 ext4_fsblk_t last_block; 113 ext4_grpblk_t colour; 114 ext4_group_t block_group; 115 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 116 int depth; 117 118 if (path) { 119 struct ext4_extent *ex; 120 depth = path->p_depth; 121 122 /* 123 * Try to predict block placement assuming that we are 124 * filling in a file which will eventually be 125 * non-sparse --- i.e., in the case of libbfd writing 126 * an ELF object sections out-of-order but in a way 127 * the eventually results in a contiguous object or 128 * executable file, or some database extending a table 129 * space file. However, this is actually somewhat 130 * non-ideal if we are writing a sparse file such as 131 * qemu or KVM writing a raw image file that is going 132 * to stay fairly sparse, since it will end up 133 * fragmenting the file system's free space. Maybe we 134 * should have some hueristics or some way to allow 135 * userspace to pass a hint to file system, 136 * especiially if the latter case turns out to be 137 * common. 138 */ 139 ex = path[depth].p_ext; 140 if (ex) { 141 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 142 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 143 144 if (block > ext_block) 145 return ext_pblk + (block - ext_block); 146 else 147 return ext_pblk - (ext_block - block); 148 } 149 150 /* it looks like index is empty; 151 * try to find starting block from index itself */ 152 if (path[depth].p_bh) 153 return path[depth].p_bh->b_blocknr; 154 } 155 156 /* OK. use inode's group */ 157 block_group = ei->i_block_group; 158 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 159 /* 160 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 161 * block groups per flexgroup, reserve the first block 162 * group for directories and special files. Regular 163 * files will start at the second block group. This 164 * tends to speed up directory access and improves 165 * fsck times. 166 */ 167 block_group &= ~(flex_size-1); 168 if (S_ISREG(inode->i_mode)) 169 block_group++; 170 } 171 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 172 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 173 174 /* 175 * If we are doing delayed allocation, we don't need take 176 * colour into account. 177 */ 178 if (test_opt(inode->i_sb, DELALLOC)) 179 return bg_start; 180 181 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 182 colour = (current->pid % 16) * 183 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 184 else 185 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 186 return bg_start + colour + block; 187} 188 189/* 190 * Allocation for a meta data block 191 */ 192static ext4_fsblk_t 193ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 194 struct ext4_ext_path *path, 195 struct ext4_extent *ex, int *err) 196{ 197 ext4_fsblk_t goal, newblock; 198 199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 201 return newblock; 202} 203 204static inline int ext4_ext_space_block(struct inode *inode, int check) 205{ 206 int size; 207 208 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 209 / sizeof(struct ext4_extent); 210 if (!check) { 211#ifdef AGGRESSIVE_TEST 212 if (size > 6) 213 size = 6; 214#endif 215 } 216 return size; 217} 218 219static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 220{ 221 int size; 222 223 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 224 / sizeof(struct ext4_extent_idx); 225 if (!check) { 226#ifdef AGGRESSIVE_TEST 227 if (size > 5) 228 size = 5; 229#endif 230 } 231 return size; 232} 233 234static inline int ext4_ext_space_root(struct inode *inode, int check) 235{ 236 int size; 237 238 size = sizeof(EXT4_I(inode)->i_data); 239 size -= sizeof(struct ext4_extent_header); 240 size /= sizeof(struct ext4_extent); 241 if (!check) { 242#ifdef AGGRESSIVE_TEST 243 if (size > 3) 244 size = 3; 245#endif 246 } 247 return size; 248} 249 250static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 251{ 252 int size; 253 254 size = sizeof(EXT4_I(inode)->i_data); 255 size -= sizeof(struct ext4_extent_header); 256 size /= sizeof(struct ext4_extent_idx); 257 if (!check) { 258#ifdef AGGRESSIVE_TEST 259 if (size > 4) 260 size = 4; 261#endif 262 } 263 return size; 264} 265 266/* 267 * Calculate the number of metadata blocks needed 268 * to allocate @blocks 269 * Worse case is one block per extent 270 */ 271int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 272{ 273 struct ext4_inode_info *ei = EXT4_I(inode); 274 int idxs, num = 0; 275 276 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 277 / sizeof(struct ext4_extent_idx)); 278 279 /* 280 * If the new delayed allocation block is contiguous with the 281 * previous da block, it can share index blocks with the 282 * previous block, so we only need to allocate a new index 283 * block every idxs leaf blocks. At ldxs**2 blocks, we need 284 * an additional index block, and at ldxs**3 blocks, yet 285 * another index blocks. 286 */ 287 if (ei->i_da_metadata_calc_len && 288 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 289 if ((ei->i_da_metadata_calc_len % idxs) == 0) 290 num++; 291 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 292 num++; 293 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 294 num++; 295 ei->i_da_metadata_calc_len = 0; 296 } else 297 ei->i_da_metadata_calc_len++; 298 ei->i_da_metadata_calc_last_lblock++; 299 return num; 300 } 301 302 /* 303 * In the worst case we need a new set of index blocks at 304 * every level of the inode's extent tree. 305 */ 306 ei->i_da_metadata_calc_len = 1; 307 ei->i_da_metadata_calc_last_lblock = lblock; 308 return ext_depth(inode) + 1; 309} 310 311static int 312ext4_ext_max_entries(struct inode *inode, int depth) 313{ 314 int max; 315 316 if (depth == ext_depth(inode)) { 317 if (depth == 0) 318 max = ext4_ext_space_root(inode, 1); 319 else 320 max = ext4_ext_space_root_idx(inode, 1); 321 } else { 322 if (depth == 0) 323 max = ext4_ext_space_block(inode, 1); 324 else 325 max = ext4_ext_space_block_idx(inode, 1); 326 } 327 328 return max; 329} 330 331static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 332{ 333 ext4_fsblk_t block = ext4_ext_pblock(ext); 334 int len = ext4_ext_get_actual_len(ext); 335 336 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 337} 338 339static int ext4_valid_extent_idx(struct inode *inode, 340 struct ext4_extent_idx *ext_idx) 341{ 342 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 343 344 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 345} 346 347static int ext4_valid_extent_entries(struct inode *inode, 348 struct ext4_extent_header *eh, 349 int depth) 350{ 351 struct ext4_extent *ext; 352 struct ext4_extent_idx *ext_idx; 353 unsigned short entries; 354 if (eh->eh_entries == 0) 355 return 1; 356 357 entries = le16_to_cpu(eh->eh_entries); 358 359 if (depth == 0) { 360 /* leaf entries */ 361 ext = EXT_FIRST_EXTENT(eh); 362 while (entries) { 363 if (!ext4_valid_extent(inode, ext)) 364 return 0; 365 ext++; 366 entries--; 367 } 368 } else { 369 ext_idx = EXT_FIRST_INDEX(eh); 370 while (entries) { 371 if (!ext4_valid_extent_idx(inode, ext_idx)) 372 return 0; 373 ext_idx++; 374 entries--; 375 } 376 } 377 return 1; 378} 379 380static int __ext4_ext_check(const char *function, unsigned int line, 381 struct inode *inode, struct ext4_extent_header *eh, 382 int depth) 383{ 384 const char *error_msg; 385 int max = 0; 386 387 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 388 error_msg = "invalid magic"; 389 goto corrupted; 390 } 391 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 392 error_msg = "unexpected eh_depth"; 393 goto corrupted; 394 } 395 if (unlikely(eh->eh_max == 0)) { 396 error_msg = "invalid eh_max"; 397 goto corrupted; 398 } 399 max = ext4_ext_max_entries(inode, depth); 400 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 401 error_msg = "too large eh_max"; 402 goto corrupted; 403 } 404 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 405 error_msg = "invalid eh_entries"; 406 goto corrupted; 407 } 408 if (!ext4_valid_extent_entries(inode, eh, depth)) { 409 error_msg = "invalid extent entries"; 410 goto corrupted; 411 } 412 return 0; 413 414corrupted: 415 ext4_error_inode(inode, function, line, 0, 416 "bad header/extent: %s - magic %x, " 417 "entries %u, max %u(%u), depth %u(%u)", 418 error_msg, le16_to_cpu(eh->eh_magic), 419 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 420 max, le16_to_cpu(eh->eh_depth), depth); 421 422 return -EIO; 423} 424 425#define ext4_ext_check(inode, eh, depth) \ 426 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 427 428int ext4_ext_check_inode(struct inode *inode) 429{ 430 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 431} 432 433#ifdef EXT_DEBUG 434static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 435{ 436 int k, l = path->p_depth; 437 438 ext_debug("path:"); 439 for (k = 0; k <= l; k++, path++) { 440 if (path->p_idx) { 441 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 442 ext4_idx_pblock(path->p_idx)); 443 } else if (path->p_ext) { 444 ext_debug(" %d:[%d]%d:%llu ", 445 le32_to_cpu(path->p_ext->ee_block), 446 ext4_ext_is_uninitialized(path->p_ext), 447 ext4_ext_get_actual_len(path->p_ext), 448 ext4_ext_pblock(path->p_ext)); 449 } else 450 ext_debug(" []"); 451 } 452 ext_debug("\n"); 453} 454 455static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 456{ 457 int depth = ext_depth(inode); 458 struct ext4_extent_header *eh; 459 struct ext4_extent *ex; 460 int i; 461 462 if (!path) 463 return; 464 465 eh = path[depth].p_hdr; 466 ex = EXT_FIRST_EXTENT(eh); 467 468 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 469 470 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 471 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 472 ext4_ext_is_uninitialized(ex), 473 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 474 } 475 ext_debug("\n"); 476} 477#else 478#define ext4_ext_show_path(inode, path) 479#define ext4_ext_show_leaf(inode, path) 480#endif 481 482void ext4_ext_drop_refs(struct ext4_ext_path *path) 483{ 484 int depth = path->p_depth; 485 int i; 486 487 for (i = 0; i <= depth; i++, path++) 488 if (path->p_bh) { 489 brelse(path->p_bh); 490 path->p_bh = NULL; 491 } 492} 493 494/* 495 * ext4_ext_binsearch_idx: 496 * binary search for the closest index of the given block 497 * the header must be checked before calling this 498 */ 499static void 500ext4_ext_binsearch_idx(struct inode *inode, 501 struct ext4_ext_path *path, ext4_lblk_t block) 502{ 503 struct ext4_extent_header *eh = path->p_hdr; 504 struct ext4_extent_idx *r, *l, *m; 505 506 507 ext_debug("binsearch for %u(idx): ", block); 508 509 l = EXT_FIRST_INDEX(eh) + 1; 510 r = EXT_LAST_INDEX(eh); 511 while (l <= r) { 512 m = l + (r - l) / 2; 513 if (block < le32_to_cpu(m->ei_block)) 514 r = m - 1; 515 else 516 l = m + 1; 517 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 518 m, le32_to_cpu(m->ei_block), 519 r, le32_to_cpu(r->ei_block)); 520 } 521 522 path->p_idx = l - 1; 523 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 524 ext4_idx_pblock(path->p_idx)); 525 526#ifdef CHECK_BINSEARCH 527 { 528 struct ext4_extent_idx *chix, *ix; 529 int k; 530 531 chix = ix = EXT_FIRST_INDEX(eh); 532 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 533 if (k != 0 && 534 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 535 printk(KERN_DEBUG "k=%d, ix=0x%p, " 536 "first=0x%p\n", k, 537 ix, EXT_FIRST_INDEX(eh)); 538 printk(KERN_DEBUG "%u <= %u\n", 539 le32_to_cpu(ix->ei_block), 540 le32_to_cpu(ix[-1].ei_block)); 541 } 542 BUG_ON(k && le32_to_cpu(ix->ei_block) 543 <= le32_to_cpu(ix[-1].ei_block)); 544 if (block < le32_to_cpu(ix->ei_block)) 545 break; 546 chix = ix; 547 } 548 BUG_ON(chix != path->p_idx); 549 } 550#endif 551 552} 553 554/* 555 * ext4_ext_binsearch: 556 * binary search for closest extent of the given block 557 * the header must be checked before calling this 558 */ 559static void 560ext4_ext_binsearch(struct inode *inode, 561 struct ext4_ext_path *path, ext4_lblk_t block) 562{ 563 struct ext4_extent_header *eh = path->p_hdr; 564 struct ext4_extent *r, *l, *m; 565 566 if (eh->eh_entries == 0) { 567 /* 568 * this leaf is empty: 569 * we get such a leaf in split/add case 570 */ 571 return; 572 } 573 574 ext_debug("binsearch for %u: ", block); 575 576 l = EXT_FIRST_EXTENT(eh) + 1; 577 r = EXT_LAST_EXTENT(eh); 578 579 while (l <= r) { 580 m = l + (r - l) / 2; 581 if (block < le32_to_cpu(m->ee_block)) 582 r = m - 1; 583 else 584 l = m + 1; 585 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 586 m, le32_to_cpu(m->ee_block), 587 r, le32_to_cpu(r->ee_block)); 588 } 589 590 path->p_ext = l - 1; 591 ext_debug(" -> %d:%llu:[%d]%d ", 592 le32_to_cpu(path->p_ext->ee_block), 593 ext4_ext_pblock(path->p_ext), 594 ext4_ext_is_uninitialized(path->p_ext), 595 ext4_ext_get_actual_len(path->p_ext)); 596 597#ifdef CHECK_BINSEARCH 598 { 599 struct ext4_extent *chex, *ex; 600 int k; 601 602 chex = ex = EXT_FIRST_EXTENT(eh); 603 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 604 BUG_ON(k && le32_to_cpu(ex->ee_block) 605 <= le32_to_cpu(ex[-1].ee_block)); 606 if (block < le32_to_cpu(ex->ee_block)) 607 break; 608 chex = ex; 609 } 610 BUG_ON(chex != path->p_ext); 611 } 612#endif 613 614} 615 616int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 617{ 618 struct ext4_extent_header *eh; 619 620 eh = ext_inode_hdr(inode); 621 eh->eh_depth = 0; 622 eh->eh_entries = 0; 623 eh->eh_magic = EXT4_EXT_MAGIC; 624 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 625 ext4_mark_inode_dirty(handle, inode); 626 ext4_ext_invalidate_cache(inode); 627 return 0; 628} 629 630struct ext4_ext_path * 631ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 632 struct ext4_ext_path *path) 633{ 634 struct ext4_extent_header *eh; 635 struct buffer_head *bh; 636 short int depth, i, ppos = 0, alloc = 0; 637 638 eh = ext_inode_hdr(inode); 639 depth = ext_depth(inode); 640 641 /* account possible depth increase */ 642 if (!path) { 643 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 644 GFP_NOFS); 645 if (!path) 646 return ERR_PTR(-ENOMEM); 647 alloc = 1; 648 } 649 path[0].p_hdr = eh; 650 path[0].p_bh = NULL; 651 652 i = depth; 653 /* walk through the tree */ 654 while (i) { 655 int need_to_validate = 0; 656 657 ext_debug("depth %d: num %d, max %d\n", 658 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 659 660 ext4_ext_binsearch_idx(inode, path + ppos, block); 661 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 662 path[ppos].p_depth = i; 663 path[ppos].p_ext = NULL; 664 665 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 666 if (unlikely(!bh)) 667 goto err; 668 if (!bh_uptodate_or_lock(bh)) { 669 trace_ext4_ext_load_extent(inode, block, 670 path[ppos].p_block); 671 if (bh_submit_read(bh) < 0) { 672 put_bh(bh); 673 goto err; 674 } 675 /* validate the extent entries */ 676 need_to_validate = 1; 677 } 678 eh = ext_block_hdr(bh); 679 ppos++; 680 if (unlikely(ppos > depth)) { 681 put_bh(bh); 682 EXT4_ERROR_INODE(inode, 683 "ppos %d > depth %d", ppos, depth); 684 goto err; 685 } 686 path[ppos].p_bh = bh; 687 path[ppos].p_hdr = eh; 688 i--; 689 690 if (need_to_validate && ext4_ext_check(inode, eh, i)) 691 goto err; 692 } 693 694 path[ppos].p_depth = i; 695 path[ppos].p_ext = NULL; 696 path[ppos].p_idx = NULL; 697 698 /* find extent */ 699 ext4_ext_binsearch(inode, path + ppos, block); 700 /* if not an empty leaf */ 701 if (path[ppos].p_ext) 702 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 703 704 ext4_ext_show_path(inode, path); 705 706 return path; 707 708err: 709 ext4_ext_drop_refs(path); 710 if (alloc) 711 kfree(path); 712 return ERR_PTR(-EIO); 713} 714 715/* 716 * ext4_ext_insert_index: 717 * insert new index [@logical;@ptr] into the block at @curp; 718 * check where to insert: before @curp or after @curp 719 */ 720static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 721 struct ext4_ext_path *curp, 722 int logical, ext4_fsblk_t ptr) 723{ 724 struct ext4_extent_idx *ix; 725 int len, err; 726 727 err = ext4_ext_get_access(handle, inode, curp); 728 if (err) 729 return err; 730 731 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 732 EXT4_ERROR_INODE(inode, 733 "logical %d == ei_block %d!", 734 logical, le32_to_cpu(curp->p_idx->ei_block)); 735 return -EIO; 736 } 737 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 738 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 739 /* insert after */ 740 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 741 len = (len - 1) * sizeof(struct ext4_extent_idx); 742 len = len < 0 ? 0 : len; 743 ext_debug("insert new index %d after: %llu. " 744 "move %d from 0x%p to 0x%p\n", 745 logical, ptr, len, 746 (curp->p_idx + 1), (curp->p_idx + 2)); 747 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 748 } 749 ix = curp->p_idx + 1; 750 } else { 751 /* insert before */ 752 len = len * sizeof(struct ext4_extent_idx); 753 len = len < 0 ? 0 : len; 754 ext_debug("insert new index %d before: %llu. " 755 "move %d from 0x%p to 0x%p\n", 756 logical, ptr, len, 757 curp->p_idx, (curp->p_idx + 1)); 758 memmove(curp->p_idx + 1, curp->p_idx, len); 759 ix = curp->p_idx; 760 } 761 762 ix->ei_block = cpu_to_le32(logical); 763 ext4_idx_store_pblock(ix, ptr); 764 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 765 766 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 767 > le16_to_cpu(curp->p_hdr->eh_max))) { 768 EXT4_ERROR_INODE(inode, 769 "logical %d == ei_block %d!", 770 logical, le32_to_cpu(curp->p_idx->ei_block)); 771 return -EIO; 772 } 773 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 774 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 775 return -EIO; 776 } 777 778 err = ext4_ext_dirty(handle, inode, curp); 779 ext4_std_error(inode->i_sb, err); 780 781 return err; 782} 783 784/* 785 * ext4_ext_split: 786 * inserts new subtree into the path, using free index entry 787 * at depth @at: 788 * - allocates all needed blocks (new leaf and all intermediate index blocks) 789 * - makes decision where to split 790 * - moves remaining extents and index entries (right to the split point) 791 * into the newly allocated blocks 792 * - initializes subtree 793 */ 794static int ext4_ext_split(handle_t *handle, struct inode *inode, 795 struct ext4_ext_path *path, 796 struct ext4_extent *newext, int at) 797{ 798 struct buffer_head *bh = NULL; 799 int depth = ext_depth(inode); 800 struct ext4_extent_header *neh; 801 struct ext4_extent_idx *fidx; 802 struct ext4_extent *ex; 803 int i = at, k, m, a; 804 ext4_fsblk_t newblock, oldblock; 805 __le32 border; 806 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 807 int err = 0; 808 809 /* make decision: where to split? */ 810 /* FIXME: now decision is simplest: at current extent */ 811 812 /* if current leaf will be split, then we should use 813 * border from split point */ 814 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 815 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 816 return -EIO; 817 } 818 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 819 border = path[depth].p_ext[1].ee_block; 820 ext_debug("leaf will be split." 821 " next leaf starts at %d\n", 822 le32_to_cpu(border)); 823 } else { 824 border = newext->ee_block; 825 ext_debug("leaf will be added." 826 " next leaf starts at %d\n", 827 le32_to_cpu(border)); 828 } 829 830 /* 831 * If error occurs, then we break processing 832 * and mark filesystem read-only. index won't 833 * be inserted and tree will be in consistent 834 * state. Next mount will repair buffers too. 835 */ 836 837 /* 838 * Get array to track all allocated blocks. 839 * We need this to handle errors and free blocks 840 * upon them. 841 */ 842 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 843 if (!ablocks) 844 return -ENOMEM; 845 846 /* allocate all needed blocks */ 847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 848 for (a = 0; a < depth - at; a++) { 849 newblock = ext4_ext_new_meta_block(handle, inode, path, 850 newext, &err); 851 if (newblock == 0) 852 goto cleanup; 853 ablocks[a] = newblock; 854 } 855 856 /* initialize new leaf */ 857 newblock = ablocks[--a]; 858 if (unlikely(newblock == 0)) { 859 EXT4_ERROR_INODE(inode, "newblock == 0!"); 860 err = -EIO; 861 goto cleanup; 862 } 863 bh = sb_getblk(inode->i_sb, newblock); 864 if (!bh) { 865 err = -EIO; 866 goto cleanup; 867 } 868 lock_buffer(bh); 869 870 err = ext4_journal_get_create_access(handle, bh); 871 if (err) 872 goto cleanup; 873 874 neh = ext_block_hdr(bh); 875 neh->eh_entries = 0; 876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 877 neh->eh_magic = EXT4_EXT_MAGIC; 878 neh->eh_depth = 0; 879 ex = EXT_FIRST_EXTENT(neh); 880 881 /* move remainder of path[depth] to the new leaf */ 882 if (unlikely(path[depth].p_hdr->eh_entries != 883 path[depth].p_hdr->eh_max)) { 884 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 885 path[depth].p_hdr->eh_entries, 886 path[depth].p_hdr->eh_max); 887 err = -EIO; 888 goto cleanup; 889 } 890 /* start copy from next extent */ 891 /* TODO: we could do it by single memmove */ 892 m = 0; 893 path[depth].p_ext++; 894 while (path[depth].p_ext <= 895 EXT_MAX_EXTENT(path[depth].p_hdr)) { 896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 897 le32_to_cpu(path[depth].p_ext->ee_block), 898 ext4_ext_pblock(path[depth].p_ext), 899 ext4_ext_is_uninitialized(path[depth].p_ext), 900 ext4_ext_get_actual_len(path[depth].p_ext), 901 newblock); 902 /*memmove(ex++, path[depth].p_ext++, 903 sizeof(struct ext4_extent)); 904 neh->eh_entries++;*/ 905 path[depth].p_ext++; 906 m++; 907 } 908 if (m) { 909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 910 le16_add_cpu(&neh->eh_entries, m); 911 } 912 913 set_buffer_uptodate(bh); 914 unlock_buffer(bh); 915 916 err = ext4_handle_dirty_metadata(handle, inode, bh); 917 if (err) 918 goto cleanup; 919 brelse(bh); 920 bh = NULL; 921 922 /* correct old leaf */ 923 if (m) { 924 err = ext4_ext_get_access(handle, inode, path + depth); 925 if (err) 926 goto cleanup; 927 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 928 err = ext4_ext_dirty(handle, inode, path + depth); 929 if (err) 930 goto cleanup; 931 932 } 933 934 /* create intermediate indexes */ 935 k = depth - at - 1; 936 if (unlikely(k < 0)) { 937 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 938 err = -EIO; 939 goto cleanup; 940 } 941 if (k) 942 ext_debug("create %d intermediate indices\n", k); 943 /* insert new index into current index block */ 944 /* current depth stored in i var */ 945 i = depth - 1; 946 while (k--) { 947 oldblock = newblock; 948 newblock = ablocks[--a]; 949 bh = sb_getblk(inode->i_sb, newblock); 950 if (!bh) { 951 err = -EIO; 952 goto cleanup; 953 } 954 lock_buffer(bh); 955 956 err = ext4_journal_get_create_access(handle, bh); 957 if (err) 958 goto cleanup; 959 960 neh = ext_block_hdr(bh); 961 neh->eh_entries = cpu_to_le16(1); 962 neh->eh_magic = EXT4_EXT_MAGIC; 963 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 964 neh->eh_depth = cpu_to_le16(depth - i); 965 fidx = EXT_FIRST_INDEX(neh); 966 fidx->ei_block = border; 967 ext4_idx_store_pblock(fidx, oldblock); 968 969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 970 i, newblock, le32_to_cpu(border), oldblock); 971 /* copy indexes */ 972 m = 0; 973 path[i].p_idx++; 974 975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 976 EXT_MAX_INDEX(path[i].p_hdr)); 977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 978 EXT_LAST_INDEX(path[i].p_hdr))) { 979 EXT4_ERROR_INODE(inode, 980 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 981 le32_to_cpu(path[i].p_ext->ee_block)); 982 err = -EIO; 983 goto cleanup; 984 } 985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 987 le32_to_cpu(path[i].p_idx->ei_block), 988 ext4_idx_pblock(path[i].p_idx), 989 newblock); 990 /*memmove(++fidx, path[i].p_idx++, 991 sizeof(struct ext4_extent_idx)); 992 neh->eh_entries++; 993 BUG_ON(neh->eh_entries > neh->eh_max);*/ 994 path[i].p_idx++; 995 m++; 996 } 997 if (m) { 998 memmove(++fidx, path[i].p_idx - m, 999 sizeof(struct ext4_extent_idx) * m); 1000 le16_add_cpu(&neh->eh_entries, m); 1001 } 1002 set_buffer_uptodate(bh); 1003 unlock_buffer(bh); 1004 1005 err = ext4_handle_dirty_metadata(handle, inode, bh); 1006 if (err) 1007 goto cleanup; 1008 brelse(bh); 1009 bh = NULL; 1010 1011 /* correct old index */ 1012 if (m) { 1013 err = ext4_ext_get_access(handle, inode, path + i); 1014 if (err) 1015 goto cleanup; 1016 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1017 err = ext4_ext_dirty(handle, inode, path + i); 1018 if (err) 1019 goto cleanup; 1020 } 1021 1022 i--; 1023 } 1024 1025 /* insert new index */ 1026 err = ext4_ext_insert_index(handle, inode, path + at, 1027 le32_to_cpu(border), newblock); 1028 1029cleanup: 1030 if (bh) { 1031 if (buffer_locked(bh)) 1032 unlock_buffer(bh); 1033 brelse(bh); 1034 } 1035 1036 if (err) { 1037 /* free all allocated blocks in error case */ 1038 for (i = 0; i < depth; i++) { 1039 if (!ablocks[i]) 1040 continue; 1041 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1042 EXT4_FREE_BLOCKS_METADATA); 1043 } 1044 } 1045 kfree(ablocks); 1046 1047 return err; 1048} 1049 1050/* 1051 * ext4_ext_grow_indepth: 1052 * implements tree growing procedure: 1053 * - allocates new block 1054 * - moves top-level data (index block or leaf) into the new block 1055 * - initializes new top-level, creating index that points to the 1056 * just created block 1057 */ 1058static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1059 struct ext4_ext_path *path, 1060 struct ext4_extent *newext) 1061{ 1062 struct ext4_ext_path *curp = path; 1063 struct ext4_extent_header *neh; 1064 struct buffer_head *bh; 1065 ext4_fsblk_t newblock; 1066 int err = 0; 1067 1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1069 if (newblock == 0) 1070 return err; 1071 1072 bh = sb_getblk(inode->i_sb, newblock); 1073 if (!bh) { 1074 err = -EIO; 1075 ext4_std_error(inode->i_sb, err); 1076 return err; 1077 } 1078 lock_buffer(bh); 1079 1080 err = ext4_journal_get_create_access(handle, bh); 1081 if (err) { 1082 unlock_buffer(bh); 1083 goto out; 1084 } 1085 1086 /* move top-level index/leaf into new block */ 1087 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1088 1089 /* set size of new block */ 1090 neh = ext_block_hdr(bh); 1091 /* old root could have indexes or leaves 1092 * so calculate e_max right way */ 1093 if (ext_depth(inode)) 1094 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1095 else 1096 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1097 neh->eh_magic = EXT4_EXT_MAGIC; 1098 set_buffer_uptodate(bh); 1099 unlock_buffer(bh); 1100 1101 err = ext4_handle_dirty_metadata(handle, inode, bh); 1102 if (err) 1103 goto out; 1104 1105 /* create index in new top-level index: num,max,pointer */ 1106 err = ext4_ext_get_access(handle, inode, curp); 1107 if (err) 1108 goto out; 1109 1110 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1111 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1112 curp->p_hdr->eh_entries = cpu_to_le16(1); 1113 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1114 1115 if (path[0].p_hdr->eh_depth) 1116 curp->p_idx->ei_block = 1117 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; 1118 else 1119 curp->p_idx->ei_block = 1120 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 1121 ext4_idx_store_pblock(curp->p_idx, newblock); 1122 1123 neh = ext_inode_hdr(inode); 1124 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1125 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1126 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1127 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1128 1129 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1130 err = ext4_ext_dirty(handle, inode, curp); 1131out: 1132 brelse(bh); 1133 1134 return err; 1135} 1136 1137/* 1138 * ext4_ext_create_new_leaf: 1139 * finds empty index and adds new leaf. 1140 * if no free index is found, then it requests in-depth growing. 1141 */ 1142static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1143 struct ext4_ext_path *path, 1144 struct ext4_extent *newext) 1145{ 1146 struct ext4_ext_path *curp; 1147 int depth, i, err = 0; 1148 1149repeat: 1150 i = depth = ext_depth(inode); 1151 1152 /* walk up to the tree and look for free index entry */ 1153 curp = path + depth; 1154 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1155 i--; 1156 curp--; 1157 } 1158 1159 /* we use already allocated block for index block, 1160 * so subsequent data blocks should be contiguous */ 1161 if (EXT_HAS_FREE_INDEX(curp)) { 1162 /* if we found index with free entry, then use that 1163 * entry: create all needed subtree and add new leaf */ 1164 err = ext4_ext_split(handle, inode, path, newext, i); 1165 if (err) 1166 goto out; 1167 1168 /* refill path */ 1169 ext4_ext_drop_refs(path); 1170 path = ext4_ext_find_extent(inode, 1171 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1172 path); 1173 if (IS_ERR(path)) 1174 err = PTR_ERR(path); 1175 } else { 1176 /* tree is full, time to grow in depth */ 1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1178 if (err) 1179 goto out; 1180 1181 /* refill path */ 1182 ext4_ext_drop_refs(path); 1183 path = ext4_ext_find_extent(inode, 1184 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1185 path); 1186 if (IS_ERR(path)) { 1187 err = PTR_ERR(path); 1188 goto out; 1189 } 1190 1191 /* 1192 * only first (depth 0 -> 1) produces free space; 1193 * in all other cases we have to split the grown tree 1194 */ 1195 depth = ext_depth(inode); 1196 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1197 /* now we need to split */ 1198 goto repeat; 1199 } 1200 } 1201 1202out: 1203 return err; 1204} 1205 1206/* 1207 * search the closest allocated block to the left for *logical 1208 * and returns it at @logical + it's physical address at @phys 1209 * if *logical is the smallest allocated block, the function 1210 * returns 0 at @phys 1211 * return value contains 0 (success) or error code 1212 */ 1213static int ext4_ext_search_left(struct inode *inode, 1214 struct ext4_ext_path *path, 1215 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1216{ 1217 struct ext4_extent_idx *ix; 1218 struct ext4_extent *ex; 1219 int depth, ee_len; 1220 1221 if (unlikely(path == NULL)) { 1222 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1223 return -EIO; 1224 } 1225 depth = path->p_depth; 1226 *phys = 0; 1227 1228 if (depth == 0 && path->p_ext == NULL) 1229 return 0; 1230 1231 /* usually extent in the path covers blocks smaller 1232 * then *logical, but it can be that extent is the 1233 * first one in the file */ 1234 1235 ex = path[depth].p_ext; 1236 ee_len = ext4_ext_get_actual_len(ex); 1237 if (*logical < le32_to_cpu(ex->ee_block)) { 1238 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1239 EXT4_ERROR_INODE(inode, 1240 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1241 *logical, le32_to_cpu(ex->ee_block)); 1242 return -EIO; 1243 } 1244 while (--depth >= 0) { 1245 ix = path[depth].p_idx; 1246 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1247 EXT4_ERROR_INODE(inode, 1248 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1249 ix != NULL ? ix->ei_block : 0, 1250 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1251 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1252 depth); 1253 return -EIO; 1254 } 1255 } 1256 return 0; 1257 } 1258 1259 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1260 EXT4_ERROR_INODE(inode, 1261 "logical %d < ee_block %d + ee_len %d!", 1262 *logical, le32_to_cpu(ex->ee_block), ee_len); 1263 return -EIO; 1264 } 1265 1266 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1267 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1268 return 0; 1269} 1270 1271/* 1272 * search the closest allocated block to the right for *logical 1273 * and returns it at @logical + it's physical address at @phys 1274 * if *logical is the smallest allocated block, the function 1275 * returns 0 at @phys 1276 * return value contains 0 (success) or error code 1277 */ 1278static int ext4_ext_search_right(struct inode *inode, 1279 struct ext4_ext_path *path, 1280 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1281{ 1282 struct buffer_head *bh = NULL; 1283 struct ext4_extent_header *eh; 1284 struct ext4_extent_idx *ix; 1285 struct ext4_extent *ex; 1286 ext4_fsblk_t block; 1287 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1288 int ee_len; 1289 1290 if (unlikely(path == NULL)) { 1291 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1292 return -EIO; 1293 } 1294 depth = path->p_depth; 1295 *phys = 0; 1296 1297 if (depth == 0 && path->p_ext == NULL) 1298 return 0; 1299 1300 /* usually extent in the path covers blocks smaller 1301 * then *logical, but it can be that extent is the 1302 * first one in the file */ 1303 1304 ex = path[depth].p_ext; 1305 ee_len = ext4_ext_get_actual_len(ex); 1306 if (*logical < le32_to_cpu(ex->ee_block)) { 1307 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1308 EXT4_ERROR_INODE(inode, 1309 "first_extent(path[%d].p_hdr) != ex", 1310 depth); 1311 return -EIO; 1312 } 1313 while (--depth >= 0) { 1314 ix = path[depth].p_idx; 1315 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1316 EXT4_ERROR_INODE(inode, 1317 "ix != EXT_FIRST_INDEX *logical %d!", 1318 *logical); 1319 return -EIO; 1320 } 1321 } 1322 *logical = le32_to_cpu(ex->ee_block); 1323 *phys = ext4_ext_pblock(ex); 1324 return 0; 1325 } 1326 1327 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1328 EXT4_ERROR_INODE(inode, 1329 "logical %d < ee_block %d + ee_len %d!", 1330 *logical, le32_to_cpu(ex->ee_block), ee_len); 1331 return -EIO; 1332 } 1333 1334 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1335 /* next allocated block in this leaf */ 1336 ex++; 1337 *logical = le32_to_cpu(ex->ee_block); 1338 *phys = ext4_ext_pblock(ex); 1339 return 0; 1340 } 1341 1342 /* go up and search for index to the right */ 1343 while (--depth >= 0) { 1344 ix = path[depth].p_idx; 1345 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1346 goto got_index; 1347 } 1348 1349 /* we've gone up to the root and found no index to the right */ 1350 return 0; 1351 1352got_index: 1353 /* we've found index to the right, let's 1354 * follow it and find the closest allocated 1355 * block to the right */ 1356 ix++; 1357 block = ext4_idx_pblock(ix); 1358 while (++depth < path->p_depth) { 1359 bh = sb_bread(inode->i_sb, block); 1360 if (bh == NULL) 1361 return -EIO; 1362 eh = ext_block_hdr(bh); 1363 /* subtract from p_depth to get proper eh_depth */ 1364 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1365 put_bh(bh); 1366 return -EIO; 1367 } 1368 ix = EXT_FIRST_INDEX(eh); 1369 block = ext4_idx_pblock(ix); 1370 put_bh(bh); 1371 } 1372 1373 bh = sb_bread(inode->i_sb, block); 1374 if (bh == NULL) 1375 return -EIO; 1376 eh = ext_block_hdr(bh); 1377 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1378 put_bh(bh); 1379 return -EIO; 1380 } 1381 ex = EXT_FIRST_EXTENT(eh); 1382 *logical = le32_to_cpu(ex->ee_block); 1383 *phys = ext4_ext_pblock(ex); 1384 put_bh(bh); 1385 return 0; 1386} 1387 1388/* 1389 * ext4_ext_next_allocated_block: 1390 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1391 * NOTE: it considers block number from index entry as 1392 * allocated block. Thus, index entries have to be consistent 1393 * with leaves. 1394 */ 1395static ext4_lblk_t 1396ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1397{ 1398 int depth; 1399 1400 BUG_ON(path == NULL); 1401 depth = path->p_depth; 1402 1403 if (depth == 0 && path->p_ext == NULL) 1404 return EXT_MAX_BLOCK; 1405 1406 while (depth >= 0) { 1407 if (depth == path->p_depth) { 1408 /* leaf */ 1409 if (path[depth].p_ext != 1410 EXT_LAST_EXTENT(path[depth].p_hdr)) 1411 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1412 } else { 1413 /* index */ 1414 if (path[depth].p_idx != 1415 EXT_LAST_INDEX(path[depth].p_hdr)) 1416 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1417 } 1418 depth--; 1419 } 1420 1421 return EXT_MAX_BLOCK; 1422} 1423 1424/* 1425 * ext4_ext_next_leaf_block: 1426 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1427 */ 1428static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1429 struct ext4_ext_path *path) 1430{ 1431 int depth; 1432 1433 BUG_ON(path == NULL); 1434 depth = path->p_depth; 1435 1436 /* zero-tree has no leaf blocks at all */ 1437 if (depth == 0) 1438 return EXT_MAX_BLOCK; 1439 1440 /* go to index block */ 1441 depth--; 1442 1443 while (depth >= 0) { 1444 if (path[depth].p_idx != 1445 EXT_LAST_INDEX(path[depth].p_hdr)) 1446 return (ext4_lblk_t) 1447 le32_to_cpu(path[depth].p_idx[1].ei_block); 1448 depth--; 1449 } 1450 1451 return EXT_MAX_BLOCK; 1452} 1453 1454/* 1455 * ext4_ext_correct_indexes: 1456 * if leaf gets modified and modified extent is first in the leaf, 1457 * then we have to correct all indexes above. 1458 * TODO: do we need to correct tree in all cases? 1459 */ 1460static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1461 struct ext4_ext_path *path) 1462{ 1463 struct ext4_extent_header *eh; 1464 int depth = ext_depth(inode); 1465 struct ext4_extent *ex; 1466 __le32 border; 1467 int k, err = 0; 1468 1469 eh = path[depth].p_hdr; 1470 ex = path[depth].p_ext; 1471 1472 if (unlikely(ex == NULL || eh == NULL)) { 1473 EXT4_ERROR_INODE(inode, 1474 "ex %p == NULL or eh %p == NULL", ex, eh); 1475 return -EIO; 1476 } 1477 1478 if (depth == 0) { 1479 /* there is no tree at all */ 1480 return 0; 1481 } 1482 1483 if (ex != EXT_FIRST_EXTENT(eh)) { 1484 /* we correct tree if first leaf got modified only */ 1485 return 0; 1486 } 1487 1488 /* 1489 * TODO: we need correction if border is smaller than current one 1490 */ 1491 k = depth - 1; 1492 border = path[depth].p_ext->ee_block; 1493 err = ext4_ext_get_access(handle, inode, path + k); 1494 if (err) 1495 return err; 1496 path[k].p_idx->ei_block = border; 1497 err = ext4_ext_dirty(handle, inode, path + k); 1498 if (err) 1499 return err; 1500 1501 while (k--) { 1502 /* change all left-side indexes */ 1503 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1504 break; 1505 err = ext4_ext_get_access(handle, inode, path + k); 1506 if (err) 1507 break; 1508 path[k].p_idx->ei_block = border; 1509 err = ext4_ext_dirty(handle, inode, path + k); 1510 if (err) 1511 break; 1512 } 1513 1514 return err; 1515} 1516 1517int 1518ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1519 struct ext4_extent *ex2) 1520{ 1521 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1522 1523 /* 1524 * Make sure that either both extents are uninitialized, or 1525 * both are _not_. 1526 */ 1527 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1528 return 0; 1529 1530 if (ext4_ext_is_uninitialized(ex1)) 1531 max_len = EXT_UNINIT_MAX_LEN; 1532 else 1533 max_len = EXT_INIT_MAX_LEN; 1534 1535 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1536 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1537 1538 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1539 le32_to_cpu(ex2->ee_block)) 1540 return 0; 1541 1542 /* 1543 * To allow future support for preallocated extents to be added 1544 * as an RO_COMPAT feature, refuse to merge to extents if 1545 * this can result in the top bit of ee_len being set. 1546 */ 1547 if (ext1_ee_len + ext2_ee_len > max_len) 1548 return 0; 1549#ifdef AGGRESSIVE_TEST 1550 if (ext1_ee_len >= 4) 1551 return 0; 1552#endif 1553 1554 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1555 return 1; 1556 return 0; 1557} 1558 1559/* 1560 * This function tries to merge the "ex" extent to the next extent in the tree. 1561 * It always tries to merge towards right. If you want to merge towards 1562 * left, pass "ex - 1" as argument instead of "ex". 1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1564 * 1 if they got merged. 1565 */ 1566static int ext4_ext_try_to_merge(struct inode *inode, 1567 struct ext4_ext_path *path, 1568 struct ext4_extent *ex) 1569{ 1570 struct ext4_extent_header *eh; 1571 unsigned int depth, len; 1572 int merge_done = 0; 1573 int uninitialized = 0; 1574 1575 depth = ext_depth(inode); 1576 BUG_ON(path[depth].p_hdr == NULL); 1577 eh = path[depth].p_hdr; 1578 1579 while (ex < EXT_LAST_EXTENT(eh)) { 1580 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1581 break; 1582 /* merge with next extent! */ 1583 if (ext4_ext_is_uninitialized(ex)) 1584 uninitialized = 1; 1585 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1586 + ext4_ext_get_actual_len(ex + 1)); 1587 if (uninitialized) 1588 ext4_ext_mark_uninitialized(ex); 1589 1590 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1591 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1592 * sizeof(struct ext4_extent); 1593 memmove(ex + 1, ex + 2, len); 1594 } 1595 le16_add_cpu(&eh->eh_entries, -1); 1596 merge_done = 1; 1597 WARN_ON(eh->eh_entries == 0); 1598 if (!eh->eh_entries) 1599 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1600 } 1601 1602 return merge_done; 1603} 1604 1605/* 1606 * check if a portion of the "newext" extent overlaps with an 1607 * existing extent. 1608 * 1609 * If there is an overlap discovered, it updates the length of the newext 1610 * such that there will be no overlap, and then returns 1. 1611 * If there is no overlap found, it returns 0. 1612 */ 1613static unsigned int ext4_ext_check_overlap(struct inode *inode, 1614 struct ext4_extent *newext, 1615 struct ext4_ext_path *path) 1616{ 1617 ext4_lblk_t b1, b2; 1618 unsigned int depth, len1; 1619 unsigned int ret = 0; 1620 1621 b1 = le32_to_cpu(newext->ee_block); 1622 len1 = ext4_ext_get_actual_len(newext); 1623 depth = ext_depth(inode); 1624 if (!path[depth].p_ext) 1625 goto out; 1626 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1627 1628 /* 1629 * get the next allocated block if the extent in the path 1630 * is before the requested block(s) 1631 */ 1632 if (b2 < b1) { 1633 b2 = ext4_ext_next_allocated_block(path); 1634 if (b2 == EXT_MAX_BLOCK) 1635 goto out; 1636 } 1637 1638 /* check for wrap through zero on extent logical start block*/ 1639 if (b1 + len1 < b1) { 1640 len1 = EXT_MAX_BLOCK - b1; 1641 newext->ee_len = cpu_to_le16(len1); 1642 ret = 1; 1643 } 1644 1645 /* check for overlap */ 1646 if (b1 + len1 > b2) { 1647 newext->ee_len = cpu_to_le16(b2 - b1); 1648 ret = 1; 1649 } 1650out: 1651 return ret; 1652} 1653 1654/* 1655 * ext4_ext_insert_extent: 1656 * tries to merge requsted extent into the existing extent or 1657 * inserts requested extent as new one into the tree, 1658 * creating new leaf in the no-space case. 1659 */ 1660int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1661 struct ext4_ext_path *path, 1662 struct ext4_extent *newext, int flag) 1663{ 1664 struct ext4_extent_header *eh; 1665 struct ext4_extent *ex, *fex; 1666 struct ext4_extent *nearex; /* nearest extent */ 1667 struct ext4_ext_path *npath = NULL; 1668 int depth, len, err; 1669 ext4_lblk_t next; 1670 unsigned uninitialized = 0; 1671 1672 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1673 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1674 return -EIO; 1675 } 1676 depth = ext_depth(inode); 1677 ex = path[depth].p_ext; 1678 if (unlikely(path[depth].p_hdr == NULL)) { 1679 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1680 return -EIO; 1681 } 1682 1683 /* try to insert block into found extent and return */ 1684 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1685 && ext4_can_extents_be_merged(inode, ex, newext)) { 1686 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1687 ext4_ext_is_uninitialized(newext), 1688 ext4_ext_get_actual_len(newext), 1689 le32_to_cpu(ex->ee_block), 1690 ext4_ext_is_uninitialized(ex), 1691 ext4_ext_get_actual_len(ex), 1692 ext4_ext_pblock(ex)); 1693 err = ext4_ext_get_access(handle, inode, path + depth); 1694 if (err) 1695 return err; 1696 1697 /* 1698 * ext4_can_extents_be_merged should have checked that either 1699 * both extents are uninitialized, or both aren't. Thus we 1700 * need to check only one of them here. 1701 */ 1702 if (ext4_ext_is_uninitialized(ex)) 1703 uninitialized = 1; 1704 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1705 + ext4_ext_get_actual_len(newext)); 1706 if (uninitialized) 1707 ext4_ext_mark_uninitialized(ex); 1708 eh = path[depth].p_hdr; 1709 nearex = ex; 1710 goto merge; 1711 } 1712 1713repeat: 1714 depth = ext_depth(inode); 1715 eh = path[depth].p_hdr; 1716 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1717 goto has_space; 1718 1719 /* probably next leaf has space for us? */ 1720 fex = EXT_LAST_EXTENT(eh); 1721 next = ext4_ext_next_leaf_block(inode, path); 1722 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1723 && next != EXT_MAX_BLOCK) { 1724 ext_debug("next leaf block - %d\n", next); 1725 BUG_ON(npath != NULL); 1726 npath = ext4_ext_find_extent(inode, next, NULL); 1727 if (IS_ERR(npath)) 1728 return PTR_ERR(npath); 1729 BUG_ON(npath->p_depth != path->p_depth); 1730 eh = npath[depth].p_hdr; 1731 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1732 ext_debug("next leaf isnt full(%d)\n", 1733 le16_to_cpu(eh->eh_entries)); 1734 path = npath; 1735 goto repeat; 1736 } 1737 ext_debug("next leaf has no free space(%d,%d)\n", 1738 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1739 } 1740 1741 /* 1742 * There is no free space in the found leaf. 1743 * We're gonna add a new leaf in the tree. 1744 */ 1745 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1746 if (err) 1747 goto cleanup; 1748 depth = ext_depth(inode); 1749 eh = path[depth].p_hdr; 1750 1751has_space: 1752 nearex = path[depth].p_ext; 1753 1754 err = ext4_ext_get_access(handle, inode, path + depth); 1755 if (err) 1756 goto cleanup; 1757 1758 if (!nearex) { 1759 /* there is no extent in this leaf, create first one */ 1760 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1761 le32_to_cpu(newext->ee_block), 1762 ext4_ext_pblock(newext), 1763 ext4_ext_is_uninitialized(newext), 1764 ext4_ext_get_actual_len(newext)); 1765 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1766 } else if (le32_to_cpu(newext->ee_block) 1767 > le32_to_cpu(nearex->ee_block)) { 1768/* BUG_ON(newext->ee_block == nearex->ee_block); */ 1769 if (nearex != EXT_LAST_EXTENT(eh)) { 1770 len = EXT_MAX_EXTENT(eh) - nearex; 1771 len = (len - 1) * sizeof(struct ext4_extent); 1772 len = len < 0 ? 0 : len; 1773 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1774 "move %d from 0x%p to 0x%p\n", 1775 le32_to_cpu(newext->ee_block), 1776 ext4_ext_pblock(newext), 1777 ext4_ext_is_uninitialized(newext), 1778 ext4_ext_get_actual_len(newext), 1779 nearex, len, nearex + 1, nearex + 2); 1780 memmove(nearex + 2, nearex + 1, len); 1781 } 1782 path[depth].p_ext = nearex + 1; 1783 } else { 1784 BUG_ON(newext->ee_block == nearex->ee_block); 1785 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1786 len = len < 0 ? 0 : len; 1787 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1788 "move %d from 0x%p to 0x%p\n", 1789 le32_to_cpu(newext->ee_block), 1790 ext4_ext_pblock(newext), 1791 ext4_ext_is_uninitialized(newext), 1792 ext4_ext_get_actual_len(newext), 1793 nearex, len, nearex + 1, nearex + 2); 1794 memmove(nearex + 1, nearex, len); 1795 path[depth].p_ext = nearex; 1796 } 1797 1798 le16_add_cpu(&eh->eh_entries, 1); 1799 nearex = path[depth].p_ext; 1800 nearex->ee_block = newext->ee_block; 1801 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1802 nearex->ee_len = newext->ee_len; 1803 1804merge: 1805 /* try to merge extents to the right */ 1806 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1807 ext4_ext_try_to_merge(inode, path, nearex); 1808 1809 /* try to merge extents to the left */ 1810 1811 /* time to correct all indexes above */ 1812 err = ext4_ext_correct_indexes(handle, inode, path); 1813 if (err) 1814 goto cleanup; 1815 1816 err = ext4_ext_dirty(handle, inode, path + depth); 1817 1818cleanup: 1819 if (npath) { 1820 ext4_ext_drop_refs(npath); 1821 kfree(npath); 1822 } 1823 ext4_ext_invalidate_cache(inode); 1824 return err; 1825} 1826 1827static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1828 ext4_lblk_t num, ext_prepare_callback func, 1829 void *cbdata) 1830{ 1831 struct ext4_ext_path *path = NULL; 1832 struct ext4_ext_cache cbex; 1833 struct ext4_extent *ex; 1834 ext4_lblk_t next, start = 0, end = 0; 1835 ext4_lblk_t last = block + num; 1836 int depth, exists, err = 0; 1837 1838 BUG_ON(func == NULL); 1839 BUG_ON(inode == NULL); 1840 1841 while (block < last && block != EXT_MAX_BLOCK) { 1842 num = last - block; 1843 /* find extent for this block */ 1844 down_read(&EXT4_I(inode)->i_data_sem); 1845 path = ext4_ext_find_extent(inode, block, path); 1846 up_read(&EXT4_I(inode)->i_data_sem); 1847 if (IS_ERR(path)) { 1848 err = PTR_ERR(path); 1849 path = NULL; 1850 break; 1851 } 1852 1853 depth = ext_depth(inode); 1854 if (unlikely(path[depth].p_hdr == NULL)) { 1855 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1856 err = -EIO; 1857 break; 1858 } 1859 ex = path[depth].p_ext; 1860 next = ext4_ext_next_allocated_block(path); 1861 1862 exists = 0; 1863 if (!ex) { 1864 /* there is no extent yet, so try to allocate 1865 * all requested space */ 1866 start = block; 1867 end = block + num; 1868 } else if (le32_to_cpu(ex->ee_block) > block) { 1869 /* need to allocate space before found extent */ 1870 start = block; 1871 end = le32_to_cpu(ex->ee_block); 1872 if (block + num < end) 1873 end = block + num; 1874 } else if (block >= le32_to_cpu(ex->ee_block) 1875 + ext4_ext_get_actual_len(ex)) { 1876 /* need to allocate space after found extent */ 1877 start = block; 1878 end = block + num; 1879 if (end >= next) 1880 end = next; 1881 } else if (block >= le32_to_cpu(ex->ee_block)) { 1882 /* 1883 * some part of requested space is covered 1884 * by found extent 1885 */ 1886 start = block; 1887 end = le32_to_cpu(ex->ee_block) 1888 + ext4_ext_get_actual_len(ex); 1889 if (block + num < end) 1890 end = block + num; 1891 exists = 1; 1892 } else { 1893 BUG(); 1894 } 1895 BUG_ON(end <= start); 1896 1897 if (!exists) { 1898 cbex.ec_block = start; 1899 cbex.ec_len = end - start; 1900 cbex.ec_start = 0; 1901 } else { 1902 cbex.ec_block = le32_to_cpu(ex->ee_block); 1903 cbex.ec_len = ext4_ext_get_actual_len(ex); 1904 cbex.ec_start = ext4_ext_pblock(ex); 1905 } 1906 1907 if (unlikely(cbex.ec_len == 0)) { 1908 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 1909 err = -EIO; 1910 break; 1911 } 1912 err = func(inode, path, &cbex, ex, cbdata); 1913 ext4_ext_drop_refs(path); 1914 1915 if (err < 0) 1916 break; 1917 1918 if (err == EXT_REPEAT) 1919 continue; 1920 else if (err == EXT_BREAK) { 1921 err = 0; 1922 break; 1923 } 1924 1925 if (ext_depth(inode) != depth) { 1926 /* depth was changed. we have to realloc path */ 1927 kfree(path); 1928 path = NULL; 1929 } 1930 1931 block = cbex.ec_block + cbex.ec_len; 1932 } 1933 1934 if (path) { 1935 ext4_ext_drop_refs(path); 1936 kfree(path); 1937 } 1938 1939 return err; 1940} 1941 1942static void 1943ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1944 __u32 len, ext4_fsblk_t start) 1945{ 1946 struct ext4_ext_cache *cex; 1947 BUG_ON(len == 0); 1948 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1949 cex = &EXT4_I(inode)->i_cached_extent; 1950 cex->ec_block = block; 1951 cex->ec_len = len; 1952 cex->ec_start = start; 1953 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1954} 1955 1956/* 1957 * ext4_ext_put_gap_in_cache: 1958 * calculate boundaries of the gap that the requested block fits into 1959 * and cache this gap 1960 */ 1961static void 1962ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1963 ext4_lblk_t block) 1964{ 1965 int depth = ext_depth(inode); 1966 unsigned long len; 1967 ext4_lblk_t lblock; 1968 struct ext4_extent *ex; 1969 1970 ex = path[depth].p_ext; 1971 if (ex == NULL) { 1972 /* there is no extent yet, so gap is [0;-] */ 1973 lblock = 0; 1974 len = EXT_MAX_BLOCK; 1975 ext_debug("cache gap(whole file):"); 1976 } else if (block < le32_to_cpu(ex->ee_block)) { 1977 lblock = block; 1978 len = le32_to_cpu(ex->ee_block) - block; 1979 ext_debug("cache gap(before): %u [%u:%u]", 1980 block, 1981 le32_to_cpu(ex->ee_block), 1982 ext4_ext_get_actual_len(ex)); 1983 } else if (block >= le32_to_cpu(ex->ee_block) 1984 + ext4_ext_get_actual_len(ex)) { 1985 ext4_lblk_t next; 1986 lblock = le32_to_cpu(ex->ee_block) 1987 + ext4_ext_get_actual_len(ex); 1988 1989 next = ext4_ext_next_allocated_block(path); 1990 ext_debug("cache gap(after): [%u:%u] %u", 1991 le32_to_cpu(ex->ee_block), 1992 ext4_ext_get_actual_len(ex), 1993 block); 1994 BUG_ON(next == lblock); 1995 len = next - lblock; 1996 } else { 1997 lblock = len = 0; 1998 BUG(); 1999 } 2000 2001 ext_debug(" -> %u:%lu\n", lblock, len); 2002 ext4_ext_put_in_cache(inode, lblock, len, 0); 2003} 2004 2005/* 2006 * Return 0 if cache is invalid; 1 if the cache is valid 2007 */ 2008static int 2009ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2010 struct ext4_extent *ex) 2011{ 2012 struct ext4_ext_cache *cex; 2013 int ret = 0; 2014 2015 /* 2016 * We borrow i_block_reservation_lock to protect i_cached_extent 2017 */ 2018 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2019 cex = &EXT4_I(inode)->i_cached_extent; 2020 2021 /* has cache valid data? */ 2022 if (cex->ec_len == 0) 2023 goto errout; 2024 2025 if (in_range(block, cex->ec_block, cex->ec_len)) { 2026 ex->ee_block = cpu_to_le32(cex->ec_block); 2027 ext4_ext_store_pblock(ex, cex->ec_start); 2028 ex->ee_len = cpu_to_le16(cex->ec_len); 2029 ext_debug("%u cached by %u:%u:%llu\n", 2030 block, 2031 cex->ec_block, cex->ec_len, cex->ec_start); 2032 ret = 1; 2033 } 2034errout: 2035 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2036 return ret; 2037} 2038 2039/* 2040 * ext4_ext_rm_idx: 2041 * removes index from the index block. 2042 * It's used in truncate case only, thus all requests are for 2043 * last index in the block only. 2044 */ 2045static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2046 struct ext4_ext_path *path) 2047{ 2048 int err; 2049 ext4_fsblk_t leaf; 2050 2051 /* free index block */ 2052 path--; 2053 leaf = ext4_idx_pblock(path->p_idx); 2054 if (unlikely(path->p_hdr->eh_entries == 0)) { 2055 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2056 return -EIO; 2057 } 2058 err = ext4_ext_get_access(handle, inode, path); 2059 if (err) 2060 return err; 2061 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2062 err = ext4_ext_dirty(handle, inode, path); 2063 if (err) 2064 return err; 2065 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2066 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2067 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2068 return err; 2069} 2070 2071/* 2072 * ext4_ext_calc_credits_for_single_extent: 2073 * This routine returns max. credits that needed to insert an extent 2074 * to the extent tree. 2075 * When pass the actual path, the caller should calculate credits 2076 * under i_data_sem. 2077 */ 2078int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2079 struct ext4_ext_path *path) 2080{ 2081 if (path) { 2082 int depth = ext_depth(inode); 2083 int ret = 0; 2084 2085 /* probably there is space in leaf? */ 2086 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2087 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2088 2089 /* 2090 * There are some space in the leaf tree, no 2091 * need to account for leaf block credit 2092 * 2093 * bitmaps and block group descriptor blocks 2094 * and other metadat blocks still need to be 2095 * accounted. 2096 */ 2097 /* 1 bitmap, 1 block group descriptor */ 2098 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2099 return ret; 2100 } 2101 } 2102 2103 return ext4_chunk_trans_blocks(inode, nrblocks); 2104} 2105 2106/* 2107 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2108 * 2109 * if nrblocks are fit in a single extent (chunk flag is 1), then 2110 * in the worse case, each tree level index/leaf need to be changed 2111 * if the tree split due to insert a new extent, then the old tree 2112 * index/leaf need to be updated too 2113 * 2114 * If the nrblocks are discontiguous, they could cause 2115 * the whole tree split more than once, but this is really rare. 2116 */ 2117int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2118{ 2119 int index; 2120 int depth = ext_depth(inode); 2121 2122 if (chunk) 2123 index = depth * 2; 2124 else 2125 index = depth * 3; 2126 2127 return index; 2128} 2129 2130static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2131 struct ext4_extent *ex, 2132 ext4_lblk_t from, ext4_lblk_t to) 2133{ 2134 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2135 int flags = EXT4_FREE_BLOCKS_FORGET; 2136 2137 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2138 flags |= EXT4_FREE_BLOCKS_METADATA; 2139#ifdef EXTENTS_STATS 2140 { 2141 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2142 spin_lock(&sbi->s_ext_stats_lock); 2143 sbi->s_ext_blocks += ee_len; 2144 sbi->s_ext_extents++; 2145 if (ee_len < sbi->s_ext_min) 2146 sbi->s_ext_min = ee_len; 2147 if (ee_len > sbi->s_ext_max) 2148 sbi->s_ext_max = ee_len; 2149 if (ext_depth(inode) > sbi->s_depth_max) 2150 sbi->s_depth_max = ext_depth(inode); 2151 spin_unlock(&sbi->s_ext_stats_lock); 2152 } 2153#endif 2154 if (from >= le32_to_cpu(ex->ee_block) 2155 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2156 /* tail removal */ 2157 ext4_lblk_t num; 2158 ext4_fsblk_t start; 2159 2160 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2161 start = ext4_ext_pblock(ex) + ee_len - num; 2162 ext_debug("free last %u blocks starting %llu\n", num, start); 2163 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2164 } else if (from == le32_to_cpu(ex->ee_block) 2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2167 from, to, le32_to_cpu(ex->ee_block), ee_len); 2168 } else { 2169 printk(KERN_INFO "strange request: removal(2) " 2170 "%u-%u from %u:%u\n", 2171 from, to, le32_to_cpu(ex->ee_block), ee_len); 2172 } 2173 return 0; 2174} 2175 2176static int 2177ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2178 struct ext4_ext_path *path, ext4_lblk_t start) 2179{ 2180 int err = 0, correct_index = 0; 2181 int depth = ext_depth(inode), credits; 2182 struct ext4_extent_header *eh; 2183 ext4_lblk_t a, b, block; 2184 unsigned num; 2185 ext4_lblk_t ex_ee_block; 2186 unsigned short ex_ee_len; 2187 unsigned uninitialized = 0; 2188 struct ext4_extent *ex; 2189 2190 /* the header must be checked already in ext4_ext_remove_space() */ 2191 ext_debug("truncate since %u in leaf\n", start); 2192 if (!path[depth].p_hdr) 2193 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2194 eh = path[depth].p_hdr; 2195 if (unlikely(path[depth].p_hdr == NULL)) { 2196 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2197 return -EIO; 2198 } 2199 /* find where to start removing */ 2200 ex = EXT_LAST_EXTENT(eh); 2201 2202 ex_ee_block = le32_to_cpu(ex->ee_block); 2203 ex_ee_len = ext4_ext_get_actual_len(ex); 2204 2205 while (ex >= EXT_FIRST_EXTENT(eh) && 2206 ex_ee_block + ex_ee_len > start) { 2207 2208 if (ext4_ext_is_uninitialized(ex)) 2209 uninitialized = 1; 2210 else 2211 uninitialized = 0; 2212 2213 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2214 uninitialized, ex_ee_len); 2215 path[depth].p_ext = ex; 2216 2217 a = ex_ee_block > start ? ex_ee_block : start; 2218 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2219 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2220 2221 ext_debug(" border %u:%u\n", a, b); 2222 2223 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2224 block = 0; 2225 num = 0; 2226 BUG(); 2227 } else if (a != ex_ee_block) { 2228 /* remove tail of the extent */ 2229 block = ex_ee_block; 2230 num = a - block; 2231 } else if (b != ex_ee_block + ex_ee_len - 1) { 2232 /* remove head of the extent */ 2233 block = a; 2234 num = b - a; 2235 /* there is no "make a hole" API yet */ 2236 BUG(); 2237 } else { 2238 /* remove whole extent: excellent! */ 2239 block = ex_ee_block; 2240 num = 0; 2241 BUG_ON(a != ex_ee_block); 2242 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2243 } 2244 2245 /* 2246 * 3 for leaf, sb, and inode plus 2 (bmap and group 2247 * descriptor) for each block group; assume two block 2248 * groups plus ex_ee_len/blocks_per_block_group for 2249 * the worst case 2250 */ 2251 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2252 if (ex == EXT_FIRST_EXTENT(eh)) { 2253 correct_index = 1; 2254 credits += (ext_depth(inode)) + 1; 2255 } 2256 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2257 2258 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2259 if (err) 2260 goto out; 2261 2262 err = ext4_ext_get_access(handle, inode, path + depth); 2263 if (err) 2264 goto out; 2265 2266 err = ext4_remove_blocks(handle, inode, ex, a, b); 2267 if (err) 2268 goto out; 2269 2270 if (num == 0) { 2271 /* this extent is removed; mark slot entirely unused */ 2272 ext4_ext_store_pblock(ex, 0); 2273 le16_add_cpu(&eh->eh_entries, -1); 2274 } 2275 2276 ex->ee_block = cpu_to_le32(block); 2277 ex->ee_len = cpu_to_le16(num); 2278 /* 2279 * Do not mark uninitialized if all the blocks in the 2280 * extent have been removed. 2281 */ 2282 if (uninitialized && num) 2283 ext4_ext_mark_uninitialized(ex); 2284 2285 err = ext4_ext_dirty(handle, inode, path + depth); 2286 if (err) 2287 goto out; 2288 2289 ext_debug("new extent: %u:%u:%llu\n", block, num, 2290 ext4_ext_pblock(ex)); 2291 ex--; 2292 ex_ee_block = le32_to_cpu(ex->ee_block); 2293 ex_ee_len = ext4_ext_get_actual_len(ex); 2294 } 2295 2296 if (correct_index && eh->eh_entries) 2297 err = ext4_ext_correct_indexes(handle, inode, path); 2298 2299 /* if this leaf is free, then we should 2300 * remove it from index block above */ 2301 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2302 err = ext4_ext_rm_idx(handle, inode, path + depth); 2303 2304out: 2305 return err; 2306} 2307 2308/* 2309 * ext4_ext_more_to_rm: 2310 * returns 1 if current index has to be freed (even partial) 2311 */ 2312static int 2313ext4_ext_more_to_rm(struct ext4_ext_path *path) 2314{ 2315 BUG_ON(path->p_idx == NULL); 2316 2317 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2318 return 0; 2319 2320 /* 2321 * if truncate on deeper level happened, it wasn't partial, 2322 * so we have to consider current index for truncation 2323 */ 2324 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2325 return 0; 2326 return 1; 2327} 2328 2329static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2330{ 2331 struct super_block *sb = inode->i_sb; 2332 int depth = ext_depth(inode); 2333 struct ext4_ext_path *path; 2334 handle_t *handle; 2335 int i, err; 2336 2337 ext_debug("truncate since %u\n", start); 2338 2339 /* probably first extent we're gonna free will be last in block */ 2340 handle = ext4_journal_start(inode, depth + 1); 2341 if (IS_ERR(handle)) 2342 return PTR_ERR(handle); 2343 2344again: 2345 ext4_ext_invalidate_cache(inode); 2346 2347 /* 2348 * We start scanning from right side, freeing all the blocks 2349 * after i_size and walking into the tree depth-wise. 2350 */ 2351 depth = ext_depth(inode); 2352 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2353 if (path == NULL) { 2354 ext4_journal_stop(handle); 2355 return -ENOMEM; 2356 } 2357 path[0].p_depth = depth; 2358 path[0].p_hdr = ext_inode_hdr(inode); 2359 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2360 err = -EIO; 2361 goto out; 2362 } 2363 i = err = 0; 2364 2365 while (i >= 0 && err == 0) { 2366 if (i == depth) { 2367 /* this is leaf block */ 2368 err = ext4_ext_rm_leaf(handle, inode, path, start); 2369 /* root level has p_bh == NULL, brelse() eats this */ 2370 brelse(path[i].p_bh); 2371 path[i].p_bh = NULL; 2372 i--; 2373 continue; 2374 } 2375 2376 /* this is index block */ 2377 if (!path[i].p_hdr) { 2378 ext_debug("initialize header\n"); 2379 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2380 } 2381 2382 if (!path[i].p_idx) { 2383 /* this level hasn't been touched yet */ 2384 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2385 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2386 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2387 path[i].p_hdr, 2388 le16_to_cpu(path[i].p_hdr->eh_entries)); 2389 } else { 2390 /* we were already here, see at next index */ 2391 path[i].p_idx--; 2392 } 2393 2394 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2395 i, EXT_FIRST_INDEX(path[i].p_hdr), 2396 path[i].p_idx); 2397 if (ext4_ext_more_to_rm(path + i)) { 2398 struct buffer_head *bh; 2399 /* go to the next level */ 2400 ext_debug("move to level %d (block %llu)\n", 2401 i + 1, ext4_idx_pblock(path[i].p_idx)); 2402 memset(path + i + 1, 0, sizeof(*path)); 2403 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); 2404 if (!bh) { 2405 /* should we reset i_size? */ 2406 err = -EIO; 2407 break; 2408 } 2409 if (WARN_ON(i + 1 > depth)) { 2410 err = -EIO; 2411 break; 2412 } 2413 if (ext4_ext_check(inode, ext_block_hdr(bh), 2414 depth - i - 1)) { 2415 err = -EIO; 2416 break; 2417 } 2418 path[i + 1].p_bh = bh; 2419 2420 /* save actual number of indexes since this 2421 * number is changed at the next iteration */ 2422 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2423 i++; 2424 } else { 2425 /* we finished processing this index, go up */ 2426 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2427 /* index is empty, remove it; 2428 * handle must be already prepared by the 2429 * truncatei_leaf() */ 2430 err = ext4_ext_rm_idx(handle, inode, path + i); 2431 } 2432 /* root level has p_bh == NULL, brelse() eats this */ 2433 brelse(path[i].p_bh); 2434 path[i].p_bh = NULL; 2435 i--; 2436 ext_debug("return to level %d\n", i); 2437 } 2438 } 2439 2440 /* TODO: flexible tree reduction should be here */ 2441 if (path->p_hdr->eh_entries == 0) { 2442 /* 2443 * truncate to zero freed all the tree, 2444 * so we need to correct eh_depth 2445 */ 2446 err = ext4_ext_get_access(handle, inode, path); 2447 if (err == 0) { 2448 ext_inode_hdr(inode)->eh_depth = 0; 2449 ext_inode_hdr(inode)->eh_max = 2450 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2451 err = ext4_ext_dirty(handle, inode, path); 2452 } 2453 } 2454out: 2455 ext4_ext_drop_refs(path); 2456 kfree(path); 2457 if (err == -EAGAIN) 2458 goto again; 2459 ext4_journal_stop(handle); 2460 2461 return err; 2462} 2463 2464/* 2465 * called at mount time 2466 */ 2467void ext4_ext_init(struct super_block *sb) 2468{ 2469 /* 2470 * possible initialization would be here 2471 */ 2472 2473 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2474#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2475 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2476#ifdef AGGRESSIVE_TEST 2477 printk(", aggressive tests"); 2478#endif 2479#ifdef CHECK_BINSEARCH 2480 printk(", check binsearch"); 2481#endif 2482#ifdef EXTENTS_STATS 2483 printk(", stats"); 2484#endif 2485 printk("\n"); 2486#endif 2487#ifdef EXTENTS_STATS 2488 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2489 EXT4_SB(sb)->s_ext_min = 1 << 30; 2490 EXT4_SB(sb)->s_ext_max = 0; 2491#endif 2492 } 2493} 2494 2495/* 2496 * called at umount time 2497 */ 2498void ext4_ext_release(struct super_block *sb) 2499{ 2500 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2501 return; 2502 2503#ifdef EXTENTS_STATS 2504 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2505 struct ext4_sb_info *sbi = EXT4_SB(sb); 2506 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2507 sbi->s_ext_blocks, sbi->s_ext_extents, 2508 sbi->s_ext_blocks / sbi->s_ext_extents); 2509 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2510 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2511 } 2512#endif 2513} 2514 2515/* FIXME!! we need to try to merge to left or right after zero-out */ 2516static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2517{ 2518 ext4_fsblk_t ee_pblock; 2519 unsigned int ee_len; 2520 int ret; 2521 2522 ee_len = ext4_ext_get_actual_len(ex); 2523 ee_pblock = ext4_ext_pblock(ex); 2524 2525 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 2526 if (ret > 0) 2527 ret = 0; 2528 2529 return ret; 2530} 2531 2532#define EXT4_EXT_ZERO_LEN 7 2533/* 2534 * This function is called by ext4_ext_map_blocks() if someone tries to write 2535 * to an uninitialized extent. It may result in splitting the uninitialized 2536 * extent into multiple extents (upto three - one initialized and two 2537 * uninitialized). 2538 * There are three possibilities: 2539 * a> There is no split required: Entire extent should be initialized 2540 * b> Splits in two extents: Write is happening at either end of the extent 2541 * c> Splits in three extents: Somone is writing in middle of the extent 2542 */ 2543static int ext4_ext_convert_to_initialized(handle_t *handle, 2544 struct inode *inode, 2545 struct ext4_map_blocks *map, 2546 struct ext4_ext_path *path) 2547{ 2548 struct ext4_extent *ex, newex, orig_ex; 2549 struct ext4_extent *ex1 = NULL; 2550 struct ext4_extent *ex2 = NULL; 2551 struct ext4_extent *ex3 = NULL; 2552 struct ext4_extent_header *eh; 2553 ext4_lblk_t ee_block, eof_block; 2554 unsigned int allocated, ee_len, depth; 2555 ext4_fsblk_t newblock; 2556 int err = 0; 2557 int ret = 0; 2558 int may_zeroout; 2559 2560 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2561 "block %llu, max_blocks %u\n", inode->i_ino, 2562 (unsigned long long)map->m_lblk, map->m_len); 2563 2564 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2565 inode->i_sb->s_blocksize_bits; 2566 if (eof_block < map->m_lblk + map->m_len) 2567 eof_block = map->m_lblk + map->m_len; 2568 2569 depth = ext_depth(inode); 2570 eh = path[depth].p_hdr; 2571 ex = path[depth].p_ext; 2572 ee_block = le32_to_cpu(ex->ee_block); 2573 ee_len = ext4_ext_get_actual_len(ex); 2574 allocated = ee_len - (map->m_lblk - ee_block); 2575 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); 2576 2577 ex2 = ex; 2578 orig_ex.ee_block = ex->ee_block; 2579 orig_ex.ee_len = cpu_to_le16(ee_len); 2580 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); 2581 2582 /* 2583 * It is safe to convert extent to initialized via explicit 2584 * zeroout only if extent is fully insde i_size or new_size. 2585 */ 2586 may_zeroout = ee_block + ee_len <= eof_block; 2587 2588 err = ext4_ext_get_access(handle, inode, path + depth); 2589 if (err) 2590 goto out; 2591 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2592 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2593 err = ext4_ext_zeroout(inode, &orig_ex); 2594 if (err) 2595 goto fix_extent_len; 2596 /* update the extent length and mark as initialized */ 2597 ex->ee_block = orig_ex.ee_block; 2598 ex->ee_len = orig_ex.ee_len; 2599 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2600 ext4_ext_dirty(handle, inode, path + depth); 2601 /* zeroed the full extent */ 2602 return allocated; 2603 } 2604 2605 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 2606 if (map->m_lblk > ee_block) { 2607 ex1 = ex; 2608 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2609 ext4_ext_mark_uninitialized(ex1); 2610 ex2 = &newex; 2611 } 2612 /* 2613 * for sanity, update the length of the ex2 extent before 2614 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2615 * overlap of blocks. 2616 */ 2617 if (!ex1 && allocated > map->m_len) 2618 ex2->ee_len = cpu_to_le16(map->m_len); 2619 /* ex3: to ee_block + ee_len : uninitialised */ 2620 if (allocated > map->m_len) { 2621 unsigned int newdepth; 2622 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2623 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { 2624 /* 2625 * map->m_lblk == ee_block is handled by the zerouout 2626 * at the beginning. 2627 * Mark first half uninitialized. 2628 * Mark second half initialized and zero out the 2629 * initialized extent 2630 */ 2631 ex->ee_block = orig_ex.ee_block; 2632 ex->ee_len = cpu_to_le16(ee_len - allocated); 2633 ext4_ext_mark_uninitialized(ex); 2634 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2635 ext4_ext_dirty(handle, inode, path + depth); 2636 2637 ex3 = &newex; 2638 ex3->ee_block = cpu_to_le32(map->m_lblk); 2639 ext4_ext_store_pblock(ex3, newblock); 2640 ex3->ee_len = cpu_to_le16(allocated); 2641 err = ext4_ext_insert_extent(handle, inode, path, 2642 ex3, 0); 2643 if (err == -ENOSPC) { 2644 err = ext4_ext_zeroout(inode, &orig_ex); 2645 if (err) 2646 goto fix_extent_len; 2647 ex->ee_block = orig_ex.ee_block; 2648 ex->ee_len = orig_ex.ee_len; 2649 ext4_ext_store_pblock(ex, 2650 ext4_ext_pblock(&orig_ex)); 2651 ext4_ext_dirty(handle, inode, path + depth); 2652 /* blocks available from map->m_lblk */ 2653 return allocated; 2654 2655 } else if (err) 2656 goto fix_extent_len; 2657 2658 /* 2659 * We need to zero out the second half because 2660 * an fallocate request can update file size and 2661 * converting the second half to initialized extent 2662 * implies that we can leak some junk data to user 2663 * space. 2664 */ 2665 err = ext4_ext_zeroout(inode, ex3); 2666 if (err) { 2667 /* 2668 * We should actually mark the 2669 * second half as uninit and return error 2670 * Insert would have changed the extent 2671 */ 2672 depth = ext_depth(inode); 2673 ext4_ext_drop_refs(path); 2674 path = ext4_ext_find_extent(inode, map->m_lblk, 2675 path); 2676 if (IS_ERR(path)) { 2677 err = PTR_ERR(path); 2678 return err; 2679 } 2680 /* get the second half extent details */ 2681 ex = path[depth].p_ext; 2682 err = ext4_ext_get_access(handle, inode, 2683 path + depth); 2684 if (err) 2685 return err; 2686 ext4_ext_mark_uninitialized(ex); 2687 ext4_ext_dirty(handle, inode, path + depth); 2688 return err; 2689 } 2690 2691 /* zeroed the second half */ 2692 return allocated; 2693 } 2694 ex3 = &newex; 2695 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 2696 ext4_ext_store_pblock(ex3, newblock + map->m_len); 2697 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 2698 ext4_ext_mark_uninitialized(ex3); 2699 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2700 if (err == -ENOSPC && may_zeroout) { 2701 err = ext4_ext_zeroout(inode, &orig_ex); 2702 if (err) 2703 goto fix_extent_len; 2704 /* update the extent length and mark as initialized */ 2705 ex->ee_block = orig_ex.ee_block; 2706 ex->ee_len = orig_ex.ee_len; 2707 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2708 ext4_ext_dirty(handle, inode, path + depth); 2709 /* zeroed the full extent */ 2710 /* blocks available from map->m_lblk */ 2711 return allocated; 2712 2713 } else if (err) 2714 goto fix_extent_len; 2715 /* 2716 * The depth, and hence eh & ex might change 2717 * as part of the insert above. 2718 */ 2719 newdepth = ext_depth(inode); 2720 /* 2721 * update the extent length after successful insert of the 2722 * split extent 2723 */ 2724 ee_len -= ext4_ext_get_actual_len(ex3); 2725 orig_ex.ee_len = cpu_to_le16(ee_len); 2726 may_zeroout = ee_block + ee_len <= eof_block; 2727 2728 depth = newdepth; 2729 ext4_ext_drop_refs(path); 2730 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2731 if (IS_ERR(path)) { 2732 err = PTR_ERR(path); 2733 goto out; 2734 } 2735 eh = path[depth].p_hdr; 2736 ex = path[depth].p_ext; 2737 if (ex2 != &newex) 2738 ex2 = ex; 2739 2740 err = ext4_ext_get_access(handle, inode, path + depth); 2741 if (err) 2742 goto out; 2743 2744 allocated = map->m_len; 2745 2746 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2747 * to insert a extent in the middle zerout directly 2748 * otherwise give the extent a chance to merge to left 2749 */ 2750 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2751 map->m_lblk != ee_block && may_zeroout) { 2752 err = ext4_ext_zeroout(inode, &orig_ex); 2753 if (err) 2754 goto fix_extent_len; 2755 /* update the extent length and mark as initialized */ 2756 ex->ee_block = orig_ex.ee_block; 2757 ex->ee_len = orig_ex.ee_len; 2758 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2759 ext4_ext_dirty(handle, inode, path + depth); 2760 /* zero out the first half */ 2761 /* blocks available from map->m_lblk */ 2762 return allocated; 2763 } 2764 } 2765 /* 2766 * If there was a change of depth as part of the 2767 * insertion of ex3 above, we need to update the length 2768 * of the ex1 extent again here 2769 */ 2770 if (ex1 && ex1 != ex) { 2771 ex1 = ex; 2772 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2773 ext4_ext_mark_uninitialized(ex1); 2774 ex2 = &newex; 2775 } 2776 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ 2777 ex2->ee_block = cpu_to_le32(map->m_lblk); 2778 ext4_ext_store_pblock(ex2, newblock); 2779 ex2->ee_len = cpu_to_le16(allocated); 2780 if (ex2 != ex) 2781 goto insert; 2782 /* 2783 * New (initialized) extent starts from the first block 2784 * in the current extent. i.e., ex2 == ex 2785 * We have to see if it can be merged with the extent 2786 * on the left. 2787 */ 2788 if (ex2 > EXT_FIRST_EXTENT(eh)) { 2789 /* 2790 * To merge left, pass "ex2 - 1" to try_to_merge(), 2791 * since it merges towards right _only_. 2792 */ 2793 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); 2794 if (ret) { 2795 err = ext4_ext_correct_indexes(handle, inode, path); 2796 if (err) 2797 goto out; 2798 depth = ext_depth(inode); 2799 ex2--; 2800 } 2801 } 2802 /* 2803 * Try to Merge towards right. This might be required 2804 * only when the whole extent is being written to. 2805 * i.e. ex2 == ex and ex3 == NULL. 2806 */ 2807 if (!ex3) { 2808 ret = ext4_ext_try_to_merge(inode, path, ex2); 2809 if (ret) { 2810 err = ext4_ext_correct_indexes(handle, inode, path); 2811 if (err) 2812 goto out; 2813 } 2814 } 2815 /* Mark modified extent as dirty */ 2816 err = ext4_ext_dirty(handle, inode, path + depth); 2817 goto out; 2818insert: 2819 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2820 if (err == -ENOSPC && may_zeroout) { 2821 err = ext4_ext_zeroout(inode, &orig_ex); 2822 if (err) 2823 goto fix_extent_len; 2824 /* update the extent length and mark as initialized */ 2825 ex->ee_block = orig_ex.ee_block; 2826 ex->ee_len = orig_ex.ee_len; 2827 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2828 ext4_ext_dirty(handle, inode, path + depth); 2829 /* zero out the first half */ 2830 return allocated; 2831 } else if (err) 2832 goto fix_extent_len; 2833out: 2834 ext4_ext_show_leaf(inode, path); 2835 return err ? err : allocated; 2836 2837fix_extent_len: 2838 ex->ee_block = orig_ex.ee_block; 2839 ex->ee_len = orig_ex.ee_len; 2840 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2841 ext4_ext_mark_uninitialized(ex); 2842 ext4_ext_dirty(handle, inode, path + depth); 2843 return err; 2844} 2845 2846/* 2847 * This function is called by ext4_ext_map_blocks() from 2848 * ext4_get_blocks_dio_write() when DIO to write 2849 * to an uninitialized extent. 2850 * 2851 * Writing to an uninitized extent may result in splitting the uninitialized 2852 * extent into multiple /initialized uninitialized extents (up to three) 2853 * There are three possibilities: 2854 * a> There is no split required: Entire extent should be uninitialized 2855 * b> Splits in two extents: Write is happening at either end of the extent 2856 * c> Splits in three extents: Somone is writing in middle of the extent 2857 * 2858 * One of more index blocks maybe needed if the extent tree grow after 2859 * the uninitialized extent split. To prevent ENOSPC occur at the IO 2860 * complete, we need to split the uninitialized extent before DIO submit 2861 * the IO. The uninitialized extent called at this time will be split 2862 * into three uninitialized extent(at most). After IO complete, the part 2863 * being filled will be convert to initialized by the end_io callback function 2864 * via ext4_convert_unwritten_extents(). 2865 * 2866 * Returns the size of uninitialized extent to be written on success. 2867 */ 2868static int ext4_split_unwritten_extents(handle_t *handle, 2869 struct inode *inode, 2870 struct ext4_map_blocks *map, 2871 struct ext4_ext_path *path, 2872 int flags) 2873{ 2874 struct ext4_extent *ex, newex, orig_ex; 2875 struct ext4_extent *ex1 = NULL; 2876 struct ext4_extent *ex2 = NULL; 2877 struct ext4_extent *ex3 = NULL; 2878 ext4_lblk_t ee_block, eof_block; 2879 unsigned int allocated, ee_len, depth; 2880 ext4_fsblk_t newblock; 2881 int err = 0; 2882 int may_zeroout; 2883 2884 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 2885 "block %llu, max_blocks %u\n", inode->i_ino, 2886 (unsigned long long)map->m_lblk, map->m_len); 2887 2888 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2889 inode->i_sb->s_blocksize_bits; 2890 if (eof_block < map->m_lblk + map->m_len) 2891 eof_block = map->m_lblk + map->m_len; 2892 2893 depth = ext_depth(inode); 2894 ex = path[depth].p_ext; 2895 ee_block = le32_to_cpu(ex->ee_block); 2896 ee_len = ext4_ext_get_actual_len(ex); 2897 allocated = ee_len - (map->m_lblk - ee_block); 2898 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); 2899 2900 ex2 = ex; 2901 orig_ex.ee_block = ex->ee_block; 2902 orig_ex.ee_len = cpu_to_le16(ee_len); 2903 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); 2904 2905 /* 2906 * It is safe to convert extent to initialized via explicit 2907 * zeroout only if extent is fully insde i_size or new_size. 2908 */ 2909 may_zeroout = ee_block + ee_len <= eof_block; 2910 2911 /* 2912 * If the uninitialized extent begins at the same logical 2913 * block where the write begins, and the write completely 2914 * covers the extent, then we don't need to split it. 2915 */ 2916 if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) 2917 return allocated; 2918 2919 err = ext4_ext_get_access(handle, inode, path + depth); 2920 if (err) 2921 goto out; 2922 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 2923 if (map->m_lblk > ee_block) { 2924 ex1 = ex; 2925 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2926 ext4_ext_mark_uninitialized(ex1); 2927 ex2 = &newex; 2928 } 2929 /* 2930 * for sanity, update the length of the ex2 extent before 2931 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2932 * overlap of blocks. 2933 */ 2934 if (!ex1 && allocated > map->m_len) 2935 ex2->ee_len = cpu_to_le16(map->m_len); 2936 /* ex3: to ee_block + ee_len : uninitialised */ 2937 if (allocated > map->m_len) { 2938 unsigned int newdepth; 2939 ex3 = &newex; 2940 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 2941 ext4_ext_store_pblock(ex3, newblock + map->m_len); 2942 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 2943 ext4_ext_mark_uninitialized(ex3); 2944 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 2945 if (err == -ENOSPC && may_zeroout) { 2946 err = ext4_ext_zeroout(inode, &orig_ex); 2947 if (err) 2948 goto fix_extent_len; 2949 /* update the extent length and mark as initialized */ 2950 ex->ee_block = orig_ex.ee_block; 2951 ex->ee_len = orig_ex.ee_len; 2952 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2953 ext4_ext_dirty(handle, inode, path + depth); 2954 /* zeroed the full extent */ 2955 /* blocks available from map->m_lblk */ 2956 return allocated; 2957 2958 } else if (err) 2959 goto fix_extent_len; 2960 /* 2961 * The depth, and hence eh & ex might change 2962 * as part of the insert above. 2963 */ 2964 newdepth = ext_depth(inode); 2965 /* 2966 * update the extent length after successful insert of the 2967 * split extent 2968 */ 2969 ee_len -= ext4_ext_get_actual_len(ex3); 2970 orig_ex.ee_len = cpu_to_le16(ee_len); 2971 may_zeroout = ee_block + ee_len <= eof_block; 2972 2973 depth = newdepth; 2974 ext4_ext_drop_refs(path); 2975 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2976 if (IS_ERR(path)) { 2977 err = PTR_ERR(path); 2978 goto out; 2979 } 2980 ex = path[depth].p_ext; 2981 if (ex2 != &newex) 2982 ex2 = ex; 2983 2984 err = ext4_ext_get_access(handle, inode, path + depth); 2985 if (err) 2986 goto out; 2987 2988 allocated = map->m_len; 2989 } 2990 /* 2991 * If there was a change of depth as part of the 2992 * insertion of ex3 above, we need to update the length 2993 * of the ex1 extent again here 2994 */ 2995 if (ex1 && ex1 != ex) { 2996 ex1 = ex; 2997 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2998 ext4_ext_mark_uninitialized(ex1); 2999 ex2 = &newex; 3000 } 3001 /* 3002 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written 3003 * using direct I/O, uninitialised still. 3004 */ 3005 ex2->ee_block = cpu_to_le32(map->m_lblk); 3006 ext4_ext_store_pblock(ex2, newblock); 3007 ex2->ee_len = cpu_to_le16(allocated); 3008 ext4_ext_mark_uninitialized(ex2); 3009 if (ex2 != ex) 3010 goto insert; 3011 /* Mark modified extent as dirty */ 3012 err = ext4_ext_dirty(handle, inode, path + depth); 3013 ext_debug("out here\n"); 3014 goto out; 3015insert: 3016 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3017 if (err == -ENOSPC && may_zeroout) { 3018 err = ext4_ext_zeroout(inode, &orig_ex); 3019 if (err) 3020 goto fix_extent_len; 3021 /* update the extent length and mark as initialized */ 3022 ex->ee_block = orig_ex.ee_block; 3023 ex->ee_len = orig_ex.ee_len; 3024 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3025 ext4_ext_dirty(handle, inode, path + depth); 3026 /* zero out the first half */ 3027 return allocated; 3028 } else if (err) 3029 goto fix_extent_len; 3030out: 3031 ext4_ext_show_leaf(inode, path); 3032 return err ? err : allocated; 3033 3034fix_extent_len: 3035 ex->ee_block = orig_ex.ee_block; 3036 ex->ee_len = orig_ex.ee_len; 3037 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3038 ext4_ext_mark_uninitialized(ex); 3039 ext4_ext_dirty(handle, inode, path + depth); 3040 return err; 3041} 3042static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3043 struct inode *inode, 3044 struct ext4_ext_path *path) 3045{ 3046 struct ext4_extent *ex; 3047 struct ext4_extent_header *eh; 3048 int depth; 3049 int err = 0; 3050 int ret = 0; 3051 3052 depth = ext_depth(inode); 3053 eh = path[depth].p_hdr; 3054 ex = path[depth].p_ext; 3055 3056 err = ext4_ext_get_access(handle, inode, path + depth); 3057 if (err) 3058 goto out; 3059 /* first mark the extent as initialized */ 3060 ext4_ext_mark_initialized(ex); 3061 3062 /* 3063 * We have to see if it can be merged with the extent 3064 * on the left. 3065 */ 3066 if (ex > EXT_FIRST_EXTENT(eh)) { 3067 /* 3068 * To merge left, pass "ex - 1" to try_to_merge(), 3069 * since it merges towards right _only_. 3070 */ 3071 ret = ext4_ext_try_to_merge(inode, path, ex - 1); 3072 if (ret) { 3073 err = ext4_ext_correct_indexes(handle, inode, path); 3074 if (err) 3075 goto out; 3076 depth = ext_depth(inode); 3077 ex--; 3078 } 3079 } 3080 /* 3081 * Try to Merge towards right. 3082 */ 3083 ret = ext4_ext_try_to_merge(inode, path, ex); 3084 if (ret) { 3085 err = ext4_ext_correct_indexes(handle, inode, path); 3086 if (err) 3087 goto out; 3088 depth = ext_depth(inode); 3089 } 3090 /* Mark modified extent as dirty */ 3091 err = ext4_ext_dirty(handle, inode, path + depth); 3092out: 3093 ext4_ext_show_leaf(inode, path); 3094 return err; 3095} 3096 3097static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3098 sector_t block, int count) 3099{ 3100 int i; 3101 for (i = 0; i < count; i++) 3102 unmap_underlying_metadata(bdev, block + i); 3103} 3104 3105/* 3106 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3107 */ 3108static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3109 ext4_lblk_t lblk, 3110 struct ext4_ext_path *path, 3111 unsigned int len) 3112{ 3113 int i, depth; 3114 struct ext4_extent_header *eh; 3115 struct ext4_extent *last_ex; 3116 3117 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3118 return 0; 3119 3120 depth = ext_depth(inode); 3121 eh = path[depth].p_hdr; 3122 3123 if (unlikely(!eh->eh_entries)) { 3124 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3125 "EOFBLOCKS_FL set"); 3126 return -EIO; 3127 } 3128 last_ex = EXT_LAST_EXTENT(eh); 3129 /* 3130 * We should clear the EOFBLOCKS_FL flag if we are writing the 3131 * last block in the last extent in the file. We test this by 3132 * first checking to see if the caller to 3133 * ext4_ext_get_blocks() was interested in the last block (or 3134 * a block beyond the last block) in the current extent. If 3135 * this turns out to be false, we can bail out from this 3136 * function immediately. 3137 */ 3138 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3139 ext4_ext_get_actual_len(last_ex)) 3140 return 0; 3141 /* 3142 * If the caller does appear to be planning to write at or 3143 * beyond the end of the current extent, we then test to see 3144 * if the current extent is the last extent in the file, by 3145 * checking to make sure it was reached via the rightmost node 3146 * at each level of the tree. 3147 */ 3148 for (i = depth-1; i >= 0; i--) 3149 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3150 return 0; 3151 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3152 return ext4_mark_inode_dirty(handle, inode); 3153} 3154 3155static int 3156ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3157 struct ext4_map_blocks *map, 3158 struct ext4_ext_path *path, int flags, 3159 unsigned int allocated, ext4_fsblk_t newblock) 3160{ 3161 int ret = 0; 3162 int err = 0; 3163 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3164 3165 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3166 "block %llu, max_blocks %u, flags %d, allocated %u", 3167 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3168 flags, allocated); 3169 ext4_ext_show_leaf(inode, path); 3170 3171 /* get_block() before submit the IO, split the extent */ 3172 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3173 ret = ext4_split_unwritten_extents(handle, inode, map, 3174 path, flags); 3175 /* 3176 * Flag the inode(non aio case) or end_io struct (aio case) 3177 * that this IO needs to convertion to written when IO is 3178 * completed 3179 */ 3180 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3181 io->flag = EXT4_IO_END_UNWRITTEN; 3182 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3183 } else 3184 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3185 if (ext4_should_dioread_nolock(inode)) 3186 map->m_flags |= EXT4_MAP_UNINIT; 3187 goto out; 3188 } 3189 /* IO end_io complete, convert the filled extent to written */ 3190 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3191 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3192 path); 3193 if (ret >= 0) { 3194 ext4_update_inode_fsync_trans(handle, inode, 1); 3195 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3196 path, map->m_len); 3197 } else 3198 err = ret; 3199 goto out2; 3200 } 3201 /* buffered IO case */ 3202 /* 3203 * repeat fallocate creation request 3204 * we already have an unwritten extent 3205 */ 3206 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3207 goto map_out; 3208 3209 /* buffered READ or buffered write_begin() lookup */ 3210 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3211 /* 3212 * We have blocks reserved already. We 3213 * return allocated blocks so that delalloc 3214 * won't do block reservation for us. But 3215 * the buffer head will be unmapped so that 3216 * a read from the block returns 0s. 3217 */ 3218 map->m_flags |= EXT4_MAP_UNWRITTEN; 3219 goto out1; 3220 } 3221 3222 /* buffered write, writepage time, convert*/ 3223 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3224 if (ret >= 0) { 3225 ext4_update_inode_fsync_trans(handle, inode, 1); 3226 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 3227 map->m_len); 3228 if (err < 0) 3229 goto out2; 3230 } 3231 3232out: 3233 if (ret <= 0) { 3234 err = ret; 3235 goto out2; 3236 } else 3237 allocated = ret; 3238 map->m_flags |= EXT4_MAP_NEW; 3239 /* 3240 * if we allocated more blocks than requested 3241 * we need to make sure we unmap the extra block 3242 * allocated. The actual needed block will get 3243 * unmapped later when we find the buffer_head marked 3244 * new. 3245 */ 3246 if (allocated > map->m_len) { 3247 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3248 newblock + map->m_len, 3249 allocated - map->m_len); 3250 allocated = map->m_len; 3251 } 3252 3253 /* 3254 * If we have done fallocate with the offset that is already 3255 * delayed allocated, we would have block reservation 3256 * and quota reservation done in the delayed write path. 3257 * But fallocate would have already updated quota and block 3258 * count for this offset. So cancel these reservation 3259 */ 3260 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3261 ext4_da_update_reserve_space(inode, allocated, 0); 3262 3263map_out: 3264 map->m_flags |= EXT4_MAP_MAPPED; 3265out1: 3266 if (allocated > map->m_len) 3267 allocated = map->m_len; 3268 ext4_ext_show_leaf(inode, path); 3269 map->m_pblk = newblock; 3270 map->m_len = allocated; 3271out2: 3272 if (path) { 3273 ext4_ext_drop_refs(path); 3274 kfree(path); 3275 } 3276 return err ? err : allocated; 3277} 3278 3279/* 3280 * Block allocation/map/preallocation routine for extents based files 3281 * 3282 * 3283 * Need to be called with 3284 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3285 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3286 * 3287 * return > 0, number of of blocks already mapped/allocated 3288 * if create == 0 and these are pre-allocated blocks 3289 * buffer head is unmapped 3290 * otherwise blocks are mapped 3291 * 3292 * return = 0, if plain look up failed (blocks have not been allocated) 3293 * buffer head is unmapped 3294 * 3295 * return < 0, error case. 3296 */ 3297int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3298 struct ext4_map_blocks *map, int flags) 3299{ 3300 struct ext4_ext_path *path = NULL; 3301 struct ext4_extent newex, *ex; 3302 ext4_fsblk_t newblock = 0; 3303 int err = 0, depth, ret; 3304 unsigned int allocated = 0; 3305 struct ext4_allocation_request ar; 3306 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3307 3308 ext_debug("blocks %u/%u requested for inode %lu\n", 3309 map->m_lblk, map->m_len, inode->i_ino); 3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3311 3312 /* check in cache */ 3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3314 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3315 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3316 /* 3317 * block isn't allocated yet and 3318 * user doesn't want to allocate it 3319 */ 3320 goto out2; 3321 } 3322 /* we should allocate requested block */ 3323 } else { 3324 /* block is already allocated */ 3325 newblock = map->m_lblk 3326 - le32_to_cpu(newex.ee_block) 3327 + ext4_ext_pblock(&newex); 3328 /* number of remaining blocks in the extent */ 3329 allocated = ext4_ext_get_actual_len(&newex) - 3330 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3331 goto out; 3332 } 3333 } 3334 3335 /* find extent for this block */ 3336 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3337 if (IS_ERR(path)) { 3338 err = PTR_ERR(path); 3339 path = NULL; 3340 goto out2; 3341 } 3342 3343 depth = ext_depth(inode); 3344 3345 /* 3346 * consistent leaf must not be empty; 3347 * this situation is possible, though, _during_ tree modification; 3348 * this is why assert can't be put in ext4_ext_find_extent() 3349 */ 3350 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3351 EXT4_ERROR_INODE(inode, "bad extent address " 3352 "lblock: %lu, depth: %d pblock %lld", 3353 (unsigned long) map->m_lblk, depth, 3354 path[depth].p_block); 3355 err = -EIO; 3356 goto out2; 3357 } 3358 3359 ex = path[depth].p_ext; 3360 if (ex) { 3361 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3362 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3363 unsigned short ee_len; 3364 3365 /* 3366 * Uninitialized extents are treated as holes, except that 3367 * we split out initialized portions during a write. 3368 */ 3369 ee_len = ext4_ext_get_actual_len(ex); 3370 /* if found extent covers block, simply return it */ 3371 if (in_range(map->m_lblk, ee_block, ee_len)) { 3372 newblock = map->m_lblk - ee_block + ee_start; 3373 /* number of remaining blocks in the extent */ 3374 allocated = ee_len - (map->m_lblk - ee_block); 3375 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3376 ee_block, ee_len, newblock); 3377 3378 /* Do not put uninitialized extent in the cache */ 3379 if (!ext4_ext_is_uninitialized(ex)) { 3380 ext4_ext_put_in_cache(inode, ee_block, 3381 ee_len, ee_start); 3382 goto out; 3383 } 3384 ret = ext4_ext_handle_uninitialized_extents(handle, 3385 inode, map, path, flags, allocated, 3386 newblock); 3387 return ret; 3388 } 3389 } 3390 3391 /* 3392 * requested block isn't allocated yet; 3393 * we couldn't try to create block if create flag is zero 3394 */ 3395 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3396 /* 3397 * put just found gap into cache to speed up 3398 * subsequent requests 3399 */ 3400 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3401 goto out2; 3402 } 3403 /* 3404 * Okay, we need to do block allocation. 3405 */ 3406 3407 /* find neighbour allocated blocks */ 3408 ar.lleft = map->m_lblk; 3409 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3410 if (err) 3411 goto out2; 3412 ar.lright = map->m_lblk; 3413 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3414 if (err) 3415 goto out2; 3416 3417 /* 3418 * See if request is beyond maximum number of blocks we can have in 3419 * a single extent. For an initialized extent this limit is 3420 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3421 * EXT_UNINIT_MAX_LEN. 3422 */ 3423 if (map->m_len > EXT_INIT_MAX_LEN && 3424 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3425 map->m_len = EXT_INIT_MAX_LEN; 3426 else if (map->m_len > EXT_UNINIT_MAX_LEN && 3427 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3428 map->m_len = EXT_UNINIT_MAX_LEN; 3429 3430 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3431 newex.ee_block = cpu_to_le32(map->m_lblk); 3432 newex.ee_len = cpu_to_le16(map->m_len); 3433 err = ext4_ext_check_overlap(inode, &newex, path); 3434 if (err) 3435 allocated = ext4_ext_get_actual_len(&newex); 3436 else 3437 allocated = map->m_len; 3438 3439 /* allocate new block */ 3440 ar.inode = inode; 3441 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3442 ar.logical = map->m_lblk; 3443 ar.len = allocated; 3444 if (S_ISREG(inode->i_mode)) 3445 ar.flags = EXT4_MB_HINT_DATA; 3446 else 3447 /* disable in-core preallocation for non-regular files */ 3448 ar.flags = 0; 3449 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3450 if (!newblock) 3451 goto out2; 3452 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 3453 ar.goal, newblock, allocated); 3454 3455 /* try to insert new extent into found leaf and return */ 3456 ext4_ext_store_pblock(&newex, newblock); 3457 newex.ee_len = cpu_to_le16(ar.len); 3458 /* Mark uninitialized */ 3459 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3460 ext4_ext_mark_uninitialized(&newex); 3461 /* 3462 * io_end structure was created for every IO write to an 3463 * uninitialized extent. To avoid unecessary conversion, 3464 * here we flag the IO that really needs the conversion. 3465 * For non asycn direct IO case, flag the inode state 3466 * that we need to perform convertion when IO is done. 3467 */ 3468 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3469 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3470 io->flag = EXT4_IO_END_UNWRITTEN; 3471 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3472 } else 3473 ext4_set_inode_state(inode, 3474 EXT4_STATE_DIO_UNWRITTEN); 3475 } 3476 if (ext4_should_dioread_nolock(inode)) 3477 map->m_flags |= EXT4_MAP_UNINIT; 3478 } 3479 3480 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 3481 if (err) 3482 goto out2; 3483 3484 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3485 if (err) { 3486 /* free data blocks we just allocated */ 3487 /* not a good idea to call discard here directly, 3488 * but otherwise we'd need to call it every free() */ 3489 ext4_discard_preallocations(inode); 3490 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 3491 ext4_ext_get_actual_len(&newex), 0); 3492 goto out2; 3493 } 3494 3495 /* previous routine could use block we allocated */ 3496 newblock = ext4_ext_pblock(&newex); 3497 allocated = ext4_ext_get_actual_len(&newex); 3498 if (allocated > map->m_len) 3499 allocated = map->m_len; 3500 map->m_flags |= EXT4_MAP_NEW; 3501 3502 /* 3503 * Update reserved blocks/metadata blocks after successful 3504 * block allocation which had been deferred till now. 3505 */ 3506 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3507 ext4_da_update_reserve_space(inode, allocated, 1); 3508 3509 /* 3510 * Cache the extent and update transaction to commit on fdatasync only 3511 * when it is _not_ an uninitialized extent. 3512 */ 3513 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3514 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); 3515 ext4_update_inode_fsync_trans(handle, inode, 1); 3516 } else 3517 ext4_update_inode_fsync_trans(handle, inode, 0); 3518out: 3519 if (allocated > map->m_len) 3520 allocated = map->m_len; 3521 ext4_ext_show_leaf(inode, path); 3522 map->m_flags |= EXT4_MAP_MAPPED; 3523 map->m_pblk = newblock; 3524 map->m_len = allocated; 3525out2: 3526 if (path) { 3527 ext4_ext_drop_refs(path); 3528 kfree(path); 3529 } 3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3531 newblock, map->m_len, err ? err : allocated); 3532 return err ? err : allocated; 3533} 3534 3535void ext4_ext_truncate(struct inode *inode) 3536{ 3537 struct address_space *mapping = inode->i_mapping; 3538 struct super_block *sb = inode->i_sb; 3539 ext4_lblk_t last_block; 3540 handle_t *handle; 3541 int err = 0; 3542 3543 /* 3544 * finish any pending end_io work so we won't run the risk of 3545 * converting any truncated blocks to initialized later 3546 */ 3547 ext4_flush_completed_IO(inode); 3548 3549 /* 3550 * probably first extent we're gonna free will be last in block 3551 */ 3552 err = ext4_writepage_trans_blocks(inode); 3553 handle = ext4_journal_start(inode, err); 3554 if (IS_ERR(handle)) 3555 return; 3556 3557 if (inode->i_size & (sb->s_blocksize - 1)) 3558 ext4_block_truncate_page(handle, mapping, inode->i_size); 3559 3560 if (ext4_orphan_add(handle, inode)) 3561 goto out_stop; 3562 3563 down_write(&EXT4_I(inode)->i_data_sem); 3564 ext4_ext_invalidate_cache(inode); 3565 3566 ext4_discard_preallocations(inode); 3567 3568 /* 3569 * TODO: optimization is possible here. 3570 * Probably we need not scan at all, 3571 * because page truncation is enough. 3572 */ 3573 3574 /* we have to know where to truncate from in crash case */ 3575 EXT4_I(inode)->i_disksize = inode->i_size; 3576 ext4_mark_inode_dirty(handle, inode); 3577 3578 last_block = (inode->i_size + sb->s_blocksize - 1) 3579 >> EXT4_BLOCK_SIZE_BITS(sb); 3580 err = ext4_ext_remove_space(inode, last_block); 3581 3582 /* In a multi-transaction truncate, we only make the final 3583 * transaction synchronous. 3584 */ 3585 if (IS_SYNC(inode)) 3586 ext4_handle_sync(handle); 3587 3588out_stop: 3589 up_write(&EXT4_I(inode)->i_data_sem); 3590 /* 3591 * If this was a simple ftruncate() and the file will remain alive, 3592 * then we need to clear up the orphan record which we created above. 3593 * However, if this was a real unlink then we were called by 3594 * ext4_delete_inode(), and we allow that function to clean up the 3595 * orphan info for us. 3596 */ 3597 if (inode->i_nlink) 3598 ext4_orphan_del(handle, inode); 3599 3600 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3601 ext4_mark_inode_dirty(handle, inode); 3602 ext4_journal_stop(handle); 3603} 3604 3605static void ext4_falloc_update_inode(struct inode *inode, 3606 int mode, loff_t new_size, int update_ctime) 3607{ 3608 struct timespec now; 3609 3610 if (update_ctime) { 3611 now = current_fs_time(inode->i_sb); 3612 if (!timespec_equal(&inode->i_ctime, &now)) 3613 inode->i_ctime = now; 3614 } 3615 /* 3616 * Update only when preallocation was requested beyond 3617 * the file size. 3618 */ 3619 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3620 if (new_size > i_size_read(inode)) 3621 i_size_write(inode, new_size); 3622 if (new_size > EXT4_I(inode)->i_disksize) 3623 ext4_update_i_disksize(inode, new_size); 3624 } else { 3625 /* 3626 * Mark that we allocate beyond EOF so the subsequent truncate 3627 * can proceed even if the new size is the same as i_size. 3628 */ 3629 if (new_size > i_size_read(inode)) 3630 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3631 } 3632 3633} 3634 3635/* 3636 * preallocate space for a file. This implements ext4's fallocate file 3637 * operation, which gets called from sys_fallocate system call. 3638 * For block-mapped files, posix_fallocate should fall back to the method 3639 * of writing zeroes to the required new blocks (the same behavior which is 3640 * expected for file systems which do not support fallocate() system call). 3641 */ 3642long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 3643{ 3644 struct inode *inode = file->f_path.dentry->d_inode; 3645 handle_t *handle; 3646 loff_t new_size; 3647 unsigned int max_blocks; 3648 int ret = 0; 3649 int ret2 = 0; 3650 int retries = 0; 3651 struct ext4_map_blocks map; 3652 unsigned int credits, blkbits = inode->i_blkbits; 3653 3654 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 3655 if (mode & ~FALLOC_FL_KEEP_SIZE) 3656 return -EOPNOTSUPP; 3657 3658 /* 3659 * currently supporting (pre)allocate mode for extent-based 3660 * files _only_ 3661 */ 3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3663 return -EOPNOTSUPP; 3664 3665 trace_ext4_fallocate_enter(inode, offset, len, mode); 3666 map.m_lblk = offset >> blkbits; 3667 /* 3668 * We can't just convert len to max_blocks because 3669 * If blocksize = 4096 offset = 3072 and len = 2048 3670 */ 3671 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3672 - map.m_lblk; 3673 /* 3674 * credits to insert 1 extent into extent tree 3675 */ 3676 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3677 mutex_lock(&inode->i_mutex); 3678 ret = inode_newsize_ok(inode, (len + offset)); 3679 if (ret) { 3680 mutex_unlock(&inode->i_mutex); 3681 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 3682 return ret; 3683 } 3684retry: 3685 while (ret >= 0 && ret < max_blocks) { 3686 map.m_lblk = map.m_lblk + ret; 3687 map.m_len = max_blocks = max_blocks - ret; 3688 handle = ext4_journal_start(inode, credits); 3689 if (IS_ERR(handle)) { 3690 ret = PTR_ERR(handle); 3691 break; 3692 } 3693 ret = ext4_map_blocks(handle, inode, &map, 3694 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3695 if (ret <= 0) { 3696#ifdef EXT4FS_DEBUG 3697 WARN_ON(ret <= 0); 3698 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3699 "returned error inode#%lu, block=%u, " 3700 "max_blocks=%u", __func__, 3701 inode->i_ino, map.m_lblk, max_blocks); 3702#endif 3703 ext4_mark_inode_dirty(handle, inode); 3704 ret2 = ext4_journal_stop(handle); 3705 break; 3706 } 3707 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3708 blkbits) >> blkbits)) 3709 new_size = offset + len; 3710 else 3711 new_size = (map.m_lblk + ret) << blkbits; 3712 3713 ext4_falloc_update_inode(inode, mode, new_size, 3714 (map.m_flags & EXT4_MAP_NEW)); 3715 ext4_mark_inode_dirty(handle, inode); 3716 ret2 = ext4_journal_stop(handle); 3717 if (ret2) 3718 break; 3719 } 3720 if (ret == -ENOSPC && 3721 ext4_should_retry_alloc(inode->i_sb, &retries)) { 3722 ret = 0; 3723 goto retry; 3724 } 3725 mutex_unlock(&inode->i_mutex); 3726 trace_ext4_fallocate_exit(inode, offset, max_blocks, 3727 ret > 0 ? ret2 : ret); 3728 return ret > 0 ? ret2 : ret; 3729} 3730 3731/* 3732 * This function convert a range of blocks to written extents 3733 * The caller of this function will pass the start offset and the size. 3734 * all unwritten extents within this range will be converted to 3735 * written extents. 3736 * 3737 * This function is called from the direct IO end io call back 3738 * function, to convert the fallocated extents after IO is completed. 3739 * Returns 0 on success. 3740 */ 3741int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3742 ssize_t len) 3743{ 3744 handle_t *handle; 3745 unsigned int max_blocks; 3746 int ret = 0; 3747 int ret2 = 0; 3748 struct ext4_map_blocks map; 3749 unsigned int credits, blkbits = inode->i_blkbits; 3750 3751 map.m_lblk = offset >> blkbits; 3752 /* 3753 * We can't just convert len to max_blocks because 3754 * If blocksize = 4096 offset = 3072 and len = 2048 3755 */ 3756 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 3757 map.m_lblk); 3758 /* 3759 * credits to insert 1 extent into extent tree 3760 */ 3761 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3762 while (ret >= 0 && ret < max_blocks) { 3763 map.m_lblk += ret; 3764 map.m_len = (max_blocks -= ret); 3765 handle = ext4_journal_start(inode, credits); 3766 if (IS_ERR(handle)) { 3767 ret = PTR_ERR(handle); 3768 break; 3769 } 3770 ret = ext4_map_blocks(handle, inode, &map, 3771 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3772 if (ret <= 0) { 3773 WARN_ON(ret <= 0); 3774 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3775 "returned error inode#%lu, block=%u, " 3776 "max_blocks=%u", __func__, 3777 inode->i_ino, map.m_lblk, map.m_len); 3778 } 3779 ext4_mark_inode_dirty(handle, inode); 3780 ret2 = ext4_journal_stop(handle); 3781 if (ret <= 0 || ret2 ) 3782 break; 3783 } 3784 return ret > 0 ? ret2 : ret; 3785} 3786 3787/* 3788 * Callback function called for each extent to gather FIEMAP information. 3789 */ 3790static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3791 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3792 void *data) 3793{ 3794 __u64 logical; 3795 __u64 physical; 3796 __u64 length; 3797 loff_t size; 3798 __u32 flags = 0; 3799 int ret = 0; 3800 struct fiemap_extent_info *fieinfo = data; 3801 unsigned char blksize_bits; 3802 3803 blksize_bits = inode->i_sb->s_blocksize_bits; 3804 logical = (__u64)newex->ec_block << blksize_bits; 3805 3806 if (newex->ec_start == 0) { 3807 /* 3808 * No extent in extent-tree contains block @newex->ec_start, 3809 * then the block may stay in 1)a hole or 2)delayed-extent. 3810 * 3811 * Holes or delayed-extents are processed as follows. 3812 * 1. lookup dirty pages with specified range in pagecache. 3813 * If no page is got, then there is no delayed-extent and 3814 * return with EXT_CONTINUE. 3815 * 2. find the 1st mapped buffer, 3816 * 3. check if the mapped buffer is both in the request range 3817 * and a delayed buffer. If not, there is no delayed-extent, 3818 * then return. 3819 * 4. a delayed-extent is found, the extent will be collected. 3820 */ 3821 ext4_lblk_t end = 0; 3822 pgoff_t last_offset; 3823 pgoff_t offset; 3824 pgoff_t index; 3825 struct page **pages = NULL; 3826 struct buffer_head *bh = NULL; 3827 struct buffer_head *head = NULL; 3828 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); 3829 3830 pages = kmalloc(PAGE_SIZE, GFP_KERNEL); 3831 if (pages == NULL) 3832 return -ENOMEM; 3833 3834 offset = logical >> PAGE_SHIFT; 3835repeat: 3836 last_offset = offset; 3837 head = NULL; 3838 ret = find_get_pages_tag(inode->i_mapping, &offset, 3839 PAGECACHE_TAG_DIRTY, nr_pages, pages); 3840 3841 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 3842 /* First time, try to find a mapped buffer. */ 3843 if (ret == 0) { 3844out: 3845 for (index = 0; index < ret; index++) 3846 page_cache_release(pages[index]); 3847 /* just a hole. */ 3848 kfree(pages); 3849 return EXT_CONTINUE; 3850 } 3851 3852 /* Try to find the 1st mapped buffer. */ 3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 3854 blksize_bits; 3855 if (!page_has_buffers(pages[0])) 3856 goto out; 3857 head = page_buffers(pages[0]); 3858 if (!head) 3859 goto out; 3860 3861 bh = head; 3862 do { 3863 if (buffer_mapped(bh)) { 3864 /* get the 1st mapped buffer. */ 3865 if (end > newex->ec_block + 3866 newex->ec_len) 3867 /* The buffer is out of 3868 * the request range. 3869 */ 3870 goto out; 3871 goto found_mapped_buffer; 3872 } 3873 bh = bh->b_this_page; 3874 end++; 3875 } while (bh != head); 3876 3877 /* No mapped buffer found. */ 3878 goto out; 3879 } else { 3880 /*Find contiguous delayed buffers. */ 3881 if (ret > 0 && pages[0]->index == last_offset) 3882 head = page_buffers(pages[0]); 3883 bh = head; 3884 } 3885 3886found_mapped_buffer: 3887 if (bh != NULL && buffer_delay(bh)) { 3888 /* 1st or contiguous delayed buffer found. */ 3889 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 3890 /* 3891 * 1st delayed buffer found, record 3892 * the start of extent. 3893 */ 3894 flags |= FIEMAP_EXTENT_DELALLOC; 3895 newex->ec_block = end; 3896 logical = (__u64)end << blksize_bits; 3897 } 3898 /* Find contiguous delayed buffers. */ 3899 do { 3900 if (!buffer_delay(bh)) 3901 goto found_delayed_extent; 3902 bh = bh->b_this_page; 3903 end++; 3904 } while (bh != head); 3905 3906 for (index = 1; index < ret; index++) { 3907 if (!page_has_buffers(pages[index])) { 3908 bh = NULL; 3909 break; 3910 } 3911 head = page_buffers(pages[index]); 3912 if (!head) { 3913 bh = NULL; 3914 break; 3915 } 3916 if (pages[index]->index != 3917 pages[0]->index + index) { 3918 /* Blocks are not contiguous. */ 3919 bh = NULL; 3920 break; 3921 } 3922 bh = head; 3923 do { 3924 if (!buffer_delay(bh)) 3925 /* Delayed-extent ends. */ 3926 goto found_delayed_extent; 3927 bh = bh->b_this_page; 3928 end++; 3929 } while (bh != head); 3930 } 3931 } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) 3932 /* a hole found. */ 3933 goto out; 3934 3935found_delayed_extent: 3936 newex->ec_len = min(end - newex->ec_block, 3937 (ext4_lblk_t)EXT_INIT_MAX_LEN); 3938 if (ret == nr_pages && bh != NULL && 3939 newex->ec_len < EXT_INIT_MAX_LEN && 3940 buffer_delay(bh)) { 3941 /* Have not collected an extent and continue. */ 3942 for (index = 0; index < ret; index++) 3943 page_cache_release(pages[index]); 3944 goto repeat; 3945 } 3946 3947 for (index = 0; index < ret; index++) 3948 page_cache_release(pages[index]); 3949 kfree(pages); 3950 } 3951 3952 physical = (__u64)newex->ec_start << blksize_bits; 3953 length = (__u64)newex->ec_len << blksize_bits; 3954 3955 if (ex && ext4_ext_is_uninitialized(ex)) 3956 flags |= FIEMAP_EXTENT_UNWRITTEN; 3957 3958 size = i_size_read(inode); 3959 if (logical + length >= size) 3960 flags |= FIEMAP_EXTENT_LAST; 3961 3962 ret = fiemap_fill_next_extent(fieinfo, logical, physical, 3963 length, flags); 3964 if (ret < 0) 3965 return ret; 3966 if (ret == 1) 3967 return EXT_BREAK; 3968 return EXT_CONTINUE; 3969} 3970 3971/* fiemap flags we can handle specified here */ 3972#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3973 3974static int ext4_xattr_fiemap(struct inode *inode, 3975 struct fiemap_extent_info *fieinfo) 3976{ 3977 __u64 physical = 0; 3978 __u64 length; 3979 __u32 flags = FIEMAP_EXTENT_LAST; 3980 int blockbits = inode->i_sb->s_blocksize_bits; 3981 int error = 0; 3982 3983 /* in-inode? */ 3984 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 3985 struct ext4_iloc iloc; 3986 int offset; /* offset of xattr in inode */ 3987 3988 error = ext4_get_inode_loc(inode, &iloc); 3989 if (error) 3990 return error; 3991 physical = iloc.bh->b_blocknr << blockbits; 3992 offset = EXT4_GOOD_OLD_INODE_SIZE + 3993 EXT4_I(inode)->i_extra_isize; 3994 physical += offset; 3995 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3996 flags |= FIEMAP_EXTENT_DATA_INLINE; 3997 brelse(iloc.bh); 3998 } else { /* external block */ 3999 physical = EXT4_I(inode)->i_file_acl << blockbits; 4000 length = inode->i_sb->s_blocksize; 4001 } 4002 4003 if (physical) 4004 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4005 length, flags); 4006 return (error < 0 ? error : 0); 4007} 4008 4009int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4010 __u64 start, __u64 len) 4011{ 4012 ext4_lblk_t start_blk; 4013 int error = 0; 4014 4015 /* fallback to generic here if not in extents fmt */ 4016 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4017 return generic_block_fiemap(inode, fieinfo, start, len, 4018 ext4_get_block); 4019 4020 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 4021 return -EBADR; 4022 4023 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4024 error = ext4_xattr_fiemap(inode, fieinfo); 4025 } else { 4026 ext4_lblk_t len_blks; 4027 __u64 last_blk; 4028 4029 start_blk = start >> inode->i_sb->s_blocksize_bits; 4030 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4031 if (last_blk >= EXT_MAX_BLOCK) 4032 last_blk = EXT_MAX_BLOCK-1; 4033 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4034 4035 /* 4036 * Walk the extent tree gathering extent information. 4037 * ext4_ext_fiemap_cb will push extents back to user. 4038 */ 4039 error = ext4_ext_walk_space(inode, start_blk, len_blks, 4040 ext4_ext_fiemap_cb, fieinfo); 4041 } 4042 4043 return error; 4044} 4045 4046