extents.c revision 56055d3ae4cc7fa6d2b10885f20269de8a989ed7
1/* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23/* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32#include <linux/module.h> 33#include <linux/fs.h> 34#include <linux/time.h> 35#include <linux/ext4_jbd2.h> 36#include <linux/jbd.h> 37#include <linux/highuid.h> 38#include <linux/pagemap.h> 39#include <linux/quotaops.h> 40#include <linux/string.h> 41#include <linux/slab.h> 42#include <linux/falloc.h> 43#include <linux/ext4_fs_extents.h> 44#include <asm/uaccess.h> 45 46 47/* 48 * ext_pblock: 49 * combine low and high parts of physical block number into ext4_fsblk_t 50 */ 51static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) 52{ 53 ext4_fsblk_t block; 54 55 block = le32_to_cpu(ex->ee_start); 56 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; 57 return block; 58} 59 60/* 61 * idx_pblock: 62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t 63 */ 64static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) 65{ 66 ext4_fsblk_t block; 67 68 block = le32_to_cpu(ix->ei_leaf); 69 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; 70 return block; 71} 72 73/* 74 * ext4_ext_store_pblock: 75 * stores a large physical block number into an extent struct, 76 * breaking it into parts 77 */ 78static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) 79{ 80 ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 82} 83 84/* 85 * ext4_idx_store_pblock: 86 * stores a large physical block number into an index struct, 87 * breaking it into parts 88 */ 89static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) 90{ 91 ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93} 94 95static int ext4_ext_check_header(const char *function, struct inode *inode, 96 struct ext4_extent_header *eh) 97{ 98 const char *error_msg = NULL; 99 100 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 101 error_msg = "invalid magic"; 102 goto corrupted; 103 } 104 if (unlikely(eh->eh_max == 0)) { 105 error_msg = "invalid eh_max"; 106 goto corrupted; 107 } 108 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 109 error_msg = "invalid eh_entries"; 110 goto corrupted; 111 } 112 return 0; 113 114corrupted: 115 ext4_error(inode->i_sb, function, 116 "bad header in inode #%lu: %s - magic %x, " 117 "entries %u, max %u, depth %u", 118 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 119 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 120 le16_to_cpu(eh->eh_depth)); 121 122 return -EIO; 123} 124 125static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) 126{ 127 int err; 128 129 if (handle->h_buffer_credits > needed) 130 return handle; 131 if (!ext4_journal_extend(handle, needed)) 132 return handle; 133 err = ext4_journal_restart(handle, needed); 134 135 return handle; 136} 137 138/* 139 * could return: 140 * - EROFS 141 * - ENOMEM 142 */ 143static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 144 struct ext4_ext_path *path) 145{ 146 if (path->p_bh) { 147 /* path points to block */ 148 return ext4_journal_get_write_access(handle, path->p_bh); 149 } 150 /* path points to leaf/index in inode body */ 151 /* we use in-core data, no need to protect them */ 152 return 0; 153} 154 155/* 156 * could return: 157 * - EROFS 158 * - ENOMEM 159 * - EIO 160 */ 161static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 162 struct ext4_ext_path *path) 163{ 164 int err; 165 if (path->p_bh) { 166 /* path points to block */ 167 err = ext4_journal_dirty_metadata(handle, path->p_bh); 168 } else { 169 /* path points to leaf/index in inode body */ 170 err = ext4_mark_inode_dirty(handle, inode); 171 } 172 return err; 173} 174 175static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 176 struct ext4_ext_path *path, 177 ext4_fsblk_t block) 178{ 179 struct ext4_inode_info *ei = EXT4_I(inode); 180 ext4_fsblk_t bg_start; 181 ext4_grpblk_t colour; 182 int depth; 183 184 if (path) { 185 struct ext4_extent *ex; 186 depth = path->p_depth; 187 188 /* try to predict block placement */ 189 ex = path[depth].p_ext; 190 if (ex) 191 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 192 193 /* it looks like index is empty; 194 * try to find starting block from index itself */ 195 if (path[depth].p_bh) 196 return path[depth].p_bh->b_blocknr; 197 } 198 199 /* OK. use inode's group */ 200 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 201 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); 202 colour = (current->pid % 16) * 203 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 204 return bg_start + colour + block; 205} 206 207static ext4_fsblk_t 208ext4_ext_new_block(handle_t *handle, struct inode *inode, 209 struct ext4_ext_path *path, 210 struct ext4_extent *ex, int *err) 211{ 212 ext4_fsblk_t goal, newblock; 213 214 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 215 newblock = ext4_new_block(handle, inode, goal, err); 216 return newblock; 217} 218 219static int ext4_ext_space_block(struct inode *inode) 220{ 221 int size; 222 223 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 224 / sizeof(struct ext4_extent); 225#ifdef AGGRESSIVE_TEST 226 if (size > 6) 227 size = 6; 228#endif 229 return size; 230} 231 232static int ext4_ext_space_block_idx(struct inode *inode) 233{ 234 int size; 235 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 237 / sizeof(struct ext4_extent_idx); 238#ifdef AGGRESSIVE_TEST 239 if (size > 5) 240 size = 5; 241#endif 242 return size; 243} 244 245static int ext4_ext_space_root(struct inode *inode) 246{ 247 int size; 248 249 size = sizeof(EXT4_I(inode)->i_data); 250 size -= sizeof(struct ext4_extent_header); 251 size /= sizeof(struct ext4_extent); 252#ifdef AGGRESSIVE_TEST 253 if (size > 3) 254 size = 3; 255#endif 256 return size; 257} 258 259static int ext4_ext_space_root_idx(struct inode *inode) 260{ 261 int size; 262 263 size = sizeof(EXT4_I(inode)->i_data); 264 size -= sizeof(struct ext4_extent_header); 265 size /= sizeof(struct ext4_extent_idx); 266#ifdef AGGRESSIVE_TEST 267 if (size > 4) 268 size = 4; 269#endif 270 return size; 271} 272 273#ifdef EXT_DEBUG 274static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 275{ 276 int k, l = path->p_depth; 277 278 ext_debug("path:"); 279 for (k = 0; k <= l; k++, path++) { 280 if (path->p_idx) { 281 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 282 idx_pblock(path->p_idx)); 283 } else if (path->p_ext) { 284 ext_debug(" %d:%d:%llu ", 285 le32_to_cpu(path->p_ext->ee_block), 286 ext4_ext_get_actual_len(path->p_ext), 287 ext_pblock(path->p_ext)); 288 } else 289 ext_debug(" []"); 290 } 291 ext_debug("\n"); 292} 293 294static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 295{ 296 int depth = ext_depth(inode); 297 struct ext4_extent_header *eh; 298 struct ext4_extent *ex; 299 int i; 300 301 if (!path) 302 return; 303 304 eh = path[depth].p_hdr; 305 ex = EXT_FIRST_EXTENT(eh); 306 307 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 308 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 309 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 310 } 311 ext_debug("\n"); 312} 313#else 314#define ext4_ext_show_path(inode,path) 315#define ext4_ext_show_leaf(inode,path) 316#endif 317 318static void ext4_ext_drop_refs(struct ext4_ext_path *path) 319{ 320 int depth = path->p_depth; 321 int i; 322 323 for (i = 0; i <= depth; i++, path++) 324 if (path->p_bh) { 325 brelse(path->p_bh); 326 path->p_bh = NULL; 327 } 328} 329 330/* 331 * ext4_ext_binsearch_idx: 332 * binary search for the closest index of the given block 333 */ 334static void 335ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) 336{ 337 struct ext4_extent_header *eh = path->p_hdr; 338 struct ext4_extent_idx *r, *l, *m; 339 340 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); 341 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); 342 BUG_ON(le16_to_cpu(eh->eh_entries) <= 0); 343 344 ext_debug("binsearch for %d(idx): ", block); 345 346 l = EXT_FIRST_INDEX(eh) + 1; 347 r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1; 348 while (l <= r) { 349 m = l + (r - l) / 2; 350 if (block < le32_to_cpu(m->ei_block)) 351 r = m - 1; 352 else 353 l = m + 1; 354 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block, 355 m, m->ei_block, r, r->ei_block); 356 } 357 358 path->p_idx = l - 1; 359 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 360 idx_block(path->p_idx)); 361 362#ifdef CHECK_BINSEARCH 363 { 364 struct ext4_extent_idx *chix, *ix; 365 int k; 366 367 chix = ix = EXT_FIRST_INDEX(eh); 368 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 369 if (k != 0 && 370 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 371 printk("k=%d, ix=0x%p, first=0x%p\n", k, 372 ix, EXT_FIRST_INDEX(eh)); 373 printk("%u <= %u\n", 374 le32_to_cpu(ix->ei_block), 375 le32_to_cpu(ix[-1].ei_block)); 376 } 377 BUG_ON(k && le32_to_cpu(ix->ei_block) 378 <= le32_to_cpu(ix[-1].ei_block)); 379 if (block < le32_to_cpu(ix->ei_block)) 380 break; 381 chix = ix; 382 } 383 BUG_ON(chix != path->p_idx); 384 } 385#endif 386 387} 388 389/* 390 * ext4_ext_binsearch: 391 * binary search for closest extent of the given block 392 */ 393static void 394ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) 395{ 396 struct ext4_extent_header *eh = path->p_hdr; 397 struct ext4_extent *r, *l, *m; 398 399 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); 400 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); 401 402 if (eh->eh_entries == 0) { 403 /* 404 * this leaf is empty: 405 * we get such a leaf in split/add case 406 */ 407 return; 408 } 409 410 ext_debug("binsearch for %d: ", block); 411 412 l = EXT_FIRST_EXTENT(eh) + 1; 413 r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1; 414 415 while (l <= r) { 416 m = l + (r - l) / 2; 417 if (block < le32_to_cpu(m->ee_block)) 418 r = m - 1; 419 else 420 l = m + 1; 421 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block, 422 m, m->ee_block, r, r->ee_block); 423 } 424 425 path->p_ext = l - 1; 426 ext_debug(" -> %d:%llu:%d ", 427 le32_to_cpu(path->p_ext->ee_block), 428 ext_pblock(path->p_ext), 429 ext4_ext_get_actual_len(path->p_ext)); 430 431#ifdef CHECK_BINSEARCH 432 { 433 struct ext4_extent *chex, *ex; 434 int k; 435 436 chex = ex = EXT_FIRST_EXTENT(eh); 437 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 438 BUG_ON(k && le32_to_cpu(ex->ee_block) 439 <= le32_to_cpu(ex[-1].ee_block)); 440 if (block < le32_to_cpu(ex->ee_block)) 441 break; 442 chex = ex; 443 } 444 BUG_ON(chex != path->p_ext); 445 } 446#endif 447 448} 449 450int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 451{ 452 struct ext4_extent_header *eh; 453 454 eh = ext_inode_hdr(inode); 455 eh->eh_depth = 0; 456 eh->eh_entries = 0; 457 eh->eh_magic = EXT4_EXT_MAGIC; 458 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 459 ext4_mark_inode_dirty(handle, inode); 460 ext4_ext_invalidate_cache(inode); 461 return 0; 462} 463 464struct ext4_ext_path * 465ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) 466{ 467 struct ext4_extent_header *eh; 468 struct buffer_head *bh; 469 short int depth, i, ppos = 0, alloc = 0; 470 471 eh = ext_inode_hdr(inode); 472 BUG_ON(eh == NULL); 473 if (ext4_ext_check_header(__FUNCTION__, inode, eh)) 474 return ERR_PTR(-EIO); 475 476 i = depth = ext_depth(inode); 477 478 /* account possible depth increase */ 479 if (!path) { 480 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 481 GFP_NOFS); 482 if (!path) 483 return ERR_PTR(-ENOMEM); 484 alloc = 1; 485 } 486 path[0].p_hdr = eh; 487 488 /* walk through the tree */ 489 while (i) { 490 ext_debug("depth %d: num %d, max %d\n", 491 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 492 ext4_ext_binsearch_idx(inode, path + ppos, block); 493 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 494 path[ppos].p_depth = i; 495 path[ppos].p_ext = NULL; 496 497 bh = sb_bread(inode->i_sb, path[ppos].p_block); 498 if (!bh) 499 goto err; 500 501 eh = ext_block_hdr(bh); 502 ppos++; 503 BUG_ON(ppos > depth); 504 path[ppos].p_bh = bh; 505 path[ppos].p_hdr = eh; 506 i--; 507 508 if (ext4_ext_check_header(__FUNCTION__, inode, eh)) 509 goto err; 510 } 511 512 path[ppos].p_depth = i; 513 path[ppos].p_hdr = eh; 514 path[ppos].p_ext = NULL; 515 path[ppos].p_idx = NULL; 516 517 if (ext4_ext_check_header(__FUNCTION__, inode, eh)) 518 goto err; 519 520 /* find extent */ 521 ext4_ext_binsearch(inode, path + ppos, block); 522 523 ext4_ext_show_path(inode, path); 524 525 return path; 526 527err: 528 ext4_ext_drop_refs(path); 529 if (alloc) 530 kfree(path); 531 return ERR_PTR(-EIO); 532} 533 534/* 535 * ext4_ext_insert_index: 536 * insert new index [@logical;@ptr] into the block at @curp; 537 * check where to insert: before @curp or after @curp 538 */ 539static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 540 struct ext4_ext_path *curp, 541 int logical, ext4_fsblk_t ptr) 542{ 543 struct ext4_extent_idx *ix; 544 int len, err; 545 546 err = ext4_ext_get_access(handle, inode, curp); 547 if (err) 548 return err; 549 550 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 551 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 552 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 553 /* insert after */ 554 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 555 len = (len - 1) * sizeof(struct ext4_extent_idx); 556 len = len < 0 ? 0 : len; 557 ext_debug("insert new index %d after: %d. " 558 "move %d from 0x%p to 0x%p\n", 559 logical, ptr, len, 560 (curp->p_idx + 1), (curp->p_idx + 2)); 561 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 562 } 563 ix = curp->p_idx + 1; 564 } else { 565 /* insert before */ 566 len = len * sizeof(struct ext4_extent_idx); 567 len = len < 0 ? 0 : len; 568 ext_debug("insert new index %d before: %d. " 569 "move %d from 0x%p to 0x%p\n", 570 logical, ptr, len, 571 curp->p_idx, (curp->p_idx + 1)); 572 memmove(curp->p_idx + 1, curp->p_idx, len); 573 ix = curp->p_idx; 574 } 575 576 ix->ei_block = cpu_to_le32(logical); 577 ext4_idx_store_pblock(ix, ptr); 578 curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1); 579 580 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 581 > le16_to_cpu(curp->p_hdr->eh_max)); 582 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 583 584 err = ext4_ext_dirty(handle, inode, curp); 585 ext4_std_error(inode->i_sb, err); 586 587 return err; 588} 589 590/* 591 * ext4_ext_split: 592 * inserts new subtree into the path, using free index entry 593 * at depth @at: 594 * - allocates all needed blocks (new leaf and all intermediate index blocks) 595 * - makes decision where to split 596 * - moves remaining extents and index entries (right to the split point) 597 * into the newly allocated blocks 598 * - initializes subtree 599 */ 600static int ext4_ext_split(handle_t *handle, struct inode *inode, 601 struct ext4_ext_path *path, 602 struct ext4_extent *newext, int at) 603{ 604 struct buffer_head *bh = NULL; 605 int depth = ext_depth(inode); 606 struct ext4_extent_header *neh; 607 struct ext4_extent_idx *fidx; 608 struct ext4_extent *ex; 609 int i = at, k, m, a; 610 ext4_fsblk_t newblock, oldblock; 611 __le32 border; 612 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 613 int err = 0; 614 615 /* make decision: where to split? */ 616 /* FIXME: now decision is simplest: at current extent */ 617 618 /* if current leaf will be split, then we should use 619 * border from split point */ 620 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 621 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 622 border = path[depth].p_ext[1].ee_block; 623 ext_debug("leaf will be split." 624 " next leaf starts at %d\n", 625 le32_to_cpu(border)); 626 } else { 627 border = newext->ee_block; 628 ext_debug("leaf will be added." 629 " next leaf starts at %d\n", 630 le32_to_cpu(border)); 631 } 632 633 /* 634 * If error occurs, then we break processing 635 * and mark filesystem read-only. index won't 636 * be inserted and tree will be in consistent 637 * state. Next mount will repair buffers too. 638 */ 639 640 /* 641 * Get array to track all allocated blocks. 642 * We need this to handle errors and free blocks 643 * upon them. 644 */ 645 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 646 if (!ablocks) 647 return -ENOMEM; 648 649 /* allocate all needed blocks */ 650 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 651 for (a = 0; a < depth - at; a++) { 652 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 653 if (newblock == 0) 654 goto cleanup; 655 ablocks[a] = newblock; 656 } 657 658 /* initialize new leaf */ 659 newblock = ablocks[--a]; 660 BUG_ON(newblock == 0); 661 bh = sb_getblk(inode->i_sb, newblock); 662 if (!bh) { 663 err = -EIO; 664 goto cleanup; 665 } 666 lock_buffer(bh); 667 668 err = ext4_journal_get_create_access(handle, bh); 669 if (err) 670 goto cleanup; 671 672 neh = ext_block_hdr(bh); 673 neh->eh_entries = 0; 674 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 675 neh->eh_magic = EXT4_EXT_MAGIC; 676 neh->eh_depth = 0; 677 ex = EXT_FIRST_EXTENT(neh); 678 679 /* move remainder of path[depth] to the new leaf */ 680 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 681 /* start copy from next extent */ 682 /* TODO: we could do it by single memmove */ 683 m = 0; 684 path[depth].p_ext++; 685 while (path[depth].p_ext <= 686 EXT_MAX_EXTENT(path[depth].p_hdr)) { 687 ext_debug("move %d:%llu:%d in new leaf %llu\n", 688 le32_to_cpu(path[depth].p_ext->ee_block), 689 ext_pblock(path[depth].p_ext), 690 ext4_ext_get_actual_len(path[depth].p_ext), 691 newblock); 692 /*memmove(ex++, path[depth].p_ext++, 693 sizeof(struct ext4_extent)); 694 neh->eh_entries++;*/ 695 path[depth].p_ext++; 696 m++; 697 } 698 if (m) { 699 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 700 neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m); 701 } 702 703 set_buffer_uptodate(bh); 704 unlock_buffer(bh); 705 706 err = ext4_journal_dirty_metadata(handle, bh); 707 if (err) 708 goto cleanup; 709 brelse(bh); 710 bh = NULL; 711 712 /* correct old leaf */ 713 if (m) { 714 err = ext4_ext_get_access(handle, inode, path + depth); 715 if (err) 716 goto cleanup; 717 path[depth].p_hdr->eh_entries = 718 cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m); 719 err = ext4_ext_dirty(handle, inode, path + depth); 720 if (err) 721 goto cleanup; 722 723 } 724 725 /* create intermediate indexes */ 726 k = depth - at - 1; 727 BUG_ON(k < 0); 728 if (k) 729 ext_debug("create %d intermediate indices\n", k); 730 /* insert new index into current index block */ 731 /* current depth stored in i var */ 732 i = depth - 1; 733 while (k--) { 734 oldblock = newblock; 735 newblock = ablocks[--a]; 736 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock); 737 if (!bh) { 738 err = -EIO; 739 goto cleanup; 740 } 741 lock_buffer(bh); 742 743 err = ext4_journal_get_create_access(handle, bh); 744 if (err) 745 goto cleanup; 746 747 neh = ext_block_hdr(bh); 748 neh->eh_entries = cpu_to_le16(1); 749 neh->eh_magic = EXT4_EXT_MAGIC; 750 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 751 neh->eh_depth = cpu_to_le16(depth - i); 752 fidx = EXT_FIRST_INDEX(neh); 753 fidx->ei_block = border; 754 ext4_idx_store_pblock(fidx, oldblock); 755 756 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i, 757 newblock, (unsigned long) le32_to_cpu(border), 758 oldblock); 759 /* copy indexes */ 760 m = 0; 761 path[i].p_idx++; 762 763 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 764 EXT_MAX_INDEX(path[i].p_hdr)); 765 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 766 EXT_LAST_INDEX(path[i].p_hdr)); 767 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 768 ext_debug("%d: move %d:%d in new index %llu\n", i, 769 le32_to_cpu(path[i].p_idx->ei_block), 770 idx_pblock(path[i].p_idx), 771 newblock); 772 /*memmove(++fidx, path[i].p_idx++, 773 sizeof(struct ext4_extent_idx)); 774 neh->eh_entries++; 775 BUG_ON(neh->eh_entries > neh->eh_max);*/ 776 path[i].p_idx++; 777 m++; 778 } 779 if (m) { 780 memmove(++fidx, path[i].p_idx - m, 781 sizeof(struct ext4_extent_idx) * m); 782 neh->eh_entries = 783 cpu_to_le16(le16_to_cpu(neh->eh_entries) + m); 784 } 785 set_buffer_uptodate(bh); 786 unlock_buffer(bh); 787 788 err = ext4_journal_dirty_metadata(handle, bh); 789 if (err) 790 goto cleanup; 791 brelse(bh); 792 bh = NULL; 793 794 /* correct old index */ 795 if (m) { 796 err = ext4_ext_get_access(handle, inode, path + i); 797 if (err) 798 goto cleanup; 799 path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m); 800 err = ext4_ext_dirty(handle, inode, path + i); 801 if (err) 802 goto cleanup; 803 } 804 805 i--; 806 } 807 808 /* insert new index */ 809 err = ext4_ext_insert_index(handle, inode, path + at, 810 le32_to_cpu(border), newblock); 811 812cleanup: 813 if (bh) { 814 if (buffer_locked(bh)) 815 unlock_buffer(bh); 816 brelse(bh); 817 } 818 819 if (err) { 820 /* free all allocated blocks in error case */ 821 for (i = 0; i < depth; i++) { 822 if (!ablocks[i]) 823 continue; 824 ext4_free_blocks(handle, inode, ablocks[i], 1); 825 } 826 } 827 kfree(ablocks); 828 829 return err; 830} 831 832/* 833 * ext4_ext_grow_indepth: 834 * implements tree growing procedure: 835 * - allocates new block 836 * - moves top-level data (index block or leaf) into the new block 837 * - initializes new top-level, creating index that points to the 838 * just created block 839 */ 840static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 841 struct ext4_ext_path *path, 842 struct ext4_extent *newext) 843{ 844 struct ext4_ext_path *curp = path; 845 struct ext4_extent_header *neh; 846 struct ext4_extent_idx *fidx; 847 struct buffer_head *bh; 848 ext4_fsblk_t newblock; 849 int err = 0; 850 851 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 852 if (newblock == 0) 853 return err; 854 855 bh = sb_getblk(inode->i_sb, newblock); 856 if (!bh) { 857 err = -EIO; 858 ext4_std_error(inode->i_sb, err); 859 return err; 860 } 861 lock_buffer(bh); 862 863 err = ext4_journal_get_create_access(handle, bh); 864 if (err) { 865 unlock_buffer(bh); 866 goto out; 867 } 868 869 /* move top-level index/leaf into new block */ 870 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 871 872 /* set size of new block */ 873 neh = ext_block_hdr(bh); 874 /* old root could have indexes or leaves 875 * so calculate e_max right way */ 876 if (ext_depth(inode)) 877 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 878 else 879 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 880 neh->eh_magic = EXT4_EXT_MAGIC; 881 set_buffer_uptodate(bh); 882 unlock_buffer(bh); 883 884 err = ext4_journal_dirty_metadata(handle, bh); 885 if (err) 886 goto out; 887 888 /* create index in new top-level index: num,max,pointer */ 889 err = ext4_ext_get_access(handle, inode, curp); 890 if (err) 891 goto out; 892 893 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 894 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 895 curp->p_hdr->eh_entries = cpu_to_le16(1); 896 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 897 /* FIXME: it works, but actually path[0] can be index */ 898 curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 899 ext4_idx_store_pblock(curp->p_idx, newblock); 900 901 neh = ext_inode_hdr(inode); 902 fidx = EXT_FIRST_INDEX(neh); 903 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 904 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 905 le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); 906 907 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 908 err = ext4_ext_dirty(handle, inode, curp); 909out: 910 brelse(bh); 911 912 return err; 913} 914 915/* 916 * ext4_ext_create_new_leaf: 917 * finds empty index and adds new leaf. 918 * if no free index is found, then it requests in-depth growing. 919 */ 920static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 921 struct ext4_ext_path *path, 922 struct ext4_extent *newext) 923{ 924 struct ext4_ext_path *curp; 925 int depth, i, err = 0; 926 927repeat: 928 i = depth = ext_depth(inode); 929 930 /* walk up to the tree and look for free index entry */ 931 curp = path + depth; 932 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 933 i--; 934 curp--; 935 } 936 937 /* we use already allocated block for index block, 938 * so subsequent data blocks should be contiguous */ 939 if (EXT_HAS_FREE_INDEX(curp)) { 940 /* if we found index with free entry, then use that 941 * entry: create all needed subtree and add new leaf */ 942 err = ext4_ext_split(handle, inode, path, newext, i); 943 944 /* refill path */ 945 ext4_ext_drop_refs(path); 946 path = ext4_ext_find_extent(inode, 947 le32_to_cpu(newext->ee_block), 948 path); 949 if (IS_ERR(path)) 950 err = PTR_ERR(path); 951 } else { 952 /* tree is full, time to grow in depth */ 953 err = ext4_ext_grow_indepth(handle, inode, path, newext); 954 if (err) 955 goto out; 956 957 /* refill path */ 958 ext4_ext_drop_refs(path); 959 path = ext4_ext_find_extent(inode, 960 le32_to_cpu(newext->ee_block), 961 path); 962 if (IS_ERR(path)) { 963 err = PTR_ERR(path); 964 goto out; 965 } 966 967 /* 968 * only first (depth 0 -> 1) produces free space; 969 * in all other cases we have to split the grown tree 970 */ 971 depth = ext_depth(inode); 972 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 973 /* now we need to split */ 974 goto repeat; 975 } 976 } 977 978out: 979 return err; 980} 981 982/* 983 * ext4_ext_next_allocated_block: 984 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 985 * NOTE: it considers block number from index entry as 986 * allocated block. Thus, index entries have to be consistent 987 * with leaves. 988 */ 989static unsigned long 990ext4_ext_next_allocated_block(struct ext4_ext_path *path) 991{ 992 int depth; 993 994 BUG_ON(path == NULL); 995 depth = path->p_depth; 996 997 if (depth == 0 && path->p_ext == NULL) 998 return EXT_MAX_BLOCK; 999 1000 while (depth >= 0) { 1001 if (depth == path->p_depth) { 1002 /* leaf */ 1003 if (path[depth].p_ext != 1004 EXT_LAST_EXTENT(path[depth].p_hdr)) 1005 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1006 } else { 1007 /* index */ 1008 if (path[depth].p_idx != 1009 EXT_LAST_INDEX(path[depth].p_hdr)) 1010 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1011 } 1012 depth--; 1013 } 1014 1015 return EXT_MAX_BLOCK; 1016} 1017 1018/* 1019 * ext4_ext_next_leaf_block: 1020 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1021 */ 1022static unsigned ext4_ext_next_leaf_block(struct inode *inode, 1023 struct ext4_ext_path *path) 1024{ 1025 int depth; 1026 1027 BUG_ON(path == NULL); 1028 depth = path->p_depth; 1029 1030 /* zero-tree has no leaf blocks at all */ 1031 if (depth == 0) 1032 return EXT_MAX_BLOCK; 1033 1034 /* go to index block */ 1035 depth--; 1036 1037 while (depth >= 0) { 1038 if (path[depth].p_idx != 1039 EXT_LAST_INDEX(path[depth].p_hdr)) 1040 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1041 depth--; 1042 } 1043 1044 return EXT_MAX_BLOCK; 1045} 1046 1047/* 1048 * ext4_ext_correct_indexes: 1049 * if leaf gets modified and modified extent is first in the leaf, 1050 * then we have to correct all indexes above. 1051 * TODO: do we need to correct tree in all cases? 1052 */ 1053int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1054 struct ext4_ext_path *path) 1055{ 1056 struct ext4_extent_header *eh; 1057 int depth = ext_depth(inode); 1058 struct ext4_extent *ex; 1059 __le32 border; 1060 int k, err = 0; 1061 1062 eh = path[depth].p_hdr; 1063 ex = path[depth].p_ext; 1064 BUG_ON(ex == NULL); 1065 BUG_ON(eh == NULL); 1066 1067 if (depth == 0) { 1068 /* there is no tree at all */ 1069 return 0; 1070 } 1071 1072 if (ex != EXT_FIRST_EXTENT(eh)) { 1073 /* we correct tree if first leaf got modified only */ 1074 return 0; 1075 } 1076 1077 /* 1078 * TODO: we need correction if border is smaller than current one 1079 */ 1080 k = depth - 1; 1081 border = path[depth].p_ext->ee_block; 1082 err = ext4_ext_get_access(handle, inode, path + k); 1083 if (err) 1084 return err; 1085 path[k].p_idx->ei_block = border; 1086 err = ext4_ext_dirty(handle, inode, path + k); 1087 if (err) 1088 return err; 1089 1090 while (k--) { 1091 /* change all left-side indexes */ 1092 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1093 break; 1094 err = ext4_ext_get_access(handle, inode, path + k); 1095 if (err) 1096 break; 1097 path[k].p_idx->ei_block = border; 1098 err = ext4_ext_dirty(handle, inode, path + k); 1099 if (err) 1100 break; 1101 } 1102 1103 return err; 1104} 1105 1106static int 1107ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1108 struct ext4_extent *ex2) 1109{ 1110 unsigned short ext1_ee_len, ext2_ee_len; 1111 1112 /* 1113 * Make sure that either both extents are uninitialized, or 1114 * both are _not_. 1115 */ 1116 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1117 return 0; 1118 1119 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1120 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1121 1122 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1123 le32_to_cpu(ex2->ee_block)) 1124 return 0; 1125 1126 /* 1127 * To allow future support for preallocated extents to be added 1128 * as an RO_COMPAT feature, refuse to merge to extents if 1129 * this can result in the top bit of ee_len being set. 1130 */ 1131 if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) 1132 return 0; 1133#ifdef AGGRESSIVE_TEST 1134 if (le16_to_cpu(ex1->ee_len) >= 4) 1135 return 0; 1136#endif 1137 1138 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1139 return 1; 1140 return 0; 1141} 1142 1143/* 1144 * This function tries to merge the "ex" extent to the next extent in the tree. 1145 * It always tries to merge towards right. If you want to merge towards 1146 * left, pass "ex - 1" as argument instead of "ex". 1147 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1148 * 1 if they got merged. 1149 */ 1150int ext4_ext_try_to_merge(struct inode *inode, 1151 struct ext4_ext_path *path, 1152 struct ext4_extent *ex) 1153{ 1154 struct ext4_extent_header *eh; 1155 unsigned int depth, len; 1156 int merge_done = 0; 1157 int uninitialized = 0; 1158 1159 depth = ext_depth(inode); 1160 BUG_ON(path[depth].p_hdr == NULL); 1161 eh = path[depth].p_hdr; 1162 1163 while (ex < EXT_LAST_EXTENT(eh)) { 1164 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1165 break; 1166 /* merge with next extent! */ 1167 if (ext4_ext_is_uninitialized(ex)) 1168 uninitialized = 1; 1169 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1170 + ext4_ext_get_actual_len(ex + 1)); 1171 if (uninitialized) 1172 ext4_ext_mark_uninitialized(ex); 1173 1174 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1175 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1176 * sizeof(struct ext4_extent); 1177 memmove(ex + 1, ex + 2, len); 1178 } 1179 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1); 1180 merge_done = 1; 1181 WARN_ON(eh->eh_entries == 0); 1182 if (!eh->eh_entries) 1183 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1184 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1185 } 1186 1187 return merge_done; 1188} 1189 1190/* 1191 * check if a portion of the "newext" extent overlaps with an 1192 * existing extent. 1193 * 1194 * If there is an overlap discovered, it updates the length of the newext 1195 * such that there will be no overlap, and then returns 1. 1196 * If there is no overlap found, it returns 0. 1197 */ 1198unsigned int ext4_ext_check_overlap(struct inode *inode, 1199 struct ext4_extent *newext, 1200 struct ext4_ext_path *path) 1201{ 1202 unsigned long b1, b2; 1203 unsigned int depth, len1; 1204 unsigned int ret = 0; 1205 1206 b1 = le32_to_cpu(newext->ee_block); 1207 len1 = ext4_ext_get_actual_len(newext); 1208 depth = ext_depth(inode); 1209 if (!path[depth].p_ext) 1210 goto out; 1211 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1212 1213 /* 1214 * get the next allocated block if the extent in the path 1215 * is before the requested block(s) 1216 */ 1217 if (b2 < b1) { 1218 b2 = ext4_ext_next_allocated_block(path); 1219 if (b2 == EXT_MAX_BLOCK) 1220 goto out; 1221 } 1222 1223 /* check for wrap through zero */ 1224 if (b1 + len1 < b1) { 1225 len1 = EXT_MAX_BLOCK - b1; 1226 newext->ee_len = cpu_to_le16(len1); 1227 ret = 1; 1228 } 1229 1230 /* check for overlap */ 1231 if (b1 + len1 > b2) { 1232 newext->ee_len = cpu_to_le16(b2 - b1); 1233 ret = 1; 1234 } 1235out: 1236 return ret; 1237} 1238 1239/* 1240 * ext4_ext_insert_extent: 1241 * tries to merge requsted extent into the existing extent or 1242 * inserts requested extent as new one into the tree, 1243 * creating new leaf in the no-space case. 1244 */ 1245int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1246 struct ext4_ext_path *path, 1247 struct ext4_extent *newext) 1248{ 1249 struct ext4_extent_header * eh; 1250 struct ext4_extent *ex, *fex; 1251 struct ext4_extent *nearex; /* nearest extent */ 1252 struct ext4_ext_path *npath = NULL; 1253 int depth, len, err, next; 1254 unsigned uninitialized = 0; 1255 1256 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1257 depth = ext_depth(inode); 1258 ex = path[depth].p_ext; 1259 BUG_ON(path[depth].p_hdr == NULL); 1260 1261 /* try to insert block into found extent and return */ 1262 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1263 ext_debug("append %d block to %d:%d (from %llu)\n", 1264 ext4_ext_get_actual_len(newext), 1265 le32_to_cpu(ex->ee_block), 1266 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1267 err = ext4_ext_get_access(handle, inode, path + depth); 1268 if (err) 1269 return err; 1270 1271 /* 1272 * ext4_can_extents_be_merged should have checked that either 1273 * both extents are uninitialized, or both aren't. Thus we 1274 * need to check only one of them here. 1275 */ 1276 if (ext4_ext_is_uninitialized(ex)) 1277 uninitialized = 1; 1278 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1279 + ext4_ext_get_actual_len(newext)); 1280 if (uninitialized) 1281 ext4_ext_mark_uninitialized(ex); 1282 eh = path[depth].p_hdr; 1283 nearex = ex; 1284 goto merge; 1285 } 1286 1287repeat: 1288 depth = ext_depth(inode); 1289 eh = path[depth].p_hdr; 1290 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1291 goto has_space; 1292 1293 /* probably next leaf has space for us? */ 1294 fex = EXT_LAST_EXTENT(eh); 1295 next = ext4_ext_next_leaf_block(inode, path); 1296 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1297 && next != EXT_MAX_BLOCK) { 1298 ext_debug("next leaf block - %d\n", next); 1299 BUG_ON(npath != NULL); 1300 npath = ext4_ext_find_extent(inode, next, NULL); 1301 if (IS_ERR(npath)) 1302 return PTR_ERR(npath); 1303 BUG_ON(npath->p_depth != path->p_depth); 1304 eh = npath[depth].p_hdr; 1305 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1306 ext_debug("next leaf isnt full(%d)\n", 1307 le16_to_cpu(eh->eh_entries)); 1308 path = npath; 1309 goto repeat; 1310 } 1311 ext_debug("next leaf has no free space(%d,%d)\n", 1312 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1313 } 1314 1315 /* 1316 * There is no free space in the found leaf. 1317 * We're gonna add a new leaf in the tree. 1318 */ 1319 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1320 if (err) 1321 goto cleanup; 1322 depth = ext_depth(inode); 1323 eh = path[depth].p_hdr; 1324 1325has_space: 1326 nearex = path[depth].p_ext; 1327 1328 err = ext4_ext_get_access(handle, inode, path + depth); 1329 if (err) 1330 goto cleanup; 1331 1332 if (!nearex) { 1333 /* there is no extent in this leaf, create first one */ 1334 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1335 le32_to_cpu(newext->ee_block), 1336 ext_pblock(newext), 1337 ext4_ext_get_actual_len(newext)); 1338 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1339 } else if (le32_to_cpu(newext->ee_block) 1340 > le32_to_cpu(nearex->ee_block)) { 1341/* BUG_ON(newext->ee_block == nearex->ee_block); */ 1342 if (nearex != EXT_LAST_EXTENT(eh)) { 1343 len = EXT_MAX_EXTENT(eh) - nearex; 1344 len = (len - 1) * sizeof(struct ext4_extent); 1345 len = len < 0 ? 0 : len; 1346 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1347 "move %d from 0x%p to 0x%p\n", 1348 le32_to_cpu(newext->ee_block), 1349 ext_pblock(newext), 1350 ext4_ext_get_actual_len(newext), 1351 nearex, len, nearex + 1, nearex + 2); 1352 memmove(nearex + 2, nearex + 1, len); 1353 } 1354 path[depth].p_ext = nearex + 1; 1355 } else { 1356 BUG_ON(newext->ee_block == nearex->ee_block); 1357 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1358 len = len < 0 ? 0 : len; 1359 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1360 "move %d from 0x%p to 0x%p\n", 1361 le32_to_cpu(newext->ee_block), 1362 ext_pblock(newext), 1363 ext4_ext_get_actual_len(newext), 1364 nearex, len, nearex + 1, nearex + 2); 1365 memmove(nearex + 1, nearex, len); 1366 path[depth].p_ext = nearex; 1367 } 1368 1369 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1); 1370 nearex = path[depth].p_ext; 1371 nearex->ee_block = newext->ee_block; 1372 nearex->ee_start = newext->ee_start; 1373 nearex->ee_start_hi = newext->ee_start_hi; 1374 nearex->ee_len = newext->ee_len; 1375 1376merge: 1377 /* try to merge extents to the right */ 1378 ext4_ext_try_to_merge(inode, path, nearex); 1379 1380 /* try to merge extents to the left */ 1381 1382 /* time to correct all indexes above */ 1383 err = ext4_ext_correct_indexes(handle, inode, path); 1384 if (err) 1385 goto cleanup; 1386 1387 err = ext4_ext_dirty(handle, inode, path + depth); 1388 1389cleanup: 1390 if (npath) { 1391 ext4_ext_drop_refs(npath); 1392 kfree(npath); 1393 } 1394 ext4_ext_tree_changed(inode); 1395 ext4_ext_invalidate_cache(inode); 1396 return err; 1397} 1398 1399int ext4_ext_walk_space(struct inode *inode, unsigned long block, 1400 unsigned long num, ext_prepare_callback func, 1401 void *cbdata) 1402{ 1403 struct ext4_ext_path *path = NULL; 1404 struct ext4_ext_cache cbex; 1405 struct ext4_extent *ex; 1406 unsigned long next, start = 0, end = 0; 1407 unsigned long last = block + num; 1408 int depth, exists, err = 0; 1409 1410 BUG_ON(func == NULL); 1411 BUG_ON(inode == NULL); 1412 1413 while (block < last && block != EXT_MAX_BLOCK) { 1414 num = last - block; 1415 /* find extent for this block */ 1416 path = ext4_ext_find_extent(inode, block, path); 1417 if (IS_ERR(path)) { 1418 err = PTR_ERR(path); 1419 path = NULL; 1420 break; 1421 } 1422 1423 depth = ext_depth(inode); 1424 BUG_ON(path[depth].p_hdr == NULL); 1425 ex = path[depth].p_ext; 1426 next = ext4_ext_next_allocated_block(path); 1427 1428 exists = 0; 1429 if (!ex) { 1430 /* there is no extent yet, so try to allocate 1431 * all requested space */ 1432 start = block; 1433 end = block + num; 1434 } else if (le32_to_cpu(ex->ee_block) > block) { 1435 /* need to allocate space before found extent */ 1436 start = block; 1437 end = le32_to_cpu(ex->ee_block); 1438 if (block + num < end) 1439 end = block + num; 1440 } else if (block >= le32_to_cpu(ex->ee_block) 1441 + ext4_ext_get_actual_len(ex)) { 1442 /* need to allocate space after found extent */ 1443 start = block; 1444 end = block + num; 1445 if (end >= next) 1446 end = next; 1447 } else if (block >= le32_to_cpu(ex->ee_block)) { 1448 /* 1449 * some part of requested space is covered 1450 * by found extent 1451 */ 1452 start = block; 1453 end = le32_to_cpu(ex->ee_block) 1454 + ext4_ext_get_actual_len(ex); 1455 if (block + num < end) 1456 end = block + num; 1457 exists = 1; 1458 } else { 1459 BUG(); 1460 } 1461 BUG_ON(end <= start); 1462 1463 if (!exists) { 1464 cbex.ec_block = start; 1465 cbex.ec_len = end - start; 1466 cbex.ec_start = 0; 1467 cbex.ec_type = EXT4_EXT_CACHE_GAP; 1468 } else { 1469 cbex.ec_block = le32_to_cpu(ex->ee_block); 1470 cbex.ec_len = ext4_ext_get_actual_len(ex); 1471 cbex.ec_start = ext_pblock(ex); 1472 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1473 } 1474 1475 BUG_ON(cbex.ec_len == 0); 1476 err = func(inode, path, &cbex, cbdata); 1477 ext4_ext_drop_refs(path); 1478 1479 if (err < 0) 1480 break; 1481 if (err == EXT_REPEAT) 1482 continue; 1483 else if (err == EXT_BREAK) { 1484 err = 0; 1485 break; 1486 } 1487 1488 if (ext_depth(inode) != depth) { 1489 /* depth was changed. we have to realloc path */ 1490 kfree(path); 1491 path = NULL; 1492 } 1493 1494 block = cbex.ec_block + cbex.ec_len; 1495 } 1496 1497 if (path) { 1498 ext4_ext_drop_refs(path); 1499 kfree(path); 1500 } 1501 1502 return err; 1503} 1504 1505static void 1506ext4_ext_put_in_cache(struct inode *inode, __u32 block, 1507 __u32 len, __u32 start, int type) 1508{ 1509 struct ext4_ext_cache *cex; 1510 BUG_ON(len == 0); 1511 cex = &EXT4_I(inode)->i_cached_extent; 1512 cex->ec_type = type; 1513 cex->ec_block = block; 1514 cex->ec_len = len; 1515 cex->ec_start = start; 1516} 1517 1518/* 1519 * ext4_ext_put_gap_in_cache: 1520 * calculate boundaries of the gap that the requested block fits into 1521 * and cache this gap 1522 */ 1523static void 1524ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1525 unsigned long block) 1526{ 1527 int depth = ext_depth(inode); 1528 unsigned long lblock, len; 1529 struct ext4_extent *ex; 1530 1531 ex = path[depth].p_ext; 1532 if (ex == NULL) { 1533 /* there is no extent yet, so gap is [0;-] */ 1534 lblock = 0; 1535 len = EXT_MAX_BLOCK; 1536 ext_debug("cache gap(whole file):"); 1537 } else if (block < le32_to_cpu(ex->ee_block)) { 1538 lblock = block; 1539 len = le32_to_cpu(ex->ee_block) - block; 1540 ext_debug("cache gap(before): %lu [%lu:%lu]", 1541 (unsigned long) block, 1542 (unsigned long) le32_to_cpu(ex->ee_block), 1543 (unsigned long) ext4_ext_get_actual_len(ex)); 1544 } else if (block >= le32_to_cpu(ex->ee_block) 1545 + ext4_ext_get_actual_len(ex)) { 1546 lblock = le32_to_cpu(ex->ee_block) 1547 + ext4_ext_get_actual_len(ex); 1548 len = ext4_ext_next_allocated_block(path); 1549 ext_debug("cache gap(after): [%lu:%lu] %lu", 1550 (unsigned long) le32_to_cpu(ex->ee_block), 1551 (unsigned long) ext4_ext_get_actual_len(ex), 1552 (unsigned long) block); 1553 BUG_ON(len == lblock); 1554 len = len - lblock; 1555 } else { 1556 lblock = len = 0; 1557 BUG(); 1558 } 1559 1560 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len); 1561 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1562} 1563 1564static int 1565ext4_ext_in_cache(struct inode *inode, unsigned long block, 1566 struct ext4_extent *ex) 1567{ 1568 struct ext4_ext_cache *cex; 1569 1570 cex = &EXT4_I(inode)->i_cached_extent; 1571 1572 /* has cache valid data? */ 1573 if (cex->ec_type == EXT4_EXT_CACHE_NO) 1574 return EXT4_EXT_CACHE_NO; 1575 1576 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 1577 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 1578 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 1579 ex->ee_block = cpu_to_le32(cex->ec_block); 1580 ext4_ext_store_pblock(ex, cex->ec_start); 1581 ex->ee_len = cpu_to_le16(cex->ec_len); 1582 ext_debug("%lu cached by %lu:%lu:%llu\n", 1583 (unsigned long) block, 1584 (unsigned long) cex->ec_block, 1585 (unsigned long) cex->ec_len, 1586 cex->ec_start); 1587 return cex->ec_type; 1588 } 1589 1590 /* not in cache */ 1591 return EXT4_EXT_CACHE_NO; 1592} 1593 1594/* 1595 * ext4_ext_rm_idx: 1596 * removes index from the index block. 1597 * It's used in truncate case only, thus all requests are for 1598 * last index in the block only. 1599 */ 1600int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1601 struct ext4_ext_path *path) 1602{ 1603 struct buffer_head *bh; 1604 int err; 1605 ext4_fsblk_t leaf; 1606 1607 /* free index block */ 1608 path--; 1609 leaf = idx_pblock(path->p_idx); 1610 BUG_ON(path->p_hdr->eh_entries == 0); 1611 err = ext4_ext_get_access(handle, inode, path); 1612 if (err) 1613 return err; 1614 path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1); 1615 err = ext4_ext_dirty(handle, inode, path); 1616 if (err) 1617 return err; 1618 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1619 bh = sb_find_get_block(inode->i_sb, leaf); 1620 ext4_forget(handle, 1, inode, bh, leaf); 1621 ext4_free_blocks(handle, inode, leaf, 1); 1622 return err; 1623} 1624 1625/* 1626 * ext4_ext_calc_credits_for_insert: 1627 * This routine returns max. credits that the extent tree can consume. 1628 * It should be OK for low-performance paths like ->writepage() 1629 * To allow many writing processes to fit into a single transaction, 1630 * the caller should calculate credits under truncate_mutex and 1631 * pass the actual path. 1632 */ 1633int ext4_ext_calc_credits_for_insert(struct inode *inode, 1634 struct ext4_ext_path *path) 1635{ 1636 int depth, needed; 1637 1638 if (path) { 1639 /* probably there is space in leaf? */ 1640 depth = ext_depth(inode); 1641 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 1642 < le16_to_cpu(path[depth].p_hdr->eh_max)) 1643 return 1; 1644 } 1645 1646 /* 1647 * given 32-bit logical block (4294967296 blocks), max. tree 1648 * can be 4 levels in depth -- 4 * 340^4 == 53453440000. 1649 * Let's also add one more level for imbalance. 1650 */ 1651 depth = 5; 1652 1653 /* allocation of new data block(s) */ 1654 needed = 2; 1655 1656 /* 1657 * tree can be full, so it would need to grow in depth: 1658 * we need one credit to modify old root, credits for 1659 * new root will be added in split accounting 1660 */ 1661 needed += 1; 1662 1663 /* 1664 * Index split can happen, we would need: 1665 * allocate intermediate indexes (bitmap + group) 1666 * + change two blocks at each level, but root (already included) 1667 */ 1668 needed += (depth * 2) + (depth * 2); 1669 1670 /* any allocation modifies superblock */ 1671 needed += 1; 1672 1673 return needed; 1674} 1675 1676static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 1677 struct ext4_extent *ex, 1678 unsigned long from, unsigned long to) 1679{ 1680 struct buffer_head *bh; 1681 unsigned short ee_len = ext4_ext_get_actual_len(ex); 1682 int i; 1683 1684#ifdef EXTENTS_STATS 1685 { 1686 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1687 spin_lock(&sbi->s_ext_stats_lock); 1688 sbi->s_ext_blocks += ee_len; 1689 sbi->s_ext_extents++; 1690 if (ee_len < sbi->s_ext_min) 1691 sbi->s_ext_min = ee_len; 1692 if (ee_len > sbi->s_ext_max) 1693 sbi->s_ext_max = ee_len; 1694 if (ext_depth(inode) > sbi->s_depth_max) 1695 sbi->s_depth_max = ext_depth(inode); 1696 spin_unlock(&sbi->s_ext_stats_lock); 1697 } 1698#endif 1699 if (from >= le32_to_cpu(ex->ee_block) 1700 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 1701 /* tail removal */ 1702 unsigned long num; 1703 ext4_fsblk_t start; 1704 num = le32_to_cpu(ex->ee_block) + ee_len - from; 1705 start = ext_pblock(ex) + ee_len - num; 1706 ext_debug("free last %lu blocks starting %llu\n", num, start); 1707 for (i = 0; i < num; i++) { 1708 bh = sb_find_get_block(inode->i_sb, start + i); 1709 ext4_forget(handle, 0, inode, bh, start + i); 1710 } 1711 ext4_free_blocks(handle, inode, start, num); 1712 } else if (from == le32_to_cpu(ex->ee_block) 1713 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 1714 printk("strange request: removal %lu-%lu from %u:%u\n", 1715 from, to, le32_to_cpu(ex->ee_block), ee_len); 1716 } else { 1717 printk("strange request: removal(2) %lu-%lu from %u:%u\n", 1718 from, to, le32_to_cpu(ex->ee_block), ee_len); 1719 } 1720 return 0; 1721} 1722 1723static int 1724ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 1725 struct ext4_ext_path *path, unsigned long start) 1726{ 1727 int err = 0, correct_index = 0; 1728 int depth = ext_depth(inode), credits; 1729 struct ext4_extent_header *eh; 1730 unsigned a, b, block, num; 1731 unsigned long ex_ee_block; 1732 unsigned short ex_ee_len; 1733 unsigned uninitialized = 0; 1734 struct ext4_extent *ex; 1735 1736 ext_debug("truncate since %lu in leaf\n", start); 1737 if (!path[depth].p_hdr) 1738 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 1739 eh = path[depth].p_hdr; 1740 BUG_ON(eh == NULL); 1741 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); 1742 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); 1743 1744 /* find where to start removing */ 1745 ex = EXT_LAST_EXTENT(eh); 1746 1747 ex_ee_block = le32_to_cpu(ex->ee_block); 1748 if (ext4_ext_is_uninitialized(ex)) 1749 uninitialized = 1; 1750 ex_ee_len = ext4_ext_get_actual_len(ex); 1751 1752 while (ex >= EXT_FIRST_EXTENT(eh) && 1753 ex_ee_block + ex_ee_len > start) { 1754 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 1755 path[depth].p_ext = ex; 1756 1757 a = ex_ee_block > start ? ex_ee_block : start; 1758 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 1759 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 1760 1761 ext_debug(" border %u:%u\n", a, b); 1762 1763 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 1764 block = 0; 1765 num = 0; 1766 BUG(); 1767 } else if (a != ex_ee_block) { 1768 /* remove tail of the extent */ 1769 block = ex_ee_block; 1770 num = a - block; 1771 } else if (b != ex_ee_block + ex_ee_len - 1) { 1772 /* remove head of the extent */ 1773 block = a; 1774 num = b - a; 1775 /* there is no "make a hole" API yet */ 1776 BUG(); 1777 } else { 1778 /* remove whole extent: excellent! */ 1779 block = ex_ee_block; 1780 num = 0; 1781 BUG_ON(a != ex_ee_block); 1782 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 1783 } 1784 1785 /* at present, extent can't cross block group: */ 1786 /* leaf + bitmap + group desc + sb + inode */ 1787 credits = 5; 1788 if (ex == EXT_FIRST_EXTENT(eh)) { 1789 correct_index = 1; 1790 credits += (ext_depth(inode)) + 1; 1791 } 1792#ifdef CONFIG_QUOTA 1793 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 1794#endif 1795 1796 handle = ext4_ext_journal_restart(handle, credits); 1797 if (IS_ERR(handle)) { 1798 err = PTR_ERR(handle); 1799 goto out; 1800 } 1801 1802 err = ext4_ext_get_access(handle, inode, path + depth); 1803 if (err) 1804 goto out; 1805 1806 err = ext4_remove_blocks(handle, inode, ex, a, b); 1807 if (err) 1808 goto out; 1809 1810 if (num == 0) { 1811 /* this extent is removed; mark slot entirely unused */ 1812 ext4_ext_store_pblock(ex, 0); 1813 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); 1814 } 1815 1816 ex->ee_block = cpu_to_le32(block); 1817 ex->ee_len = cpu_to_le16(num); 1818 if (uninitialized) 1819 ext4_ext_mark_uninitialized(ex); 1820 1821 err = ext4_ext_dirty(handle, inode, path + depth); 1822 if (err) 1823 goto out; 1824 1825 ext_debug("new extent: %u:%u:%llu\n", block, num, 1826 ext_pblock(ex)); 1827 ex--; 1828 ex_ee_block = le32_to_cpu(ex->ee_block); 1829 ex_ee_len = ext4_ext_get_actual_len(ex); 1830 } 1831 1832 if (correct_index && eh->eh_entries) 1833 err = ext4_ext_correct_indexes(handle, inode, path); 1834 1835 /* if this leaf is free, then we should 1836 * remove it from index block above */ 1837 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 1838 err = ext4_ext_rm_idx(handle, inode, path + depth); 1839 1840out: 1841 return err; 1842} 1843 1844/* 1845 * ext4_ext_more_to_rm: 1846 * returns 1 if current index has to be freed (even partial) 1847 */ 1848static int 1849ext4_ext_more_to_rm(struct ext4_ext_path *path) 1850{ 1851 BUG_ON(path->p_idx == NULL); 1852 1853 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 1854 return 0; 1855 1856 /* 1857 * if truncate on deeper level happened, it wasn't partial, 1858 * so we have to consider current index for truncation 1859 */ 1860 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 1861 return 0; 1862 return 1; 1863} 1864 1865int ext4_ext_remove_space(struct inode *inode, unsigned long start) 1866{ 1867 struct super_block *sb = inode->i_sb; 1868 int depth = ext_depth(inode); 1869 struct ext4_ext_path *path; 1870 handle_t *handle; 1871 int i = 0, err = 0; 1872 1873 ext_debug("truncate since %lu\n", start); 1874 1875 /* probably first extent we're gonna free will be last in block */ 1876 handle = ext4_journal_start(inode, depth + 1); 1877 if (IS_ERR(handle)) 1878 return PTR_ERR(handle); 1879 1880 ext4_ext_invalidate_cache(inode); 1881 1882 /* 1883 * We start scanning from right side, freeing all the blocks 1884 * after i_size and walking into the tree depth-wise. 1885 */ 1886 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL); 1887 if (path == NULL) { 1888 ext4_journal_stop(handle); 1889 return -ENOMEM; 1890 } 1891 path[0].p_hdr = ext_inode_hdr(inode); 1892 if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) { 1893 err = -EIO; 1894 goto out; 1895 } 1896 path[0].p_depth = depth; 1897 1898 while (i >= 0 && err == 0) { 1899 if (i == depth) { 1900 /* this is leaf block */ 1901 err = ext4_ext_rm_leaf(handle, inode, path, start); 1902 /* root level has p_bh == NULL, brelse() eats this */ 1903 brelse(path[i].p_bh); 1904 path[i].p_bh = NULL; 1905 i--; 1906 continue; 1907 } 1908 1909 /* this is index block */ 1910 if (!path[i].p_hdr) { 1911 ext_debug("initialize header\n"); 1912 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 1913 if (ext4_ext_check_header(__FUNCTION__, inode, 1914 path[i].p_hdr)) { 1915 err = -EIO; 1916 goto out; 1917 } 1918 } 1919 1920 BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries) 1921 > le16_to_cpu(path[i].p_hdr->eh_max)); 1922 BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC); 1923 1924 if (!path[i].p_idx) { 1925 /* this level hasn't been touched yet */ 1926 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 1927 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 1928 ext_debug("init index ptr: hdr 0x%p, num %d\n", 1929 path[i].p_hdr, 1930 le16_to_cpu(path[i].p_hdr->eh_entries)); 1931 } else { 1932 /* we were already here, see at next index */ 1933 path[i].p_idx--; 1934 } 1935 1936 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 1937 i, EXT_FIRST_INDEX(path[i].p_hdr), 1938 path[i].p_idx); 1939 if (ext4_ext_more_to_rm(path + i)) { 1940 /* go to the next level */ 1941 ext_debug("move to level %d (block %llu)\n", 1942 i + 1, idx_pblock(path[i].p_idx)); 1943 memset(path + i + 1, 0, sizeof(*path)); 1944 path[i+1].p_bh = 1945 sb_bread(sb, idx_pblock(path[i].p_idx)); 1946 if (!path[i+1].p_bh) { 1947 /* should we reset i_size? */ 1948 err = -EIO; 1949 break; 1950 } 1951 1952 /* save actual number of indexes since this 1953 * number is changed at the next iteration */ 1954 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 1955 i++; 1956 } else { 1957 /* we finished processing this index, go up */ 1958 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 1959 /* index is empty, remove it; 1960 * handle must be already prepared by the 1961 * truncatei_leaf() */ 1962 err = ext4_ext_rm_idx(handle, inode, path + i); 1963 } 1964 /* root level has p_bh == NULL, brelse() eats this */ 1965 brelse(path[i].p_bh); 1966 path[i].p_bh = NULL; 1967 i--; 1968 ext_debug("return to level %d\n", i); 1969 } 1970 } 1971 1972 /* TODO: flexible tree reduction should be here */ 1973 if (path->p_hdr->eh_entries == 0) { 1974 /* 1975 * truncate to zero freed all the tree, 1976 * so we need to correct eh_depth 1977 */ 1978 err = ext4_ext_get_access(handle, inode, path); 1979 if (err == 0) { 1980 ext_inode_hdr(inode)->eh_depth = 0; 1981 ext_inode_hdr(inode)->eh_max = 1982 cpu_to_le16(ext4_ext_space_root(inode)); 1983 err = ext4_ext_dirty(handle, inode, path); 1984 } 1985 } 1986out: 1987 ext4_ext_tree_changed(inode); 1988 ext4_ext_drop_refs(path); 1989 kfree(path); 1990 ext4_journal_stop(handle); 1991 1992 return err; 1993} 1994 1995/* 1996 * called at mount time 1997 */ 1998void ext4_ext_init(struct super_block *sb) 1999{ 2000 /* 2001 * possible initialization would be here 2002 */ 2003 2004 if (test_opt(sb, EXTENTS)) { 2005 printk("EXT4-fs: file extents enabled"); 2006#ifdef AGGRESSIVE_TEST 2007 printk(", aggressive tests"); 2008#endif 2009#ifdef CHECK_BINSEARCH 2010 printk(", check binsearch"); 2011#endif 2012#ifdef EXTENTS_STATS 2013 printk(", stats"); 2014#endif 2015 printk("\n"); 2016#ifdef EXTENTS_STATS 2017 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2018 EXT4_SB(sb)->s_ext_min = 1 << 30; 2019 EXT4_SB(sb)->s_ext_max = 0; 2020#endif 2021 } 2022} 2023 2024/* 2025 * called at umount time 2026 */ 2027void ext4_ext_release(struct super_block *sb) 2028{ 2029 if (!test_opt(sb, EXTENTS)) 2030 return; 2031 2032#ifdef EXTENTS_STATS 2033 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2034 struct ext4_sb_info *sbi = EXT4_SB(sb); 2035 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2036 sbi->s_ext_blocks, sbi->s_ext_extents, 2037 sbi->s_ext_blocks / sbi->s_ext_extents); 2038 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2039 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2040 } 2041#endif 2042} 2043 2044/* 2045 * This function is called by ext4_ext_get_blocks() if someone tries to write 2046 * to an uninitialized extent. It may result in splitting the uninitialized 2047 * extent into multiple extents (upto three - one initialized and two 2048 * uninitialized). 2049 * There are three possibilities: 2050 * a> There is no split required: Entire extent should be initialized 2051 * b> Splits in two extents: Write is happening at either end of the extent 2052 * c> Splits in three extents: Somone is writing in middle of the extent 2053 */ 2054int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, 2055 struct ext4_ext_path *path, 2056 ext4_fsblk_t iblock, 2057 unsigned long max_blocks) 2058{ 2059 struct ext4_extent *ex, newex; 2060 struct ext4_extent *ex1 = NULL; 2061 struct ext4_extent *ex2 = NULL; 2062 struct ext4_extent *ex3 = NULL; 2063 struct ext4_extent_header *eh; 2064 unsigned int allocated, ee_block, ee_len, depth; 2065 ext4_fsblk_t newblock; 2066 int err = 0; 2067 int ret = 0; 2068 2069 depth = ext_depth(inode); 2070 eh = path[depth].p_hdr; 2071 ex = path[depth].p_ext; 2072 ee_block = le32_to_cpu(ex->ee_block); 2073 ee_len = ext4_ext_get_actual_len(ex); 2074 allocated = ee_len - (iblock - ee_block); 2075 newblock = iblock - ee_block + ext_pblock(ex); 2076 ex2 = ex; 2077 2078 /* ex1: ee_block to iblock - 1 : uninitialized */ 2079 if (iblock > ee_block) { 2080 ex1 = ex; 2081 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2082 ext4_ext_mark_uninitialized(ex1); 2083 ex2 = &newex; 2084 } 2085 /* 2086 * for sanity, update the length of the ex2 extent before 2087 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2088 * overlap of blocks. 2089 */ 2090 if (!ex1 && allocated > max_blocks) 2091 ex2->ee_len = cpu_to_le16(max_blocks); 2092 /* ex3: to ee_block + ee_len : uninitialised */ 2093 if (allocated > max_blocks) { 2094 unsigned int newdepth; 2095 ex3 = &newex; 2096 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2097 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2098 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2099 ext4_ext_mark_uninitialized(ex3); 2100 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2101 if (err) 2102 goto out; 2103 /* 2104 * The depth, and hence eh & ex might change 2105 * as part of the insert above. 2106 */ 2107 newdepth = ext_depth(inode); 2108 if (newdepth != depth) { 2109 depth = newdepth; 2110 path = ext4_ext_find_extent(inode, iblock, NULL); 2111 if (IS_ERR(path)) { 2112 err = PTR_ERR(path); 2113 path = NULL; 2114 goto out; 2115 } 2116 eh = path[depth].p_hdr; 2117 ex = path[depth].p_ext; 2118 if (ex2 != &newex) 2119 ex2 = ex; 2120 } 2121 allocated = max_blocks; 2122 } 2123 /* 2124 * If there was a change of depth as part of the 2125 * insertion of ex3 above, we need to update the length 2126 * of the ex1 extent again here 2127 */ 2128 if (ex1 && ex1 != ex) { 2129 ex1 = ex; 2130 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2131 ext4_ext_mark_uninitialized(ex1); 2132 ex2 = &newex; 2133 } 2134 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2135 ex2->ee_block = cpu_to_le32(iblock); 2136 ex2->ee_start = cpu_to_le32(newblock); 2137 ext4_ext_store_pblock(ex2, newblock); 2138 ex2->ee_len = cpu_to_le16(allocated); 2139 if (ex2 != ex) 2140 goto insert; 2141 err = ext4_ext_get_access(handle, inode, path + depth); 2142 if (err) 2143 goto out; 2144 /* 2145 * New (initialized) extent starts from the first block 2146 * in the current extent. i.e., ex2 == ex 2147 * We have to see if it can be merged with the extent 2148 * on the left. 2149 */ 2150 if (ex2 > EXT_FIRST_EXTENT(eh)) { 2151 /* 2152 * To merge left, pass "ex2 - 1" to try_to_merge(), 2153 * since it merges towards right _only_. 2154 */ 2155 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); 2156 if (ret) { 2157 err = ext4_ext_correct_indexes(handle, inode, path); 2158 if (err) 2159 goto out; 2160 depth = ext_depth(inode); 2161 ex2--; 2162 } 2163 } 2164 /* 2165 * Try to Merge towards right. This might be required 2166 * only when the whole extent is being written to. 2167 * i.e. ex2 == ex and ex3 == NULL. 2168 */ 2169 if (!ex3) { 2170 ret = ext4_ext_try_to_merge(inode, path, ex2); 2171 if (ret) { 2172 err = ext4_ext_correct_indexes(handle, inode, path); 2173 if (err) 2174 goto out; 2175 } 2176 } 2177 /* Mark modified extent as dirty */ 2178 err = ext4_ext_dirty(handle, inode, path + depth); 2179 goto out; 2180insert: 2181 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2182out: 2183 return err ? err : allocated; 2184} 2185 2186int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2187 ext4_fsblk_t iblock, 2188 unsigned long max_blocks, struct buffer_head *bh_result, 2189 int create, int extend_disksize) 2190{ 2191 struct ext4_ext_path *path = NULL; 2192 struct ext4_extent_header *eh; 2193 struct ext4_extent newex, *ex; 2194 ext4_fsblk_t goal, newblock; 2195 int err = 0, depth, ret; 2196 unsigned long allocated = 0; 2197 2198 __clear_bit(BH_New, &bh_result->b_state); 2199 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock, 2200 max_blocks, (unsigned) inode->i_ino); 2201 mutex_lock(&EXT4_I(inode)->truncate_mutex); 2202 2203 /* check in cache */ 2204 goal = ext4_ext_in_cache(inode, iblock, &newex); 2205 if (goal) { 2206 if (goal == EXT4_EXT_CACHE_GAP) { 2207 if (!create) { 2208 /* 2209 * block isn't allocated yet and 2210 * user doesn't want to allocate it 2211 */ 2212 goto out2; 2213 } 2214 /* we should allocate requested block */ 2215 } else if (goal == EXT4_EXT_CACHE_EXTENT) { 2216 /* block is already allocated */ 2217 newblock = iblock 2218 - le32_to_cpu(newex.ee_block) 2219 + ext_pblock(&newex); 2220 /* number of remaining blocks in the extent */ 2221 allocated = le16_to_cpu(newex.ee_len) - 2222 (iblock - le32_to_cpu(newex.ee_block)); 2223 goto out; 2224 } else { 2225 BUG(); 2226 } 2227 } 2228 2229 /* find extent for this block */ 2230 path = ext4_ext_find_extent(inode, iblock, NULL); 2231 if (IS_ERR(path)) { 2232 err = PTR_ERR(path); 2233 path = NULL; 2234 goto out2; 2235 } 2236 2237 depth = ext_depth(inode); 2238 2239 /* 2240 * consistent leaf must not be empty; 2241 * this situation is possible, though, _during_ tree modification; 2242 * this is why assert can't be put in ext4_ext_find_extent() 2243 */ 2244 BUG_ON(path[depth].p_ext == NULL && depth != 0); 2245 eh = path[depth].p_hdr; 2246 2247 ex = path[depth].p_ext; 2248 if (ex) { 2249 unsigned long ee_block = le32_to_cpu(ex->ee_block); 2250 ext4_fsblk_t ee_start = ext_pblock(ex); 2251 unsigned short ee_len; 2252 2253 /* 2254 * Uninitialized extents are treated as holes, except that 2255 * we split out initialized portions during a write. 2256 */ 2257 ee_len = ext4_ext_get_actual_len(ex); 2258 /* if found extent covers block, simply return it */ 2259 if (iblock >= ee_block && iblock < ee_block + ee_len) { 2260 newblock = iblock - ee_block + ee_start; 2261 /* number of remaining blocks in the extent */ 2262 allocated = ee_len - (iblock - ee_block); 2263 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, 2264 ee_block, ee_len, newblock); 2265 2266 /* Do not put uninitialized extent in the cache */ 2267 if (!ext4_ext_is_uninitialized(ex)) { 2268 ext4_ext_put_in_cache(inode, ee_block, 2269 ee_len, ee_start, 2270 EXT4_EXT_CACHE_EXTENT); 2271 goto out; 2272 } 2273 if (create == EXT4_CREATE_UNINITIALIZED_EXT) 2274 goto out; 2275 if (!create) 2276 goto out2; 2277 2278 ret = ext4_ext_convert_to_initialized(handle, inode, 2279 path, iblock, 2280 max_blocks); 2281 if (ret <= 0) 2282 goto out2; 2283 else 2284 allocated = ret; 2285 goto outnew; 2286 } 2287 } 2288 2289 /* 2290 * requested block isn't allocated yet; 2291 * we couldn't try to create block if create flag is zero 2292 */ 2293 if (!create) { 2294 /* 2295 * put just found gap into cache to speed up 2296 * subsequent requests 2297 */ 2298 ext4_ext_put_gap_in_cache(inode, path, iblock); 2299 goto out2; 2300 } 2301 /* 2302 * Okay, we need to do block allocation. Lazily initialize the block 2303 * allocation info here if necessary. 2304 */ 2305 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) 2306 ext4_init_block_alloc_info(inode); 2307 2308 /* allocate new block */ 2309 goal = ext4_ext_find_goal(inode, path, iblock); 2310 2311 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 2312 newex.ee_block = cpu_to_le32(iblock); 2313 newex.ee_len = cpu_to_le16(max_blocks); 2314 err = ext4_ext_check_overlap(inode, &newex, path); 2315 if (err) 2316 allocated = le16_to_cpu(newex.ee_len); 2317 else 2318 allocated = max_blocks; 2319 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); 2320 if (!newblock) 2321 goto out2; 2322 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2323 goal, newblock, allocated); 2324 2325 /* try to insert new extent into found leaf and return */ 2326 ext4_ext_store_pblock(&newex, newblock); 2327 newex.ee_len = cpu_to_le16(allocated); 2328 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2329 ext4_ext_mark_uninitialized(&newex); 2330 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2331 if (err) { 2332 /* free data blocks we just allocated */ 2333 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2334 le16_to_cpu(newex.ee_len)); 2335 goto out2; 2336 } 2337 2338 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) 2339 EXT4_I(inode)->i_disksize = inode->i_size; 2340 2341 /* previous routine could use block we allocated */ 2342 newblock = ext_pblock(&newex); 2343outnew: 2344 __set_bit(BH_New, &bh_result->b_state); 2345 2346 /* Cache only when it is _not_ an uninitialized extent */ 2347 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2348 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 2349 EXT4_EXT_CACHE_EXTENT); 2350out: 2351 if (allocated > max_blocks) 2352 allocated = max_blocks; 2353 ext4_ext_show_leaf(inode, path); 2354 __set_bit(BH_Mapped, &bh_result->b_state); 2355 bh_result->b_bdev = inode->i_sb->s_bdev; 2356 bh_result->b_blocknr = newblock; 2357out2: 2358 if (path) { 2359 ext4_ext_drop_refs(path); 2360 kfree(path); 2361 } 2362 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 2363 2364 return err ? err : allocated; 2365} 2366 2367void ext4_ext_truncate(struct inode * inode, struct page *page) 2368{ 2369 struct address_space *mapping = inode->i_mapping; 2370 struct super_block *sb = inode->i_sb; 2371 unsigned long last_block; 2372 handle_t *handle; 2373 int err = 0; 2374 2375 /* 2376 * probably first extent we're gonna free will be last in block 2377 */ 2378 err = ext4_writepage_trans_blocks(inode) + 3; 2379 handle = ext4_journal_start(inode, err); 2380 if (IS_ERR(handle)) { 2381 if (page) { 2382 clear_highpage(page); 2383 flush_dcache_page(page); 2384 unlock_page(page); 2385 page_cache_release(page); 2386 } 2387 return; 2388 } 2389 2390 if (page) 2391 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2392 2393 mutex_lock(&EXT4_I(inode)->truncate_mutex); 2394 ext4_ext_invalidate_cache(inode); 2395 2396 /* 2397 * TODO: optimization is possible here. 2398 * Probably we need not scan at all, 2399 * because page truncation is enough. 2400 */ 2401 if (ext4_orphan_add(handle, inode)) 2402 goto out_stop; 2403 2404 /* we have to know where to truncate from in crash case */ 2405 EXT4_I(inode)->i_disksize = inode->i_size; 2406 ext4_mark_inode_dirty(handle, inode); 2407 2408 last_block = (inode->i_size + sb->s_blocksize - 1) 2409 >> EXT4_BLOCK_SIZE_BITS(sb); 2410 err = ext4_ext_remove_space(inode, last_block); 2411 2412 /* In a multi-transaction truncate, we only make the final 2413 * transaction synchronous. 2414 */ 2415 if (IS_SYNC(inode)) 2416 handle->h_sync = 1; 2417 2418out_stop: 2419 /* 2420 * If this was a simple ftruncate() and the file will remain alive, 2421 * then we need to clear up the orphan record which we created above. 2422 * However, if this was a real unlink then we were called by 2423 * ext4_delete_inode(), and we allow that function to clean up the 2424 * orphan info for us. 2425 */ 2426 if (inode->i_nlink) 2427 ext4_orphan_del(handle, inode); 2428 2429 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 2430 ext4_journal_stop(handle); 2431} 2432 2433/* 2434 * ext4_ext_writepage_trans_blocks: 2435 * calculate max number of blocks we could modify 2436 * in order to allocate new block for an inode 2437 */ 2438int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) 2439{ 2440 int needed; 2441 2442 needed = ext4_ext_calc_credits_for_insert(inode, NULL); 2443 2444 /* caller wants to allocate num blocks, but note it includes sb */ 2445 needed = needed * num - (num - 1); 2446 2447#ifdef CONFIG_QUOTA 2448 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2449#endif 2450 2451 return needed; 2452} 2453 2454/* 2455 * preallocate space for a file. This implements ext4's fallocate inode 2456 * operation, which gets called from sys_fallocate system call. 2457 * For block-mapped files, posix_fallocate should fall back to the method 2458 * of writing zeroes to the required new blocks (the same behavior which is 2459 * expected for file systems which do not support fallocate() system call). 2460 */ 2461long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 2462{ 2463 handle_t *handle; 2464 ext4_fsblk_t block, max_blocks; 2465 ext4_fsblk_t nblocks = 0; 2466 int ret = 0; 2467 int ret2 = 0; 2468 int retries = 0; 2469 struct buffer_head map_bh; 2470 unsigned int credits, blkbits = inode->i_blkbits; 2471 2472 /* 2473 * currently supporting (pre)allocate mode for extent-based 2474 * files _only_ 2475 */ 2476 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 2477 return -EOPNOTSUPP; 2478 2479 /* preallocation to directories is currently not supported */ 2480 if (S_ISDIR(inode->i_mode)) 2481 return -ENODEV; 2482 2483 block = offset >> blkbits; 2484 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 2485 - block; 2486 2487 /* 2488 * credits to insert 1 extent into extent tree + buffers to be able to 2489 * modify 1 super block, 1 block bitmap and 1 group descriptor. 2490 */ 2491 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; 2492retry: 2493 while (ret >= 0 && ret < max_blocks) { 2494 block = block + ret; 2495 max_blocks = max_blocks - ret; 2496 handle = ext4_journal_start(inode, credits); 2497 if (IS_ERR(handle)) { 2498 ret = PTR_ERR(handle); 2499 break; 2500 } 2501 2502 ret = ext4_ext_get_blocks(handle, inode, block, 2503 max_blocks, &map_bh, 2504 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2505 WARN_ON(!ret); 2506 if (!ret) { 2507 ext4_error(inode->i_sb, "ext4_fallocate", 2508 "ext4_ext_get_blocks returned 0! inode#%lu" 2509 ", block=%llu, max_blocks=%llu", 2510 inode->i_ino, block, max_blocks); 2511 ret = -EIO; 2512 ext4_mark_inode_dirty(handle, inode); 2513 ret2 = ext4_journal_stop(handle); 2514 break; 2515 } 2516 if (ret > 0) { 2517 /* check wrap through sign-bit/zero here */ 2518 if ((block + ret) < 0 || (block + ret) < block) { 2519 ret = -EIO; 2520 ext4_mark_inode_dirty(handle, inode); 2521 ret2 = ext4_journal_stop(handle); 2522 break; 2523 } 2524 if (buffer_new(&map_bh) && ((block + ret) > 2525 (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits) 2526 >> blkbits))) 2527 nblocks = nblocks + ret; 2528 } 2529 2530 /* Update ctime if new blocks get allocated */ 2531 if (nblocks) { 2532 struct timespec now; 2533 2534 now = current_fs_time(inode->i_sb); 2535 if (!timespec_equal(&inode->i_ctime, &now)) 2536 inode->i_ctime = now; 2537 } 2538 2539 ext4_mark_inode_dirty(handle, inode); 2540 ret2 = ext4_journal_stop(handle); 2541 if (ret2) 2542 break; 2543 } 2544 2545 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2546 goto retry; 2547 2548 /* 2549 * Time to update the file size. 2550 * Update only when preallocation was requested beyond the file size. 2551 */ 2552 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2553 (offset + len) > i_size_read(inode)) { 2554 if (ret > 0) { 2555 /* 2556 * if no error, we assume preallocation succeeded 2557 * completely 2558 */ 2559 mutex_lock(&inode->i_mutex); 2560 i_size_write(inode, offset + len); 2561 EXT4_I(inode)->i_disksize = i_size_read(inode); 2562 mutex_unlock(&inode->i_mutex); 2563 } else if (ret < 0 && nblocks) { 2564 /* Handle partial allocation scenario */ 2565 loff_t newsize; 2566 2567 mutex_lock(&inode->i_mutex); 2568 newsize = (nblocks << blkbits) + i_size_read(inode); 2569 i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits)); 2570 EXT4_I(inode)->i_disksize = i_size_read(inode); 2571 mutex_unlock(&inode->i_mutex); 2572 } 2573 } 2574 2575 return ret > 0 ? ret2 : ret; 2576} 2577