extents.c revision ee4bd0d963b75cbad9bfb59b547146671c7a655a
1/* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23/* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32#include <linux/fs.h> 33#include <linux/time.h> 34#include <linux/jbd2.h> 35#include <linux/highuid.h> 36#include <linux/pagemap.h> 37#include <linux/quotaops.h> 38#include <linux/string.h> 39#include <linux/slab.h> 40#include <asm/uaccess.h> 41#include <linux/fiemap.h> 42#include "ext4_jbd2.h" 43#include "ext4_extents.h" 44#include "xattr.h" 45 46#include <trace/events/ext4.h> 47 48/* 49 * used by extent splitting. 50 */ 51#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 52 due to ENOSPC */ 53#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ 54#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ 55 56#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ 57#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ 58 59static __le32 ext4_extent_block_csum(struct inode *inode, 60 struct ext4_extent_header *eh) 61{ 62 struct ext4_inode_info *ei = EXT4_I(inode); 63 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 64 __u32 csum; 65 66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, 67 EXT4_EXTENT_TAIL_OFFSET(eh)); 68 return cpu_to_le32(csum); 69} 70 71static int ext4_extent_block_csum_verify(struct inode *inode, 72 struct ext4_extent_header *eh) 73{ 74 struct ext4_extent_tail *et; 75 76 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 77 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 78 return 1; 79 80 et = find_ext4_extent_tail(eh); 81 if (et->et_checksum != ext4_extent_block_csum(inode, eh)) 82 return 0; 83 return 1; 84} 85 86static void ext4_extent_block_csum_set(struct inode *inode, 87 struct ext4_extent_header *eh) 88{ 89 struct ext4_extent_tail *et; 90 91 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 92 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 93 return; 94 95 et = find_ext4_extent_tail(eh); 96 et->et_checksum = ext4_extent_block_csum(inode, eh); 97} 98 99static int ext4_split_extent(handle_t *handle, 100 struct inode *inode, 101 struct ext4_ext_path **ppath, 102 struct ext4_map_blocks *map, 103 int split_flag, 104 int flags); 105 106static int ext4_split_extent_at(handle_t *handle, 107 struct inode *inode, 108 struct ext4_ext_path **ppath, 109 ext4_lblk_t split, 110 int split_flag, 111 int flags); 112 113static int ext4_find_delayed_extent(struct inode *inode, 114 struct extent_status *newes); 115 116static int ext4_ext_truncate_extend_restart(handle_t *handle, 117 struct inode *inode, 118 int needed) 119{ 120 int err; 121 122 if (!ext4_handle_valid(handle)) 123 return 0; 124 if (handle->h_buffer_credits > needed) 125 return 0; 126 err = ext4_journal_extend(handle, needed); 127 if (err <= 0) 128 return err; 129 err = ext4_truncate_restart_trans(handle, inode, needed); 130 if (err == 0) 131 err = -EAGAIN; 132 133 return err; 134} 135 136/* 137 * could return: 138 * - EROFS 139 * - ENOMEM 140 */ 141static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 142 struct ext4_ext_path *path) 143{ 144 if (path->p_bh) { 145 /* path points to block */ 146 BUFFER_TRACE(path->p_bh, "get_write_access"); 147 return ext4_journal_get_write_access(handle, path->p_bh); 148 } 149 /* path points to leaf/index in inode body */ 150 /* we use in-core data, no need to protect them */ 151 return 0; 152} 153 154/* 155 * could return: 156 * - EROFS 157 * - ENOMEM 158 * - EIO 159 */ 160int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, 161 struct inode *inode, struct ext4_ext_path *path) 162{ 163 int err; 164 165 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 166 if (path->p_bh) { 167 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 168 /* path points to block */ 169 err = __ext4_handle_dirty_metadata(where, line, handle, 170 inode, path->p_bh); 171 } else { 172 /* path points to leaf/index in inode body */ 173 err = ext4_mark_inode_dirty(handle, inode); 174 } 175 return err; 176} 177 178static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 179 struct ext4_ext_path *path, 180 ext4_lblk_t block) 181{ 182 if (path) { 183 int depth = path->p_depth; 184 struct ext4_extent *ex; 185 186 /* 187 * Try to predict block placement assuming that we are 188 * filling in a file which will eventually be 189 * non-sparse --- i.e., in the case of libbfd writing 190 * an ELF object sections out-of-order but in a way 191 * the eventually results in a contiguous object or 192 * executable file, or some database extending a table 193 * space file. However, this is actually somewhat 194 * non-ideal if we are writing a sparse file such as 195 * qemu or KVM writing a raw image file that is going 196 * to stay fairly sparse, since it will end up 197 * fragmenting the file system's free space. Maybe we 198 * should have some hueristics or some way to allow 199 * userspace to pass a hint to file system, 200 * especially if the latter case turns out to be 201 * common. 202 */ 203 ex = path[depth].p_ext; 204 if (ex) { 205 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 206 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 207 208 if (block > ext_block) 209 return ext_pblk + (block - ext_block); 210 else 211 return ext_pblk - (ext_block - block); 212 } 213 214 /* it looks like index is empty; 215 * try to find starting block from index itself */ 216 if (path[depth].p_bh) 217 return path[depth].p_bh->b_blocknr; 218 } 219 220 /* OK. use inode's group */ 221 return ext4_inode_to_goal_block(inode); 222} 223 224/* 225 * Allocation for a meta data block 226 */ 227static ext4_fsblk_t 228ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 229 struct ext4_ext_path *path, 230 struct ext4_extent *ex, int *err, unsigned int flags) 231{ 232 ext4_fsblk_t goal, newblock; 233 234 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 235 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 236 NULL, err); 237 return newblock; 238} 239 240static inline int ext4_ext_space_block(struct inode *inode, int check) 241{ 242 int size; 243 244 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 245 / sizeof(struct ext4_extent); 246#ifdef AGGRESSIVE_TEST 247 if (!check && size > 6) 248 size = 6; 249#endif 250 return size; 251} 252 253static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 254{ 255 int size; 256 257 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 258 / sizeof(struct ext4_extent_idx); 259#ifdef AGGRESSIVE_TEST 260 if (!check && size > 5) 261 size = 5; 262#endif 263 return size; 264} 265 266static inline int ext4_ext_space_root(struct inode *inode, int check) 267{ 268 int size; 269 270 size = sizeof(EXT4_I(inode)->i_data); 271 size -= sizeof(struct ext4_extent_header); 272 size /= sizeof(struct ext4_extent); 273#ifdef AGGRESSIVE_TEST 274 if (!check && size > 3) 275 size = 3; 276#endif 277 return size; 278} 279 280static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 281{ 282 int size; 283 284 size = sizeof(EXT4_I(inode)->i_data); 285 size -= sizeof(struct ext4_extent_header); 286 size /= sizeof(struct ext4_extent_idx); 287#ifdef AGGRESSIVE_TEST 288 if (!check && size > 4) 289 size = 4; 290#endif 291 return size; 292} 293 294static inline int 295ext4_force_split_extent_at(handle_t *handle, struct inode *inode, 296 struct ext4_ext_path **ppath, ext4_lblk_t lblk, 297 int nofail) 298{ 299 struct ext4_ext_path *path = *ppath; 300 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 301 302 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? 303 EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, 304 EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | 305 (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); 306} 307 308/* 309 * Calculate the number of metadata blocks needed 310 * to allocate @blocks 311 * Worse case is one block per extent 312 */ 313int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 314{ 315 struct ext4_inode_info *ei = EXT4_I(inode); 316 int idxs; 317 318 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 319 / sizeof(struct ext4_extent_idx)); 320 321 /* 322 * If the new delayed allocation block is contiguous with the 323 * previous da block, it can share index blocks with the 324 * previous block, so we only need to allocate a new index 325 * block every idxs leaf blocks. At ldxs**2 blocks, we need 326 * an additional index block, and at ldxs**3 blocks, yet 327 * another index blocks. 328 */ 329 if (ei->i_da_metadata_calc_len && 330 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 331 int num = 0; 332 333 if ((ei->i_da_metadata_calc_len % idxs) == 0) 334 num++; 335 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 336 num++; 337 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 338 num++; 339 ei->i_da_metadata_calc_len = 0; 340 } else 341 ei->i_da_metadata_calc_len++; 342 ei->i_da_metadata_calc_last_lblock++; 343 return num; 344 } 345 346 /* 347 * In the worst case we need a new set of index blocks at 348 * every level of the inode's extent tree. 349 */ 350 ei->i_da_metadata_calc_len = 1; 351 ei->i_da_metadata_calc_last_lblock = lblock; 352 return ext_depth(inode) + 1; 353} 354 355static int 356ext4_ext_max_entries(struct inode *inode, int depth) 357{ 358 int max; 359 360 if (depth == ext_depth(inode)) { 361 if (depth == 0) 362 max = ext4_ext_space_root(inode, 1); 363 else 364 max = ext4_ext_space_root_idx(inode, 1); 365 } else { 366 if (depth == 0) 367 max = ext4_ext_space_block(inode, 1); 368 else 369 max = ext4_ext_space_block_idx(inode, 1); 370 } 371 372 return max; 373} 374 375static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 376{ 377 ext4_fsblk_t block = ext4_ext_pblock(ext); 378 int len = ext4_ext_get_actual_len(ext); 379 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); 380 ext4_lblk_t last = lblock + len - 1; 381 382 if (lblock > last) 383 return 0; 384 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 385} 386 387static int ext4_valid_extent_idx(struct inode *inode, 388 struct ext4_extent_idx *ext_idx) 389{ 390 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 391 392 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 393} 394 395static int ext4_valid_extent_entries(struct inode *inode, 396 struct ext4_extent_header *eh, 397 int depth) 398{ 399 unsigned short entries; 400 if (eh->eh_entries == 0) 401 return 1; 402 403 entries = le16_to_cpu(eh->eh_entries); 404 405 if (depth == 0) { 406 /* leaf entries */ 407 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 408 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 409 ext4_fsblk_t pblock = 0; 410 ext4_lblk_t lblock = 0; 411 ext4_lblk_t prev = 0; 412 int len = 0; 413 while (entries) { 414 if (!ext4_valid_extent(inode, ext)) 415 return 0; 416 417 /* Check for overlapping extents */ 418 lblock = le32_to_cpu(ext->ee_block); 419 len = ext4_ext_get_actual_len(ext); 420 if ((lblock <= prev) && prev) { 421 pblock = ext4_ext_pblock(ext); 422 es->s_last_error_block = cpu_to_le64(pblock); 423 return 0; 424 } 425 ext++; 426 entries--; 427 prev = lblock + len - 1; 428 } 429 } else { 430 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 431 while (entries) { 432 if (!ext4_valid_extent_idx(inode, ext_idx)) 433 return 0; 434 ext_idx++; 435 entries--; 436 } 437 } 438 return 1; 439} 440 441static int __ext4_ext_check(const char *function, unsigned int line, 442 struct inode *inode, struct ext4_extent_header *eh, 443 int depth, ext4_fsblk_t pblk) 444{ 445 const char *error_msg; 446 int max = 0; 447 448 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 449 error_msg = "invalid magic"; 450 goto corrupted; 451 } 452 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 453 error_msg = "unexpected eh_depth"; 454 goto corrupted; 455 } 456 if (unlikely(eh->eh_max == 0)) { 457 error_msg = "invalid eh_max"; 458 goto corrupted; 459 } 460 max = ext4_ext_max_entries(inode, depth); 461 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 462 error_msg = "too large eh_max"; 463 goto corrupted; 464 } 465 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 466 error_msg = "invalid eh_entries"; 467 goto corrupted; 468 } 469 if (!ext4_valid_extent_entries(inode, eh, depth)) { 470 error_msg = "invalid extent entries"; 471 goto corrupted; 472 } 473 /* Verify checksum on non-root extent tree nodes */ 474 if (ext_depth(inode) != depth && 475 !ext4_extent_block_csum_verify(inode, eh)) { 476 error_msg = "extent tree corrupted"; 477 goto corrupted; 478 } 479 return 0; 480 481corrupted: 482 ext4_error_inode(inode, function, line, 0, 483 "pblk %llu bad header/extent: %s - magic %x, " 484 "entries %u, max %u(%u), depth %u(%u)", 485 (unsigned long long) pblk, error_msg, 486 le16_to_cpu(eh->eh_magic), 487 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 488 max, le16_to_cpu(eh->eh_depth), depth); 489 return -EIO; 490} 491 492#define ext4_ext_check(inode, eh, depth, pblk) \ 493 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) 494 495int ext4_ext_check_inode(struct inode *inode) 496{ 497 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); 498} 499 500static struct buffer_head * 501__read_extent_tree_block(const char *function, unsigned int line, 502 struct inode *inode, ext4_fsblk_t pblk, int depth, 503 int flags) 504{ 505 struct buffer_head *bh; 506 int err; 507 508 bh = sb_getblk(inode->i_sb, pblk); 509 if (unlikely(!bh)) 510 return ERR_PTR(-ENOMEM); 511 512 if (!bh_uptodate_or_lock(bh)) { 513 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); 514 err = bh_submit_read(bh); 515 if (err < 0) 516 goto errout; 517 } 518 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) 519 return bh; 520 err = __ext4_ext_check(function, line, inode, 521 ext_block_hdr(bh), depth, pblk); 522 if (err) 523 goto errout; 524 set_buffer_verified(bh); 525 /* 526 * If this is a leaf block, cache all of its entries 527 */ 528 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { 529 struct ext4_extent_header *eh = ext_block_hdr(bh); 530 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); 531 ext4_lblk_t prev = 0; 532 int i; 533 534 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { 535 unsigned int status = EXTENT_STATUS_WRITTEN; 536 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 537 int len = ext4_ext_get_actual_len(ex); 538 539 if (prev && (prev != lblk)) 540 ext4_es_cache_extent(inode, prev, 541 lblk - prev, ~0, 542 EXTENT_STATUS_HOLE); 543 544 if (ext4_ext_is_unwritten(ex)) 545 status = EXTENT_STATUS_UNWRITTEN; 546 ext4_es_cache_extent(inode, lblk, len, 547 ext4_ext_pblock(ex), status); 548 prev = lblk + len; 549 } 550 } 551 return bh; 552errout: 553 put_bh(bh); 554 return ERR_PTR(err); 555 556} 557 558#define read_extent_tree_block(inode, pblk, depth, flags) \ 559 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 560 (depth), (flags)) 561 562/* 563 * This function is called to cache a file's extent information in the 564 * extent status tree 565 */ 566int ext4_ext_precache(struct inode *inode) 567{ 568 struct ext4_inode_info *ei = EXT4_I(inode); 569 struct ext4_ext_path *path = NULL; 570 struct buffer_head *bh; 571 int i = 0, depth, ret = 0; 572 573 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 574 return 0; /* not an extent-mapped inode */ 575 576 down_read(&ei->i_data_sem); 577 depth = ext_depth(inode); 578 579 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 580 GFP_NOFS); 581 if (path == NULL) { 582 up_read(&ei->i_data_sem); 583 return -ENOMEM; 584 } 585 586 /* Don't cache anything if there are no external extent blocks */ 587 if (depth == 0) 588 goto out; 589 path[0].p_hdr = ext_inode_hdr(inode); 590 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); 591 if (ret) 592 goto out; 593 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 594 while (i >= 0) { 595 /* 596 * If this is a leaf block or we've reached the end of 597 * the index block, go up 598 */ 599 if ((i == depth) || 600 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 601 brelse(path[i].p_bh); 602 path[i].p_bh = NULL; 603 i--; 604 continue; 605 } 606 bh = read_extent_tree_block(inode, 607 ext4_idx_pblock(path[i].p_idx++), 608 depth - i - 1, 609 EXT4_EX_FORCE_CACHE); 610 if (IS_ERR(bh)) { 611 ret = PTR_ERR(bh); 612 break; 613 } 614 i++; 615 path[i].p_bh = bh; 616 path[i].p_hdr = ext_block_hdr(bh); 617 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); 618 } 619 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 620out: 621 up_read(&ei->i_data_sem); 622 ext4_ext_drop_refs(path); 623 kfree(path); 624 return ret; 625} 626 627#ifdef EXT_DEBUG 628static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 629{ 630 int k, l = path->p_depth; 631 632 ext_debug("path:"); 633 for (k = 0; k <= l; k++, path++) { 634 if (path->p_idx) { 635 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 636 ext4_idx_pblock(path->p_idx)); 637 } else if (path->p_ext) { 638 ext_debug(" %d:[%d]%d:%llu ", 639 le32_to_cpu(path->p_ext->ee_block), 640 ext4_ext_is_unwritten(path->p_ext), 641 ext4_ext_get_actual_len(path->p_ext), 642 ext4_ext_pblock(path->p_ext)); 643 } else 644 ext_debug(" []"); 645 } 646 ext_debug("\n"); 647} 648 649static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 650{ 651 int depth = ext_depth(inode); 652 struct ext4_extent_header *eh; 653 struct ext4_extent *ex; 654 int i; 655 656 if (!path) 657 return; 658 659 eh = path[depth].p_hdr; 660 ex = EXT_FIRST_EXTENT(eh); 661 662 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 663 664 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 665 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 666 ext4_ext_is_unwritten(ex), 667 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 668 } 669 ext_debug("\n"); 670} 671 672static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 673 ext4_fsblk_t newblock, int level) 674{ 675 int depth = ext_depth(inode); 676 struct ext4_extent *ex; 677 678 if (depth != level) { 679 struct ext4_extent_idx *idx; 680 idx = path[level].p_idx; 681 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 682 ext_debug("%d: move %d:%llu in new index %llu\n", level, 683 le32_to_cpu(idx->ei_block), 684 ext4_idx_pblock(idx), 685 newblock); 686 idx++; 687 } 688 689 return; 690 } 691 692 ex = path[depth].p_ext; 693 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 694 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 695 le32_to_cpu(ex->ee_block), 696 ext4_ext_pblock(ex), 697 ext4_ext_is_unwritten(ex), 698 ext4_ext_get_actual_len(ex), 699 newblock); 700 ex++; 701 } 702} 703 704#else 705#define ext4_ext_show_path(inode, path) 706#define ext4_ext_show_leaf(inode, path) 707#define ext4_ext_show_move(inode, path, newblock, level) 708#endif 709 710void ext4_ext_drop_refs(struct ext4_ext_path *path) 711{ 712 int depth, i; 713 714 if (!path) 715 return; 716 depth = path->p_depth; 717 for (i = 0; i <= depth; i++, path++) 718 if (path->p_bh) { 719 brelse(path->p_bh); 720 path->p_bh = NULL; 721 } 722} 723 724/* 725 * ext4_ext_binsearch_idx: 726 * binary search for the closest index of the given block 727 * the header must be checked before calling this 728 */ 729static void 730ext4_ext_binsearch_idx(struct inode *inode, 731 struct ext4_ext_path *path, ext4_lblk_t block) 732{ 733 struct ext4_extent_header *eh = path->p_hdr; 734 struct ext4_extent_idx *r, *l, *m; 735 736 737 ext_debug("binsearch for %u(idx): ", block); 738 739 l = EXT_FIRST_INDEX(eh) + 1; 740 r = EXT_LAST_INDEX(eh); 741 while (l <= r) { 742 m = l + (r - l) / 2; 743 if (block < le32_to_cpu(m->ei_block)) 744 r = m - 1; 745 else 746 l = m + 1; 747 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 748 m, le32_to_cpu(m->ei_block), 749 r, le32_to_cpu(r->ei_block)); 750 } 751 752 path->p_idx = l - 1; 753 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), 754 ext4_idx_pblock(path->p_idx)); 755 756#ifdef CHECK_BINSEARCH 757 { 758 struct ext4_extent_idx *chix, *ix; 759 int k; 760 761 chix = ix = EXT_FIRST_INDEX(eh); 762 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 763 if (k != 0 && 764 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 765 printk(KERN_DEBUG "k=%d, ix=0x%p, " 766 "first=0x%p\n", k, 767 ix, EXT_FIRST_INDEX(eh)); 768 printk(KERN_DEBUG "%u <= %u\n", 769 le32_to_cpu(ix->ei_block), 770 le32_to_cpu(ix[-1].ei_block)); 771 } 772 BUG_ON(k && le32_to_cpu(ix->ei_block) 773 <= le32_to_cpu(ix[-1].ei_block)); 774 if (block < le32_to_cpu(ix->ei_block)) 775 break; 776 chix = ix; 777 } 778 BUG_ON(chix != path->p_idx); 779 } 780#endif 781 782} 783 784/* 785 * ext4_ext_binsearch: 786 * binary search for closest extent of the given block 787 * the header must be checked before calling this 788 */ 789static void 790ext4_ext_binsearch(struct inode *inode, 791 struct ext4_ext_path *path, ext4_lblk_t block) 792{ 793 struct ext4_extent_header *eh = path->p_hdr; 794 struct ext4_extent *r, *l, *m; 795 796 if (eh->eh_entries == 0) { 797 /* 798 * this leaf is empty: 799 * we get such a leaf in split/add case 800 */ 801 return; 802 } 803 804 ext_debug("binsearch for %u: ", block); 805 806 l = EXT_FIRST_EXTENT(eh) + 1; 807 r = EXT_LAST_EXTENT(eh); 808 809 while (l <= r) { 810 m = l + (r - l) / 2; 811 if (block < le32_to_cpu(m->ee_block)) 812 r = m - 1; 813 else 814 l = m + 1; 815 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 816 m, le32_to_cpu(m->ee_block), 817 r, le32_to_cpu(r->ee_block)); 818 } 819 820 path->p_ext = l - 1; 821 ext_debug(" -> %d:%llu:[%d]%d ", 822 le32_to_cpu(path->p_ext->ee_block), 823 ext4_ext_pblock(path->p_ext), 824 ext4_ext_is_unwritten(path->p_ext), 825 ext4_ext_get_actual_len(path->p_ext)); 826 827#ifdef CHECK_BINSEARCH 828 { 829 struct ext4_extent *chex, *ex; 830 int k; 831 832 chex = ex = EXT_FIRST_EXTENT(eh); 833 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 834 BUG_ON(k && le32_to_cpu(ex->ee_block) 835 <= le32_to_cpu(ex[-1].ee_block)); 836 if (block < le32_to_cpu(ex->ee_block)) 837 break; 838 chex = ex; 839 } 840 BUG_ON(chex != path->p_ext); 841 } 842#endif 843 844} 845 846int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 847{ 848 struct ext4_extent_header *eh; 849 850 eh = ext_inode_hdr(inode); 851 eh->eh_depth = 0; 852 eh->eh_entries = 0; 853 eh->eh_magic = EXT4_EXT_MAGIC; 854 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 855 ext4_mark_inode_dirty(handle, inode); 856 return 0; 857} 858 859struct ext4_ext_path * 860ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 861 struct ext4_ext_path **orig_path, int flags) 862{ 863 struct ext4_extent_header *eh; 864 struct buffer_head *bh; 865 struct ext4_ext_path *path = orig_path ? *orig_path : NULL; 866 short int depth, i, ppos = 0; 867 int ret; 868 869 eh = ext_inode_hdr(inode); 870 depth = ext_depth(inode); 871 872 if (path) { 873 ext4_ext_drop_refs(path); 874 if (depth > path[0].p_maxdepth) { 875 kfree(path); 876 *orig_path = path = NULL; 877 } 878 } 879 if (!path) { 880 /* account possible depth increase */ 881 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 882 GFP_NOFS); 883 if (unlikely(!path)) 884 return ERR_PTR(-ENOMEM); 885 path[0].p_maxdepth = depth + 1; 886 } 887 path[0].p_hdr = eh; 888 path[0].p_bh = NULL; 889 890 i = depth; 891 /* walk through the tree */ 892 while (i) { 893 ext_debug("depth %d: num %d, max %d\n", 894 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 895 896 ext4_ext_binsearch_idx(inode, path + ppos, block); 897 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 898 path[ppos].p_depth = i; 899 path[ppos].p_ext = NULL; 900 901 bh = read_extent_tree_block(inode, path[ppos].p_block, --i, 902 flags); 903 if (unlikely(IS_ERR(bh))) { 904 ret = PTR_ERR(bh); 905 goto err; 906 } 907 908 eh = ext_block_hdr(bh); 909 ppos++; 910 if (unlikely(ppos > depth)) { 911 put_bh(bh); 912 EXT4_ERROR_INODE(inode, 913 "ppos %d > depth %d", ppos, depth); 914 ret = -EIO; 915 goto err; 916 } 917 path[ppos].p_bh = bh; 918 path[ppos].p_hdr = eh; 919 } 920 921 path[ppos].p_depth = i; 922 path[ppos].p_ext = NULL; 923 path[ppos].p_idx = NULL; 924 925 /* find extent */ 926 ext4_ext_binsearch(inode, path + ppos, block); 927 /* if not an empty leaf */ 928 if (path[ppos].p_ext) 929 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 930 931 ext4_ext_show_path(inode, path); 932 933 return path; 934 935err: 936 ext4_ext_drop_refs(path); 937 kfree(path); 938 if (orig_path) 939 *orig_path = NULL; 940 return ERR_PTR(ret); 941} 942 943/* 944 * ext4_ext_insert_index: 945 * insert new index [@logical;@ptr] into the block at @curp; 946 * check where to insert: before @curp or after @curp 947 */ 948static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 949 struct ext4_ext_path *curp, 950 int logical, ext4_fsblk_t ptr) 951{ 952 struct ext4_extent_idx *ix; 953 int len, err; 954 955 err = ext4_ext_get_access(handle, inode, curp); 956 if (err) 957 return err; 958 959 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 960 EXT4_ERROR_INODE(inode, 961 "logical %d == ei_block %d!", 962 logical, le32_to_cpu(curp->p_idx->ei_block)); 963 return -EIO; 964 } 965 966 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 967 >= le16_to_cpu(curp->p_hdr->eh_max))) { 968 EXT4_ERROR_INODE(inode, 969 "eh_entries %d >= eh_max %d!", 970 le16_to_cpu(curp->p_hdr->eh_entries), 971 le16_to_cpu(curp->p_hdr->eh_max)); 972 return -EIO; 973 } 974 975 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 976 /* insert after */ 977 ext_debug("insert new index %d after: %llu\n", logical, ptr); 978 ix = curp->p_idx + 1; 979 } else { 980 /* insert before */ 981 ext_debug("insert new index %d before: %llu\n", logical, ptr); 982 ix = curp->p_idx; 983 } 984 985 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 986 BUG_ON(len < 0); 987 if (len > 0) { 988 ext_debug("insert new index %d: " 989 "move %d indices from 0x%p to 0x%p\n", 990 logical, len, ix, ix + 1); 991 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 992 } 993 994 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 995 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 996 return -EIO; 997 } 998 999 ix->ei_block = cpu_to_le32(logical); 1000 ext4_idx_store_pblock(ix, ptr); 1001 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 1002 1003 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 1004 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 1005 return -EIO; 1006 } 1007 1008 err = ext4_ext_dirty(handle, inode, curp); 1009 ext4_std_error(inode->i_sb, err); 1010 1011 return err; 1012} 1013 1014/* 1015 * ext4_ext_split: 1016 * inserts new subtree into the path, using free index entry 1017 * at depth @at: 1018 * - allocates all needed blocks (new leaf and all intermediate index blocks) 1019 * - makes decision where to split 1020 * - moves remaining extents and index entries (right to the split point) 1021 * into the newly allocated blocks 1022 * - initializes subtree 1023 */ 1024static int ext4_ext_split(handle_t *handle, struct inode *inode, 1025 unsigned int flags, 1026 struct ext4_ext_path *path, 1027 struct ext4_extent *newext, int at) 1028{ 1029 struct buffer_head *bh = NULL; 1030 int depth = ext_depth(inode); 1031 struct ext4_extent_header *neh; 1032 struct ext4_extent_idx *fidx; 1033 int i = at, k, m, a; 1034 ext4_fsblk_t newblock, oldblock; 1035 __le32 border; 1036 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 1037 int err = 0; 1038 1039 /* make decision: where to split? */ 1040 /* FIXME: now decision is simplest: at current extent */ 1041 1042 /* if current leaf will be split, then we should use 1043 * border from split point */ 1044 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 1045 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 1046 return -EIO; 1047 } 1048 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 1049 border = path[depth].p_ext[1].ee_block; 1050 ext_debug("leaf will be split." 1051 " next leaf starts at %d\n", 1052 le32_to_cpu(border)); 1053 } else { 1054 border = newext->ee_block; 1055 ext_debug("leaf will be added." 1056 " next leaf starts at %d\n", 1057 le32_to_cpu(border)); 1058 } 1059 1060 /* 1061 * If error occurs, then we break processing 1062 * and mark filesystem read-only. index won't 1063 * be inserted and tree will be in consistent 1064 * state. Next mount will repair buffers too. 1065 */ 1066 1067 /* 1068 * Get array to track all allocated blocks. 1069 * We need this to handle errors and free blocks 1070 * upon them. 1071 */ 1072 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 1073 if (!ablocks) 1074 return -ENOMEM; 1075 1076 /* allocate all needed blocks */ 1077 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 1078 for (a = 0; a < depth - at; a++) { 1079 newblock = ext4_ext_new_meta_block(handle, inode, path, 1080 newext, &err, flags); 1081 if (newblock == 0) 1082 goto cleanup; 1083 ablocks[a] = newblock; 1084 } 1085 1086 /* initialize new leaf */ 1087 newblock = ablocks[--a]; 1088 if (unlikely(newblock == 0)) { 1089 EXT4_ERROR_INODE(inode, "newblock == 0!"); 1090 err = -EIO; 1091 goto cleanup; 1092 } 1093 bh = sb_getblk(inode->i_sb, newblock); 1094 if (unlikely(!bh)) { 1095 err = -ENOMEM; 1096 goto cleanup; 1097 } 1098 lock_buffer(bh); 1099 1100 err = ext4_journal_get_create_access(handle, bh); 1101 if (err) 1102 goto cleanup; 1103 1104 neh = ext_block_hdr(bh); 1105 neh->eh_entries = 0; 1106 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1107 neh->eh_magic = EXT4_EXT_MAGIC; 1108 neh->eh_depth = 0; 1109 1110 /* move remainder of path[depth] to the new leaf */ 1111 if (unlikely(path[depth].p_hdr->eh_entries != 1112 path[depth].p_hdr->eh_max)) { 1113 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 1114 path[depth].p_hdr->eh_entries, 1115 path[depth].p_hdr->eh_max); 1116 err = -EIO; 1117 goto cleanup; 1118 } 1119 /* start copy from next extent */ 1120 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 1121 ext4_ext_show_move(inode, path, newblock, depth); 1122 if (m) { 1123 struct ext4_extent *ex; 1124 ex = EXT_FIRST_EXTENT(neh); 1125 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 1126 le16_add_cpu(&neh->eh_entries, m); 1127 } 1128 1129 ext4_extent_block_csum_set(inode, neh); 1130 set_buffer_uptodate(bh); 1131 unlock_buffer(bh); 1132 1133 err = ext4_handle_dirty_metadata(handle, inode, bh); 1134 if (err) 1135 goto cleanup; 1136 brelse(bh); 1137 bh = NULL; 1138 1139 /* correct old leaf */ 1140 if (m) { 1141 err = ext4_ext_get_access(handle, inode, path + depth); 1142 if (err) 1143 goto cleanup; 1144 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 1145 err = ext4_ext_dirty(handle, inode, path + depth); 1146 if (err) 1147 goto cleanup; 1148 1149 } 1150 1151 /* create intermediate indexes */ 1152 k = depth - at - 1; 1153 if (unlikely(k < 0)) { 1154 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 1155 err = -EIO; 1156 goto cleanup; 1157 } 1158 if (k) 1159 ext_debug("create %d intermediate indices\n", k); 1160 /* insert new index into current index block */ 1161 /* current depth stored in i var */ 1162 i = depth - 1; 1163 while (k--) { 1164 oldblock = newblock; 1165 newblock = ablocks[--a]; 1166 bh = sb_getblk(inode->i_sb, newblock); 1167 if (unlikely(!bh)) { 1168 err = -ENOMEM; 1169 goto cleanup; 1170 } 1171 lock_buffer(bh); 1172 1173 err = ext4_journal_get_create_access(handle, bh); 1174 if (err) 1175 goto cleanup; 1176 1177 neh = ext_block_hdr(bh); 1178 neh->eh_entries = cpu_to_le16(1); 1179 neh->eh_magic = EXT4_EXT_MAGIC; 1180 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1181 neh->eh_depth = cpu_to_le16(depth - i); 1182 fidx = EXT_FIRST_INDEX(neh); 1183 fidx->ei_block = border; 1184 ext4_idx_store_pblock(fidx, oldblock); 1185 1186 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 1187 i, newblock, le32_to_cpu(border), oldblock); 1188 1189 /* move remainder of path[i] to the new index block */ 1190 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1191 EXT_LAST_INDEX(path[i].p_hdr))) { 1192 EXT4_ERROR_INODE(inode, 1193 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1194 le32_to_cpu(path[i].p_ext->ee_block)); 1195 err = -EIO; 1196 goto cleanup; 1197 } 1198 /* start copy indexes */ 1199 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1200 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1201 EXT_MAX_INDEX(path[i].p_hdr)); 1202 ext4_ext_show_move(inode, path, newblock, i); 1203 if (m) { 1204 memmove(++fidx, path[i].p_idx, 1205 sizeof(struct ext4_extent_idx) * m); 1206 le16_add_cpu(&neh->eh_entries, m); 1207 } 1208 ext4_extent_block_csum_set(inode, neh); 1209 set_buffer_uptodate(bh); 1210 unlock_buffer(bh); 1211 1212 err = ext4_handle_dirty_metadata(handle, inode, bh); 1213 if (err) 1214 goto cleanup; 1215 brelse(bh); 1216 bh = NULL; 1217 1218 /* correct old index */ 1219 if (m) { 1220 err = ext4_ext_get_access(handle, inode, path + i); 1221 if (err) 1222 goto cleanup; 1223 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1224 err = ext4_ext_dirty(handle, inode, path + i); 1225 if (err) 1226 goto cleanup; 1227 } 1228 1229 i--; 1230 } 1231 1232 /* insert new index */ 1233 err = ext4_ext_insert_index(handle, inode, path + at, 1234 le32_to_cpu(border), newblock); 1235 1236cleanup: 1237 if (bh) { 1238 if (buffer_locked(bh)) 1239 unlock_buffer(bh); 1240 brelse(bh); 1241 } 1242 1243 if (err) { 1244 /* free all allocated blocks in error case */ 1245 for (i = 0; i < depth; i++) { 1246 if (!ablocks[i]) 1247 continue; 1248 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1249 EXT4_FREE_BLOCKS_METADATA); 1250 } 1251 } 1252 kfree(ablocks); 1253 1254 return err; 1255} 1256 1257/* 1258 * ext4_ext_grow_indepth: 1259 * implements tree growing procedure: 1260 * - allocates new block 1261 * - moves top-level data (index block or leaf) into the new block 1262 * - initializes new top-level, creating index that points to the 1263 * just created block 1264 */ 1265static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1266 unsigned int flags, 1267 struct ext4_extent *newext) 1268{ 1269 struct ext4_extent_header *neh; 1270 struct buffer_head *bh; 1271 ext4_fsblk_t newblock; 1272 int err = 0; 1273 1274 newblock = ext4_ext_new_meta_block(handle, inode, NULL, 1275 newext, &err, flags); 1276 if (newblock == 0) 1277 return err; 1278 1279 bh = sb_getblk(inode->i_sb, newblock); 1280 if (unlikely(!bh)) 1281 return -ENOMEM; 1282 lock_buffer(bh); 1283 1284 err = ext4_journal_get_create_access(handle, bh); 1285 if (err) { 1286 unlock_buffer(bh); 1287 goto out; 1288 } 1289 1290 /* move top-level index/leaf into new block */ 1291 memmove(bh->b_data, EXT4_I(inode)->i_data, 1292 sizeof(EXT4_I(inode)->i_data)); 1293 1294 /* set size of new block */ 1295 neh = ext_block_hdr(bh); 1296 /* old root could have indexes or leaves 1297 * so calculate e_max right way */ 1298 if (ext_depth(inode)) 1299 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1300 else 1301 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1302 neh->eh_magic = EXT4_EXT_MAGIC; 1303 ext4_extent_block_csum_set(inode, neh); 1304 set_buffer_uptodate(bh); 1305 unlock_buffer(bh); 1306 1307 err = ext4_handle_dirty_metadata(handle, inode, bh); 1308 if (err) 1309 goto out; 1310 1311 /* Update top-level index: num,max,pointer */ 1312 neh = ext_inode_hdr(inode); 1313 neh->eh_entries = cpu_to_le16(1); 1314 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); 1315 if (neh->eh_depth == 0) { 1316 /* Root extent block becomes index block */ 1317 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1318 EXT_FIRST_INDEX(neh)->ei_block = 1319 EXT_FIRST_EXTENT(neh)->ee_block; 1320 } 1321 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1322 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1323 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1324 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1325 1326 le16_add_cpu(&neh->eh_depth, 1); 1327 ext4_mark_inode_dirty(handle, inode); 1328out: 1329 brelse(bh); 1330 1331 return err; 1332} 1333 1334/* 1335 * ext4_ext_create_new_leaf: 1336 * finds empty index and adds new leaf. 1337 * if no free index is found, then it requests in-depth growing. 1338 */ 1339static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1340 unsigned int mb_flags, 1341 unsigned int gb_flags, 1342 struct ext4_ext_path **ppath, 1343 struct ext4_extent *newext) 1344{ 1345 struct ext4_ext_path *path = *ppath; 1346 struct ext4_ext_path *curp; 1347 int depth, i, err = 0; 1348 1349repeat: 1350 i = depth = ext_depth(inode); 1351 1352 /* walk up to the tree and look for free index entry */ 1353 curp = path + depth; 1354 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1355 i--; 1356 curp--; 1357 } 1358 1359 /* we use already allocated block for index block, 1360 * so subsequent data blocks should be contiguous */ 1361 if (EXT_HAS_FREE_INDEX(curp)) { 1362 /* if we found index with free entry, then use that 1363 * entry: create all needed subtree and add new leaf */ 1364 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); 1365 if (err) 1366 goto out; 1367 1368 /* refill path */ 1369 path = ext4_ext_find_extent(inode, 1370 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1371 ppath, gb_flags); 1372 if (IS_ERR(path)) 1373 err = PTR_ERR(path); 1374 } else { 1375 /* tree is full, time to grow in depth */ 1376 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); 1377 if (err) 1378 goto out; 1379 1380 /* refill path */ 1381 path = ext4_ext_find_extent(inode, 1382 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1383 ppath, gb_flags); 1384 if (IS_ERR(path)) { 1385 err = PTR_ERR(path); 1386 goto out; 1387 } 1388 1389 /* 1390 * only first (depth 0 -> 1) produces free space; 1391 * in all other cases we have to split the grown tree 1392 */ 1393 depth = ext_depth(inode); 1394 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1395 /* now we need to split */ 1396 goto repeat; 1397 } 1398 } 1399 1400out: 1401 return err; 1402} 1403 1404/* 1405 * search the closest allocated block to the left for *logical 1406 * and returns it at @logical + it's physical address at @phys 1407 * if *logical is the smallest allocated block, the function 1408 * returns 0 at @phys 1409 * return value contains 0 (success) or error code 1410 */ 1411static int ext4_ext_search_left(struct inode *inode, 1412 struct ext4_ext_path *path, 1413 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1414{ 1415 struct ext4_extent_idx *ix; 1416 struct ext4_extent *ex; 1417 int depth, ee_len; 1418 1419 if (unlikely(path == NULL)) { 1420 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1421 return -EIO; 1422 } 1423 depth = path->p_depth; 1424 *phys = 0; 1425 1426 if (depth == 0 && path->p_ext == NULL) 1427 return 0; 1428 1429 /* usually extent in the path covers blocks smaller 1430 * then *logical, but it can be that extent is the 1431 * first one in the file */ 1432 1433 ex = path[depth].p_ext; 1434 ee_len = ext4_ext_get_actual_len(ex); 1435 if (*logical < le32_to_cpu(ex->ee_block)) { 1436 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1437 EXT4_ERROR_INODE(inode, 1438 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1439 *logical, le32_to_cpu(ex->ee_block)); 1440 return -EIO; 1441 } 1442 while (--depth >= 0) { 1443 ix = path[depth].p_idx; 1444 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1445 EXT4_ERROR_INODE(inode, 1446 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1447 ix != NULL ? le32_to_cpu(ix->ei_block) : 0, 1448 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1449 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, 1450 depth); 1451 return -EIO; 1452 } 1453 } 1454 return 0; 1455 } 1456 1457 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1458 EXT4_ERROR_INODE(inode, 1459 "logical %d < ee_block %d + ee_len %d!", 1460 *logical, le32_to_cpu(ex->ee_block), ee_len); 1461 return -EIO; 1462 } 1463 1464 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1465 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1466 return 0; 1467} 1468 1469/* 1470 * search the closest allocated block to the right for *logical 1471 * and returns it at @logical + it's physical address at @phys 1472 * if *logical is the largest allocated block, the function 1473 * returns 0 at @phys 1474 * return value contains 0 (success) or error code 1475 */ 1476static int ext4_ext_search_right(struct inode *inode, 1477 struct ext4_ext_path *path, 1478 ext4_lblk_t *logical, ext4_fsblk_t *phys, 1479 struct ext4_extent **ret_ex) 1480{ 1481 struct buffer_head *bh = NULL; 1482 struct ext4_extent_header *eh; 1483 struct ext4_extent_idx *ix; 1484 struct ext4_extent *ex; 1485 ext4_fsblk_t block; 1486 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1487 int ee_len; 1488 1489 if (unlikely(path == NULL)) { 1490 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1491 return -EIO; 1492 } 1493 depth = path->p_depth; 1494 *phys = 0; 1495 1496 if (depth == 0 && path->p_ext == NULL) 1497 return 0; 1498 1499 /* usually extent in the path covers blocks smaller 1500 * then *logical, but it can be that extent is the 1501 * first one in the file */ 1502 1503 ex = path[depth].p_ext; 1504 ee_len = ext4_ext_get_actual_len(ex); 1505 if (*logical < le32_to_cpu(ex->ee_block)) { 1506 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1507 EXT4_ERROR_INODE(inode, 1508 "first_extent(path[%d].p_hdr) != ex", 1509 depth); 1510 return -EIO; 1511 } 1512 while (--depth >= 0) { 1513 ix = path[depth].p_idx; 1514 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1515 EXT4_ERROR_INODE(inode, 1516 "ix != EXT_FIRST_INDEX *logical %d!", 1517 *logical); 1518 return -EIO; 1519 } 1520 } 1521 goto found_extent; 1522 } 1523 1524 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1525 EXT4_ERROR_INODE(inode, 1526 "logical %d < ee_block %d + ee_len %d!", 1527 *logical, le32_to_cpu(ex->ee_block), ee_len); 1528 return -EIO; 1529 } 1530 1531 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1532 /* next allocated block in this leaf */ 1533 ex++; 1534 goto found_extent; 1535 } 1536 1537 /* go up and search for index to the right */ 1538 while (--depth >= 0) { 1539 ix = path[depth].p_idx; 1540 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1541 goto got_index; 1542 } 1543 1544 /* we've gone up to the root and found no index to the right */ 1545 return 0; 1546 1547got_index: 1548 /* we've found index to the right, let's 1549 * follow it and find the closest allocated 1550 * block to the right */ 1551 ix++; 1552 block = ext4_idx_pblock(ix); 1553 while (++depth < path->p_depth) { 1554 /* subtract from p_depth to get proper eh_depth */ 1555 bh = read_extent_tree_block(inode, block, 1556 path->p_depth - depth, 0); 1557 if (IS_ERR(bh)) 1558 return PTR_ERR(bh); 1559 eh = ext_block_hdr(bh); 1560 ix = EXT_FIRST_INDEX(eh); 1561 block = ext4_idx_pblock(ix); 1562 put_bh(bh); 1563 } 1564 1565 bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); 1566 if (IS_ERR(bh)) 1567 return PTR_ERR(bh); 1568 eh = ext_block_hdr(bh); 1569 ex = EXT_FIRST_EXTENT(eh); 1570found_extent: 1571 *logical = le32_to_cpu(ex->ee_block); 1572 *phys = ext4_ext_pblock(ex); 1573 *ret_ex = ex; 1574 if (bh) 1575 put_bh(bh); 1576 return 0; 1577} 1578 1579/* 1580 * ext4_ext_next_allocated_block: 1581 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1582 * NOTE: it considers block number from index entry as 1583 * allocated block. Thus, index entries have to be consistent 1584 * with leaves. 1585 */ 1586ext4_lblk_t 1587ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1588{ 1589 int depth; 1590 1591 BUG_ON(path == NULL); 1592 depth = path->p_depth; 1593 1594 if (depth == 0 && path->p_ext == NULL) 1595 return EXT_MAX_BLOCKS; 1596 1597 while (depth >= 0) { 1598 if (depth == path->p_depth) { 1599 /* leaf */ 1600 if (path[depth].p_ext && 1601 path[depth].p_ext != 1602 EXT_LAST_EXTENT(path[depth].p_hdr)) 1603 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1604 } else { 1605 /* index */ 1606 if (path[depth].p_idx != 1607 EXT_LAST_INDEX(path[depth].p_hdr)) 1608 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1609 } 1610 depth--; 1611 } 1612 1613 return EXT_MAX_BLOCKS; 1614} 1615 1616/* 1617 * ext4_ext_next_leaf_block: 1618 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1619 */ 1620static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) 1621{ 1622 int depth; 1623 1624 BUG_ON(path == NULL); 1625 depth = path->p_depth; 1626 1627 /* zero-tree has no leaf blocks at all */ 1628 if (depth == 0) 1629 return EXT_MAX_BLOCKS; 1630 1631 /* go to index block */ 1632 depth--; 1633 1634 while (depth >= 0) { 1635 if (path[depth].p_idx != 1636 EXT_LAST_INDEX(path[depth].p_hdr)) 1637 return (ext4_lblk_t) 1638 le32_to_cpu(path[depth].p_idx[1].ei_block); 1639 depth--; 1640 } 1641 1642 return EXT_MAX_BLOCKS; 1643} 1644 1645/* 1646 * ext4_ext_correct_indexes: 1647 * if leaf gets modified and modified extent is first in the leaf, 1648 * then we have to correct all indexes above. 1649 * TODO: do we need to correct tree in all cases? 1650 */ 1651static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1652 struct ext4_ext_path *path) 1653{ 1654 struct ext4_extent_header *eh; 1655 int depth = ext_depth(inode); 1656 struct ext4_extent *ex; 1657 __le32 border; 1658 int k, err = 0; 1659 1660 eh = path[depth].p_hdr; 1661 ex = path[depth].p_ext; 1662 1663 if (unlikely(ex == NULL || eh == NULL)) { 1664 EXT4_ERROR_INODE(inode, 1665 "ex %p == NULL or eh %p == NULL", ex, eh); 1666 return -EIO; 1667 } 1668 1669 if (depth == 0) { 1670 /* there is no tree at all */ 1671 return 0; 1672 } 1673 1674 if (ex != EXT_FIRST_EXTENT(eh)) { 1675 /* we correct tree if first leaf got modified only */ 1676 return 0; 1677 } 1678 1679 /* 1680 * TODO: we need correction if border is smaller than current one 1681 */ 1682 k = depth - 1; 1683 border = path[depth].p_ext->ee_block; 1684 err = ext4_ext_get_access(handle, inode, path + k); 1685 if (err) 1686 return err; 1687 path[k].p_idx->ei_block = border; 1688 err = ext4_ext_dirty(handle, inode, path + k); 1689 if (err) 1690 return err; 1691 1692 while (k--) { 1693 /* change all left-side indexes */ 1694 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1695 break; 1696 err = ext4_ext_get_access(handle, inode, path + k); 1697 if (err) 1698 break; 1699 path[k].p_idx->ei_block = border; 1700 err = ext4_ext_dirty(handle, inode, path + k); 1701 if (err) 1702 break; 1703 } 1704 1705 return err; 1706} 1707 1708int 1709ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1710 struct ext4_extent *ex2) 1711{ 1712 unsigned short ext1_ee_len, ext2_ee_len; 1713 1714 /* 1715 * Make sure that both extents are initialized. We don't merge 1716 * unwritten extents so that we can be sure that end_io code has 1717 * the extent that was written properly split out and conversion to 1718 * initialized is trivial. 1719 */ 1720 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) 1721 return 0; 1722 1723 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1724 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1725 1726 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1727 le32_to_cpu(ex2->ee_block)) 1728 return 0; 1729 1730 /* 1731 * To allow future support for preallocated extents to be added 1732 * as an RO_COMPAT feature, refuse to merge to extents if 1733 * this can result in the top bit of ee_len being set. 1734 */ 1735 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1736 return 0; 1737 if (ext4_ext_is_unwritten(ex1) && 1738 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || 1739 atomic_read(&EXT4_I(inode)->i_unwritten) || 1740 (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) 1741 return 0; 1742#ifdef AGGRESSIVE_TEST 1743 if (ext1_ee_len >= 4) 1744 return 0; 1745#endif 1746 1747 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1748 return 1; 1749 return 0; 1750} 1751 1752/* 1753 * This function tries to merge the "ex" extent to the next extent in the tree. 1754 * It always tries to merge towards right. If you want to merge towards 1755 * left, pass "ex - 1" as argument instead of "ex". 1756 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1757 * 1 if they got merged. 1758 */ 1759static int ext4_ext_try_to_merge_right(struct inode *inode, 1760 struct ext4_ext_path *path, 1761 struct ext4_extent *ex) 1762{ 1763 struct ext4_extent_header *eh; 1764 unsigned int depth, len; 1765 int merge_done = 0, unwritten; 1766 1767 depth = ext_depth(inode); 1768 BUG_ON(path[depth].p_hdr == NULL); 1769 eh = path[depth].p_hdr; 1770 1771 while (ex < EXT_LAST_EXTENT(eh)) { 1772 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1773 break; 1774 /* merge with next extent! */ 1775 unwritten = ext4_ext_is_unwritten(ex); 1776 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1777 + ext4_ext_get_actual_len(ex + 1)); 1778 if (unwritten) 1779 ext4_ext_mark_unwritten(ex); 1780 1781 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1782 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1783 * sizeof(struct ext4_extent); 1784 memmove(ex + 1, ex + 2, len); 1785 } 1786 le16_add_cpu(&eh->eh_entries, -1); 1787 merge_done = 1; 1788 WARN_ON(eh->eh_entries == 0); 1789 if (!eh->eh_entries) 1790 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1791 } 1792 1793 return merge_done; 1794} 1795 1796/* 1797 * This function does a very simple check to see if we can collapse 1798 * an extent tree with a single extent tree leaf block into the inode. 1799 */ 1800static void ext4_ext_try_to_merge_up(handle_t *handle, 1801 struct inode *inode, 1802 struct ext4_ext_path *path) 1803{ 1804 size_t s; 1805 unsigned max_root = ext4_ext_space_root(inode, 0); 1806 ext4_fsblk_t blk; 1807 1808 if ((path[0].p_depth != 1) || 1809 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1810 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1811 return; 1812 1813 /* 1814 * We need to modify the block allocation bitmap and the block 1815 * group descriptor to release the extent tree block. If we 1816 * can't get the journal credits, give up. 1817 */ 1818 if (ext4_journal_extend(handle, 2)) 1819 return; 1820 1821 /* 1822 * Copy the extent data up to the inode 1823 */ 1824 blk = ext4_idx_pblock(path[0].p_idx); 1825 s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1826 sizeof(struct ext4_extent_idx); 1827 s += sizeof(struct ext4_extent_header); 1828 1829 path[1].p_maxdepth = path[0].p_maxdepth; 1830 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1831 path[0].p_depth = 0; 1832 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1833 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1834 path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1835 1836 brelse(path[1].p_bh); 1837 ext4_free_blocks(handle, inode, NULL, blk, 1, 1838 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1839} 1840 1841/* 1842 * This function tries to merge the @ex extent to neighbours in the tree. 1843 * return 1 if merge left else 0. 1844 */ 1845static void ext4_ext_try_to_merge(handle_t *handle, 1846 struct inode *inode, 1847 struct ext4_ext_path *path, 1848 struct ext4_extent *ex) { 1849 struct ext4_extent_header *eh; 1850 unsigned int depth; 1851 int merge_done = 0; 1852 1853 depth = ext_depth(inode); 1854 BUG_ON(path[depth].p_hdr == NULL); 1855 eh = path[depth].p_hdr; 1856 1857 if (ex > EXT_FIRST_EXTENT(eh)) 1858 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1859 1860 if (!merge_done) 1861 (void) ext4_ext_try_to_merge_right(inode, path, ex); 1862 1863 ext4_ext_try_to_merge_up(handle, inode, path); 1864} 1865 1866/* 1867 * check if a portion of the "newext" extent overlaps with an 1868 * existing extent. 1869 * 1870 * If there is an overlap discovered, it updates the length of the newext 1871 * such that there will be no overlap, and then returns 1. 1872 * If there is no overlap found, it returns 0. 1873 */ 1874static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 1875 struct inode *inode, 1876 struct ext4_extent *newext, 1877 struct ext4_ext_path *path) 1878{ 1879 ext4_lblk_t b1, b2; 1880 unsigned int depth, len1; 1881 unsigned int ret = 0; 1882 1883 b1 = le32_to_cpu(newext->ee_block); 1884 len1 = ext4_ext_get_actual_len(newext); 1885 depth = ext_depth(inode); 1886 if (!path[depth].p_ext) 1887 goto out; 1888 b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); 1889 1890 /* 1891 * get the next allocated block if the extent in the path 1892 * is before the requested block(s) 1893 */ 1894 if (b2 < b1) { 1895 b2 = ext4_ext_next_allocated_block(path); 1896 if (b2 == EXT_MAX_BLOCKS) 1897 goto out; 1898 b2 = EXT4_LBLK_CMASK(sbi, b2); 1899 } 1900 1901 /* check for wrap through zero on extent logical start block*/ 1902 if (b1 + len1 < b1) { 1903 len1 = EXT_MAX_BLOCKS - b1; 1904 newext->ee_len = cpu_to_le16(len1); 1905 ret = 1; 1906 } 1907 1908 /* check for overlap */ 1909 if (b1 + len1 > b2) { 1910 newext->ee_len = cpu_to_le16(b2 - b1); 1911 ret = 1; 1912 } 1913out: 1914 return ret; 1915} 1916 1917/* 1918 * ext4_ext_insert_extent: 1919 * tries to merge requsted extent into the existing extent or 1920 * inserts requested extent as new one into the tree, 1921 * creating new leaf in the no-space case. 1922 */ 1923int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1924 struct ext4_ext_path **ppath, 1925 struct ext4_extent *newext, int gb_flags) 1926{ 1927 struct ext4_ext_path *path = *ppath; 1928 struct ext4_extent_header *eh; 1929 struct ext4_extent *ex, *fex; 1930 struct ext4_extent *nearex; /* nearest extent */ 1931 struct ext4_ext_path *npath = NULL; 1932 int depth, len, err; 1933 ext4_lblk_t next; 1934 int mb_flags = 0, unwritten; 1935 1936 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1937 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1938 return -EIO; 1939 } 1940 depth = ext_depth(inode); 1941 ex = path[depth].p_ext; 1942 eh = path[depth].p_hdr; 1943 if (unlikely(path[depth].p_hdr == NULL)) { 1944 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1945 return -EIO; 1946 } 1947 1948 /* try to insert block into found extent and return */ 1949 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 1950 1951 /* 1952 * Try to see whether we should rather test the extent on 1953 * right from ex, or from the left of ex. This is because 1954 * ext4_ext_find_extent() can return either extent on the 1955 * left, or on the right from the searched position. This 1956 * will make merging more effective. 1957 */ 1958 if (ex < EXT_LAST_EXTENT(eh) && 1959 (le32_to_cpu(ex->ee_block) + 1960 ext4_ext_get_actual_len(ex) < 1961 le32_to_cpu(newext->ee_block))) { 1962 ex += 1; 1963 goto prepend; 1964 } else if ((ex > EXT_FIRST_EXTENT(eh)) && 1965 (le32_to_cpu(newext->ee_block) + 1966 ext4_ext_get_actual_len(newext) < 1967 le32_to_cpu(ex->ee_block))) 1968 ex -= 1; 1969 1970 /* Try to append newex to the ex */ 1971 if (ext4_can_extents_be_merged(inode, ex, newext)) { 1972 ext_debug("append [%d]%d block to %u:[%d]%d" 1973 "(from %llu)\n", 1974 ext4_ext_is_unwritten(newext), 1975 ext4_ext_get_actual_len(newext), 1976 le32_to_cpu(ex->ee_block), 1977 ext4_ext_is_unwritten(ex), 1978 ext4_ext_get_actual_len(ex), 1979 ext4_ext_pblock(ex)); 1980 err = ext4_ext_get_access(handle, inode, 1981 path + depth); 1982 if (err) 1983 return err; 1984 unwritten = ext4_ext_is_unwritten(ex); 1985 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1986 + ext4_ext_get_actual_len(newext)); 1987 if (unwritten) 1988 ext4_ext_mark_unwritten(ex); 1989 eh = path[depth].p_hdr; 1990 nearex = ex; 1991 goto merge; 1992 } 1993 1994prepend: 1995 /* Try to prepend newex to the ex */ 1996 if (ext4_can_extents_be_merged(inode, newext, ex)) { 1997 ext_debug("prepend %u[%d]%d block to %u:[%d]%d" 1998 "(from %llu)\n", 1999 le32_to_cpu(newext->ee_block), 2000 ext4_ext_is_unwritten(newext), 2001 ext4_ext_get_actual_len(newext), 2002 le32_to_cpu(ex->ee_block), 2003 ext4_ext_is_unwritten(ex), 2004 ext4_ext_get_actual_len(ex), 2005 ext4_ext_pblock(ex)); 2006 err = ext4_ext_get_access(handle, inode, 2007 path + depth); 2008 if (err) 2009 return err; 2010 2011 unwritten = ext4_ext_is_unwritten(ex); 2012 ex->ee_block = newext->ee_block; 2013 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 2014 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 2015 + ext4_ext_get_actual_len(newext)); 2016 if (unwritten) 2017 ext4_ext_mark_unwritten(ex); 2018 eh = path[depth].p_hdr; 2019 nearex = ex; 2020 goto merge; 2021 } 2022 } 2023 2024 depth = ext_depth(inode); 2025 eh = path[depth].p_hdr; 2026 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 2027 goto has_space; 2028 2029 /* probably next leaf has space for us? */ 2030 fex = EXT_LAST_EXTENT(eh); 2031 next = EXT_MAX_BLOCKS; 2032 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 2033 next = ext4_ext_next_leaf_block(path); 2034 if (next != EXT_MAX_BLOCKS) { 2035 ext_debug("next leaf block - %u\n", next); 2036 BUG_ON(npath != NULL); 2037 npath = ext4_ext_find_extent(inode, next, NULL, 0); 2038 if (IS_ERR(npath)) 2039 return PTR_ERR(npath); 2040 BUG_ON(npath->p_depth != path->p_depth); 2041 eh = npath[depth].p_hdr; 2042 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 2043 ext_debug("next leaf isn't full(%d)\n", 2044 le16_to_cpu(eh->eh_entries)); 2045 path = npath; 2046 goto has_space; 2047 } 2048 ext_debug("next leaf has no free space(%d,%d)\n", 2049 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 2050 } 2051 2052 /* 2053 * There is no free space in the found leaf. 2054 * We're gonna add a new leaf in the tree. 2055 */ 2056 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2057 mb_flags = EXT4_MB_USE_RESERVED; 2058 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2059 ppath, newext); 2060 if (err) 2061 goto cleanup; 2062 depth = ext_depth(inode); 2063 eh = path[depth].p_hdr; 2064 2065has_space: 2066 nearex = path[depth].p_ext; 2067 2068 err = ext4_ext_get_access(handle, inode, path + depth); 2069 if (err) 2070 goto cleanup; 2071 2072 if (!nearex) { 2073 /* there is no extent in this leaf, create first one */ 2074 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", 2075 le32_to_cpu(newext->ee_block), 2076 ext4_ext_pblock(newext), 2077 ext4_ext_is_unwritten(newext), 2078 ext4_ext_get_actual_len(newext)); 2079 nearex = EXT_FIRST_EXTENT(eh); 2080 } else { 2081 if (le32_to_cpu(newext->ee_block) 2082 > le32_to_cpu(nearex->ee_block)) { 2083 /* Insert after */ 2084 ext_debug("insert %u:%llu:[%d]%d before: " 2085 "nearest %p\n", 2086 le32_to_cpu(newext->ee_block), 2087 ext4_ext_pblock(newext), 2088 ext4_ext_is_unwritten(newext), 2089 ext4_ext_get_actual_len(newext), 2090 nearex); 2091 nearex++; 2092 } else { 2093 /* Insert before */ 2094 BUG_ON(newext->ee_block == nearex->ee_block); 2095 ext_debug("insert %u:%llu:[%d]%d after: " 2096 "nearest %p\n", 2097 le32_to_cpu(newext->ee_block), 2098 ext4_ext_pblock(newext), 2099 ext4_ext_is_unwritten(newext), 2100 ext4_ext_get_actual_len(newext), 2101 nearex); 2102 } 2103 len = EXT_LAST_EXTENT(eh) - nearex + 1; 2104 if (len > 0) { 2105 ext_debug("insert %u:%llu:[%d]%d: " 2106 "move %d extents from 0x%p to 0x%p\n", 2107 le32_to_cpu(newext->ee_block), 2108 ext4_ext_pblock(newext), 2109 ext4_ext_is_unwritten(newext), 2110 ext4_ext_get_actual_len(newext), 2111 len, nearex, nearex + 1); 2112 memmove(nearex + 1, nearex, 2113 len * sizeof(struct ext4_extent)); 2114 } 2115 } 2116 2117 le16_add_cpu(&eh->eh_entries, 1); 2118 path[depth].p_ext = nearex; 2119 nearex->ee_block = newext->ee_block; 2120 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 2121 nearex->ee_len = newext->ee_len; 2122 2123merge: 2124 /* try to merge extents */ 2125 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 2126 ext4_ext_try_to_merge(handle, inode, path, nearex); 2127 2128 2129 /* time to correct all indexes above */ 2130 err = ext4_ext_correct_indexes(handle, inode, path); 2131 if (err) 2132 goto cleanup; 2133 2134 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2135 2136cleanup: 2137 ext4_ext_drop_refs(npath); 2138 kfree(npath); 2139 return err; 2140} 2141 2142static int ext4_fill_fiemap_extents(struct inode *inode, 2143 ext4_lblk_t block, ext4_lblk_t num, 2144 struct fiemap_extent_info *fieinfo) 2145{ 2146 struct ext4_ext_path *path = NULL; 2147 struct ext4_extent *ex; 2148 struct extent_status es; 2149 ext4_lblk_t next, next_del, start = 0, end = 0; 2150 ext4_lblk_t last = block + num; 2151 int exists, depth = 0, err = 0; 2152 unsigned int flags = 0; 2153 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 2154 2155 while (block < last && block != EXT_MAX_BLOCKS) { 2156 num = last - block; 2157 /* find extent for this block */ 2158 down_read(&EXT4_I(inode)->i_data_sem); 2159 2160 path = ext4_ext_find_extent(inode, block, &path, 0); 2161 if (IS_ERR(path)) { 2162 up_read(&EXT4_I(inode)->i_data_sem); 2163 err = PTR_ERR(path); 2164 path = NULL; 2165 break; 2166 } 2167 2168 depth = ext_depth(inode); 2169 if (unlikely(path[depth].p_hdr == NULL)) { 2170 up_read(&EXT4_I(inode)->i_data_sem); 2171 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2172 err = -EIO; 2173 break; 2174 } 2175 ex = path[depth].p_ext; 2176 next = ext4_ext_next_allocated_block(path); 2177 2178 flags = 0; 2179 exists = 0; 2180 if (!ex) { 2181 /* there is no extent yet, so try to allocate 2182 * all requested space */ 2183 start = block; 2184 end = block + num; 2185 } else if (le32_to_cpu(ex->ee_block) > block) { 2186 /* need to allocate space before found extent */ 2187 start = block; 2188 end = le32_to_cpu(ex->ee_block); 2189 if (block + num < end) 2190 end = block + num; 2191 } else if (block >= le32_to_cpu(ex->ee_block) 2192 + ext4_ext_get_actual_len(ex)) { 2193 /* need to allocate space after found extent */ 2194 start = block; 2195 end = block + num; 2196 if (end >= next) 2197 end = next; 2198 } else if (block >= le32_to_cpu(ex->ee_block)) { 2199 /* 2200 * some part of requested space is covered 2201 * by found extent 2202 */ 2203 start = block; 2204 end = le32_to_cpu(ex->ee_block) 2205 + ext4_ext_get_actual_len(ex); 2206 if (block + num < end) 2207 end = block + num; 2208 exists = 1; 2209 } else { 2210 BUG(); 2211 } 2212 BUG_ON(end <= start); 2213 2214 if (!exists) { 2215 es.es_lblk = start; 2216 es.es_len = end - start; 2217 es.es_pblk = 0; 2218 } else { 2219 es.es_lblk = le32_to_cpu(ex->ee_block); 2220 es.es_len = ext4_ext_get_actual_len(ex); 2221 es.es_pblk = ext4_ext_pblock(ex); 2222 if (ext4_ext_is_unwritten(ex)) 2223 flags |= FIEMAP_EXTENT_UNWRITTEN; 2224 } 2225 2226 /* 2227 * Find delayed extent and update es accordingly. We call 2228 * it even in !exists case to find out whether es is the 2229 * last existing extent or not. 2230 */ 2231 next_del = ext4_find_delayed_extent(inode, &es); 2232 if (!exists && next_del) { 2233 exists = 1; 2234 flags |= (FIEMAP_EXTENT_DELALLOC | 2235 FIEMAP_EXTENT_UNKNOWN); 2236 } 2237 up_read(&EXT4_I(inode)->i_data_sem); 2238 2239 if (unlikely(es.es_len == 0)) { 2240 EXT4_ERROR_INODE(inode, "es.es_len == 0"); 2241 err = -EIO; 2242 break; 2243 } 2244 2245 /* 2246 * This is possible iff next == next_del == EXT_MAX_BLOCKS. 2247 * we need to check next == EXT_MAX_BLOCKS because it is 2248 * possible that an extent is with unwritten and delayed 2249 * status due to when an extent is delayed allocated and 2250 * is allocated by fallocate status tree will track both of 2251 * them in a extent. 2252 * 2253 * So we could return a unwritten and delayed extent, and 2254 * its block is equal to 'next'. 2255 */ 2256 if (next == next_del && next == EXT_MAX_BLOCKS) { 2257 flags |= FIEMAP_EXTENT_LAST; 2258 if (unlikely(next_del != EXT_MAX_BLOCKS || 2259 next != EXT_MAX_BLOCKS)) { 2260 EXT4_ERROR_INODE(inode, 2261 "next extent == %u, next " 2262 "delalloc extent = %u", 2263 next, next_del); 2264 err = -EIO; 2265 break; 2266 } 2267 } 2268 2269 if (exists) { 2270 err = fiemap_fill_next_extent(fieinfo, 2271 (__u64)es.es_lblk << blksize_bits, 2272 (__u64)es.es_pblk << blksize_bits, 2273 (__u64)es.es_len << blksize_bits, 2274 flags); 2275 if (err < 0) 2276 break; 2277 if (err == 1) { 2278 err = 0; 2279 break; 2280 } 2281 } 2282 2283 block = es.es_lblk + es.es_len; 2284 } 2285 2286 ext4_ext_drop_refs(path); 2287 kfree(path); 2288 return err; 2289} 2290 2291/* 2292 * ext4_ext_put_gap_in_cache: 2293 * calculate boundaries of the gap that the requested block fits into 2294 * and cache this gap 2295 */ 2296static void 2297ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2298 ext4_lblk_t block) 2299{ 2300 int depth = ext_depth(inode); 2301 unsigned long len = 0; 2302 ext4_lblk_t lblock = 0; 2303 struct ext4_extent *ex; 2304 2305 ex = path[depth].p_ext; 2306 if (ex == NULL) { 2307 /* 2308 * there is no extent yet, so gap is [0;-] and we 2309 * don't cache it 2310 */ 2311 ext_debug("cache gap(whole file):"); 2312 } else if (block < le32_to_cpu(ex->ee_block)) { 2313 lblock = block; 2314 len = le32_to_cpu(ex->ee_block) - block; 2315 ext_debug("cache gap(before): %u [%u:%u]", 2316 block, 2317 le32_to_cpu(ex->ee_block), 2318 ext4_ext_get_actual_len(ex)); 2319 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2320 ext4_es_insert_extent(inode, lblock, len, ~0, 2321 EXTENT_STATUS_HOLE); 2322 } else if (block >= le32_to_cpu(ex->ee_block) 2323 + ext4_ext_get_actual_len(ex)) { 2324 ext4_lblk_t next; 2325 lblock = le32_to_cpu(ex->ee_block) 2326 + ext4_ext_get_actual_len(ex); 2327 2328 next = ext4_ext_next_allocated_block(path); 2329 ext_debug("cache gap(after): [%u:%u] %u", 2330 le32_to_cpu(ex->ee_block), 2331 ext4_ext_get_actual_len(ex), 2332 block); 2333 BUG_ON(next == lblock); 2334 len = next - lblock; 2335 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2336 ext4_es_insert_extent(inode, lblock, len, ~0, 2337 EXTENT_STATUS_HOLE); 2338 } else { 2339 BUG(); 2340 } 2341 2342 ext_debug(" -> %u:%lu\n", lblock, len); 2343} 2344 2345/* 2346 * ext4_ext_rm_idx: 2347 * removes index from the index block. 2348 */ 2349static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2350 struct ext4_ext_path *path, int depth) 2351{ 2352 int err; 2353 ext4_fsblk_t leaf; 2354 2355 /* free index block */ 2356 depth--; 2357 path = path + depth; 2358 leaf = ext4_idx_pblock(path->p_idx); 2359 if (unlikely(path->p_hdr->eh_entries == 0)) { 2360 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2361 return -EIO; 2362 } 2363 err = ext4_ext_get_access(handle, inode, path); 2364 if (err) 2365 return err; 2366 2367 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { 2368 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; 2369 len *= sizeof(struct ext4_extent_idx); 2370 memmove(path->p_idx, path->p_idx + 1, len); 2371 } 2372 2373 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2374 err = ext4_ext_dirty(handle, inode, path); 2375 if (err) 2376 return err; 2377 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2378 trace_ext4_ext_rm_idx(inode, leaf); 2379 2380 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2381 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2382 2383 while (--depth >= 0) { 2384 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) 2385 break; 2386 path--; 2387 err = ext4_ext_get_access(handle, inode, path); 2388 if (err) 2389 break; 2390 path->p_idx->ei_block = (path+1)->p_idx->ei_block; 2391 err = ext4_ext_dirty(handle, inode, path); 2392 if (err) 2393 break; 2394 } 2395 return err; 2396} 2397 2398/* 2399 * ext4_ext_calc_credits_for_single_extent: 2400 * This routine returns max. credits that needed to insert an extent 2401 * to the extent tree. 2402 * When pass the actual path, the caller should calculate credits 2403 * under i_data_sem. 2404 */ 2405int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2406 struct ext4_ext_path *path) 2407{ 2408 if (path) { 2409 int depth = ext_depth(inode); 2410 int ret = 0; 2411 2412 /* probably there is space in leaf? */ 2413 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2414 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2415 2416 /* 2417 * There are some space in the leaf tree, no 2418 * need to account for leaf block credit 2419 * 2420 * bitmaps and block group descriptor blocks 2421 * and other metadata blocks still need to be 2422 * accounted. 2423 */ 2424 /* 1 bitmap, 1 block group descriptor */ 2425 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2426 return ret; 2427 } 2428 } 2429 2430 return ext4_chunk_trans_blocks(inode, nrblocks); 2431} 2432 2433/* 2434 * How many index/leaf blocks need to change/allocate to add @extents extents? 2435 * 2436 * If we add a single extent, then in the worse case, each tree level 2437 * index/leaf need to be changed in case of the tree split. 2438 * 2439 * If more extents are inserted, they could cause the whole tree split more 2440 * than once, but this is really rare. 2441 */ 2442int ext4_ext_index_trans_blocks(struct inode *inode, int extents) 2443{ 2444 int index; 2445 int depth; 2446 2447 /* If we are converting the inline data, only one is needed here. */ 2448 if (ext4_has_inline_data(inode)) 2449 return 1; 2450 2451 depth = ext_depth(inode); 2452 2453 if (extents <= 1) 2454 index = depth * 2; 2455 else 2456 index = depth * 3; 2457 2458 return index; 2459} 2460 2461static inline int get_default_free_blocks_flags(struct inode *inode) 2462{ 2463 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2464 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2465 else if (ext4_should_journal_data(inode)) 2466 return EXT4_FREE_BLOCKS_FORGET; 2467 return 0; 2468} 2469 2470static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2471 struct ext4_extent *ex, 2472 long long *partial_cluster, 2473 ext4_lblk_t from, ext4_lblk_t to) 2474{ 2475 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2476 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2477 ext4_fsblk_t pblk; 2478 int flags = get_default_free_blocks_flags(inode); 2479 2480 /* 2481 * For bigalloc file systems, we never free a partial cluster 2482 * at the beginning of the extent. Instead, we make a note 2483 * that we tried freeing the cluster, and check to see if we 2484 * need to free it on a subsequent call to ext4_remove_blocks, 2485 * or at the end of the ext4_truncate() operation. 2486 */ 2487 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2488 2489 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); 2490 /* 2491 * If we have a partial cluster, and it's different from the 2492 * cluster of the last block, we need to explicitly free the 2493 * partial cluster here. 2494 */ 2495 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2496 if ((*partial_cluster > 0) && 2497 (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2498 ext4_free_blocks(handle, inode, NULL, 2499 EXT4_C2B(sbi, *partial_cluster), 2500 sbi->s_cluster_ratio, flags); 2501 *partial_cluster = 0; 2502 } 2503 2504#ifdef EXTENTS_STATS 2505 { 2506 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2507 spin_lock(&sbi->s_ext_stats_lock); 2508 sbi->s_ext_blocks += ee_len; 2509 sbi->s_ext_extents++; 2510 if (ee_len < sbi->s_ext_min) 2511 sbi->s_ext_min = ee_len; 2512 if (ee_len > sbi->s_ext_max) 2513 sbi->s_ext_max = ee_len; 2514 if (ext_depth(inode) > sbi->s_depth_max) 2515 sbi->s_depth_max = ext_depth(inode); 2516 spin_unlock(&sbi->s_ext_stats_lock); 2517 } 2518#endif 2519 if (from >= le32_to_cpu(ex->ee_block) 2520 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2521 /* tail removal */ 2522 ext4_lblk_t num; 2523 unsigned int unaligned; 2524 2525 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2526 pblk = ext4_ext_pblock(ex) + ee_len - num; 2527 /* 2528 * Usually we want to free partial cluster at the end of the 2529 * extent, except for the situation when the cluster is still 2530 * used by any other extent (partial_cluster is negative). 2531 */ 2532 if (*partial_cluster < 0 && 2533 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) 2534 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2535 2536 ext_debug("free last %u blocks starting %llu partial %lld\n", 2537 num, pblk, *partial_cluster); 2538 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2539 /* 2540 * If the block range to be freed didn't start at the 2541 * beginning of a cluster, and we removed the entire 2542 * extent and the cluster is not used by any other extent, 2543 * save the partial cluster here, since we might need to 2544 * delete if we determine that the truncate operation has 2545 * removed all of the blocks in the cluster. 2546 * 2547 * On the other hand, if we did not manage to free the whole 2548 * extent, we have to mark the cluster as used (store negative 2549 * cluster number in partial_cluster). 2550 */ 2551 unaligned = EXT4_PBLK_COFF(sbi, pblk); 2552 if (unaligned && (ee_len == num) && 2553 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2554 *partial_cluster = EXT4_B2C(sbi, pblk); 2555 else if (unaligned) 2556 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); 2557 else if (*partial_cluster > 0) 2558 *partial_cluster = 0; 2559 } else 2560 ext4_error(sbi->s_sb, "strange request: removal(2) " 2561 "%u-%u from %u:%u\n", 2562 from, to, le32_to_cpu(ex->ee_block), ee_len); 2563 return 0; 2564} 2565 2566 2567/* 2568 * ext4_ext_rm_leaf() Removes the extents associated with the 2569 * blocks appearing between "start" and "end", and splits the extents 2570 * if "start" and "end" appear in the same extent 2571 * 2572 * @handle: The journal handle 2573 * @inode: The files inode 2574 * @path: The path to the leaf 2575 * @partial_cluster: The cluster which we'll have to free if all extents 2576 * has been released from it. It gets negative in case 2577 * that the cluster is still used. 2578 * @start: The first block to remove 2579 * @end: The last block to remove 2580 */ 2581static int 2582ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2583 struct ext4_ext_path *path, 2584 long long *partial_cluster, 2585 ext4_lblk_t start, ext4_lblk_t end) 2586{ 2587 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2588 int err = 0, correct_index = 0; 2589 int depth = ext_depth(inode), credits; 2590 struct ext4_extent_header *eh; 2591 ext4_lblk_t a, b; 2592 unsigned num; 2593 ext4_lblk_t ex_ee_block; 2594 unsigned short ex_ee_len; 2595 unsigned unwritten = 0; 2596 struct ext4_extent *ex; 2597 ext4_fsblk_t pblk; 2598 2599 /* the header must be checked already in ext4_ext_remove_space() */ 2600 ext_debug("truncate since %u in leaf to %u\n", start, end); 2601 if (!path[depth].p_hdr) 2602 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2603 eh = path[depth].p_hdr; 2604 if (unlikely(path[depth].p_hdr == NULL)) { 2605 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2606 return -EIO; 2607 } 2608 /* find where to start removing */ 2609 ex = path[depth].p_ext; 2610 if (!ex) 2611 ex = EXT_LAST_EXTENT(eh); 2612 2613 ex_ee_block = le32_to_cpu(ex->ee_block); 2614 ex_ee_len = ext4_ext_get_actual_len(ex); 2615 2616 /* 2617 * If we're starting with an extent other than the last one in the 2618 * node, we need to see if it shares a cluster with the extent to 2619 * the right (towards the end of the file). If its leftmost cluster 2620 * is this extent's rightmost cluster and it is not cluster aligned, 2621 * we'll mark it as a partial that is not to be deallocated. 2622 */ 2623 2624 if (ex != EXT_LAST_EXTENT(eh)) { 2625 ext4_fsblk_t current_pblk, right_pblk; 2626 long long current_cluster, right_cluster; 2627 2628 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; 2629 current_cluster = (long long)EXT4_B2C(sbi, current_pblk); 2630 right_pblk = ext4_ext_pblock(ex + 1); 2631 right_cluster = (long long)EXT4_B2C(sbi, right_pblk); 2632 if (current_cluster == right_cluster && 2633 EXT4_PBLK_COFF(sbi, right_pblk)) 2634 *partial_cluster = -right_cluster; 2635 } 2636 2637 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2638 2639 while (ex >= EXT_FIRST_EXTENT(eh) && 2640 ex_ee_block + ex_ee_len > start) { 2641 2642 if (ext4_ext_is_unwritten(ex)) 2643 unwritten = 1; 2644 else 2645 unwritten = 0; 2646 2647 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2648 unwritten, ex_ee_len); 2649 path[depth].p_ext = ex; 2650 2651 a = ex_ee_block > start ? ex_ee_block : start; 2652 b = ex_ee_block+ex_ee_len - 1 < end ? 2653 ex_ee_block+ex_ee_len - 1 : end; 2654 2655 ext_debug(" border %u:%u\n", a, b); 2656 2657 /* If this extent is beyond the end of the hole, skip it */ 2658 if (end < ex_ee_block) { 2659 /* 2660 * We're going to skip this extent and move to another, 2661 * so if this extent is not cluster aligned we have 2662 * to mark the current cluster as used to avoid 2663 * accidentally freeing it later on 2664 */ 2665 pblk = ext4_ext_pblock(ex); 2666 if (EXT4_PBLK_COFF(sbi, pblk)) 2667 *partial_cluster = 2668 -((long long)EXT4_B2C(sbi, pblk)); 2669 ex--; 2670 ex_ee_block = le32_to_cpu(ex->ee_block); 2671 ex_ee_len = ext4_ext_get_actual_len(ex); 2672 continue; 2673 } else if (b != ex_ee_block + ex_ee_len - 1) { 2674 EXT4_ERROR_INODE(inode, 2675 "can not handle truncate %u:%u " 2676 "on extent %u:%u", 2677 start, end, ex_ee_block, 2678 ex_ee_block + ex_ee_len - 1); 2679 err = -EIO; 2680 goto out; 2681 } else if (a != ex_ee_block) { 2682 /* remove tail of the extent */ 2683 num = a - ex_ee_block; 2684 } else { 2685 /* remove whole extent: excellent! */ 2686 num = 0; 2687 } 2688 /* 2689 * 3 for leaf, sb, and inode plus 2 (bmap and group 2690 * descriptor) for each block group; assume two block 2691 * groups plus ex_ee_len/blocks_per_block_group for 2692 * the worst case 2693 */ 2694 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2695 if (ex == EXT_FIRST_EXTENT(eh)) { 2696 correct_index = 1; 2697 credits += (ext_depth(inode)) + 1; 2698 } 2699 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2700 2701 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2702 if (err) 2703 goto out; 2704 2705 err = ext4_ext_get_access(handle, inode, path + depth); 2706 if (err) 2707 goto out; 2708 2709 err = ext4_remove_blocks(handle, inode, ex, partial_cluster, 2710 a, b); 2711 if (err) 2712 goto out; 2713 2714 if (num == 0) 2715 /* this extent is removed; mark slot entirely unused */ 2716 ext4_ext_store_pblock(ex, 0); 2717 2718 ex->ee_len = cpu_to_le16(num); 2719 /* 2720 * Do not mark unwritten if all the blocks in the 2721 * extent have been removed. 2722 */ 2723 if (unwritten && num) 2724 ext4_ext_mark_unwritten(ex); 2725 /* 2726 * If the extent was completely released, 2727 * we need to remove it from the leaf 2728 */ 2729 if (num == 0) { 2730 if (end != EXT_MAX_BLOCKS - 1) { 2731 /* 2732 * For hole punching, we need to scoot all the 2733 * extents up when an extent is removed so that 2734 * we dont have blank extents in the middle 2735 */ 2736 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2737 sizeof(struct ext4_extent)); 2738 2739 /* Now get rid of the one at the end */ 2740 memset(EXT_LAST_EXTENT(eh), 0, 2741 sizeof(struct ext4_extent)); 2742 } 2743 le16_add_cpu(&eh->eh_entries, -1); 2744 } else if (*partial_cluster > 0) 2745 *partial_cluster = 0; 2746 2747 err = ext4_ext_dirty(handle, inode, path + depth); 2748 if (err) 2749 goto out; 2750 2751 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, 2752 ext4_ext_pblock(ex)); 2753 ex--; 2754 ex_ee_block = le32_to_cpu(ex->ee_block); 2755 ex_ee_len = ext4_ext_get_actual_len(ex); 2756 } 2757 2758 if (correct_index && eh->eh_entries) 2759 err = ext4_ext_correct_indexes(handle, inode, path); 2760 2761 /* 2762 * If there's a partial cluster and at least one extent remains in 2763 * the leaf, free the partial cluster if it isn't shared with the 2764 * current extent. If there's a partial cluster and no extents 2765 * remain in the leaf, it can't be freed here. It can only be 2766 * freed when it's possible to determine if it's not shared with 2767 * any other extent - when the next leaf is processed or when space 2768 * removal is complete. 2769 */ 2770 if (*partial_cluster > 0 && eh->eh_entries && 2771 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2772 *partial_cluster)) { 2773 int flags = get_default_free_blocks_flags(inode); 2774 2775 ext4_free_blocks(handle, inode, NULL, 2776 EXT4_C2B(sbi, *partial_cluster), 2777 sbi->s_cluster_ratio, flags); 2778 *partial_cluster = 0; 2779 } 2780 2781 /* if this leaf is free, then we should 2782 * remove it from index block above */ 2783 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2784 err = ext4_ext_rm_idx(handle, inode, path, depth); 2785 2786out: 2787 return err; 2788} 2789 2790/* 2791 * ext4_ext_more_to_rm: 2792 * returns 1 if current index has to be freed (even partial) 2793 */ 2794static int 2795ext4_ext_more_to_rm(struct ext4_ext_path *path) 2796{ 2797 BUG_ON(path->p_idx == NULL); 2798 2799 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2800 return 0; 2801 2802 /* 2803 * if truncate on deeper level happened, it wasn't partial, 2804 * so we have to consider current index for truncation 2805 */ 2806 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2807 return 0; 2808 return 1; 2809} 2810 2811int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2812 ext4_lblk_t end) 2813{ 2814 struct super_block *sb = inode->i_sb; 2815 int depth = ext_depth(inode); 2816 struct ext4_ext_path *path = NULL; 2817 long long partial_cluster = 0; 2818 handle_t *handle; 2819 int i = 0, err = 0; 2820 2821 ext_debug("truncate since %u to %u\n", start, end); 2822 2823 /* probably first extent we're gonna free will be last in block */ 2824 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); 2825 if (IS_ERR(handle)) 2826 return PTR_ERR(handle); 2827 2828again: 2829 trace_ext4_ext_remove_space(inode, start, end, depth); 2830 2831 /* 2832 * Check if we are removing extents inside the extent tree. If that 2833 * is the case, we are going to punch a hole inside the extent tree 2834 * so we have to check whether we need to split the extent covering 2835 * the last block to remove so we can easily remove the part of it 2836 * in ext4_ext_rm_leaf(). 2837 */ 2838 if (end < EXT_MAX_BLOCKS - 1) { 2839 struct ext4_extent *ex; 2840 ext4_lblk_t ee_block; 2841 2842 /* find extent for this block */ 2843 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2844 if (IS_ERR(path)) { 2845 ext4_journal_stop(handle); 2846 return PTR_ERR(path); 2847 } 2848 depth = ext_depth(inode); 2849 /* Leaf not may not exist only if inode has no blocks at all */ 2850 ex = path[depth].p_ext; 2851 if (!ex) { 2852 if (depth) { 2853 EXT4_ERROR_INODE(inode, 2854 "path[%d].p_hdr == NULL", 2855 depth); 2856 err = -EIO; 2857 } 2858 goto out; 2859 } 2860 2861 ee_block = le32_to_cpu(ex->ee_block); 2862 2863 /* 2864 * See if the last block is inside the extent, if so split 2865 * the extent at 'end' block so we can easily remove the 2866 * tail of the first part of the split extent in 2867 * ext4_ext_rm_leaf(). 2868 */ 2869 if (end >= ee_block && 2870 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2871 /* 2872 * Split the extent in two so that 'end' is the last 2873 * block in the first new extent. Also we should not 2874 * fail removing space due to ENOSPC so try to use 2875 * reserved block if that happens. 2876 */ 2877 err = ext4_force_split_extent_at(handle, inode, &path, 2878 end + 1, 1); 2879 if (err < 0) 2880 goto out; 2881 } 2882 } 2883 /* 2884 * We start scanning from right side, freeing all the blocks 2885 * after i_size and walking into the tree depth-wise. 2886 */ 2887 depth = ext_depth(inode); 2888 if (path) { 2889 int k = i = depth; 2890 while (--k > 0) 2891 path[k].p_block = 2892 le16_to_cpu(path[k].p_hdr->eh_entries)+1; 2893 } else { 2894 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 2895 GFP_NOFS); 2896 if (path == NULL) { 2897 ext4_journal_stop(handle); 2898 return -ENOMEM; 2899 } 2900 path[0].p_maxdepth = path[0].p_depth = depth; 2901 path[0].p_hdr = ext_inode_hdr(inode); 2902 i = 0; 2903 2904 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { 2905 err = -EIO; 2906 goto out; 2907 } 2908 } 2909 err = 0; 2910 2911 while (i >= 0 && err == 0) { 2912 if (i == depth) { 2913 /* this is leaf block */ 2914 err = ext4_ext_rm_leaf(handle, inode, path, 2915 &partial_cluster, start, 2916 end); 2917 /* root level has p_bh == NULL, brelse() eats this */ 2918 brelse(path[i].p_bh); 2919 path[i].p_bh = NULL; 2920 i--; 2921 continue; 2922 } 2923 2924 /* this is index block */ 2925 if (!path[i].p_hdr) { 2926 ext_debug("initialize header\n"); 2927 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2928 } 2929 2930 if (!path[i].p_idx) { 2931 /* this level hasn't been touched yet */ 2932 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2933 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2934 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2935 path[i].p_hdr, 2936 le16_to_cpu(path[i].p_hdr->eh_entries)); 2937 } else { 2938 /* we were already here, see at next index */ 2939 path[i].p_idx--; 2940 } 2941 2942 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2943 i, EXT_FIRST_INDEX(path[i].p_hdr), 2944 path[i].p_idx); 2945 if (ext4_ext_more_to_rm(path + i)) { 2946 struct buffer_head *bh; 2947 /* go to the next level */ 2948 ext_debug("move to level %d (block %llu)\n", 2949 i + 1, ext4_idx_pblock(path[i].p_idx)); 2950 memset(path + i + 1, 0, sizeof(*path)); 2951 bh = read_extent_tree_block(inode, 2952 ext4_idx_pblock(path[i].p_idx), depth - i - 1, 2953 EXT4_EX_NOCACHE); 2954 if (IS_ERR(bh)) { 2955 /* should we reset i_size? */ 2956 err = PTR_ERR(bh); 2957 break; 2958 } 2959 /* Yield here to deal with large extent trees. 2960 * Should be a no-op if we did IO above. */ 2961 cond_resched(); 2962 if (WARN_ON(i + 1 > depth)) { 2963 err = -EIO; 2964 break; 2965 } 2966 path[i + 1].p_bh = bh; 2967 2968 /* save actual number of indexes since this 2969 * number is changed at the next iteration */ 2970 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2971 i++; 2972 } else { 2973 /* we finished processing this index, go up */ 2974 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2975 /* index is empty, remove it; 2976 * handle must be already prepared by the 2977 * truncatei_leaf() */ 2978 err = ext4_ext_rm_idx(handle, inode, path, i); 2979 } 2980 /* root level has p_bh == NULL, brelse() eats this */ 2981 brelse(path[i].p_bh); 2982 path[i].p_bh = NULL; 2983 i--; 2984 ext_debug("return to level %d\n", i); 2985 } 2986 } 2987 2988 trace_ext4_ext_remove_space_done(inode, start, end, depth, 2989 partial_cluster, path->p_hdr->eh_entries); 2990 2991 /* If we still have something in the partial cluster and we have removed 2992 * even the first extent, then we should free the blocks in the partial 2993 * cluster as well. */ 2994 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { 2995 int flags = get_default_free_blocks_flags(inode); 2996 2997 ext4_free_blocks(handle, inode, NULL, 2998 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2999 EXT4_SB(sb)->s_cluster_ratio, flags); 3000 partial_cluster = 0; 3001 } 3002 3003 /* TODO: flexible tree reduction should be here */ 3004 if (path->p_hdr->eh_entries == 0) { 3005 /* 3006 * truncate to zero freed all the tree, 3007 * so we need to correct eh_depth 3008 */ 3009 err = ext4_ext_get_access(handle, inode, path); 3010 if (err == 0) { 3011 ext_inode_hdr(inode)->eh_depth = 0; 3012 ext_inode_hdr(inode)->eh_max = 3013 cpu_to_le16(ext4_ext_space_root(inode, 0)); 3014 err = ext4_ext_dirty(handle, inode, path); 3015 } 3016 } 3017out: 3018 ext4_ext_drop_refs(path); 3019 kfree(path); 3020 path = NULL; 3021 if (err == -EAGAIN) 3022 goto again; 3023 ext4_journal_stop(handle); 3024 3025 return err; 3026} 3027 3028/* 3029 * called at mount time 3030 */ 3031void ext4_ext_init(struct super_block *sb) 3032{ 3033 /* 3034 * possible initialization would be here 3035 */ 3036 3037 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 3038#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 3039 printk(KERN_INFO "EXT4-fs: file extents enabled" 3040#ifdef AGGRESSIVE_TEST 3041 ", aggressive tests" 3042#endif 3043#ifdef CHECK_BINSEARCH 3044 ", check binsearch" 3045#endif 3046#ifdef EXTENTS_STATS 3047 ", stats" 3048#endif 3049 "\n"); 3050#endif 3051#ifdef EXTENTS_STATS 3052 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 3053 EXT4_SB(sb)->s_ext_min = 1 << 30; 3054 EXT4_SB(sb)->s_ext_max = 0; 3055#endif 3056 } 3057} 3058 3059/* 3060 * called at umount time 3061 */ 3062void ext4_ext_release(struct super_block *sb) 3063{ 3064 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 3065 return; 3066 3067#ifdef EXTENTS_STATS 3068 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 3069 struct ext4_sb_info *sbi = EXT4_SB(sb); 3070 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 3071 sbi->s_ext_blocks, sbi->s_ext_extents, 3072 sbi->s_ext_blocks / sbi->s_ext_extents); 3073 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 3074 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 3075 } 3076#endif 3077} 3078 3079static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) 3080{ 3081 ext4_lblk_t ee_block; 3082 ext4_fsblk_t ee_pblock; 3083 unsigned int ee_len; 3084 3085 ee_block = le32_to_cpu(ex->ee_block); 3086 ee_len = ext4_ext_get_actual_len(ex); 3087 ee_pblock = ext4_ext_pblock(ex); 3088 3089 if (ee_len == 0) 3090 return 0; 3091 3092 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, 3093 EXTENT_STATUS_WRITTEN); 3094} 3095 3096/* FIXME!! we need to try to merge to left or right after zero-out */ 3097static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 3098{ 3099 ext4_fsblk_t ee_pblock; 3100 unsigned int ee_len; 3101 int ret; 3102 3103 ee_len = ext4_ext_get_actual_len(ex); 3104 ee_pblock = ext4_ext_pblock(ex); 3105 3106 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 3107 if (ret > 0) 3108 ret = 0; 3109 3110 return ret; 3111} 3112 3113/* 3114 * ext4_split_extent_at() splits an extent at given block. 3115 * 3116 * @handle: the journal handle 3117 * @inode: the file inode 3118 * @path: the path to the extent 3119 * @split: the logical block where the extent is splitted. 3120 * @split_flags: indicates if the extent could be zeroout if split fails, and 3121 * the states(init or unwritten) of new extents. 3122 * @flags: flags used to insert new extent to extent tree. 3123 * 3124 * 3125 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 3126 * of which are deterimined by split_flag. 3127 * 3128 * There are two cases: 3129 * a> the extent are splitted into two extent. 3130 * b> split is not needed, and just mark the extent. 3131 * 3132 * return 0 on success. 3133 */ 3134static int ext4_split_extent_at(handle_t *handle, 3135 struct inode *inode, 3136 struct ext4_ext_path **ppath, 3137 ext4_lblk_t split, 3138 int split_flag, 3139 int flags) 3140{ 3141 struct ext4_ext_path *path = *ppath; 3142 ext4_fsblk_t newblock; 3143 ext4_lblk_t ee_block; 3144 struct ext4_extent *ex, newex, orig_ex, zero_ex; 3145 struct ext4_extent *ex2 = NULL; 3146 unsigned int ee_len, depth; 3147 int err = 0; 3148 3149 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 3150 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 3151 3152 ext_debug("ext4_split_extents_at: inode %lu, logical" 3153 "block %llu\n", inode->i_ino, (unsigned long long)split); 3154 3155 ext4_ext_show_leaf(inode, path); 3156 3157 depth = ext_depth(inode); 3158 ex = path[depth].p_ext; 3159 ee_block = le32_to_cpu(ex->ee_block); 3160 ee_len = ext4_ext_get_actual_len(ex); 3161 newblock = split - ee_block + ext4_ext_pblock(ex); 3162 3163 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 3164 BUG_ON(!ext4_ext_is_unwritten(ex) && 3165 split_flag & (EXT4_EXT_MAY_ZEROOUT | 3166 EXT4_EXT_MARK_UNWRIT1 | 3167 EXT4_EXT_MARK_UNWRIT2)); 3168 3169 err = ext4_ext_get_access(handle, inode, path + depth); 3170 if (err) 3171 goto out; 3172 3173 if (split == ee_block) { 3174 /* 3175 * case b: block @split is the block that the extent begins with 3176 * then we just change the state of the extent, and splitting 3177 * is not needed. 3178 */ 3179 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3180 ext4_ext_mark_unwritten(ex); 3181 else 3182 ext4_ext_mark_initialized(ex); 3183 3184 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 3185 ext4_ext_try_to_merge(handle, inode, path, ex); 3186 3187 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3188 goto out; 3189 } 3190 3191 /* case a */ 3192 memcpy(&orig_ex, ex, sizeof(orig_ex)); 3193 ex->ee_len = cpu_to_le16(split - ee_block); 3194 if (split_flag & EXT4_EXT_MARK_UNWRIT1) 3195 ext4_ext_mark_unwritten(ex); 3196 3197 /* 3198 * path may lead to new leaf, not to original leaf any more 3199 * after ext4_ext_insert_extent() returns, 3200 */ 3201 err = ext4_ext_dirty(handle, inode, path + depth); 3202 if (err) 3203 goto fix_extent_len; 3204 3205 ex2 = &newex; 3206 ex2->ee_block = cpu_to_le32(split); 3207 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 3208 ext4_ext_store_pblock(ex2, newblock); 3209 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3210 ext4_ext_mark_unwritten(ex2); 3211 3212 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); 3213 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3214 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3215 if (split_flag & EXT4_EXT_DATA_VALID1) { 3216 err = ext4_ext_zeroout(inode, ex2); 3217 zero_ex.ee_block = ex2->ee_block; 3218 zero_ex.ee_len = cpu_to_le16( 3219 ext4_ext_get_actual_len(ex2)); 3220 ext4_ext_store_pblock(&zero_ex, 3221 ext4_ext_pblock(ex2)); 3222 } else { 3223 err = ext4_ext_zeroout(inode, ex); 3224 zero_ex.ee_block = ex->ee_block; 3225 zero_ex.ee_len = cpu_to_le16( 3226 ext4_ext_get_actual_len(ex)); 3227 ext4_ext_store_pblock(&zero_ex, 3228 ext4_ext_pblock(ex)); 3229 } 3230 } else { 3231 err = ext4_ext_zeroout(inode, &orig_ex); 3232 zero_ex.ee_block = orig_ex.ee_block; 3233 zero_ex.ee_len = cpu_to_le16( 3234 ext4_ext_get_actual_len(&orig_ex)); 3235 ext4_ext_store_pblock(&zero_ex, 3236 ext4_ext_pblock(&orig_ex)); 3237 } 3238 3239 if (err) 3240 goto fix_extent_len; 3241 /* update the extent length and mark as initialized */ 3242 ex->ee_len = cpu_to_le16(ee_len); 3243 ext4_ext_try_to_merge(handle, inode, path, ex); 3244 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3245 if (err) 3246 goto fix_extent_len; 3247 3248 /* update extent status tree */ 3249 err = ext4_zeroout_es(inode, &zero_ex); 3250 3251 goto out; 3252 } else if (err) 3253 goto fix_extent_len; 3254 3255out: 3256 ext4_ext_show_leaf(inode, path); 3257 return err; 3258 3259fix_extent_len: 3260 ex->ee_len = orig_ex.ee_len; 3261 ext4_ext_dirty(handle, inode, path + path->p_depth); 3262 return err; 3263} 3264 3265/* 3266 * ext4_split_extents() splits an extent and mark extent which is covered 3267 * by @map as split_flags indicates 3268 * 3269 * It may result in splitting the extent into multiple extents (up to three) 3270 * There are three possibilities: 3271 * a> There is no split required 3272 * b> Splits in two extents: Split is happening at either end of the extent 3273 * c> Splits in three extents: Somone is splitting in middle of the extent 3274 * 3275 */ 3276static int ext4_split_extent(handle_t *handle, 3277 struct inode *inode, 3278 struct ext4_ext_path **ppath, 3279 struct ext4_map_blocks *map, 3280 int split_flag, 3281 int flags) 3282{ 3283 struct ext4_ext_path *path = *ppath; 3284 ext4_lblk_t ee_block; 3285 struct ext4_extent *ex; 3286 unsigned int ee_len, depth; 3287 int err = 0; 3288 int unwritten; 3289 int split_flag1, flags1; 3290 int allocated = map->m_len; 3291 3292 depth = ext_depth(inode); 3293 ex = path[depth].p_ext; 3294 ee_block = le32_to_cpu(ex->ee_block); 3295 ee_len = ext4_ext_get_actual_len(ex); 3296 unwritten = ext4_ext_is_unwritten(ex); 3297 3298 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3299 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3300 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3301 if (unwritten) 3302 split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | 3303 EXT4_EXT_MARK_UNWRIT2; 3304 if (split_flag & EXT4_EXT_DATA_VALID2) 3305 split_flag1 |= EXT4_EXT_DATA_VALID1; 3306 err = ext4_split_extent_at(handle, inode, ppath, 3307 map->m_lblk + map->m_len, split_flag1, flags1); 3308 if (err) 3309 goto out; 3310 } else { 3311 allocated = ee_len - (map->m_lblk - ee_block); 3312 } 3313 /* 3314 * Update path is required because previous ext4_split_extent_at() may 3315 * result in split of original leaf or extent zeroout. 3316 */ 3317 path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); 3318 if (IS_ERR(path)) 3319 return PTR_ERR(path); 3320 depth = ext_depth(inode); 3321 ex = path[depth].p_ext; 3322 if (!ex) { 3323 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3324 (unsigned long) map->m_lblk); 3325 return -EIO; 3326 } 3327 unwritten = ext4_ext_is_unwritten(ex); 3328 split_flag1 = 0; 3329 3330 if (map->m_lblk >= ee_block) { 3331 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 3332 if (unwritten) { 3333 split_flag1 |= EXT4_EXT_MARK_UNWRIT1; 3334 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3335 EXT4_EXT_MARK_UNWRIT2); 3336 } 3337 err = ext4_split_extent_at(handle, inode, ppath, 3338 map->m_lblk, split_flag1, flags); 3339 if (err) 3340 goto out; 3341 } 3342 3343 ext4_ext_show_leaf(inode, path); 3344out: 3345 return err ? err : allocated; 3346} 3347 3348/* 3349 * This function is called by ext4_ext_map_blocks() if someone tries to write 3350 * to an unwritten extent. It may result in splitting the unwritten 3351 * extent into multiple extents (up to three - one initialized and two 3352 * unwritten). 3353 * There are three possibilities: 3354 * a> There is no split required: Entire extent should be initialized 3355 * b> Splits in two extents: Write is happening at either end of the extent 3356 * c> Splits in three extents: Somone is writing in middle of the extent 3357 * 3358 * Pre-conditions: 3359 * - The extent pointed to by 'path' is unwritten. 3360 * - The extent pointed to by 'path' contains a superset 3361 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). 3362 * 3363 * Post-conditions on success: 3364 * - the returned value is the number of blocks beyond map->l_lblk 3365 * that are allocated and initialized. 3366 * It is guaranteed to be >= map->m_len. 3367 */ 3368static int ext4_ext_convert_to_initialized(handle_t *handle, 3369 struct inode *inode, 3370 struct ext4_map_blocks *map, 3371 struct ext4_ext_path **ppath, 3372 int flags) 3373{ 3374 struct ext4_ext_path *path = *ppath; 3375 struct ext4_sb_info *sbi; 3376 struct ext4_extent_header *eh; 3377 struct ext4_map_blocks split_map; 3378 struct ext4_extent zero_ex; 3379 struct ext4_extent *ex, *abut_ex; 3380 ext4_lblk_t ee_block, eof_block; 3381 unsigned int ee_len, depth, map_len = map->m_len; 3382 int allocated = 0, max_zeroout = 0; 3383 int err = 0; 3384 int split_flag = 0; 3385 3386 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3387 "block %llu, max_blocks %u\n", inode->i_ino, 3388 (unsigned long long)map->m_lblk, map_len); 3389 3390 sbi = EXT4_SB(inode->i_sb); 3391 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3392 inode->i_sb->s_blocksize_bits; 3393 if (eof_block < map->m_lblk + map_len) 3394 eof_block = map->m_lblk + map_len; 3395 3396 depth = ext_depth(inode); 3397 eh = path[depth].p_hdr; 3398 ex = path[depth].p_ext; 3399 ee_block = le32_to_cpu(ex->ee_block); 3400 ee_len = ext4_ext_get_actual_len(ex); 3401 zero_ex.ee_len = 0; 3402 3403 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3404 3405 /* Pre-conditions */ 3406 BUG_ON(!ext4_ext_is_unwritten(ex)); 3407 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 3408 3409 /* 3410 * Attempt to transfer newly initialized blocks from the currently 3411 * unwritten extent to its neighbor. This is much cheaper 3412 * than an insertion followed by a merge as those involve costly 3413 * memmove() calls. Transferring to the left is the common case in 3414 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) 3415 * followed by append writes. 3416 * 3417 * Limitations of the current logic: 3418 * - L1: we do not deal with writes covering the whole extent. 3419 * This would require removing the extent if the transfer 3420 * is possible. 3421 * - L2: we only attempt to merge with an extent stored in the 3422 * same extent tree node. 3423 */ 3424 if ((map->m_lblk == ee_block) && 3425 /* See if we can merge left */ 3426 (map_len < ee_len) && /*L1*/ 3427 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ 3428 ext4_lblk_t prev_lblk; 3429 ext4_fsblk_t prev_pblk, ee_pblk; 3430 unsigned int prev_len; 3431 3432 abut_ex = ex - 1; 3433 prev_lblk = le32_to_cpu(abut_ex->ee_block); 3434 prev_len = ext4_ext_get_actual_len(abut_ex); 3435 prev_pblk = ext4_ext_pblock(abut_ex); 3436 ee_pblk = ext4_ext_pblock(ex); 3437 3438 /* 3439 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3440 * upon those conditions: 3441 * - C1: abut_ex is initialized, 3442 * - C2: abut_ex is logically abutting ex, 3443 * - C3: abut_ex is physically abutting ex, 3444 * - C4: abut_ex can receive the additional blocks without 3445 * overflowing the (initialized) length limit. 3446 */ 3447 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3448 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3449 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3450 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3451 err = ext4_ext_get_access(handle, inode, path + depth); 3452 if (err) 3453 goto out; 3454 3455 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3456 map, ex, abut_ex); 3457 3458 /* Shift the start of ex by 'map_len' blocks */ 3459 ex->ee_block = cpu_to_le32(ee_block + map_len); 3460 ext4_ext_store_pblock(ex, ee_pblk + map_len); 3461 ex->ee_len = cpu_to_le16(ee_len - map_len); 3462 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3463 3464 /* Extend abut_ex by 'map_len' blocks */ 3465 abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 3466 3467 /* Result: number of initialized blocks past m_lblk */ 3468 allocated = map_len; 3469 } 3470 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && 3471 (map_len < ee_len) && /*L1*/ 3472 ex < EXT_LAST_EXTENT(eh)) { /*L2*/ 3473 /* See if we can merge right */ 3474 ext4_lblk_t next_lblk; 3475 ext4_fsblk_t next_pblk, ee_pblk; 3476 unsigned int next_len; 3477 3478 abut_ex = ex + 1; 3479 next_lblk = le32_to_cpu(abut_ex->ee_block); 3480 next_len = ext4_ext_get_actual_len(abut_ex); 3481 next_pblk = ext4_ext_pblock(abut_ex); 3482 ee_pblk = ext4_ext_pblock(ex); 3483 3484 /* 3485 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3486 * upon those conditions: 3487 * - C1: abut_ex is initialized, 3488 * - C2: abut_ex is logically abutting ex, 3489 * - C3: abut_ex is physically abutting ex, 3490 * - C4: abut_ex can receive the additional blocks without 3491 * overflowing the (initialized) length limit. 3492 */ 3493 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3494 ((map->m_lblk + map_len) == next_lblk) && /*C2*/ 3495 ((ee_pblk + ee_len) == next_pblk) && /*C3*/ 3496 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3497 err = ext4_ext_get_access(handle, inode, path + depth); 3498 if (err) 3499 goto out; 3500 3501 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3502 map, ex, abut_ex); 3503 3504 /* Shift the start of abut_ex by 'map_len' blocks */ 3505 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 3506 ext4_ext_store_pblock(abut_ex, next_pblk - map_len); 3507 ex->ee_len = cpu_to_le16(ee_len - map_len); 3508 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3509 3510 /* Extend abut_ex by 'map_len' blocks */ 3511 abut_ex->ee_len = cpu_to_le16(next_len + map_len); 3512 3513 /* Result: number of initialized blocks past m_lblk */ 3514 allocated = map_len; 3515 } 3516 } 3517 if (allocated) { 3518 /* Mark the block containing both extents as dirty */ 3519 ext4_ext_dirty(handle, inode, path + depth); 3520 3521 /* Update path to point to the right extent */ 3522 path[depth].p_ext = abut_ex; 3523 goto out; 3524 } else 3525 allocated = ee_len - (map->m_lblk - ee_block); 3526 3527 WARN_ON(map->m_lblk < ee_block); 3528 /* 3529 * It is safe to convert extent to initialized via explicit 3530 * zeroout only if extent is fully inside i_size or new_size. 3531 */ 3532 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3533 3534 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3535 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3536 (inode->i_sb->s_blocksize_bits - 10); 3537 3538 /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3539 if (max_zeroout && (ee_len <= max_zeroout)) { 3540 err = ext4_ext_zeroout(inode, ex); 3541 if (err) 3542 goto out; 3543 zero_ex.ee_block = ex->ee_block; 3544 zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); 3545 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); 3546 3547 err = ext4_ext_get_access(handle, inode, path + depth); 3548 if (err) 3549 goto out; 3550 ext4_ext_mark_initialized(ex); 3551 ext4_ext_try_to_merge(handle, inode, path, ex); 3552 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3553 goto out; 3554 } 3555 3556 /* 3557 * four cases: 3558 * 1. split the extent into three extents. 3559 * 2. split the extent into two extents, zeroout the first half. 3560 * 3. split the extent into two extents, zeroout the second half. 3561 * 4. split the extent into two extents with out zeroout. 3562 */ 3563 split_map.m_lblk = map->m_lblk; 3564 split_map.m_len = map->m_len; 3565 3566 if (max_zeroout && (allocated > map->m_len)) { 3567 if (allocated <= max_zeroout) { 3568 /* case 3 */ 3569 zero_ex.ee_block = 3570 cpu_to_le32(map->m_lblk); 3571 zero_ex.ee_len = cpu_to_le16(allocated); 3572 ext4_ext_store_pblock(&zero_ex, 3573 ext4_ext_pblock(ex) + map->m_lblk - ee_block); 3574 err = ext4_ext_zeroout(inode, &zero_ex); 3575 if (err) 3576 goto out; 3577 split_map.m_lblk = map->m_lblk; 3578 split_map.m_len = allocated; 3579 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { 3580 /* case 2 */ 3581 if (map->m_lblk != ee_block) { 3582 zero_ex.ee_block = ex->ee_block; 3583 zero_ex.ee_len = cpu_to_le16(map->m_lblk - 3584 ee_block); 3585 ext4_ext_store_pblock(&zero_ex, 3586 ext4_ext_pblock(ex)); 3587 err = ext4_ext_zeroout(inode, &zero_ex); 3588 if (err) 3589 goto out; 3590 } 3591 3592 split_map.m_lblk = ee_block; 3593 split_map.m_len = map->m_lblk - ee_block + map->m_len; 3594 allocated = map->m_len; 3595 } 3596 } 3597 3598 allocated = ext4_split_extent(handle, inode, ppath, 3599 &split_map, split_flag, flags); 3600 if (allocated < 0) 3601 err = allocated; 3602 3603out: 3604 /* If we have gotten a failure, don't zero out status tree */ 3605 if (!err) 3606 err = ext4_zeroout_es(inode, &zero_ex); 3607 return err ? err : allocated; 3608} 3609 3610/* 3611 * This function is called by ext4_ext_map_blocks() from 3612 * ext4_get_blocks_dio_write() when DIO to write 3613 * to an unwritten extent. 3614 * 3615 * Writing to an unwritten extent may result in splitting the unwritten 3616 * extent into multiple initialized/unwritten extents (up to three) 3617 * There are three possibilities: 3618 * a> There is no split required: Entire extent should be unwritten 3619 * b> Splits in two extents: Write is happening at either end of the extent 3620 * c> Splits in three extents: Somone is writing in middle of the extent 3621 * 3622 * This works the same way in the case of initialized -> unwritten conversion. 3623 * 3624 * One of more index blocks maybe needed if the extent tree grow after 3625 * the unwritten extent split. To prevent ENOSPC occur at the IO 3626 * complete, we need to split the unwritten extent before DIO submit 3627 * the IO. The unwritten extent called at this time will be split 3628 * into three unwritten extent(at most). After IO complete, the part 3629 * being filled will be convert to initialized by the end_io callback function 3630 * via ext4_convert_unwritten_extents(). 3631 * 3632 * Returns the size of unwritten extent to be written on success. 3633 */ 3634static int ext4_split_convert_extents(handle_t *handle, 3635 struct inode *inode, 3636 struct ext4_map_blocks *map, 3637 struct ext4_ext_path **ppath, 3638 int flags) 3639{ 3640 struct ext4_ext_path *path = *ppath; 3641 ext4_lblk_t eof_block; 3642 ext4_lblk_t ee_block; 3643 struct ext4_extent *ex; 3644 unsigned int ee_len; 3645 int split_flag = 0, depth; 3646 3647 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", 3648 __func__, inode->i_ino, 3649 (unsigned long long)map->m_lblk, map->m_len); 3650 3651 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3652 inode->i_sb->s_blocksize_bits; 3653 if (eof_block < map->m_lblk + map->m_len) 3654 eof_block = map->m_lblk + map->m_len; 3655 /* 3656 * It is safe to convert extent to initialized via explicit 3657 * zeroout only if extent is fully insde i_size or new_size. 3658 */ 3659 depth = ext_depth(inode); 3660 ex = path[depth].p_ext; 3661 ee_block = le32_to_cpu(ex->ee_block); 3662 ee_len = ext4_ext_get_actual_len(ex); 3663 3664 /* Convert to unwritten */ 3665 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { 3666 split_flag |= EXT4_EXT_DATA_VALID1; 3667 /* Convert to initialized */ 3668 } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 3669 split_flag |= ee_block + ee_len <= eof_block ? 3670 EXT4_EXT_MAY_ZEROOUT : 0; 3671 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3672 } 3673 flags |= EXT4_GET_BLOCKS_PRE_IO; 3674 return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); 3675} 3676 3677static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3678 struct inode *inode, 3679 struct ext4_map_blocks *map, 3680 struct ext4_ext_path **ppath) 3681{ 3682 struct ext4_ext_path *path = *ppath; 3683 struct ext4_extent *ex; 3684 ext4_lblk_t ee_block; 3685 unsigned int ee_len; 3686 int depth; 3687 int err = 0; 3688 3689 depth = ext_depth(inode); 3690 ex = path[depth].p_ext; 3691 ee_block = le32_to_cpu(ex->ee_block); 3692 ee_len = ext4_ext_get_actual_len(ex); 3693 3694 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3695 "block %llu, max_blocks %u\n", inode->i_ino, 3696 (unsigned long long)ee_block, ee_len); 3697 3698 /* If extent is larger than requested it is a clear sign that we still 3699 * have some extent state machine issues left. So extent_split is still 3700 * required. 3701 * TODO: Once all related issues will be fixed this situation should be 3702 * illegal. 3703 */ 3704 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3705#ifdef EXT4_DEBUG 3706 ext4_warning("Inode (%ld) finished: extent logical block %llu," 3707 " len %u; IO logical block %llu, len %u\n", 3708 inode->i_ino, (unsigned long long)ee_block, ee_len, 3709 (unsigned long long)map->m_lblk, map->m_len); 3710#endif 3711 err = ext4_split_convert_extents(handle, inode, map, ppath, 3712 EXT4_GET_BLOCKS_CONVERT); 3713 if (err < 0) 3714 return err; 3715 path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); 3716 if (IS_ERR(path)) 3717 return PTR_ERR(path); 3718 depth = ext_depth(inode); 3719 ex = path[depth].p_ext; 3720 } 3721 3722 err = ext4_ext_get_access(handle, inode, path + depth); 3723 if (err) 3724 goto out; 3725 /* first mark the extent as initialized */ 3726 ext4_ext_mark_initialized(ex); 3727 3728 /* note: ext4_ext_correct_indexes() isn't needed here because 3729 * borders are not changed 3730 */ 3731 ext4_ext_try_to_merge(handle, inode, path, ex); 3732 3733 /* Mark modified extent as dirty */ 3734 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3735out: 3736 ext4_ext_show_leaf(inode, path); 3737 return err; 3738} 3739 3740static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3741 sector_t block, int count) 3742{ 3743 int i; 3744 for (i = 0; i < count; i++) 3745 unmap_underlying_metadata(bdev, block + i); 3746} 3747 3748/* 3749 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3750 */ 3751static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3752 ext4_lblk_t lblk, 3753 struct ext4_ext_path *path, 3754 unsigned int len) 3755{ 3756 int i, depth; 3757 struct ext4_extent_header *eh; 3758 struct ext4_extent *last_ex; 3759 3760 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3761 return 0; 3762 3763 depth = ext_depth(inode); 3764 eh = path[depth].p_hdr; 3765 3766 /* 3767 * We're going to remove EOFBLOCKS_FL entirely in future so we 3768 * do not care for this case anymore. Simply remove the flag 3769 * if there are no extents. 3770 */ 3771 if (unlikely(!eh->eh_entries)) 3772 goto out; 3773 last_ex = EXT_LAST_EXTENT(eh); 3774 /* 3775 * We should clear the EOFBLOCKS_FL flag if we are writing the 3776 * last block in the last extent in the file. We test this by 3777 * first checking to see if the caller to 3778 * ext4_ext_get_blocks() was interested in the last block (or 3779 * a block beyond the last block) in the current extent. If 3780 * this turns out to be false, we can bail out from this 3781 * function immediately. 3782 */ 3783 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3784 ext4_ext_get_actual_len(last_ex)) 3785 return 0; 3786 /* 3787 * If the caller does appear to be planning to write at or 3788 * beyond the end of the current extent, we then test to see 3789 * if the current extent is the last extent in the file, by 3790 * checking to make sure it was reached via the rightmost node 3791 * at each level of the tree. 3792 */ 3793 for (i = depth-1; i >= 0; i--) 3794 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3795 return 0; 3796out: 3797 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3798 return ext4_mark_inode_dirty(handle, inode); 3799} 3800 3801/** 3802 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3803 * 3804 * Return 1 if there is a delalloc block in the range, otherwise 0. 3805 */ 3806int ext4_find_delalloc_range(struct inode *inode, 3807 ext4_lblk_t lblk_start, 3808 ext4_lblk_t lblk_end) 3809{ 3810 struct extent_status es; 3811 3812 ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); 3813 if (es.es_len == 0) 3814 return 0; /* there is no delay extent in this tree */ 3815 else if (es.es_lblk <= lblk_start && 3816 lblk_start < es.es_lblk + es.es_len) 3817 return 1; 3818 else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) 3819 return 1; 3820 else 3821 return 0; 3822} 3823 3824int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) 3825{ 3826 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3827 ext4_lblk_t lblk_start, lblk_end; 3828 lblk_start = EXT4_LBLK_CMASK(sbi, lblk); 3829 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3830 3831 return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3832} 3833 3834/** 3835 * Determines how many complete clusters (out of those specified by the 'map') 3836 * are under delalloc and were reserved quota for. 3837 * This function is called when we are writing out the blocks that were 3838 * originally written with their allocation delayed, but then the space was 3839 * allocated using fallocate() before the delayed allocation could be resolved. 3840 * The cases to look for are: 3841 * ('=' indicated delayed allocated blocks 3842 * '-' indicates non-delayed allocated blocks) 3843 * (a) partial clusters towards beginning and/or end outside of allocated range 3844 * are not delalloc'ed. 3845 * Ex: 3846 * |----c---=|====c====|====c====|===-c----| 3847 * |++++++ allocated ++++++| 3848 * ==> 4 complete clusters in above example 3849 * 3850 * (b) partial cluster (outside of allocated range) towards either end is 3851 * marked for delayed allocation. In this case, we will exclude that 3852 * cluster. 3853 * Ex: 3854 * |----====c========|========c========| 3855 * |++++++ allocated ++++++| 3856 * ==> 1 complete clusters in above example 3857 * 3858 * Ex: 3859 * |================c================| 3860 * |++++++ allocated ++++++| 3861 * ==> 0 complete clusters in above example 3862 * 3863 * The ext4_da_update_reserve_space will be called only if we 3864 * determine here that there were some "entire" clusters that span 3865 * this 'allocated' range. 3866 * In the non-bigalloc case, this function will just end up returning num_blks 3867 * without ever calling ext4_find_delalloc_range. 3868 */ 3869static unsigned int 3870get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, 3871 unsigned int num_blks) 3872{ 3873 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3874 ext4_lblk_t alloc_cluster_start, alloc_cluster_end; 3875 ext4_lblk_t lblk_from, lblk_to, c_offset; 3876 unsigned int allocated_clusters = 0; 3877 3878 alloc_cluster_start = EXT4_B2C(sbi, lblk_start); 3879 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); 3880 3881 /* max possible clusters for this allocation */ 3882 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; 3883 3884 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 3885 3886 /* Check towards left side */ 3887 c_offset = EXT4_LBLK_COFF(sbi, lblk_start); 3888 if (c_offset) { 3889 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); 3890 lblk_to = lblk_from + c_offset - 1; 3891 3892 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3893 allocated_clusters--; 3894 } 3895 3896 /* Now check towards right. */ 3897 c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); 3898 if (allocated_clusters && c_offset) { 3899 lblk_from = lblk_start + num_blks; 3900 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3901 3902 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3903 allocated_clusters--; 3904 } 3905 3906 return allocated_clusters; 3907} 3908 3909static int 3910convert_initialized_extent(handle_t *handle, struct inode *inode, 3911 struct ext4_map_blocks *map, 3912 struct ext4_ext_path **ppath, int flags, 3913 unsigned int allocated, ext4_fsblk_t newblock) 3914{ 3915 struct ext4_ext_path *path = *ppath; 3916 struct ext4_extent *ex; 3917 ext4_lblk_t ee_block; 3918 unsigned int ee_len; 3919 int depth; 3920 int err = 0; 3921 3922 /* 3923 * Make sure that the extent is no bigger than we support with 3924 * unwritten extent 3925 */ 3926 if (map->m_len > EXT_UNWRITTEN_MAX_LEN) 3927 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; 3928 3929 depth = ext_depth(inode); 3930 ex = path[depth].p_ext; 3931 ee_block = le32_to_cpu(ex->ee_block); 3932 ee_len = ext4_ext_get_actual_len(ex); 3933 3934 ext_debug("%s: inode %lu, logical" 3935 "block %llu, max_blocks %u\n", __func__, inode->i_ino, 3936 (unsigned long long)ee_block, ee_len); 3937 3938 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3939 err = ext4_split_convert_extents(handle, inode, map, ppath, 3940 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3941 if (err < 0) 3942 return err; 3943 path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); 3944 if (IS_ERR(path)) 3945 return PTR_ERR(path); 3946 depth = ext_depth(inode); 3947 ex = path[depth].p_ext; 3948 if (!ex) { 3949 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3950 (unsigned long) map->m_lblk); 3951 return -EIO; 3952 } 3953 } 3954 3955 err = ext4_ext_get_access(handle, inode, path + depth); 3956 if (err) 3957 return err; 3958 /* first mark the extent as unwritten */ 3959 ext4_ext_mark_unwritten(ex); 3960 3961 /* note: ext4_ext_correct_indexes() isn't needed here because 3962 * borders are not changed 3963 */ 3964 ext4_ext_try_to_merge(handle, inode, path, ex); 3965 3966 /* Mark modified extent as dirty */ 3967 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3968 if (err) 3969 return err; 3970 ext4_ext_show_leaf(inode, path); 3971 3972 ext4_update_inode_fsync_trans(handle, inode, 1); 3973 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); 3974 if (err) 3975 return err; 3976 map->m_flags |= EXT4_MAP_UNWRITTEN; 3977 if (allocated > map->m_len) 3978 allocated = map->m_len; 3979 map->m_len = allocated; 3980 return allocated; 3981} 3982 3983static int 3984ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, 3985 struct ext4_map_blocks *map, 3986 struct ext4_ext_path **ppath, int flags, 3987 unsigned int allocated, ext4_fsblk_t newblock) 3988{ 3989 struct ext4_ext_path *path = *ppath; 3990 int ret = 0; 3991 int err = 0; 3992 ext4_io_end_t *io = ext4_inode_aio(inode); 3993 3994 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " 3995 "block %llu, max_blocks %u, flags %x, allocated %u\n", 3996 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3997 flags, allocated); 3998 ext4_ext_show_leaf(inode, path); 3999 4000 /* 4001 * When writing into unwritten space, we should not fail to 4002 * allocate metadata blocks for the new extent block if needed. 4003 */ 4004 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; 4005 4006 trace_ext4_ext_handle_unwritten_extents(inode, map, flags, 4007 allocated, newblock); 4008 4009 /* get_block() before submit the IO, split the extent */ 4010 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 4011 ret = ext4_split_convert_extents(handle, inode, map, ppath, 4012 flags | EXT4_GET_BLOCKS_CONVERT); 4013 if (ret <= 0) 4014 goto out; 4015 /* 4016 * Flag the inode(non aio case) or end_io struct (aio case) 4017 * that this IO needs to conversion to written when IO is 4018 * completed 4019 */ 4020 if (io) 4021 ext4_set_io_unwritten_flag(inode, io); 4022 else 4023 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 4024 map->m_flags |= EXT4_MAP_UNWRITTEN; 4025 goto out; 4026 } 4027 /* IO end_io complete, convert the filled extent to written */ 4028 if (flags & EXT4_GET_BLOCKS_CONVERT) { 4029 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 4030 ppath); 4031 if (ret >= 0) { 4032 ext4_update_inode_fsync_trans(handle, inode, 1); 4033 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4034 path, map->m_len); 4035 } else 4036 err = ret; 4037 map->m_flags |= EXT4_MAP_MAPPED; 4038 map->m_pblk = newblock; 4039 if (allocated > map->m_len) 4040 allocated = map->m_len; 4041 map->m_len = allocated; 4042 goto out2; 4043 } 4044 /* buffered IO case */ 4045 /* 4046 * repeat fallocate creation request 4047 * we already have an unwritten extent 4048 */ 4049 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { 4050 map->m_flags |= EXT4_MAP_UNWRITTEN; 4051 goto map_out; 4052 } 4053 4054 /* buffered READ or buffered write_begin() lookup */ 4055 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4056 /* 4057 * We have blocks reserved already. We 4058 * return allocated blocks so that delalloc 4059 * won't do block reservation for us. But 4060 * the buffer head will be unmapped so that 4061 * a read from the block returns 0s. 4062 */ 4063 map->m_flags |= EXT4_MAP_UNWRITTEN; 4064 goto out1; 4065 } 4066 4067 /* buffered write, writepage time, convert*/ 4068 ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); 4069 if (ret >= 0) 4070 ext4_update_inode_fsync_trans(handle, inode, 1); 4071out: 4072 if (ret <= 0) { 4073 err = ret; 4074 goto out2; 4075 } else 4076 allocated = ret; 4077 map->m_flags |= EXT4_MAP_NEW; 4078 /* 4079 * if we allocated more blocks than requested 4080 * we need to make sure we unmap the extra block 4081 * allocated. The actual needed block will get 4082 * unmapped later when we find the buffer_head marked 4083 * new. 4084 */ 4085 if (allocated > map->m_len) { 4086 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 4087 newblock + map->m_len, 4088 allocated - map->m_len); 4089 allocated = map->m_len; 4090 } 4091 map->m_len = allocated; 4092 4093 /* 4094 * If we have done fallocate with the offset that is already 4095 * delayed allocated, we would have block reservation 4096 * and quota reservation done in the delayed write path. 4097 * But fallocate would have already updated quota and block 4098 * count for this offset. So cancel these reservation 4099 */ 4100 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4101 unsigned int reserved_clusters; 4102 reserved_clusters = get_reserved_cluster_alloc(inode, 4103 map->m_lblk, map->m_len); 4104 if (reserved_clusters) 4105 ext4_da_update_reserve_space(inode, 4106 reserved_clusters, 4107 0); 4108 } 4109 4110map_out: 4111 map->m_flags |= EXT4_MAP_MAPPED; 4112 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { 4113 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 4114 map->m_len); 4115 if (err < 0) 4116 goto out2; 4117 } 4118out1: 4119 if (allocated > map->m_len) 4120 allocated = map->m_len; 4121 ext4_ext_show_leaf(inode, path); 4122 map->m_pblk = newblock; 4123 map->m_len = allocated; 4124out2: 4125 return err ? err : allocated; 4126} 4127 4128/* 4129 * get_implied_cluster_alloc - check to see if the requested 4130 * allocation (in the map structure) overlaps with a cluster already 4131 * allocated in an extent. 4132 * @sb The filesystem superblock structure 4133 * @map The requested lblk->pblk mapping 4134 * @ex The extent structure which might contain an implied 4135 * cluster allocation 4136 * 4137 * This function is called by ext4_ext_map_blocks() after we failed to 4138 * find blocks that were already in the inode's extent tree. Hence, 4139 * we know that the beginning of the requested region cannot overlap 4140 * the extent from the inode's extent tree. There are three cases we 4141 * want to catch. The first is this case: 4142 * 4143 * |--- cluster # N--| 4144 * |--- extent ---| |---- requested region ---| 4145 * |==========| 4146 * 4147 * The second case that we need to test for is this one: 4148 * 4149 * |--------- cluster # N ----------------| 4150 * |--- requested region --| |------- extent ----| 4151 * |=======================| 4152 * 4153 * The third case is when the requested region lies between two extents 4154 * within the same cluster: 4155 * |------------- cluster # N-------------| 4156 * |----- ex -----| |---- ex_right ----| 4157 * |------ requested region ------| 4158 * |================| 4159 * 4160 * In each of the above cases, we need to set the map->m_pblk and 4161 * map->m_len so it corresponds to the return the extent labelled as 4162 * "|====|" from cluster #N, since it is already in use for data in 4163 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to 4164 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated 4165 * as a new "allocated" block region. Otherwise, we will return 0 and 4166 * ext4_ext_map_blocks() will then allocate one or more new clusters 4167 * by calling ext4_mb_new_blocks(). 4168 */ 4169static int get_implied_cluster_alloc(struct super_block *sb, 4170 struct ext4_map_blocks *map, 4171 struct ext4_extent *ex, 4172 struct ext4_ext_path *path) 4173{ 4174 struct ext4_sb_info *sbi = EXT4_SB(sb); 4175 ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4176 ext4_lblk_t ex_cluster_start, ex_cluster_end; 4177 ext4_lblk_t rr_cluster_start; 4178 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4179 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4180 unsigned short ee_len = ext4_ext_get_actual_len(ex); 4181 4182 /* The extent passed in that we are trying to match */ 4183 ex_cluster_start = EXT4_B2C(sbi, ee_block); 4184 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); 4185 4186 /* The requested region passed into ext4_map_blocks() */ 4187 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 4188 4189 if ((rr_cluster_start == ex_cluster_end) || 4190 (rr_cluster_start == ex_cluster_start)) { 4191 if (rr_cluster_start == ex_cluster_end) 4192 ee_start += ee_len - 1; 4193 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; 4194 map->m_len = min(map->m_len, 4195 (unsigned) sbi->s_cluster_ratio - c_offset); 4196 /* 4197 * Check for and handle this case: 4198 * 4199 * |--------- cluster # N-------------| 4200 * |------- extent ----| 4201 * |--- requested region ---| 4202 * |===========| 4203 */ 4204 4205 if (map->m_lblk < ee_block) 4206 map->m_len = min(map->m_len, ee_block - map->m_lblk); 4207 4208 /* 4209 * Check for the case where there is already another allocated 4210 * block to the right of 'ex' but before the end of the cluster. 4211 * 4212 * |------------- cluster # N-------------| 4213 * |----- ex -----| |---- ex_right ----| 4214 * |------ requested region ------| 4215 * |================| 4216 */ 4217 if (map->m_lblk > ee_block) { 4218 ext4_lblk_t next = ext4_ext_next_allocated_block(path); 4219 map->m_len = min(map->m_len, next - map->m_lblk); 4220 } 4221 4222 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); 4223 return 1; 4224 } 4225 4226 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); 4227 return 0; 4228} 4229 4230 4231/* 4232 * Block allocation/map/preallocation routine for extents based files 4233 * 4234 * 4235 * Need to be called with 4236 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 4237 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 4238 * 4239 * return > 0, number of of blocks already mapped/allocated 4240 * if create == 0 and these are pre-allocated blocks 4241 * buffer head is unmapped 4242 * otherwise blocks are mapped 4243 * 4244 * return = 0, if plain look up failed (blocks have not been allocated) 4245 * buffer head is unmapped 4246 * 4247 * return < 0, error case. 4248 */ 4249int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 4250 struct ext4_map_blocks *map, int flags) 4251{ 4252 struct ext4_ext_path *path = NULL; 4253 struct ext4_extent newex, *ex, *ex2; 4254 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4255 ext4_fsblk_t newblock = 0; 4256 int free_on_err = 0, err = 0, depth, ret; 4257 unsigned int allocated = 0, offset = 0; 4258 unsigned int allocated_clusters = 0; 4259 struct ext4_allocation_request ar; 4260 ext4_io_end_t *io = ext4_inode_aio(inode); 4261 ext4_lblk_t cluster_offset; 4262 int set_unwritten = 0; 4263 4264 ext_debug("blocks %u/%u requested for inode %lu\n", 4265 map->m_lblk, map->m_len, inode->i_ino); 4266 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4267 4268 /* find extent for this block */ 4269 path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); 4270 if (IS_ERR(path)) { 4271 err = PTR_ERR(path); 4272 path = NULL; 4273 goto out2; 4274 } 4275 4276 depth = ext_depth(inode); 4277 4278 /* 4279 * consistent leaf must not be empty; 4280 * this situation is possible, though, _during_ tree modification; 4281 * this is why assert can't be put in ext4_ext_find_extent() 4282 */ 4283 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4284 EXT4_ERROR_INODE(inode, "bad extent address " 4285 "lblock: %lu, depth: %d pblock %lld", 4286 (unsigned long) map->m_lblk, depth, 4287 path[depth].p_block); 4288 err = -EIO; 4289 goto out2; 4290 } 4291 4292 ex = path[depth].p_ext; 4293 if (ex) { 4294 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4295 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4296 unsigned short ee_len; 4297 4298 4299 /* 4300 * unwritten extents are treated as holes, except that 4301 * we split out initialized portions during a write. 4302 */ 4303 ee_len = ext4_ext_get_actual_len(ex); 4304 4305 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 4306 4307 /* if found extent covers block, simply return it */ 4308 if (in_range(map->m_lblk, ee_block, ee_len)) { 4309 newblock = map->m_lblk - ee_block + ee_start; 4310 /* number of remaining blocks in the extent */ 4311 allocated = ee_len - (map->m_lblk - ee_block); 4312 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4313 ee_block, ee_len, newblock); 4314 4315 /* 4316 * If the extent is initialized check whether the 4317 * caller wants to convert it to unwritten. 4318 */ 4319 if ((!ext4_ext_is_unwritten(ex)) && 4320 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4321 allocated = convert_initialized_extent( 4322 handle, inode, map, &path, 4323 flags, allocated, newblock); 4324 goto out2; 4325 } else if (!ext4_ext_is_unwritten(ex)) 4326 goto out; 4327 4328 ret = ext4_ext_handle_unwritten_extents( 4329 handle, inode, map, &path, flags, 4330 allocated, newblock); 4331 if (ret < 0) 4332 err = ret; 4333 else 4334 allocated = ret; 4335 goto out2; 4336 } 4337 } 4338 4339 if ((sbi->s_cluster_ratio > 1) && 4340 ext4_find_delalloc_cluster(inode, map->m_lblk)) 4341 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4342 4343 /* 4344 * requested block isn't allocated yet; 4345 * we couldn't try to create block if create flag is zero 4346 */ 4347 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4348 /* 4349 * put just found gap into cache to speed up 4350 * subsequent requests 4351 */ 4352 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) 4353 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4354 goto out2; 4355 } 4356 4357 /* 4358 * Okay, we need to do block allocation. 4359 */ 4360 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; 4361 newex.ee_block = cpu_to_le32(map->m_lblk); 4362 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4363 4364 /* 4365 * If we are doing bigalloc, check to see if the extent returned 4366 * by ext4_ext_find_extent() implies a cluster we can use. 4367 */ 4368 if (cluster_offset && ex && 4369 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4370 ar.len = allocated = map->m_len; 4371 newblock = map->m_pblk; 4372 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4373 goto got_allocated_blocks; 4374 } 4375 4376 /* find neighbour allocated blocks */ 4377 ar.lleft = map->m_lblk; 4378 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 4379 if (err) 4380 goto out2; 4381 ar.lright = map->m_lblk; 4382 ex2 = NULL; 4383 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); 4384 if (err) 4385 goto out2; 4386 4387 /* Check if the extent after searching to the right implies a 4388 * cluster we can use. */ 4389 if ((sbi->s_cluster_ratio > 1) && ex2 && 4390 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4391 ar.len = allocated = map->m_len; 4392 newblock = map->m_pblk; 4393 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4394 goto got_allocated_blocks; 4395 } 4396 4397 /* 4398 * See if request is beyond maximum number of blocks we can have in 4399 * a single extent. For an initialized extent this limit is 4400 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is 4401 * EXT_UNWRITTEN_MAX_LEN. 4402 */ 4403 if (map->m_len > EXT_INIT_MAX_LEN && 4404 !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4405 map->m_len = EXT_INIT_MAX_LEN; 4406 else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && 4407 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4408 map->m_len = EXT_UNWRITTEN_MAX_LEN; 4409 4410 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 4411 newex.ee_len = cpu_to_le16(map->m_len); 4412 err = ext4_ext_check_overlap(sbi, inode, &newex, path); 4413 if (err) 4414 allocated = ext4_ext_get_actual_len(&newex); 4415 else 4416 allocated = map->m_len; 4417 4418 /* allocate new block */ 4419 ar.inode = inode; 4420 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 4421 ar.logical = map->m_lblk; 4422 /* 4423 * We calculate the offset from the beginning of the cluster 4424 * for the logical block number, since when we allocate a 4425 * physical cluster, the physical block should start at the 4426 * same offset from the beginning of the cluster. This is 4427 * needed so that future calls to get_implied_cluster_alloc() 4428 * work correctly. 4429 */ 4430 offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4431 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4432 ar.goal -= offset; 4433 ar.logical -= offset; 4434 if (S_ISREG(inode->i_mode)) 4435 ar.flags = EXT4_MB_HINT_DATA; 4436 else 4437 /* disable in-core preallocation for non-regular files */ 4438 ar.flags = 0; 4439 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4440 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4441 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4442 if (!newblock) 4443 goto out2; 4444 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4445 ar.goal, newblock, allocated); 4446 free_on_err = 1; 4447 allocated_clusters = ar.len; 4448 ar.len = EXT4_C2B(sbi, ar.len) - offset; 4449 if (ar.len > allocated) 4450 ar.len = allocated; 4451 4452got_allocated_blocks: 4453 /* try to insert new extent into found leaf and return */ 4454 ext4_ext_store_pblock(&newex, newblock + offset); 4455 newex.ee_len = cpu_to_le16(ar.len); 4456 /* Mark unwritten */ 4457 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ 4458 ext4_ext_mark_unwritten(&newex); 4459 map->m_flags |= EXT4_MAP_UNWRITTEN; 4460 /* 4461 * io_end structure was created for every IO write to an 4462 * unwritten extent. To avoid unnecessary conversion, 4463 * here we flag the IO that really needs the conversion. 4464 * For non asycn direct IO case, flag the inode state 4465 * that we need to perform conversion when IO is done. 4466 */ 4467 if (flags & EXT4_GET_BLOCKS_PRE_IO) 4468 set_unwritten = 1; 4469 } 4470 4471 err = 0; 4472 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) 4473 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4474 path, ar.len); 4475 if (!err) 4476 err = ext4_ext_insert_extent(handle, inode, &path, 4477 &newex, flags); 4478 4479 if (!err && set_unwritten) { 4480 if (io) 4481 ext4_set_io_unwritten_flag(inode, io); 4482 else 4483 ext4_set_inode_state(inode, 4484 EXT4_STATE_DIO_UNWRITTEN); 4485 } 4486 4487 if (err && free_on_err) { 4488 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4489 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4490 /* free data blocks we just allocated */ 4491 /* not a good idea to call discard here directly, 4492 * but otherwise we'd need to call it every free() */ 4493 ext4_discard_preallocations(inode); 4494 ext4_free_blocks(handle, inode, NULL, newblock, 4495 EXT4_C2B(sbi, allocated_clusters), fb_flags); 4496 goto out2; 4497 } 4498 4499 /* previous routine could use block we allocated */ 4500 newblock = ext4_ext_pblock(&newex); 4501 allocated = ext4_ext_get_actual_len(&newex); 4502 if (allocated > map->m_len) 4503 allocated = map->m_len; 4504 map->m_flags |= EXT4_MAP_NEW; 4505 4506 /* 4507 * Update reserved blocks/metadata blocks after successful 4508 * block allocation which had been deferred till now. 4509 */ 4510 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4511 unsigned int reserved_clusters; 4512 /* 4513 * Check how many clusters we had reserved this allocated range 4514 */ 4515 reserved_clusters = get_reserved_cluster_alloc(inode, 4516 map->m_lblk, allocated); 4517 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { 4518 if (reserved_clusters) { 4519 /* 4520 * We have clusters reserved for this range. 4521 * But since we are not doing actual allocation 4522 * and are simply using blocks from previously 4523 * allocated cluster, we should release the 4524 * reservation and not claim quota. 4525 */ 4526 ext4_da_update_reserve_space(inode, 4527 reserved_clusters, 0); 4528 } 4529 } else { 4530 BUG_ON(allocated_clusters < reserved_clusters); 4531 if (reserved_clusters < allocated_clusters) { 4532 struct ext4_inode_info *ei = EXT4_I(inode); 4533 int reservation = allocated_clusters - 4534 reserved_clusters; 4535 /* 4536 * It seems we claimed few clusters outside of 4537 * the range of this allocation. We should give 4538 * it back to the reservation pool. This can 4539 * happen in the following case: 4540 * 4541 * * Suppose s_cluster_ratio is 4 (i.e., each 4542 * cluster has 4 blocks. Thus, the clusters 4543 * are [0-3],[4-7],[8-11]... 4544 * * First comes delayed allocation write for 4545 * logical blocks 10 & 11. Since there were no 4546 * previous delayed allocated blocks in the 4547 * range [8-11], we would reserve 1 cluster 4548 * for this write. 4549 * * Next comes write for logical blocks 3 to 8. 4550 * In this case, we will reserve 2 clusters 4551 * (for [0-3] and [4-7]; and not for [8-11] as 4552 * that range has a delayed allocated blocks. 4553 * Thus total reserved clusters now becomes 3. 4554 * * Now, during the delayed allocation writeout 4555 * time, we will first write blocks [3-8] and 4556 * allocate 3 clusters for writing these 4557 * blocks. Also, we would claim all these 4558 * three clusters above. 4559 * * Now when we come here to writeout the 4560 * blocks [10-11], we would expect to claim 4561 * the reservation of 1 cluster we had made 4562 * (and we would claim it since there are no 4563 * more delayed allocated blocks in the range 4564 * [8-11]. But our reserved cluster count had 4565 * already gone to 0. 4566 * 4567 * Thus, at the step 4 above when we determine 4568 * that there are still some unwritten delayed 4569 * allocated blocks outside of our current 4570 * block range, we should increment the 4571 * reserved clusters count so that when the 4572 * remaining blocks finally gets written, we 4573 * could claim them. 4574 */ 4575 dquot_reserve_block(inode, 4576 EXT4_C2B(sbi, reservation)); 4577 spin_lock(&ei->i_block_reservation_lock); 4578 ei->i_reserved_data_blocks += reservation; 4579 spin_unlock(&ei->i_block_reservation_lock); 4580 } 4581 /* 4582 * We will claim quota for all newly allocated blocks. 4583 * We're updating the reserved space *after* the 4584 * correction above so we do not accidentally free 4585 * all the metadata reservation because we might 4586 * actually need it later on. 4587 */ 4588 ext4_da_update_reserve_space(inode, allocated_clusters, 4589 1); 4590 } 4591 } 4592 4593 /* 4594 * Cache the extent and update transaction to commit on fdatasync only 4595 * when it is _not_ an unwritten extent. 4596 */ 4597 if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) 4598 ext4_update_inode_fsync_trans(handle, inode, 1); 4599 else 4600 ext4_update_inode_fsync_trans(handle, inode, 0); 4601out: 4602 if (allocated > map->m_len) 4603 allocated = map->m_len; 4604 ext4_ext_show_leaf(inode, path); 4605 map->m_flags |= EXT4_MAP_MAPPED; 4606 map->m_pblk = newblock; 4607 map->m_len = allocated; 4608out2: 4609 ext4_ext_drop_refs(path); 4610 kfree(path); 4611 4612 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4613 err ? err : allocated); 4614 ext4_es_lru_add(inode); 4615 return err ? err : allocated; 4616} 4617 4618void ext4_ext_truncate(handle_t *handle, struct inode *inode) 4619{ 4620 struct super_block *sb = inode->i_sb; 4621 ext4_lblk_t last_block; 4622 int err = 0; 4623 4624 /* 4625 * TODO: optimization is possible here. 4626 * Probably we need not scan at all, 4627 * because page truncation is enough. 4628 */ 4629 4630 /* we have to know where to truncate from in crash case */ 4631 EXT4_I(inode)->i_disksize = inode->i_size; 4632 ext4_mark_inode_dirty(handle, inode); 4633 4634 last_block = (inode->i_size + sb->s_blocksize - 1) 4635 >> EXT4_BLOCK_SIZE_BITS(sb); 4636retry: 4637 err = ext4_es_remove_extent(inode, last_block, 4638 EXT_MAX_BLOCKS - last_block); 4639 if (err == -ENOMEM) { 4640 cond_resched(); 4641 congestion_wait(BLK_RW_ASYNC, HZ/50); 4642 goto retry; 4643 } 4644 if (err) { 4645 ext4_std_error(inode->i_sb, err); 4646 return; 4647 } 4648 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4649 ext4_std_error(inode->i_sb, err); 4650} 4651 4652static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4653 ext4_lblk_t len, loff_t new_size, 4654 int flags, int mode) 4655{ 4656 struct inode *inode = file_inode(file); 4657 handle_t *handle; 4658 int ret = 0; 4659 int ret2 = 0; 4660 int retries = 0; 4661 struct ext4_map_blocks map; 4662 unsigned int credits; 4663 loff_t epos; 4664 4665 map.m_lblk = offset; 4666 map.m_len = len; 4667 /* 4668 * Don't normalize the request if it can fit in one extent so 4669 * that it doesn't get unnecessarily split into multiple 4670 * extents. 4671 */ 4672 if (len <= EXT_UNWRITTEN_MAX_LEN) 4673 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4674 4675 /* 4676 * credits to insert 1 extent into extent tree 4677 */ 4678 credits = ext4_chunk_trans_blocks(inode, len); 4679 4680retry: 4681 while (ret >= 0 && len) { 4682 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4683 credits); 4684 if (IS_ERR(handle)) { 4685 ret = PTR_ERR(handle); 4686 break; 4687 } 4688 ret = ext4_map_blocks(handle, inode, &map, flags); 4689 if (ret <= 0) { 4690 ext4_debug("inode #%lu: block %u: len %u: " 4691 "ext4_ext_map_blocks returned %d", 4692 inode->i_ino, map.m_lblk, 4693 map.m_len, ret); 4694 ext4_mark_inode_dirty(handle, inode); 4695 ret2 = ext4_journal_stop(handle); 4696 break; 4697 } 4698 map.m_lblk += ret; 4699 map.m_len = len = len - ret; 4700 epos = (loff_t)map.m_lblk << inode->i_blkbits; 4701 inode->i_ctime = ext4_current_time(inode); 4702 if (new_size) { 4703 if (epos > new_size) 4704 epos = new_size; 4705 if (ext4_update_inode_size(inode, epos) & 0x1) 4706 inode->i_mtime = inode->i_ctime; 4707 } else { 4708 if (epos > inode->i_size) 4709 ext4_set_inode_flag(inode, 4710 EXT4_INODE_EOFBLOCKS); 4711 } 4712 ext4_mark_inode_dirty(handle, inode); 4713 ret2 = ext4_journal_stop(handle); 4714 if (ret2) 4715 break; 4716 } 4717 if (ret == -ENOSPC && 4718 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4719 ret = 0; 4720 goto retry; 4721 } 4722 4723 return ret > 0 ? ret2 : ret; 4724} 4725 4726static long ext4_zero_range(struct file *file, loff_t offset, 4727 loff_t len, int mode) 4728{ 4729 struct inode *inode = file_inode(file); 4730 handle_t *handle = NULL; 4731 unsigned int max_blocks; 4732 loff_t new_size = 0; 4733 int ret = 0; 4734 int flags; 4735 int credits; 4736 int partial_begin, partial_end; 4737 loff_t start, end; 4738 ext4_lblk_t lblk; 4739 struct address_space *mapping = inode->i_mapping; 4740 unsigned int blkbits = inode->i_blkbits; 4741 4742 trace_ext4_zero_range(inode, offset, len, mode); 4743 4744 if (!S_ISREG(inode->i_mode)) 4745 return -EINVAL; 4746 4747 /* Call ext4_force_commit to flush all data in case of data=journal. */ 4748 if (ext4_should_journal_data(inode)) { 4749 ret = ext4_force_commit(inode->i_sb); 4750 if (ret) 4751 return ret; 4752 } 4753 4754 /* 4755 * Write out all dirty pages to avoid race conditions 4756 * Then release them. 4757 */ 4758 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4759 ret = filemap_write_and_wait_range(mapping, offset, 4760 offset + len - 1); 4761 if (ret) 4762 return ret; 4763 } 4764 4765 /* 4766 * Round up offset. This is not fallocate, we neet to zero out 4767 * blocks, so convert interior block aligned part of the range to 4768 * unwritten and possibly manually zero out unaligned parts of the 4769 * range. 4770 */ 4771 start = round_up(offset, 1 << blkbits); 4772 end = round_down((offset + len), 1 << blkbits); 4773 4774 if (start < offset || end > offset + len) 4775 return -EINVAL; 4776 partial_begin = offset & ((1 << blkbits) - 1); 4777 partial_end = (offset + len) & ((1 << blkbits) - 1); 4778 4779 lblk = start >> blkbits; 4780 max_blocks = (end >> blkbits); 4781 if (max_blocks < lblk) 4782 max_blocks = 0; 4783 else 4784 max_blocks -= lblk; 4785 4786 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | 4787 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4788 EXT4_EX_NOCACHE; 4789 if (mode & FALLOC_FL_KEEP_SIZE) 4790 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4791 4792 mutex_lock(&inode->i_mutex); 4793 4794 /* 4795 * Indirect files do not support unwritten extnets 4796 */ 4797 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4798 ret = -EOPNOTSUPP; 4799 goto out_mutex; 4800 } 4801 4802 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4803 offset + len > i_size_read(inode)) { 4804 new_size = offset + len; 4805 ret = inode_newsize_ok(inode, new_size); 4806 if (ret) 4807 goto out_mutex; 4808 /* 4809 * If we have a partial block after EOF we have to allocate 4810 * the entire block. 4811 */ 4812 if (partial_end) 4813 max_blocks += 1; 4814 } 4815 4816 if (max_blocks > 0) { 4817 4818 /* Now release the pages and zero block aligned part of pages*/ 4819 truncate_pagecache_range(inode, start, end - 1); 4820 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4821 4822 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4823 ext4_inode_block_unlocked_dio(inode); 4824 inode_dio_wait(inode); 4825 4826 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4827 flags, mode); 4828 if (ret) 4829 goto out_dio; 4830 /* 4831 * Remove entire range from the extent status tree. 4832 * 4833 * ext4_es_remove_extent(inode, lblk, max_blocks) is 4834 * NOT sufficient. I'm not sure why this is the case, 4835 * but let's be conservative and remove the extent 4836 * status tree for the entire inode. There should be 4837 * no outstanding delalloc extents thanks to the 4838 * filemap_write_and_wait_range() call above. 4839 */ 4840 ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 4841 if (ret) 4842 goto out_dio; 4843 } 4844 if (!partial_begin && !partial_end) 4845 goto out_dio; 4846 4847 /* 4848 * In worst case we have to writeout two nonadjacent unwritten 4849 * blocks and update the inode 4850 */ 4851 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; 4852 if (ext4_should_journal_data(inode)) 4853 credits += 2; 4854 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 4855 if (IS_ERR(handle)) { 4856 ret = PTR_ERR(handle); 4857 ext4_std_error(inode->i_sb, ret); 4858 goto out_dio; 4859 } 4860 4861 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4862 if (new_size) { 4863 ext4_update_inode_size(inode, new_size); 4864 } else { 4865 /* 4866 * Mark that we allocate beyond EOF so the subsequent truncate 4867 * can proceed even if the new size is the same as i_size. 4868 */ 4869 if ((offset + len) > i_size_read(inode)) 4870 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4871 } 4872 ext4_mark_inode_dirty(handle, inode); 4873 4874 /* Zero out partial block at the edges of the range */ 4875 ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4876 4877 if (file->f_flags & O_SYNC) 4878 ext4_handle_sync(handle); 4879 4880 ext4_journal_stop(handle); 4881out_dio: 4882 ext4_inode_resume_unlocked_dio(inode); 4883out_mutex: 4884 mutex_unlock(&inode->i_mutex); 4885 return ret; 4886} 4887 4888/* 4889 * preallocate space for a file. This implements ext4's fallocate file 4890 * operation, which gets called from sys_fallocate system call. 4891 * For block-mapped files, posix_fallocate should fall back to the method 4892 * of writing zeroes to the required new blocks (the same behavior which is 4893 * expected for file systems which do not support fallocate() system call). 4894 */ 4895long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4896{ 4897 struct inode *inode = file_inode(file); 4898 loff_t new_size = 0; 4899 unsigned int max_blocks; 4900 int ret = 0; 4901 int flags; 4902 ext4_lblk_t lblk; 4903 unsigned int blkbits = inode->i_blkbits; 4904 4905 /* Return error if mode is not supported */ 4906 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 4907 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) 4908 return -EOPNOTSUPP; 4909 4910 if (mode & FALLOC_FL_PUNCH_HOLE) 4911 return ext4_punch_hole(inode, offset, len); 4912 4913 ret = ext4_convert_inline_data(inode); 4914 if (ret) 4915 return ret; 4916 4917 /* 4918 * currently supporting (pre)allocate mode for extent-based 4919 * files _only_ 4920 */ 4921 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4922 return -EOPNOTSUPP; 4923 4924 if (mode & FALLOC_FL_COLLAPSE_RANGE) 4925 return ext4_collapse_range(inode, offset, len); 4926 4927 if (mode & FALLOC_FL_ZERO_RANGE) 4928 return ext4_zero_range(file, offset, len, mode); 4929 4930 trace_ext4_fallocate_enter(inode, offset, len, mode); 4931 lblk = offset >> blkbits; 4932 /* 4933 * We can't just convert len to max_blocks because 4934 * If blocksize = 4096 offset = 3072 and len = 2048 4935 */ 4936 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4937 - lblk; 4938 4939 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4940 if (mode & FALLOC_FL_KEEP_SIZE) 4941 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4942 4943 mutex_lock(&inode->i_mutex); 4944 4945 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4946 offset + len > i_size_read(inode)) { 4947 new_size = offset + len; 4948 ret = inode_newsize_ok(inode, new_size); 4949 if (ret) 4950 goto out; 4951 } 4952 4953 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4954 flags, mode); 4955 if (ret) 4956 goto out; 4957 4958 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4959 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, 4960 EXT4_I(inode)->i_sync_tid); 4961 } 4962out: 4963 mutex_unlock(&inode->i_mutex); 4964 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4965 return ret; 4966} 4967 4968/* 4969 * This function convert a range of blocks to written extents 4970 * The caller of this function will pass the start offset and the size. 4971 * all unwritten extents within this range will be converted to 4972 * written extents. 4973 * 4974 * This function is called from the direct IO end io call back 4975 * function, to convert the fallocated extents after IO is completed. 4976 * Returns 0 on success. 4977 */ 4978int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 4979 loff_t offset, ssize_t len) 4980{ 4981 unsigned int max_blocks; 4982 int ret = 0; 4983 int ret2 = 0; 4984 struct ext4_map_blocks map; 4985 unsigned int credits, blkbits = inode->i_blkbits; 4986 4987 map.m_lblk = offset >> blkbits; 4988 /* 4989 * We can't just convert len to max_blocks because 4990 * If blocksize = 4096 offset = 3072 and len = 2048 4991 */ 4992 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4993 map.m_lblk); 4994 /* 4995 * This is somewhat ugly but the idea is clear: When transaction is 4996 * reserved, everything goes into it. Otherwise we rather start several 4997 * smaller transactions for conversion of each extent separately. 4998 */ 4999 if (handle) { 5000 handle = ext4_journal_start_reserved(handle, 5001 EXT4_HT_EXT_CONVERT); 5002 if (IS_ERR(handle)) 5003 return PTR_ERR(handle); 5004 credits = 0; 5005 } else { 5006 /* 5007 * credits to insert 1 extent into extent tree 5008 */ 5009 credits = ext4_chunk_trans_blocks(inode, max_blocks); 5010 } 5011 while (ret >= 0 && ret < max_blocks) { 5012 map.m_lblk += ret; 5013 map.m_len = (max_blocks -= ret); 5014 if (credits) { 5015 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 5016 credits); 5017 if (IS_ERR(handle)) { 5018 ret = PTR_ERR(handle); 5019 break; 5020 } 5021 } 5022 ret = ext4_map_blocks(handle, inode, &map, 5023 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 5024 if (ret <= 0) 5025 ext4_warning(inode->i_sb, 5026 "inode #%lu: block %u: len %u: " 5027 "ext4_ext_map_blocks returned %d", 5028 inode->i_ino, map.m_lblk, 5029 map.m_len, ret); 5030 ext4_mark_inode_dirty(handle, inode); 5031 if (credits) 5032 ret2 = ext4_journal_stop(handle); 5033 if (ret <= 0 || ret2) 5034 break; 5035 } 5036 if (!credits) 5037 ret2 = ext4_journal_stop(handle); 5038 return ret > 0 ? ret2 : ret; 5039} 5040 5041/* 5042 * If newes is not existing extent (newes->ec_pblk equals zero) find 5043 * delayed extent at start of newes and update newes accordingly and 5044 * return start of the next delayed extent. 5045 * 5046 * If newes is existing extent (newes->ec_pblk is not equal zero) 5047 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed 5048 * extent found. Leave newes unmodified. 5049 */ 5050static int ext4_find_delayed_extent(struct inode *inode, 5051 struct extent_status *newes) 5052{ 5053 struct extent_status es; 5054 ext4_lblk_t block, next_del; 5055 5056 if (newes->es_pblk == 0) { 5057 ext4_es_find_delayed_extent_range(inode, newes->es_lblk, 5058 newes->es_lblk + newes->es_len - 1, &es); 5059 5060 /* 5061 * No extent in extent-tree contains block @newes->es_pblk, 5062 * then the block may stay in 1)a hole or 2)delayed-extent. 5063 */ 5064 if (es.es_len == 0) 5065 /* A hole found. */ 5066 return 0; 5067 5068 if (es.es_lblk > newes->es_lblk) { 5069 /* A hole found. */ 5070 newes->es_len = min(es.es_lblk - newes->es_lblk, 5071 newes->es_len); 5072 return 0; 5073 } 5074 5075 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; 5076 } 5077 5078 block = newes->es_lblk + newes->es_len; 5079 ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); 5080 if (es.es_len == 0) 5081 next_del = EXT_MAX_BLOCKS; 5082 else 5083 next_del = es.es_lblk; 5084 5085 return next_del; 5086} 5087/* fiemap flags we can handle specified here */ 5088#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 5089 5090static int ext4_xattr_fiemap(struct inode *inode, 5091 struct fiemap_extent_info *fieinfo) 5092{ 5093 __u64 physical = 0; 5094 __u64 length; 5095 __u32 flags = FIEMAP_EXTENT_LAST; 5096 int blockbits = inode->i_sb->s_blocksize_bits; 5097 int error = 0; 5098 5099 /* in-inode? */ 5100 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 5101 struct ext4_iloc iloc; 5102 int offset; /* offset of xattr in inode */ 5103 5104 error = ext4_get_inode_loc(inode, &iloc); 5105 if (error) 5106 return error; 5107 physical = (__u64)iloc.bh->b_blocknr << blockbits; 5108 offset = EXT4_GOOD_OLD_INODE_SIZE + 5109 EXT4_I(inode)->i_extra_isize; 5110 physical += offset; 5111 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 5112 flags |= FIEMAP_EXTENT_DATA_INLINE; 5113 brelse(iloc.bh); 5114 } else { /* external block */ 5115 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; 5116 length = inode->i_sb->s_blocksize; 5117 } 5118 5119 if (physical) 5120 error = fiemap_fill_next_extent(fieinfo, 0, physical, 5121 length, flags); 5122 return (error < 0 ? error : 0); 5123} 5124 5125int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5126 __u64 start, __u64 len) 5127{ 5128 ext4_lblk_t start_blk; 5129 int error = 0; 5130 5131 if (ext4_has_inline_data(inode)) { 5132 int has_inline = 1; 5133 5134 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 5135 5136 if (has_inline) 5137 return error; 5138 } 5139 5140 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 5141 error = ext4_ext_precache(inode); 5142 if (error) 5143 return error; 5144 } 5145 5146 /* fallback to generic here if not in extents fmt */ 5147 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5148 return generic_block_fiemap(inode, fieinfo, start, len, 5149 ext4_get_block); 5150 5151 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5152 return -EBADR; 5153 5154 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 5155 error = ext4_xattr_fiemap(inode, fieinfo); 5156 } else { 5157 ext4_lblk_t len_blks; 5158 __u64 last_blk; 5159 5160 start_blk = start >> inode->i_sb->s_blocksize_bits; 5161 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 5162 if (last_blk >= EXT_MAX_BLOCKS) 5163 last_blk = EXT_MAX_BLOCKS-1; 5164 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 5165 5166 /* 5167 * Walk the extent tree gathering extent information 5168 * and pushing extents back to the user. 5169 */ 5170 error = ext4_fill_fiemap_extents(inode, start_blk, 5171 len_blks, fieinfo); 5172 } 5173 ext4_es_lru_add(inode); 5174 return error; 5175} 5176 5177/* 5178 * ext4_access_path: 5179 * Function to access the path buffer for marking it dirty. 5180 * It also checks if there are sufficient credits left in the journal handle 5181 * to update path. 5182 */ 5183static int 5184ext4_access_path(handle_t *handle, struct inode *inode, 5185 struct ext4_ext_path *path) 5186{ 5187 int credits, err; 5188 5189 if (!ext4_handle_valid(handle)) 5190 return 0; 5191 5192 /* 5193 * Check if need to extend journal credits 5194 * 3 for leaf, sb, and inode plus 2 (bmap and group 5195 * descriptor) for each block group; assume two block 5196 * groups 5197 */ 5198 if (handle->h_buffer_credits < 7) { 5199 credits = ext4_writepage_trans_blocks(inode); 5200 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 5201 /* EAGAIN is success */ 5202 if (err && err != -EAGAIN) 5203 return err; 5204 } 5205 5206 err = ext4_ext_get_access(handle, inode, path); 5207 return err; 5208} 5209 5210/* 5211 * ext4_ext_shift_path_extents: 5212 * Shift the extents of a path structure lying between path[depth].p_ext 5213 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift 5214 * from starting block for each extent. 5215 */ 5216static int 5217ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, 5218 struct inode *inode, handle_t *handle, 5219 ext4_lblk_t *start) 5220{ 5221 int depth, err = 0; 5222 struct ext4_extent *ex_start, *ex_last; 5223 bool update = 0; 5224 depth = path->p_depth; 5225 5226 while (depth >= 0) { 5227 if (depth == path->p_depth) { 5228 ex_start = path[depth].p_ext; 5229 if (!ex_start) 5230 return -EIO; 5231 5232 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 5233 if (!ex_last) 5234 return -EIO; 5235 5236 err = ext4_access_path(handle, inode, path + depth); 5237 if (err) 5238 goto out; 5239 5240 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) 5241 update = 1; 5242 5243 *start = le32_to_cpu(ex_last->ee_block) + 5244 ext4_ext_get_actual_len(ex_last); 5245 5246 while (ex_start <= ex_last) { 5247 le32_add_cpu(&ex_start->ee_block, -shift); 5248 /* Try to merge to the left. */ 5249 if ((ex_start > 5250 EXT_FIRST_EXTENT(path[depth].p_hdr)) && 5251 ext4_ext_try_to_merge_right(inode, 5252 path, ex_start - 1)) 5253 ex_last--; 5254 else 5255 ex_start++; 5256 } 5257 err = ext4_ext_dirty(handle, inode, path + depth); 5258 if (err) 5259 goto out; 5260 5261 if (--depth < 0 || !update) 5262 break; 5263 } 5264 5265 /* Update index too */ 5266 err = ext4_access_path(handle, inode, path + depth); 5267 if (err) 5268 goto out; 5269 5270 le32_add_cpu(&path[depth].p_idx->ei_block, -shift); 5271 err = ext4_ext_dirty(handle, inode, path + depth); 5272 if (err) 5273 goto out; 5274 5275 /* we are done if current index is not a starting index */ 5276 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) 5277 break; 5278 5279 depth--; 5280 } 5281 5282out: 5283 return err; 5284} 5285 5286/* 5287 * ext4_ext_shift_extents: 5288 * All the extents which lies in the range from start to the last allocated 5289 * block for the file are shifted downwards by shift blocks. 5290 * On success, 0 is returned, error otherwise. 5291 */ 5292static int 5293ext4_ext_shift_extents(struct inode *inode, handle_t *handle, 5294 ext4_lblk_t start, ext4_lblk_t shift) 5295{ 5296 struct ext4_ext_path *path; 5297 int ret = 0, depth; 5298 struct ext4_extent *extent; 5299 ext4_lblk_t stop_block; 5300 ext4_lblk_t ex_start, ex_end; 5301 5302 /* Let path point to the last extent */ 5303 path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); 5304 if (IS_ERR(path)) 5305 return PTR_ERR(path); 5306 5307 depth = path->p_depth; 5308 extent = path[depth].p_ext; 5309 if (!extent) 5310 goto out; 5311 5312 stop_block = le32_to_cpu(extent->ee_block) + 5313 ext4_ext_get_actual_len(extent); 5314 5315 /* Nothing to shift, if hole is at the end of file */ 5316 if (start >= stop_block) 5317 goto out; 5318 5319 /* 5320 * Don't start shifting extents until we make sure the hole is big 5321 * enough to accomodate the shift. 5322 */ 5323 path = ext4_ext_find_extent(inode, start - 1, &path, 0); 5324 if (IS_ERR(path)) 5325 return PTR_ERR(path); 5326 depth = path->p_depth; 5327 extent = path[depth].p_ext; 5328 if (extent) { 5329 ex_start = le32_to_cpu(extent->ee_block); 5330 ex_end = le32_to_cpu(extent->ee_block) + 5331 ext4_ext_get_actual_len(extent); 5332 } else { 5333 ex_start = 0; 5334 ex_end = 0; 5335 } 5336 5337 if ((start == ex_start && shift > ex_start) || 5338 (shift > start - ex_end)) 5339 return -EINVAL; 5340 5341 /* Its safe to start updating extents */ 5342 while (start < stop_block) { 5343 path = ext4_ext_find_extent(inode, start, &path, 0); 5344 if (IS_ERR(path)) 5345 return PTR_ERR(path); 5346 depth = path->p_depth; 5347 extent = path[depth].p_ext; 5348 if (!extent) { 5349 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 5350 (unsigned long) start); 5351 return -EIO; 5352 } 5353 if (start > le32_to_cpu(extent->ee_block)) { 5354 /* Hole, move to the next extent */ 5355 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { 5356 path[depth].p_ext++; 5357 } else { 5358 start = ext4_ext_next_allocated_block(path); 5359 continue; 5360 } 5361 } 5362 ret = ext4_ext_shift_path_extents(path, shift, inode, 5363 handle, &start); 5364 if (ret) 5365 break; 5366 } 5367out: 5368 ext4_ext_drop_refs(path); 5369 kfree(path); 5370 return ret; 5371} 5372 5373/* 5374 * ext4_collapse_range: 5375 * This implements the fallocate's collapse range functionality for ext4 5376 * Returns: 0 and non-zero on error. 5377 */ 5378int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) 5379{ 5380 struct super_block *sb = inode->i_sb; 5381 ext4_lblk_t punch_start, punch_stop; 5382 handle_t *handle; 5383 unsigned int credits; 5384 loff_t new_size, ioffset; 5385 int ret; 5386 5387 /* Collapse range works only on fs block size aligned offsets. */ 5388 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || 5389 len & (EXT4_CLUSTER_SIZE(sb) - 1)) 5390 return -EINVAL; 5391 5392 if (!S_ISREG(inode->i_mode)) 5393 return -EINVAL; 5394 5395 trace_ext4_collapse_range(inode, offset, len); 5396 5397 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5398 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5399 5400 /* Call ext4_force_commit to flush all data in case of data=journal. */ 5401 if (ext4_should_journal_data(inode)) { 5402 ret = ext4_force_commit(inode->i_sb); 5403 if (ret) 5404 return ret; 5405 } 5406 5407 /* 5408 * Need to round down offset to be aligned with page size boundary 5409 * for page size > block size. 5410 */ 5411 ioffset = round_down(offset, PAGE_SIZE); 5412 5413 /* Write out all dirty pages */ 5414 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5415 LLONG_MAX); 5416 if (ret) 5417 return ret; 5418 5419 /* Take mutex lock */ 5420 mutex_lock(&inode->i_mutex); 5421 5422 /* 5423 * There is no need to overlap collapse range with EOF, in which case 5424 * it is effectively a truncate operation 5425 */ 5426 if (offset + len >= i_size_read(inode)) { 5427 ret = -EINVAL; 5428 goto out_mutex; 5429 } 5430 5431 /* Currently just for extent based files */ 5432 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5433 ret = -EOPNOTSUPP; 5434 goto out_mutex; 5435 } 5436 5437 truncate_pagecache(inode, ioffset); 5438 5439 /* Wait for existing dio to complete */ 5440 ext4_inode_block_unlocked_dio(inode); 5441 inode_dio_wait(inode); 5442 5443 credits = ext4_writepage_trans_blocks(inode); 5444 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5445 if (IS_ERR(handle)) { 5446 ret = PTR_ERR(handle); 5447 goto out_dio; 5448 } 5449 5450 down_write(&EXT4_I(inode)->i_data_sem); 5451 ext4_discard_preallocations(inode); 5452 5453 ret = ext4_es_remove_extent(inode, punch_start, 5454 EXT_MAX_BLOCKS - punch_start); 5455 if (ret) { 5456 up_write(&EXT4_I(inode)->i_data_sem); 5457 goto out_stop; 5458 } 5459 5460 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); 5461 if (ret) { 5462 up_write(&EXT4_I(inode)->i_data_sem); 5463 goto out_stop; 5464 } 5465 ext4_discard_preallocations(inode); 5466 5467 ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5468 punch_stop - punch_start); 5469 if (ret) { 5470 up_write(&EXT4_I(inode)->i_data_sem); 5471 goto out_stop; 5472 } 5473 5474 new_size = i_size_read(inode) - len; 5475 i_size_write(inode, new_size); 5476 EXT4_I(inode)->i_disksize = new_size; 5477 5478 up_write(&EXT4_I(inode)->i_data_sem); 5479 if (IS_SYNC(inode)) 5480 ext4_handle_sync(handle); 5481 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 5482 ext4_mark_inode_dirty(handle, inode); 5483 5484out_stop: 5485 ext4_journal_stop(handle); 5486out_dio: 5487 ext4_inode_resume_unlocked_dio(inode); 5488out_mutex: 5489 mutex_unlock(&inode->i_mutex); 5490 return ret; 5491} 5492 5493/** 5494 * ext4_swap_extents - Swap extents between two inodes 5495 * 5496 * @inode1: First inode 5497 * @inode2: Second inode 5498 * @lblk1: Start block for first inode 5499 * @lblk2: Start block for second inode 5500 * @count: Number of blocks to swap 5501 * @mark_unwritten: Mark second inode's extents as unwritten after swap 5502 * @erp: Pointer to save error value 5503 * 5504 * This helper routine does exactly what is promise "swap extents". All other 5505 * stuff such as page-cache locking consistency, bh mapping consistency or 5506 * extent's data copying must be performed by caller. 5507 * Locking: 5508 * i_mutex is held for both inodes 5509 * i_data_sem is locked for write for both inodes 5510 * Assumptions: 5511 * All pages from requested range are locked for both inodes 5512 */ 5513int 5514ext4_swap_extents(handle_t *handle, struct inode *inode1, 5515 struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, 5516 ext4_lblk_t count, int unwritten, int *erp) 5517{ 5518 struct ext4_ext_path *path1 = NULL; 5519 struct ext4_ext_path *path2 = NULL; 5520 int replaced_count = 0; 5521 5522 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); 5523 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); 5524 BUG_ON(!mutex_is_locked(&inode1->i_mutex)); 5525 BUG_ON(!mutex_is_locked(&inode1->i_mutex)); 5526 5527 *erp = ext4_es_remove_extent(inode1, lblk1, count); 5528 if (unlikely(*erp)) 5529 return 0; 5530 *erp = ext4_es_remove_extent(inode2, lblk2, count); 5531 if (unlikely(*erp)) 5532 return 0; 5533 5534 while (count) { 5535 struct ext4_extent *ex1, *ex2, tmp_ex; 5536 ext4_lblk_t e1_blk, e2_blk; 5537 int e1_len, e2_len, len; 5538 int split = 0; 5539 5540 path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); 5541 if (unlikely(IS_ERR(path1))) { 5542 *erp = PTR_ERR(path1); 5543 path1 = NULL; 5544 finish: 5545 count = 0; 5546 goto repeat; 5547 } 5548 path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); 5549 if (unlikely(IS_ERR(path2))) { 5550 *erp = PTR_ERR(path2); 5551 path2 = NULL; 5552 goto finish; 5553 } 5554 ex1 = path1[path1->p_depth].p_ext; 5555 ex2 = path2[path2->p_depth].p_ext; 5556 /* Do we have somthing to swap ? */ 5557 if (unlikely(!ex2 || !ex1)) 5558 goto finish; 5559 5560 e1_blk = le32_to_cpu(ex1->ee_block); 5561 e2_blk = le32_to_cpu(ex2->ee_block); 5562 e1_len = ext4_ext_get_actual_len(ex1); 5563 e2_len = ext4_ext_get_actual_len(ex2); 5564 5565 /* Hole handling */ 5566 if (!in_range(lblk1, e1_blk, e1_len) || 5567 !in_range(lblk2, e2_blk, e2_len)) { 5568 ext4_lblk_t next1, next2; 5569 5570 /* if hole after extent, then go to next extent */ 5571 next1 = ext4_ext_next_allocated_block(path1); 5572 next2 = ext4_ext_next_allocated_block(path2); 5573 /* If hole before extent, then shift to that extent */ 5574 if (e1_blk > lblk1) 5575 next1 = e1_blk; 5576 if (e2_blk > lblk2) 5577 next2 = e1_blk; 5578 /* Do we have something to swap */ 5579 if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) 5580 goto finish; 5581 /* Move to the rightest boundary */ 5582 len = next1 - lblk1; 5583 if (len < next2 - lblk2) 5584 len = next2 - lblk2; 5585 if (len > count) 5586 len = count; 5587 lblk1 += len; 5588 lblk2 += len; 5589 count -= len; 5590 goto repeat; 5591 } 5592 5593 /* Prepare left boundary */ 5594 if (e1_blk < lblk1) { 5595 split = 1; 5596 *erp = ext4_force_split_extent_at(handle, inode1, 5597 &path1, lblk1, 0); 5598 if (unlikely(*erp)) 5599 goto finish; 5600 } 5601 if (e2_blk < lblk2) { 5602 split = 1; 5603 *erp = ext4_force_split_extent_at(handle, inode2, 5604 &path2, lblk2, 0); 5605 if (unlikely(*erp)) 5606 goto finish; 5607 } 5608 /* ext4_split_extent_at() may result in leaf extent split, 5609 * path must to be revalidated. */ 5610 if (split) 5611 goto repeat; 5612 5613 /* Prepare right boundary */ 5614 len = count; 5615 if (len > e1_blk + e1_len - lblk1) 5616 len = e1_blk + e1_len - lblk1; 5617 if (len > e2_blk + e2_len - lblk2) 5618 len = e2_blk + e2_len - lblk2; 5619 5620 if (len != e1_len) { 5621 split = 1; 5622 *erp = ext4_force_split_extent_at(handle, inode1, 5623 &path1, lblk1 + len, 0); 5624 if (unlikely(*erp)) 5625 goto finish; 5626 } 5627 if (len != e2_len) { 5628 split = 1; 5629 *erp = ext4_force_split_extent_at(handle, inode2, 5630 &path2, lblk2 + len, 0); 5631 if (*erp) 5632 goto finish; 5633 } 5634 /* ext4_split_extent_at() may result in leaf extent split, 5635 * path must to be revalidated. */ 5636 if (split) 5637 goto repeat; 5638 5639 BUG_ON(e2_len != e1_len); 5640 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); 5641 if (unlikely(*erp)) 5642 goto finish; 5643 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); 5644 if (unlikely(*erp)) 5645 goto finish; 5646 5647 /* Both extents are fully inside boundaries. Swap it now */ 5648 tmp_ex = *ex1; 5649 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); 5650 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); 5651 ex1->ee_len = cpu_to_le16(e2_len); 5652 ex2->ee_len = cpu_to_le16(e1_len); 5653 if (unwritten) 5654 ext4_ext_mark_unwritten(ex2); 5655 if (ext4_ext_is_unwritten(&tmp_ex)) 5656 ext4_ext_mark_unwritten(ex1); 5657 5658 ext4_ext_try_to_merge(handle, inode2, path2, ex2); 5659 ext4_ext_try_to_merge(handle, inode1, path1, ex1); 5660 *erp = ext4_ext_dirty(handle, inode2, path2 + 5661 path2->p_depth); 5662 if (unlikely(*erp)) 5663 goto finish; 5664 *erp = ext4_ext_dirty(handle, inode1, path1 + 5665 path1->p_depth); 5666 /* 5667 * Looks scarry ah..? second inode already points to new blocks, 5668 * and it was successfully dirtied. But luckily error may happen 5669 * only due to journal error, so full transaction will be 5670 * aborted anyway. 5671 */ 5672 if (unlikely(*erp)) 5673 goto finish; 5674 lblk1 += len; 5675 lblk2 += len; 5676 replaced_count += len; 5677 count -= len; 5678 5679 repeat: 5680 ext4_ext_drop_refs(path1); 5681 kfree(path1); 5682 ext4_ext_drop_refs(path2); 5683 kfree(path2); 5684 path1 = path2 = NULL; 5685 } 5686 return replaced_count; 5687} 5688