extents.c revision 91dd8c114499e9818f2d5919ef0b9eee61810220
1/* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23/* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32#include <linux/fs.h> 33#include <linux/time.h> 34#include <linux/jbd2.h> 35#include <linux/highuid.h> 36#include <linux/pagemap.h> 37#include <linux/quotaops.h> 38#include <linux/string.h> 39#include <linux/slab.h> 40#include <linux/falloc.h> 41#include <asm/uaccess.h> 42#include <linux/fiemap.h> 43#include "ext4_jbd2.h" 44 45#include <trace/events/ext4.h> 46 47/* 48 * used by extent splitting. 49 */ 50#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 51 due to ENOSPC */ 52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ 53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ 54 55#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ 56#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ 57 58static __le32 ext4_extent_block_csum(struct inode *inode, 59 struct ext4_extent_header *eh) 60{ 61 struct ext4_inode_info *ei = EXT4_I(inode); 62 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 63 __u32 csum; 64 65 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, 66 EXT4_EXTENT_TAIL_OFFSET(eh)); 67 return cpu_to_le32(csum); 68} 69 70static int ext4_extent_block_csum_verify(struct inode *inode, 71 struct ext4_extent_header *eh) 72{ 73 struct ext4_extent_tail *et; 74 75 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 76 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 77 return 1; 78 79 et = find_ext4_extent_tail(eh); 80 if (et->et_checksum != ext4_extent_block_csum(inode, eh)) 81 return 0; 82 return 1; 83} 84 85static void ext4_extent_block_csum_set(struct inode *inode, 86 struct ext4_extent_header *eh) 87{ 88 struct ext4_extent_tail *et; 89 90 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 91 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 92 return; 93 94 et = find_ext4_extent_tail(eh); 95 et->et_checksum = ext4_extent_block_csum(inode, eh); 96} 97 98static int ext4_split_extent(handle_t *handle, 99 struct inode *inode, 100 struct ext4_ext_path *path, 101 struct ext4_map_blocks *map, 102 int split_flag, 103 int flags); 104 105static int ext4_split_extent_at(handle_t *handle, 106 struct inode *inode, 107 struct ext4_ext_path *path, 108 ext4_lblk_t split, 109 int split_flag, 110 int flags); 111 112static int ext4_find_delayed_extent(struct inode *inode, 113 struct ext4_ext_cache *newex); 114 115static int ext4_ext_truncate_extend_restart(handle_t *handle, 116 struct inode *inode, 117 int needed) 118{ 119 int err; 120 121 if (!ext4_handle_valid(handle)) 122 return 0; 123 if (handle->h_buffer_credits > needed) 124 return 0; 125 err = ext4_journal_extend(handle, needed); 126 if (err <= 0) 127 return err; 128 err = ext4_truncate_restart_trans(handle, inode, needed); 129 if (err == 0) 130 err = -EAGAIN; 131 132 return err; 133} 134 135/* 136 * could return: 137 * - EROFS 138 * - ENOMEM 139 */ 140static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 141 struct ext4_ext_path *path) 142{ 143 if (path->p_bh) { 144 /* path points to block */ 145 return ext4_journal_get_write_access(handle, path->p_bh); 146 } 147 /* path points to leaf/index in inode body */ 148 /* we use in-core data, no need to protect them */ 149 return 0; 150} 151 152/* 153 * could return: 154 * - EROFS 155 * - ENOMEM 156 * - EIO 157 */ 158#define ext4_ext_dirty(handle, inode, path) \ 159 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 160static int __ext4_ext_dirty(const char *where, unsigned int line, 161 handle_t *handle, struct inode *inode, 162 struct ext4_ext_path *path) 163{ 164 int err; 165 if (path->p_bh) { 166 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 167 /* path points to block */ 168 err = __ext4_handle_dirty_metadata(where, line, handle, 169 inode, path->p_bh); 170 } else { 171 /* path points to leaf/index in inode body */ 172 err = ext4_mark_inode_dirty(handle, inode); 173 } 174 return err; 175} 176 177static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 178 struct ext4_ext_path *path, 179 ext4_lblk_t block) 180{ 181 if (path) { 182 int depth = path->p_depth; 183 struct ext4_extent *ex; 184 185 /* 186 * Try to predict block placement assuming that we are 187 * filling in a file which will eventually be 188 * non-sparse --- i.e., in the case of libbfd writing 189 * an ELF object sections out-of-order but in a way 190 * the eventually results in a contiguous object or 191 * executable file, or some database extending a table 192 * space file. However, this is actually somewhat 193 * non-ideal if we are writing a sparse file such as 194 * qemu or KVM writing a raw image file that is going 195 * to stay fairly sparse, since it will end up 196 * fragmenting the file system's free space. Maybe we 197 * should have some hueristics or some way to allow 198 * userspace to pass a hint to file system, 199 * especially if the latter case turns out to be 200 * common. 201 */ 202 ex = path[depth].p_ext; 203 if (ex) { 204 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 205 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 206 207 if (block > ext_block) 208 return ext_pblk + (block - ext_block); 209 else 210 return ext_pblk - (ext_block - block); 211 } 212 213 /* it looks like index is empty; 214 * try to find starting block from index itself */ 215 if (path[depth].p_bh) 216 return path[depth].p_bh->b_blocknr; 217 } 218 219 /* OK. use inode's group */ 220 return ext4_inode_to_goal_block(inode); 221} 222 223/* 224 * Allocation for a meta data block 225 */ 226static ext4_fsblk_t 227ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 228 struct ext4_ext_path *path, 229 struct ext4_extent *ex, int *err, unsigned int flags) 230{ 231 ext4_fsblk_t goal, newblock; 232 233 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 234 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 235 NULL, err); 236 return newblock; 237} 238 239static inline int ext4_ext_space_block(struct inode *inode, int check) 240{ 241 int size; 242 243 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 244 / sizeof(struct ext4_extent); 245#ifdef AGGRESSIVE_TEST 246 if (!check && size > 6) 247 size = 6; 248#endif 249 return size; 250} 251 252static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 253{ 254 int size; 255 256 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 257 / sizeof(struct ext4_extent_idx); 258#ifdef AGGRESSIVE_TEST 259 if (!check && size > 5) 260 size = 5; 261#endif 262 return size; 263} 264 265static inline int ext4_ext_space_root(struct inode *inode, int check) 266{ 267 int size; 268 269 size = sizeof(EXT4_I(inode)->i_data); 270 size -= sizeof(struct ext4_extent_header); 271 size /= sizeof(struct ext4_extent); 272#ifdef AGGRESSIVE_TEST 273 if (!check && size > 3) 274 size = 3; 275#endif 276 return size; 277} 278 279static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 280{ 281 int size; 282 283 size = sizeof(EXT4_I(inode)->i_data); 284 size -= sizeof(struct ext4_extent_header); 285 size /= sizeof(struct ext4_extent_idx); 286#ifdef AGGRESSIVE_TEST 287 if (!check && size > 4) 288 size = 4; 289#endif 290 return size; 291} 292 293/* 294 * Calculate the number of metadata blocks needed 295 * to allocate @blocks 296 * Worse case is one block per extent 297 */ 298int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 299{ 300 struct ext4_inode_info *ei = EXT4_I(inode); 301 int idxs; 302 303 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 304 / sizeof(struct ext4_extent_idx)); 305 306 /* 307 * If the new delayed allocation block is contiguous with the 308 * previous da block, it can share index blocks with the 309 * previous block, so we only need to allocate a new index 310 * block every idxs leaf blocks. At ldxs**2 blocks, we need 311 * an additional index block, and at ldxs**3 blocks, yet 312 * another index blocks. 313 */ 314 if (ei->i_da_metadata_calc_len && 315 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 316 int num = 0; 317 318 if ((ei->i_da_metadata_calc_len % idxs) == 0) 319 num++; 320 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 321 num++; 322 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 323 num++; 324 ei->i_da_metadata_calc_len = 0; 325 } else 326 ei->i_da_metadata_calc_len++; 327 ei->i_da_metadata_calc_last_lblock++; 328 return num; 329 } 330 331 /* 332 * In the worst case we need a new set of index blocks at 333 * every level of the inode's extent tree. 334 */ 335 ei->i_da_metadata_calc_len = 1; 336 ei->i_da_metadata_calc_last_lblock = lblock; 337 return ext_depth(inode) + 1; 338} 339 340static int 341ext4_ext_max_entries(struct inode *inode, int depth) 342{ 343 int max; 344 345 if (depth == ext_depth(inode)) { 346 if (depth == 0) 347 max = ext4_ext_space_root(inode, 1); 348 else 349 max = ext4_ext_space_root_idx(inode, 1); 350 } else { 351 if (depth == 0) 352 max = ext4_ext_space_block(inode, 1); 353 else 354 max = ext4_ext_space_block_idx(inode, 1); 355 } 356 357 return max; 358} 359 360static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 361{ 362 ext4_fsblk_t block = ext4_ext_pblock(ext); 363 int len = ext4_ext_get_actual_len(ext); 364 365 if (len == 0) 366 return 0; 367 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 368} 369 370static int ext4_valid_extent_idx(struct inode *inode, 371 struct ext4_extent_idx *ext_idx) 372{ 373 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 374 375 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 376} 377 378static int ext4_valid_extent_entries(struct inode *inode, 379 struct ext4_extent_header *eh, 380 int depth) 381{ 382 unsigned short entries; 383 if (eh->eh_entries == 0) 384 return 1; 385 386 entries = le16_to_cpu(eh->eh_entries); 387 388 if (depth == 0) { 389 /* leaf entries */ 390 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 391 while (entries) { 392 if (!ext4_valid_extent(inode, ext)) 393 return 0; 394 ext++; 395 entries--; 396 } 397 } else { 398 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 399 while (entries) { 400 if (!ext4_valid_extent_idx(inode, ext_idx)) 401 return 0; 402 ext_idx++; 403 entries--; 404 } 405 } 406 return 1; 407} 408 409static int __ext4_ext_check(const char *function, unsigned int line, 410 struct inode *inode, struct ext4_extent_header *eh, 411 int depth) 412{ 413 const char *error_msg; 414 int max = 0; 415 416 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 417 error_msg = "invalid magic"; 418 goto corrupted; 419 } 420 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 421 error_msg = "unexpected eh_depth"; 422 goto corrupted; 423 } 424 if (unlikely(eh->eh_max == 0)) { 425 error_msg = "invalid eh_max"; 426 goto corrupted; 427 } 428 max = ext4_ext_max_entries(inode, depth); 429 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 430 error_msg = "too large eh_max"; 431 goto corrupted; 432 } 433 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 434 error_msg = "invalid eh_entries"; 435 goto corrupted; 436 } 437 if (!ext4_valid_extent_entries(inode, eh, depth)) { 438 error_msg = "invalid extent entries"; 439 goto corrupted; 440 } 441 /* Verify checksum on non-root extent tree nodes */ 442 if (ext_depth(inode) != depth && 443 !ext4_extent_block_csum_verify(inode, eh)) { 444 error_msg = "extent tree corrupted"; 445 goto corrupted; 446 } 447 return 0; 448 449corrupted: 450 ext4_error_inode(inode, function, line, 0, 451 "bad header/extent: %s - magic %x, " 452 "entries %u, max %u(%u), depth %u(%u)", 453 error_msg, le16_to_cpu(eh->eh_magic), 454 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 455 max, le16_to_cpu(eh->eh_depth), depth); 456 457 return -EIO; 458} 459 460#define ext4_ext_check(inode, eh, depth) \ 461 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 462 463int ext4_ext_check_inode(struct inode *inode) 464{ 465 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 466} 467 468static int __ext4_ext_check_block(const char *function, unsigned int line, 469 struct inode *inode, 470 struct ext4_extent_header *eh, 471 int depth, 472 struct buffer_head *bh) 473{ 474 int ret; 475 476 if (buffer_verified(bh)) 477 return 0; 478 ret = ext4_ext_check(inode, eh, depth); 479 if (ret) 480 return ret; 481 set_buffer_verified(bh); 482 return ret; 483} 484 485#define ext4_ext_check_block(inode, eh, depth, bh) \ 486 __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) 487 488#ifdef EXT_DEBUG 489static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 490{ 491 int k, l = path->p_depth; 492 493 ext_debug("path:"); 494 for (k = 0; k <= l; k++, path++) { 495 if (path->p_idx) { 496 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 497 ext4_idx_pblock(path->p_idx)); 498 } else if (path->p_ext) { 499 ext_debug(" %d:[%d]%d:%llu ", 500 le32_to_cpu(path->p_ext->ee_block), 501 ext4_ext_is_uninitialized(path->p_ext), 502 ext4_ext_get_actual_len(path->p_ext), 503 ext4_ext_pblock(path->p_ext)); 504 } else 505 ext_debug(" []"); 506 } 507 ext_debug("\n"); 508} 509 510static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 511{ 512 int depth = ext_depth(inode); 513 struct ext4_extent_header *eh; 514 struct ext4_extent *ex; 515 int i; 516 517 if (!path) 518 return; 519 520 eh = path[depth].p_hdr; 521 ex = EXT_FIRST_EXTENT(eh); 522 523 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 524 525 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 526 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 527 ext4_ext_is_uninitialized(ex), 528 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 529 } 530 ext_debug("\n"); 531} 532 533static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 534 ext4_fsblk_t newblock, int level) 535{ 536 int depth = ext_depth(inode); 537 struct ext4_extent *ex; 538 539 if (depth != level) { 540 struct ext4_extent_idx *idx; 541 idx = path[level].p_idx; 542 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 543 ext_debug("%d: move %d:%llu in new index %llu\n", level, 544 le32_to_cpu(idx->ei_block), 545 ext4_idx_pblock(idx), 546 newblock); 547 idx++; 548 } 549 550 return; 551 } 552 553 ex = path[depth].p_ext; 554 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 555 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 556 le32_to_cpu(ex->ee_block), 557 ext4_ext_pblock(ex), 558 ext4_ext_is_uninitialized(ex), 559 ext4_ext_get_actual_len(ex), 560 newblock); 561 ex++; 562 } 563} 564 565#else 566#define ext4_ext_show_path(inode, path) 567#define ext4_ext_show_leaf(inode, path) 568#define ext4_ext_show_move(inode, path, newblock, level) 569#endif 570 571void ext4_ext_drop_refs(struct ext4_ext_path *path) 572{ 573 int depth = path->p_depth; 574 int i; 575 576 for (i = 0; i <= depth; i++, path++) 577 if (path->p_bh) { 578 brelse(path->p_bh); 579 path->p_bh = NULL; 580 } 581} 582 583/* 584 * ext4_ext_binsearch_idx: 585 * binary search for the closest index of the given block 586 * the header must be checked before calling this 587 */ 588static void 589ext4_ext_binsearch_idx(struct inode *inode, 590 struct ext4_ext_path *path, ext4_lblk_t block) 591{ 592 struct ext4_extent_header *eh = path->p_hdr; 593 struct ext4_extent_idx *r, *l, *m; 594 595 596 ext_debug("binsearch for %u(idx): ", block); 597 598 l = EXT_FIRST_INDEX(eh) + 1; 599 r = EXT_LAST_INDEX(eh); 600 while (l <= r) { 601 m = l + (r - l) / 2; 602 if (block < le32_to_cpu(m->ei_block)) 603 r = m - 1; 604 else 605 l = m + 1; 606 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 607 m, le32_to_cpu(m->ei_block), 608 r, le32_to_cpu(r->ei_block)); 609 } 610 611 path->p_idx = l - 1; 612 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), 613 ext4_idx_pblock(path->p_idx)); 614 615#ifdef CHECK_BINSEARCH 616 { 617 struct ext4_extent_idx *chix, *ix; 618 int k; 619 620 chix = ix = EXT_FIRST_INDEX(eh); 621 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 622 if (k != 0 && 623 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 624 printk(KERN_DEBUG "k=%d, ix=0x%p, " 625 "first=0x%p\n", k, 626 ix, EXT_FIRST_INDEX(eh)); 627 printk(KERN_DEBUG "%u <= %u\n", 628 le32_to_cpu(ix->ei_block), 629 le32_to_cpu(ix[-1].ei_block)); 630 } 631 BUG_ON(k && le32_to_cpu(ix->ei_block) 632 <= le32_to_cpu(ix[-1].ei_block)); 633 if (block < le32_to_cpu(ix->ei_block)) 634 break; 635 chix = ix; 636 } 637 BUG_ON(chix != path->p_idx); 638 } 639#endif 640 641} 642 643/* 644 * ext4_ext_binsearch: 645 * binary search for closest extent of the given block 646 * the header must be checked before calling this 647 */ 648static void 649ext4_ext_binsearch(struct inode *inode, 650 struct ext4_ext_path *path, ext4_lblk_t block) 651{ 652 struct ext4_extent_header *eh = path->p_hdr; 653 struct ext4_extent *r, *l, *m; 654 655 if (eh->eh_entries == 0) { 656 /* 657 * this leaf is empty: 658 * we get such a leaf in split/add case 659 */ 660 return; 661 } 662 663 ext_debug("binsearch for %u: ", block); 664 665 l = EXT_FIRST_EXTENT(eh) + 1; 666 r = EXT_LAST_EXTENT(eh); 667 668 while (l <= r) { 669 m = l + (r - l) / 2; 670 if (block < le32_to_cpu(m->ee_block)) 671 r = m - 1; 672 else 673 l = m + 1; 674 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 675 m, le32_to_cpu(m->ee_block), 676 r, le32_to_cpu(r->ee_block)); 677 } 678 679 path->p_ext = l - 1; 680 ext_debug(" -> %d:%llu:[%d]%d ", 681 le32_to_cpu(path->p_ext->ee_block), 682 ext4_ext_pblock(path->p_ext), 683 ext4_ext_is_uninitialized(path->p_ext), 684 ext4_ext_get_actual_len(path->p_ext)); 685 686#ifdef CHECK_BINSEARCH 687 { 688 struct ext4_extent *chex, *ex; 689 int k; 690 691 chex = ex = EXT_FIRST_EXTENT(eh); 692 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 693 BUG_ON(k && le32_to_cpu(ex->ee_block) 694 <= le32_to_cpu(ex[-1].ee_block)); 695 if (block < le32_to_cpu(ex->ee_block)) 696 break; 697 chex = ex; 698 } 699 BUG_ON(chex != path->p_ext); 700 } 701#endif 702 703} 704 705int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 706{ 707 struct ext4_extent_header *eh; 708 709 eh = ext_inode_hdr(inode); 710 eh->eh_depth = 0; 711 eh->eh_entries = 0; 712 eh->eh_magic = EXT4_EXT_MAGIC; 713 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 714 ext4_mark_inode_dirty(handle, inode); 715 ext4_ext_invalidate_cache(inode); 716 return 0; 717} 718 719struct ext4_ext_path * 720ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 721 struct ext4_ext_path *path) 722{ 723 struct ext4_extent_header *eh; 724 struct buffer_head *bh; 725 short int depth, i, ppos = 0, alloc = 0; 726 727 eh = ext_inode_hdr(inode); 728 depth = ext_depth(inode); 729 730 /* account possible depth increase */ 731 if (!path) { 732 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 733 GFP_NOFS); 734 if (!path) 735 return ERR_PTR(-ENOMEM); 736 alloc = 1; 737 } 738 path[0].p_hdr = eh; 739 path[0].p_bh = NULL; 740 741 i = depth; 742 /* walk through the tree */ 743 while (i) { 744 ext_debug("depth %d: num %d, max %d\n", 745 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 746 747 ext4_ext_binsearch_idx(inode, path + ppos, block); 748 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 749 path[ppos].p_depth = i; 750 path[ppos].p_ext = NULL; 751 752 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 753 if (unlikely(!bh)) 754 goto err; 755 if (!bh_uptodate_or_lock(bh)) { 756 trace_ext4_ext_load_extent(inode, block, 757 path[ppos].p_block); 758 if (bh_submit_read(bh) < 0) { 759 put_bh(bh); 760 goto err; 761 } 762 } 763 eh = ext_block_hdr(bh); 764 ppos++; 765 if (unlikely(ppos > depth)) { 766 put_bh(bh); 767 EXT4_ERROR_INODE(inode, 768 "ppos %d > depth %d", ppos, depth); 769 goto err; 770 } 771 path[ppos].p_bh = bh; 772 path[ppos].p_hdr = eh; 773 i--; 774 775 if (ext4_ext_check_block(inode, eh, i, bh)) 776 goto err; 777 } 778 779 path[ppos].p_depth = i; 780 path[ppos].p_ext = NULL; 781 path[ppos].p_idx = NULL; 782 783 /* find extent */ 784 ext4_ext_binsearch(inode, path + ppos, block); 785 /* if not an empty leaf */ 786 if (path[ppos].p_ext) 787 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 788 789 ext4_ext_show_path(inode, path); 790 791 return path; 792 793err: 794 ext4_ext_drop_refs(path); 795 if (alloc) 796 kfree(path); 797 return ERR_PTR(-EIO); 798} 799 800/* 801 * ext4_ext_insert_index: 802 * insert new index [@logical;@ptr] into the block at @curp; 803 * check where to insert: before @curp or after @curp 804 */ 805static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 806 struct ext4_ext_path *curp, 807 int logical, ext4_fsblk_t ptr) 808{ 809 struct ext4_extent_idx *ix; 810 int len, err; 811 812 err = ext4_ext_get_access(handle, inode, curp); 813 if (err) 814 return err; 815 816 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 817 EXT4_ERROR_INODE(inode, 818 "logical %d == ei_block %d!", 819 logical, le32_to_cpu(curp->p_idx->ei_block)); 820 return -EIO; 821 } 822 823 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 824 >= le16_to_cpu(curp->p_hdr->eh_max))) { 825 EXT4_ERROR_INODE(inode, 826 "eh_entries %d >= eh_max %d!", 827 le16_to_cpu(curp->p_hdr->eh_entries), 828 le16_to_cpu(curp->p_hdr->eh_max)); 829 return -EIO; 830 } 831 832 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 833 /* insert after */ 834 ext_debug("insert new index %d after: %llu\n", logical, ptr); 835 ix = curp->p_idx + 1; 836 } else { 837 /* insert before */ 838 ext_debug("insert new index %d before: %llu\n", logical, ptr); 839 ix = curp->p_idx; 840 } 841 842 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 843 BUG_ON(len < 0); 844 if (len > 0) { 845 ext_debug("insert new index %d: " 846 "move %d indices from 0x%p to 0x%p\n", 847 logical, len, ix, ix + 1); 848 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 849 } 850 851 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 852 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 853 return -EIO; 854 } 855 856 ix->ei_block = cpu_to_le32(logical); 857 ext4_idx_store_pblock(ix, ptr); 858 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 859 860 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 861 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 862 return -EIO; 863 } 864 865 err = ext4_ext_dirty(handle, inode, curp); 866 ext4_std_error(inode->i_sb, err); 867 868 return err; 869} 870 871/* 872 * ext4_ext_split: 873 * inserts new subtree into the path, using free index entry 874 * at depth @at: 875 * - allocates all needed blocks (new leaf and all intermediate index blocks) 876 * - makes decision where to split 877 * - moves remaining extents and index entries (right to the split point) 878 * into the newly allocated blocks 879 * - initializes subtree 880 */ 881static int ext4_ext_split(handle_t *handle, struct inode *inode, 882 unsigned int flags, 883 struct ext4_ext_path *path, 884 struct ext4_extent *newext, int at) 885{ 886 struct buffer_head *bh = NULL; 887 int depth = ext_depth(inode); 888 struct ext4_extent_header *neh; 889 struct ext4_extent_idx *fidx; 890 int i = at, k, m, a; 891 ext4_fsblk_t newblock, oldblock; 892 __le32 border; 893 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 894 int err = 0; 895 896 /* make decision: where to split? */ 897 /* FIXME: now decision is simplest: at current extent */ 898 899 /* if current leaf will be split, then we should use 900 * border from split point */ 901 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 902 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 903 return -EIO; 904 } 905 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 906 border = path[depth].p_ext[1].ee_block; 907 ext_debug("leaf will be split." 908 " next leaf starts at %d\n", 909 le32_to_cpu(border)); 910 } else { 911 border = newext->ee_block; 912 ext_debug("leaf will be added." 913 " next leaf starts at %d\n", 914 le32_to_cpu(border)); 915 } 916 917 /* 918 * If error occurs, then we break processing 919 * and mark filesystem read-only. index won't 920 * be inserted and tree will be in consistent 921 * state. Next mount will repair buffers too. 922 */ 923 924 /* 925 * Get array to track all allocated blocks. 926 * We need this to handle errors and free blocks 927 * upon them. 928 */ 929 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 930 if (!ablocks) 931 return -ENOMEM; 932 933 /* allocate all needed blocks */ 934 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 935 for (a = 0; a < depth - at; a++) { 936 newblock = ext4_ext_new_meta_block(handle, inode, path, 937 newext, &err, flags); 938 if (newblock == 0) 939 goto cleanup; 940 ablocks[a] = newblock; 941 } 942 943 /* initialize new leaf */ 944 newblock = ablocks[--a]; 945 if (unlikely(newblock == 0)) { 946 EXT4_ERROR_INODE(inode, "newblock == 0!"); 947 err = -EIO; 948 goto cleanup; 949 } 950 bh = sb_getblk(inode->i_sb, newblock); 951 if (!bh) { 952 err = -EIO; 953 goto cleanup; 954 } 955 lock_buffer(bh); 956 957 err = ext4_journal_get_create_access(handle, bh); 958 if (err) 959 goto cleanup; 960 961 neh = ext_block_hdr(bh); 962 neh->eh_entries = 0; 963 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 964 neh->eh_magic = EXT4_EXT_MAGIC; 965 neh->eh_depth = 0; 966 967 /* move remainder of path[depth] to the new leaf */ 968 if (unlikely(path[depth].p_hdr->eh_entries != 969 path[depth].p_hdr->eh_max)) { 970 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 971 path[depth].p_hdr->eh_entries, 972 path[depth].p_hdr->eh_max); 973 err = -EIO; 974 goto cleanup; 975 } 976 /* start copy from next extent */ 977 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 978 ext4_ext_show_move(inode, path, newblock, depth); 979 if (m) { 980 struct ext4_extent *ex; 981 ex = EXT_FIRST_EXTENT(neh); 982 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 983 le16_add_cpu(&neh->eh_entries, m); 984 } 985 986 ext4_extent_block_csum_set(inode, neh); 987 set_buffer_uptodate(bh); 988 unlock_buffer(bh); 989 990 err = ext4_handle_dirty_metadata(handle, inode, bh); 991 if (err) 992 goto cleanup; 993 brelse(bh); 994 bh = NULL; 995 996 /* correct old leaf */ 997 if (m) { 998 err = ext4_ext_get_access(handle, inode, path + depth); 999 if (err) 1000 goto cleanup; 1001 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 1002 err = ext4_ext_dirty(handle, inode, path + depth); 1003 if (err) 1004 goto cleanup; 1005 1006 } 1007 1008 /* create intermediate indexes */ 1009 k = depth - at - 1; 1010 if (unlikely(k < 0)) { 1011 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 1012 err = -EIO; 1013 goto cleanup; 1014 } 1015 if (k) 1016 ext_debug("create %d intermediate indices\n", k); 1017 /* insert new index into current index block */ 1018 /* current depth stored in i var */ 1019 i = depth - 1; 1020 while (k--) { 1021 oldblock = newblock; 1022 newblock = ablocks[--a]; 1023 bh = sb_getblk(inode->i_sb, newblock); 1024 if (!bh) { 1025 err = -EIO; 1026 goto cleanup; 1027 } 1028 lock_buffer(bh); 1029 1030 err = ext4_journal_get_create_access(handle, bh); 1031 if (err) 1032 goto cleanup; 1033 1034 neh = ext_block_hdr(bh); 1035 neh->eh_entries = cpu_to_le16(1); 1036 neh->eh_magic = EXT4_EXT_MAGIC; 1037 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1038 neh->eh_depth = cpu_to_le16(depth - i); 1039 fidx = EXT_FIRST_INDEX(neh); 1040 fidx->ei_block = border; 1041 ext4_idx_store_pblock(fidx, oldblock); 1042 1043 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 1044 i, newblock, le32_to_cpu(border), oldblock); 1045 1046 /* move remainder of path[i] to the new index block */ 1047 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1048 EXT_LAST_INDEX(path[i].p_hdr))) { 1049 EXT4_ERROR_INODE(inode, 1050 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1051 le32_to_cpu(path[i].p_ext->ee_block)); 1052 err = -EIO; 1053 goto cleanup; 1054 } 1055 /* start copy indexes */ 1056 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1057 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1058 EXT_MAX_INDEX(path[i].p_hdr)); 1059 ext4_ext_show_move(inode, path, newblock, i); 1060 if (m) { 1061 memmove(++fidx, path[i].p_idx, 1062 sizeof(struct ext4_extent_idx) * m); 1063 le16_add_cpu(&neh->eh_entries, m); 1064 } 1065 ext4_extent_block_csum_set(inode, neh); 1066 set_buffer_uptodate(bh); 1067 unlock_buffer(bh); 1068 1069 err = ext4_handle_dirty_metadata(handle, inode, bh); 1070 if (err) 1071 goto cleanup; 1072 brelse(bh); 1073 bh = NULL; 1074 1075 /* correct old index */ 1076 if (m) { 1077 err = ext4_ext_get_access(handle, inode, path + i); 1078 if (err) 1079 goto cleanup; 1080 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1081 err = ext4_ext_dirty(handle, inode, path + i); 1082 if (err) 1083 goto cleanup; 1084 } 1085 1086 i--; 1087 } 1088 1089 /* insert new index */ 1090 err = ext4_ext_insert_index(handle, inode, path + at, 1091 le32_to_cpu(border), newblock); 1092 1093cleanup: 1094 if (bh) { 1095 if (buffer_locked(bh)) 1096 unlock_buffer(bh); 1097 brelse(bh); 1098 } 1099 1100 if (err) { 1101 /* free all allocated blocks in error case */ 1102 for (i = 0; i < depth; i++) { 1103 if (!ablocks[i]) 1104 continue; 1105 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1106 EXT4_FREE_BLOCKS_METADATA); 1107 } 1108 } 1109 kfree(ablocks); 1110 1111 return err; 1112} 1113 1114/* 1115 * ext4_ext_grow_indepth: 1116 * implements tree growing procedure: 1117 * - allocates new block 1118 * - moves top-level data (index block or leaf) into the new block 1119 * - initializes new top-level, creating index that points to the 1120 * just created block 1121 */ 1122static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1123 unsigned int flags, 1124 struct ext4_extent *newext) 1125{ 1126 struct ext4_extent_header *neh; 1127 struct buffer_head *bh; 1128 ext4_fsblk_t newblock; 1129 int err = 0; 1130 1131 newblock = ext4_ext_new_meta_block(handle, inode, NULL, 1132 newext, &err, flags); 1133 if (newblock == 0) 1134 return err; 1135 1136 bh = sb_getblk(inode->i_sb, newblock); 1137 if (!bh) { 1138 err = -EIO; 1139 ext4_std_error(inode->i_sb, err); 1140 return err; 1141 } 1142 lock_buffer(bh); 1143 1144 err = ext4_journal_get_create_access(handle, bh); 1145 if (err) { 1146 unlock_buffer(bh); 1147 goto out; 1148 } 1149 1150 /* move top-level index/leaf into new block */ 1151 memmove(bh->b_data, EXT4_I(inode)->i_data, 1152 sizeof(EXT4_I(inode)->i_data)); 1153 1154 /* set size of new block */ 1155 neh = ext_block_hdr(bh); 1156 /* old root could have indexes or leaves 1157 * so calculate e_max right way */ 1158 if (ext_depth(inode)) 1159 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1160 else 1161 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1162 neh->eh_magic = EXT4_EXT_MAGIC; 1163 ext4_extent_block_csum_set(inode, neh); 1164 set_buffer_uptodate(bh); 1165 unlock_buffer(bh); 1166 1167 err = ext4_handle_dirty_metadata(handle, inode, bh); 1168 if (err) 1169 goto out; 1170 1171 /* Update top-level index: num,max,pointer */ 1172 neh = ext_inode_hdr(inode); 1173 neh->eh_entries = cpu_to_le16(1); 1174 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); 1175 if (neh->eh_depth == 0) { 1176 /* Root extent block becomes index block */ 1177 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1178 EXT_FIRST_INDEX(neh)->ei_block = 1179 EXT_FIRST_EXTENT(neh)->ee_block; 1180 } 1181 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1182 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1183 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1184 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1185 1186 le16_add_cpu(&neh->eh_depth, 1); 1187 ext4_mark_inode_dirty(handle, inode); 1188out: 1189 brelse(bh); 1190 1191 return err; 1192} 1193 1194/* 1195 * ext4_ext_create_new_leaf: 1196 * finds empty index and adds new leaf. 1197 * if no free index is found, then it requests in-depth growing. 1198 */ 1199static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1200 unsigned int flags, 1201 struct ext4_ext_path *path, 1202 struct ext4_extent *newext) 1203{ 1204 struct ext4_ext_path *curp; 1205 int depth, i, err = 0; 1206 1207repeat: 1208 i = depth = ext_depth(inode); 1209 1210 /* walk up to the tree and look for free index entry */ 1211 curp = path + depth; 1212 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1213 i--; 1214 curp--; 1215 } 1216 1217 /* we use already allocated block for index block, 1218 * so subsequent data blocks should be contiguous */ 1219 if (EXT_HAS_FREE_INDEX(curp)) { 1220 /* if we found index with free entry, then use that 1221 * entry: create all needed subtree and add new leaf */ 1222 err = ext4_ext_split(handle, inode, flags, path, newext, i); 1223 if (err) 1224 goto out; 1225 1226 /* refill path */ 1227 ext4_ext_drop_refs(path); 1228 path = ext4_ext_find_extent(inode, 1229 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1230 path); 1231 if (IS_ERR(path)) 1232 err = PTR_ERR(path); 1233 } else { 1234 /* tree is full, time to grow in depth */ 1235 err = ext4_ext_grow_indepth(handle, inode, flags, newext); 1236 if (err) 1237 goto out; 1238 1239 /* refill path */ 1240 ext4_ext_drop_refs(path); 1241 path = ext4_ext_find_extent(inode, 1242 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1243 path); 1244 if (IS_ERR(path)) { 1245 err = PTR_ERR(path); 1246 goto out; 1247 } 1248 1249 /* 1250 * only first (depth 0 -> 1) produces free space; 1251 * in all other cases we have to split the grown tree 1252 */ 1253 depth = ext_depth(inode); 1254 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1255 /* now we need to split */ 1256 goto repeat; 1257 } 1258 } 1259 1260out: 1261 return err; 1262} 1263 1264/* 1265 * search the closest allocated block to the left for *logical 1266 * and returns it at @logical + it's physical address at @phys 1267 * if *logical is the smallest allocated block, the function 1268 * returns 0 at @phys 1269 * return value contains 0 (success) or error code 1270 */ 1271static int ext4_ext_search_left(struct inode *inode, 1272 struct ext4_ext_path *path, 1273 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1274{ 1275 struct ext4_extent_idx *ix; 1276 struct ext4_extent *ex; 1277 int depth, ee_len; 1278 1279 if (unlikely(path == NULL)) { 1280 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1281 return -EIO; 1282 } 1283 depth = path->p_depth; 1284 *phys = 0; 1285 1286 if (depth == 0 && path->p_ext == NULL) 1287 return 0; 1288 1289 /* usually extent in the path covers blocks smaller 1290 * then *logical, but it can be that extent is the 1291 * first one in the file */ 1292 1293 ex = path[depth].p_ext; 1294 ee_len = ext4_ext_get_actual_len(ex); 1295 if (*logical < le32_to_cpu(ex->ee_block)) { 1296 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1297 EXT4_ERROR_INODE(inode, 1298 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1299 *logical, le32_to_cpu(ex->ee_block)); 1300 return -EIO; 1301 } 1302 while (--depth >= 0) { 1303 ix = path[depth].p_idx; 1304 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1305 EXT4_ERROR_INODE(inode, 1306 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1307 ix != NULL ? le32_to_cpu(ix->ei_block) : 0, 1308 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1309 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, 1310 depth); 1311 return -EIO; 1312 } 1313 } 1314 return 0; 1315 } 1316 1317 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1318 EXT4_ERROR_INODE(inode, 1319 "logical %d < ee_block %d + ee_len %d!", 1320 *logical, le32_to_cpu(ex->ee_block), ee_len); 1321 return -EIO; 1322 } 1323 1324 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1325 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1326 return 0; 1327} 1328 1329/* 1330 * search the closest allocated block to the right for *logical 1331 * and returns it at @logical + it's physical address at @phys 1332 * if *logical is the largest allocated block, the function 1333 * returns 0 at @phys 1334 * return value contains 0 (success) or error code 1335 */ 1336static int ext4_ext_search_right(struct inode *inode, 1337 struct ext4_ext_path *path, 1338 ext4_lblk_t *logical, ext4_fsblk_t *phys, 1339 struct ext4_extent **ret_ex) 1340{ 1341 struct buffer_head *bh = NULL; 1342 struct ext4_extent_header *eh; 1343 struct ext4_extent_idx *ix; 1344 struct ext4_extent *ex; 1345 ext4_fsblk_t block; 1346 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1347 int ee_len; 1348 1349 if (unlikely(path == NULL)) { 1350 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1351 return -EIO; 1352 } 1353 depth = path->p_depth; 1354 *phys = 0; 1355 1356 if (depth == 0 && path->p_ext == NULL) 1357 return 0; 1358 1359 /* usually extent in the path covers blocks smaller 1360 * then *logical, but it can be that extent is the 1361 * first one in the file */ 1362 1363 ex = path[depth].p_ext; 1364 ee_len = ext4_ext_get_actual_len(ex); 1365 if (*logical < le32_to_cpu(ex->ee_block)) { 1366 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1367 EXT4_ERROR_INODE(inode, 1368 "first_extent(path[%d].p_hdr) != ex", 1369 depth); 1370 return -EIO; 1371 } 1372 while (--depth >= 0) { 1373 ix = path[depth].p_idx; 1374 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1375 EXT4_ERROR_INODE(inode, 1376 "ix != EXT_FIRST_INDEX *logical %d!", 1377 *logical); 1378 return -EIO; 1379 } 1380 } 1381 goto found_extent; 1382 } 1383 1384 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1385 EXT4_ERROR_INODE(inode, 1386 "logical %d < ee_block %d + ee_len %d!", 1387 *logical, le32_to_cpu(ex->ee_block), ee_len); 1388 return -EIO; 1389 } 1390 1391 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1392 /* next allocated block in this leaf */ 1393 ex++; 1394 goto found_extent; 1395 } 1396 1397 /* go up and search for index to the right */ 1398 while (--depth >= 0) { 1399 ix = path[depth].p_idx; 1400 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1401 goto got_index; 1402 } 1403 1404 /* we've gone up to the root and found no index to the right */ 1405 return 0; 1406 1407got_index: 1408 /* we've found index to the right, let's 1409 * follow it and find the closest allocated 1410 * block to the right */ 1411 ix++; 1412 block = ext4_idx_pblock(ix); 1413 while (++depth < path->p_depth) { 1414 bh = sb_bread(inode->i_sb, block); 1415 if (bh == NULL) 1416 return -EIO; 1417 eh = ext_block_hdr(bh); 1418 /* subtract from p_depth to get proper eh_depth */ 1419 if (ext4_ext_check_block(inode, eh, 1420 path->p_depth - depth, bh)) { 1421 put_bh(bh); 1422 return -EIO; 1423 } 1424 ix = EXT_FIRST_INDEX(eh); 1425 block = ext4_idx_pblock(ix); 1426 put_bh(bh); 1427 } 1428 1429 bh = sb_bread(inode->i_sb, block); 1430 if (bh == NULL) 1431 return -EIO; 1432 eh = ext_block_hdr(bh); 1433 if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { 1434 put_bh(bh); 1435 return -EIO; 1436 } 1437 ex = EXT_FIRST_EXTENT(eh); 1438found_extent: 1439 *logical = le32_to_cpu(ex->ee_block); 1440 *phys = ext4_ext_pblock(ex); 1441 *ret_ex = ex; 1442 if (bh) 1443 put_bh(bh); 1444 return 0; 1445} 1446 1447/* 1448 * ext4_ext_next_allocated_block: 1449 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1450 * NOTE: it considers block number from index entry as 1451 * allocated block. Thus, index entries have to be consistent 1452 * with leaves. 1453 */ 1454static ext4_lblk_t 1455ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1456{ 1457 int depth; 1458 1459 BUG_ON(path == NULL); 1460 depth = path->p_depth; 1461 1462 if (depth == 0 && path->p_ext == NULL) 1463 return EXT_MAX_BLOCKS; 1464 1465 while (depth >= 0) { 1466 if (depth == path->p_depth) { 1467 /* leaf */ 1468 if (path[depth].p_ext && 1469 path[depth].p_ext != 1470 EXT_LAST_EXTENT(path[depth].p_hdr)) 1471 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1472 } else { 1473 /* index */ 1474 if (path[depth].p_idx != 1475 EXT_LAST_INDEX(path[depth].p_hdr)) 1476 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1477 } 1478 depth--; 1479 } 1480 1481 return EXT_MAX_BLOCKS; 1482} 1483 1484/* 1485 * ext4_ext_next_leaf_block: 1486 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1487 */ 1488static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) 1489{ 1490 int depth; 1491 1492 BUG_ON(path == NULL); 1493 depth = path->p_depth; 1494 1495 /* zero-tree has no leaf blocks at all */ 1496 if (depth == 0) 1497 return EXT_MAX_BLOCKS; 1498 1499 /* go to index block */ 1500 depth--; 1501 1502 while (depth >= 0) { 1503 if (path[depth].p_idx != 1504 EXT_LAST_INDEX(path[depth].p_hdr)) 1505 return (ext4_lblk_t) 1506 le32_to_cpu(path[depth].p_idx[1].ei_block); 1507 depth--; 1508 } 1509 1510 return EXT_MAX_BLOCKS; 1511} 1512 1513/* 1514 * ext4_ext_correct_indexes: 1515 * if leaf gets modified and modified extent is first in the leaf, 1516 * then we have to correct all indexes above. 1517 * TODO: do we need to correct tree in all cases? 1518 */ 1519static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1520 struct ext4_ext_path *path) 1521{ 1522 struct ext4_extent_header *eh; 1523 int depth = ext_depth(inode); 1524 struct ext4_extent *ex; 1525 __le32 border; 1526 int k, err = 0; 1527 1528 eh = path[depth].p_hdr; 1529 ex = path[depth].p_ext; 1530 1531 if (unlikely(ex == NULL || eh == NULL)) { 1532 EXT4_ERROR_INODE(inode, 1533 "ex %p == NULL or eh %p == NULL", ex, eh); 1534 return -EIO; 1535 } 1536 1537 if (depth == 0) { 1538 /* there is no tree at all */ 1539 return 0; 1540 } 1541 1542 if (ex != EXT_FIRST_EXTENT(eh)) { 1543 /* we correct tree if first leaf got modified only */ 1544 return 0; 1545 } 1546 1547 /* 1548 * TODO: we need correction if border is smaller than current one 1549 */ 1550 k = depth - 1; 1551 border = path[depth].p_ext->ee_block; 1552 err = ext4_ext_get_access(handle, inode, path + k); 1553 if (err) 1554 return err; 1555 path[k].p_idx->ei_block = border; 1556 err = ext4_ext_dirty(handle, inode, path + k); 1557 if (err) 1558 return err; 1559 1560 while (k--) { 1561 /* change all left-side indexes */ 1562 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1563 break; 1564 err = ext4_ext_get_access(handle, inode, path + k); 1565 if (err) 1566 break; 1567 path[k].p_idx->ei_block = border; 1568 err = ext4_ext_dirty(handle, inode, path + k); 1569 if (err) 1570 break; 1571 } 1572 1573 return err; 1574} 1575 1576int 1577ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1578 struct ext4_extent *ex2) 1579{ 1580 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1581 1582 /* 1583 * Make sure that either both extents are uninitialized, or 1584 * both are _not_. 1585 */ 1586 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1587 return 0; 1588 1589 if (ext4_ext_is_uninitialized(ex1)) 1590 max_len = EXT_UNINIT_MAX_LEN; 1591 else 1592 max_len = EXT_INIT_MAX_LEN; 1593 1594 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1595 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1596 1597 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1598 le32_to_cpu(ex2->ee_block)) 1599 return 0; 1600 1601 /* 1602 * To allow future support for preallocated extents to be added 1603 * as an RO_COMPAT feature, refuse to merge to extents if 1604 * this can result in the top bit of ee_len being set. 1605 */ 1606 if (ext1_ee_len + ext2_ee_len > max_len) 1607 return 0; 1608#ifdef AGGRESSIVE_TEST 1609 if (ext1_ee_len >= 4) 1610 return 0; 1611#endif 1612 1613 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1614 return 1; 1615 return 0; 1616} 1617 1618/* 1619 * This function tries to merge the "ex" extent to the next extent in the tree. 1620 * It always tries to merge towards right. If you want to merge towards 1621 * left, pass "ex - 1" as argument instead of "ex". 1622 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1623 * 1 if they got merged. 1624 */ 1625static int ext4_ext_try_to_merge_right(struct inode *inode, 1626 struct ext4_ext_path *path, 1627 struct ext4_extent *ex) 1628{ 1629 struct ext4_extent_header *eh; 1630 unsigned int depth, len; 1631 int merge_done = 0; 1632 int uninitialized = 0; 1633 1634 depth = ext_depth(inode); 1635 BUG_ON(path[depth].p_hdr == NULL); 1636 eh = path[depth].p_hdr; 1637 1638 while (ex < EXT_LAST_EXTENT(eh)) { 1639 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1640 break; 1641 /* merge with next extent! */ 1642 if (ext4_ext_is_uninitialized(ex)) 1643 uninitialized = 1; 1644 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1645 + ext4_ext_get_actual_len(ex + 1)); 1646 if (uninitialized) 1647 ext4_ext_mark_uninitialized(ex); 1648 1649 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1650 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1651 * sizeof(struct ext4_extent); 1652 memmove(ex + 1, ex + 2, len); 1653 } 1654 le16_add_cpu(&eh->eh_entries, -1); 1655 merge_done = 1; 1656 WARN_ON(eh->eh_entries == 0); 1657 if (!eh->eh_entries) 1658 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1659 } 1660 1661 return merge_done; 1662} 1663 1664/* 1665 * This function does a very simple check to see if we can collapse 1666 * an extent tree with a single extent tree leaf block into the inode. 1667 */ 1668static void ext4_ext_try_to_merge_up(handle_t *handle, 1669 struct inode *inode, 1670 struct ext4_ext_path *path) 1671{ 1672 size_t s; 1673 unsigned max_root = ext4_ext_space_root(inode, 0); 1674 ext4_fsblk_t blk; 1675 1676 if ((path[0].p_depth != 1) || 1677 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1678 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1679 return; 1680 1681 /* 1682 * We need to modify the block allocation bitmap and the block 1683 * group descriptor to release the extent tree block. If we 1684 * can't get the journal credits, give up. 1685 */ 1686 if (ext4_journal_extend(handle, 2)) 1687 return; 1688 1689 /* 1690 * Copy the extent data up to the inode 1691 */ 1692 blk = ext4_idx_pblock(path[0].p_idx); 1693 s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1694 sizeof(struct ext4_extent_idx); 1695 s += sizeof(struct ext4_extent_header); 1696 1697 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1698 path[0].p_depth = 0; 1699 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1700 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1701 path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1702 1703 brelse(path[1].p_bh); 1704 ext4_free_blocks(handle, inode, NULL, blk, 1, 1705 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1706} 1707 1708/* 1709 * This function tries to merge the @ex extent to neighbours in the tree. 1710 * return 1 if merge left else 0. 1711 */ 1712static void ext4_ext_try_to_merge(handle_t *handle, 1713 struct inode *inode, 1714 struct ext4_ext_path *path, 1715 struct ext4_extent *ex) { 1716 struct ext4_extent_header *eh; 1717 unsigned int depth; 1718 int merge_done = 0; 1719 1720 depth = ext_depth(inode); 1721 BUG_ON(path[depth].p_hdr == NULL); 1722 eh = path[depth].p_hdr; 1723 1724 if (ex > EXT_FIRST_EXTENT(eh)) 1725 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1726 1727 if (!merge_done) 1728 (void) ext4_ext_try_to_merge_right(inode, path, ex); 1729 1730 ext4_ext_try_to_merge_up(handle, inode, path); 1731} 1732 1733/* 1734 * check if a portion of the "newext" extent overlaps with an 1735 * existing extent. 1736 * 1737 * If there is an overlap discovered, it updates the length of the newext 1738 * such that there will be no overlap, and then returns 1. 1739 * If there is no overlap found, it returns 0. 1740 */ 1741static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 1742 struct inode *inode, 1743 struct ext4_extent *newext, 1744 struct ext4_ext_path *path) 1745{ 1746 ext4_lblk_t b1, b2; 1747 unsigned int depth, len1; 1748 unsigned int ret = 0; 1749 1750 b1 = le32_to_cpu(newext->ee_block); 1751 len1 = ext4_ext_get_actual_len(newext); 1752 depth = ext_depth(inode); 1753 if (!path[depth].p_ext) 1754 goto out; 1755 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1756 b2 &= ~(sbi->s_cluster_ratio - 1); 1757 1758 /* 1759 * get the next allocated block if the extent in the path 1760 * is before the requested block(s) 1761 */ 1762 if (b2 < b1) { 1763 b2 = ext4_ext_next_allocated_block(path); 1764 if (b2 == EXT_MAX_BLOCKS) 1765 goto out; 1766 b2 &= ~(sbi->s_cluster_ratio - 1); 1767 } 1768 1769 /* check for wrap through zero on extent logical start block*/ 1770 if (b1 + len1 < b1) { 1771 len1 = EXT_MAX_BLOCKS - b1; 1772 newext->ee_len = cpu_to_le16(len1); 1773 ret = 1; 1774 } 1775 1776 /* check for overlap */ 1777 if (b1 + len1 > b2) { 1778 newext->ee_len = cpu_to_le16(b2 - b1); 1779 ret = 1; 1780 } 1781out: 1782 return ret; 1783} 1784 1785/* 1786 * ext4_ext_insert_extent: 1787 * tries to merge requsted extent into the existing extent or 1788 * inserts requested extent as new one into the tree, 1789 * creating new leaf in the no-space case. 1790 */ 1791int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1792 struct ext4_ext_path *path, 1793 struct ext4_extent *newext, int flag) 1794{ 1795 struct ext4_extent_header *eh; 1796 struct ext4_extent *ex, *fex; 1797 struct ext4_extent *nearex; /* nearest extent */ 1798 struct ext4_ext_path *npath = NULL; 1799 int depth, len, err; 1800 ext4_lblk_t next; 1801 unsigned uninitialized = 0; 1802 int flags = 0; 1803 1804 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1805 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1806 return -EIO; 1807 } 1808 depth = ext_depth(inode); 1809 ex = path[depth].p_ext; 1810 if (unlikely(path[depth].p_hdr == NULL)) { 1811 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1812 return -EIO; 1813 } 1814 1815 /* try to insert block into found extent and return */ 1816 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1817 && ext4_can_extents_be_merged(inode, ex, newext)) { 1818 ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", 1819 ext4_ext_is_uninitialized(newext), 1820 ext4_ext_get_actual_len(newext), 1821 le32_to_cpu(ex->ee_block), 1822 ext4_ext_is_uninitialized(ex), 1823 ext4_ext_get_actual_len(ex), 1824 ext4_ext_pblock(ex)); 1825 err = ext4_ext_get_access(handle, inode, path + depth); 1826 if (err) 1827 return err; 1828 1829 /* 1830 * ext4_can_extents_be_merged should have checked that either 1831 * both extents are uninitialized, or both aren't. Thus we 1832 * need to check only one of them here. 1833 */ 1834 if (ext4_ext_is_uninitialized(ex)) 1835 uninitialized = 1; 1836 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1837 + ext4_ext_get_actual_len(newext)); 1838 if (uninitialized) 1839 ext4_ext_mark_uninitialized(ex); 1840 eh = path[depth].p_hdr; 1841 nearex = ex; 1842 goto merge; 1843 } 1844 1845 depth = ext_depth(inode); 1846 eh = path[depth].p_hdr; 1847 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1848 goto has_space; 1849 1850 /* probably next leaf has space for us? */ 1851 fex = EXT_LAST_EXTENT(eh); 1852 next = EXT_MAX_BLOCKS; 1853 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 1854 next = ext4_ext_next_leaf_block(path); 1855 if (next != EXT_MAX_BLOCKS) { 1856 ext_debug("next leaf block - %u\n", next); 1857 BUG_ON(npath != NULL); 1858 npath = ext4_ext_find_extent(inode, next, NULL); 1859 if (IS_ERR(npath)) 1860 return PTR_ERR(npath); 1861 BUG_ON(npath->p_depth != path->p_depth); 1862 eh = npath[depth].p_hdr; 1863 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1864 ext_debug("next leaf isn't full(%d)\n", 1865 le16_to_cpu(eh->eh_entries)); 1866 path = npath; 1867 goto has_space; 1868 } 1869 ext_debug("next leaf has no free space(%d,%d)\n", 1870 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1871 } 1872 1873 /* 1874 * There is no free space in the found leaf. 1875 * We're gonna add a new leaf in the tree. 1876 */ 1877 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) 1878 flags = EXT4_MB_USE_ROOT_BLOCKS; 1879 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 1880 if (err) 1881 goto cleanup; 1882 depth = ext_depth(inode); 1883 eh = path[depth].p_hdr; 1884 1885has_space: 1886 nearex = path[depth].p_ext; 1887 1888 err = ext4_ext_get_access(handle, inode, path + depth); 1889 if (err) 1890 goto cleanup; 1891 1892 if (!nearex) { 1893 /* there is no extent in this leaf, create first one */ 1894 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", 1895 le32_to_cpu(newext->ee_block), 1896 ext4_ext_pblock(newext), 1897 ext4_ext_is_uninitialized(newext), 1898 ext4_ext_get_actual_len(newext)); 1899 nearex = EXT_FIRST_EXTENT(eh); 1900 } else { 1901 if (le32_to_cpu(newext->ee_block) 1902 > le32_to_cpu(nearex->ee_block)) { 1903 /* Insert after */ 1904 ext_debug("insert %u:%llu:[%d]%d before: " 1905 "nearest %p\n", 1906 le32_to_cpu(newext->ee_block), 1907 ext4_ext_pblock(newext), 1908 ext4_ext_is_uninitialized(newext), 1909 ext4_ext_get_actual_len(newext), 1910 nearex); 1911 nearex++; 1912 } else { 1913 /* Insert before */ 1914 BUG_ON(newext->ee_block == nearex->ee_block); 1915 ext_debug("insert %u:%llu:[%d]%d after: " 1916 "nearest %p\n", 1917 le32_to_cpu(newext->ee_block), 1918 ext4_ext_pblock(newext), 1919 ext4_ext_is_uninitialized(newext), 1920 ext4_ext_get_actual_len(newext), 1921 nearex); 1922 } 1923 len = EXT_LAST_EXTENT(eh) - nearex + 1; 1924 if (len > 0) { 1925 ext_debug("insert %u:%llu:[%d]%d: " 1926 "move %d extents from 0x%p to 0x%p\n", 1927 le32_to_cpu(newext->ee_block), 1928 ext4_ext_pblock(newext), 1929 ext4_ext_is_uninitialized(newext), 1930 ext4_ext_get_actual_len(newext), 1931 len, nearex, nearex + 1); 1932 memmove(nearex + 1, nearex, 1933 len * sizeof(struct ext4_extent)); 1934 } 1935 } 1936 1937 le16_add_cpu(&eh->eh_entries, 1); 1938 path[depth].p_ext = nearex; 1939 nearex->ee_block = newext->ee_block; 1940 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1941 nearex->ee_len = newext->ee_len; 1942 1943merge: 1944 /* try to merge extents */ 1945 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1946 ext4_ext_try_to_merge(handle, inode, path, nearex); 1947 1948 1949 /* time to correct all indexes above */ 1950 err = ext4_ext_correct_indexes(handle, inode, path); 1951 if (err) 1952 goto cleanup; 1953 1954 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 1955 1956cleanup: 1957 if (npath) { 1958 ext4_ext_drop_refs(npath); 1959 kfree(npath); 1960 } 1961 ext4_ext_invalidate_cache(inode); 1962 return err; 1963} 1964 1965static int ext4_fill_fiemap_extents(struct inode *inode, 1966 ext4_lblk_t block, ext4_lblk_t num, 1967 struct fiemap_extent_info *fieinfo) 1968{ 1969 struct ext4_ext_path *path = NULL; 1970 struct ext4_ext_cache cbex; 1971 struct ext4_extent *ex; 1972 ext4_lblk_t next, next_del, start = 0, end = 0; 1973 ext4_lblk_t last = block + num; 1974 int exists, depth = 0, err = 0; 1975 unsigned int flags = 0; 1976 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 1977 1978 while (block < last && block != EXT_MAX_BLOCKS) { 1979 num = last - block; 1980 /* find extent for this block */ 1981 down_read(&EXT4_I(inode)->i_data_sem); 1982 1983 if (path && ext_depth(inode) != depth) { 1984 /* depth was changed. we have to realloc path */ 1985 kfree(path); 1986 path = NULL; 1987 } 1988 1989 path = ext4_ext_find_extent(inode, block, path); 1990 if (IS_ERR(path)) { 1991 up_read(&EXT4_I(inode)->i_data_sem); 1992 err = PTR_ERR(path); 1993 path = NULL; 1994 break; 1995 } 1996 1997 depth = ext_depth(inode); 1998 if (unlikely(path[depth].p_hdr == NULL)) { 1999 up_read(&EXT4_I(inode)->i_data_sem); 2000 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2001 err = -EIO; 2002 break; 2003 } 2004 ex = path[depth].p_ext; 2005 next = ext4_ext_next_allocated_block(path); 2006 ext4_ext_drop_refs(path); 2007 2008 flags = 0; 2009 exists = 0; 2010 if (!ex) { 2011 /* there is no extent yet, so try to allocate 2012 * all requested space */ 2013 start = block; 2014 end = block + num; 2015 } else if (le32_to_cpu(ex->ee_block) > block) { 2016 /* need to allocate space before found extent */ 2017 start = block; 2018 end = le32_to_cpu(ex->ee_block); 2019 if (block + num < end) 2020 end = block + num; 2021 } else if (block >= le32_to_cpu(ex->ee_block) 2022 + ext4_ext_get_actual_len(ex)) { 2023 /* need to allocate space after found extent */ 2024 start = block; 2025 end = block + num; 2026 if (end >= next) 2027 end = next; 2028 } else if (block >= le32_to_cpu(ex->ee_block)) { 2029 /* 2030 * some part of requested space is covered 2031 * by found extent 2032 */ 2033 start = block; 2034 end = le32_to_cpu(ex->ee_block) 2035 + ext4_ext_get_actual_len(ex); 2036 if (block + num < end) 2037 end = block + num; 2038 exists = 1; 2039 } else { 2040 BUG(); 2041 } 2042 BUG_ON(end <= start); 2043 2044 if (!exists) { 2045 cbex.ec_block = start; 2046 cbex.ec_len = end - start; 2047 cbex.ec_start = 0; 2048 } else { 2049 cbex.ec_block = le32_to_cpu(ex->ee_block); 2050 cbex.ec_len = ext4_ext_get_actual_len(ex); 2051 cbex.ec_start = ext4_ext_pblock(ex); 2052 if (ext4_ext_is_uninitialized(ex)) 2053 flags |= FIEMAP_EXTENT_UNWRITTEN; 2054 } 2055 2056 /* 2057 * Find delayed extent and update cbex accordingly. We call 2058 * it even in !exists case to find out whether cbex is the 2059 * last existing extent or not. 2060 */ 2061 next_del = ext4_find_delayed_extent(inode, &cbex); 2062 if (!exists && next_del) { 2063 exists = 1; 2064 flags |= FIEMAP_EXTENT_DELALLOC; 2065 } 2066 up_read(&EXT4_I(inode)->i_data_sem); 2067 2068 if (unlikely(cbex.ec_len == 0)) { 2069 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 2070 err = -EIO; 2071 break; 2072 } 2073 2074 /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ 2075 if (next == next_del) { 2076 flags |= FIEMAP_EXTENT_LAST; 2077 if (unlikely(next_del != EXT_MAX_BLOCKS || 2078 next != EXT_MAX_BLOCKS)) { 2079 EXT4_ERROR_INODE(inode, 2080 "next extent == %u, next " 2081 "delalloc extent = %u", 2082 next, next_del); 2083 err = -EIO; 2084 break; 2085 } 2086 } 2087 2088 if (exists) { 2089 err = fiemap_fill_next_extent(fieinfo, 2090 (__u64)cbex.ec_block << blksize_bits, 2091 (__u64)cbex.ec_start << blksize_bits, 2092 (__u64)cbex.ec_len << blksize_bits, 2093 flags); 2094 if (err < 0) 2095 break; 2096 if (err == 1) { 2097 err = 0; 2098 break; 2099 } 2100 } 2101 2102 block = cbex.ec_block + cbex.ec_len; 2103 } 2104 2105 if (path) { 2106 ext4_ext_drop_refs(path); 2107 kfree(path); 2108 } 2109 2110 return err; 2111} 2112 2113static void 2114ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 2115 __u32 len, ext4_fsblk_t start) 2116{ 2117 struct ext4_ext_cache *cex; 2118 BUG_ON(len == 0); 2119 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2120 trace_ext4_ext_put_in_cache(inode, block, len, start); 2121 cex = &EXT4_I(inode)->i_cached_extent; 2122 cex->ec_block = block; 2123 cex->ec_len = len; 2124 cex->ec_start = start; 2125 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2126} 2127 2128/* 2129 * ext4_ext_put_gap_in_cache: 2130 * calculate boundaries of the gap that the requested block fits into 2131 * and cache this gap 2132 */ 2133static void 2134ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2135 ext4_lblk_t block) 2136{ 2137 int depth = ext_depth(inode); 2138 unsigned long len; 2139 ext4_lblk_t lblock; 2140 struct ext4_extent *ex; 2141 2142 ex = path[depth].p_ext; 2143 if (ex == NULL) { 2144 /* there is no extent yet, so gap is [0;-] */ 2145 lblock = 0; 2146 len = EXT_MAX_BLOCKS; 2147 ext_debug("cache gap(whole file):"); 2148 } else if (block < le32_to_cpu(ex->ee_block)) { 2149 lblock = block; 2150 len = le32_to_cpu(ex->ee_block) - block; 2151 ext_debug("cache gap(before): %u [%u:%u]", 2152 block, 2153 le32_to_cpu(ex->ee_block), 2154 ext4_ext_get_actual_len(ex)); 2155 } else if (block >= le32_to_cpu(ex->ee_block) 2156 + ext4_ext_get_actual_len(ex)) { 2157 ext4_lblk_t next; 2158 lblock = le32_to_cpu(ex->ee_block) 2159 + ext4_ext_get_actual_len(ex); 2160 2161 next = ext4_ext_next_allocated_block(path); 2162 ext_debug("cache gap(after): [%u:%u] %u", 2163 le32_to_cpu(ex->ee_block), 2164 ext4_ext_get_actual_len(ex), 2165 block); 2166 BUG_ON(next == lblock); 2167 len = next - lblock; 2168 } else { 2169 lblock = len = 0; 2170 BUG(); 2171 } 2172 2173 ext_debug(" -> %u:%lu\n", lblock, len); 2174 ext4_ext_put_in_cache(inode, lblock, len, 0); 2175} 2176 2177/* 2178 * ext4_ext_in_cache() 2179 * Checks to see if the given block is in the cache. 2180 * If it is, the cached extent is stored in the given 2181 * cache extent pointer. 2182 * 2183 * @inode: The files inode 2184 * @block: The block to look for in the cache 2185 * @ex: Pointer where the cached extent will be stored 2186 * if it contains block 2187 * 2188 * Return 0 if cache is invalid; 1 if the cache is valid 2189 */ 2190static int 2191ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2192 struct ext4_extent *ex) 2193{ 2194 struct ext4_ext_cache *cex; 2195 struct ext4_sb_info *sbi; 2196 int ret = 0; 2197 2198 /* 2199 * We borrow i_block_reservation_lock to protect i_cached_extent 2200 */ 2201 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2202 cex = &EXT4_I(inode)->i_cached_extent; 2203 sbi = EXT4_SB(inode->i_sb); 2204 2205 /* has cache valid data? */ 2206 if (cex->ec_len == 0) 2207 goto errout; 2208 2209 if (in_range(block, cex->ec_block, cex->ec_len)) { 2210 ex->ee_block = cpu_to_le32(cex->ec_block); 2211 ext4_ext_store_pblock(ex, cex->ec_start); 2212 ex->ee_len = cpu_to_le16(cex->ec_len); 2213 ext_debug("%u cached by %u:%u:%llu\n", 2214 block, 2215 cex->ec_block, cex->ec_len, cex->ec_start); 2216 ret = 1; 2217 } 2218errout: 2219 trace_ext4_ext_in_cache(inode, block, ret); 2220 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2221 return ret; 2222} 2223 2224/* 2225 * ext4_ext_rm_idx: 2226 * removes index from the index block. 2227 */ 2228static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2229 struct ext4_ext_path *path) 2230{ 2231 int err; 2232 ext4_fsblk_t leaf; 2233 2234 /* free index block */ 2235 path--; 2236 leaf = ext4_idx_pblock(path->p_idx); 2237 if (unlikely(path->p_hdr->eh_entries == 0)) { 2238 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2239 return -EIO; 2240 } 2241 err = ext4_ext_get_access(handle, inode, path); 2242 if (err) 2243 return err; 2244 2245 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { 2246 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; 2247 len *= sizeof(struct ext4_extent_idx); 2248 memmove(path->p_idx, path->p_idx + 1, len); 2249 } 2250 2251 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2252 err = ext4_ext_dirty(handle, inode, path); 2253 if (err) 2254 return err; 2255 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2256 trace_ext4_ext_rm_idx(inode, leaf); 2257 2258 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2259 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2260 return err; 2261} 2262 2263/* 2264 * ext4_ext_calc_credits_for_single_extent: 2265 * This routine returns max. credits that needed to insert an extent 2266 * to the extent tree. 2267 * When pass the actual path, the caller should calculate credits 2268 * under i_data_sem. 2269 */ 2270int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2271 struct ext4_ext_path *path) 2272{ 2273 if (path) { 2274 int depth = ext_depth(inode); 2275 int ret = 0; 2276 2277 /* probably there is space in leaf? */ 2278 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2279 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2280 2281 /* 2282 * There are some space in the leaf tree, no 2283 * need to account for leaf block credit 2284 * 2285 * bitmaps and block group descriptor blocks 2286 * and other metadata blocks still need to be 2287 * accounted. 2288 */ 2289 /* 1 bitmap, 1 block group descriptor */ 2290 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2291 return ret; 2292 } 2293 } 2294 2295 return ext4_chunk_trans_blocks(inode, nrblocks); 2296} 2297 2298/* 2299 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2300 * 2301 * if nrblocks are fit in a single extent (chunk flag is 1), then 2302 * in the worse case, each tree level index/leaf need to be changed 2303 * if the tree split due to insert a new extent, then the old tree 2304 * index/leaf need to be updated too 2305 * 2306 * If the nrblocks are discontiguous, they could cause 2307 * the whole tree split more than once, but this is really rare. 2308 */ 2309int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2310{ 2311 int index; 2312 int depth = ext_depth(inode); 2313 2314 if (chunk) 2315 index = depth * 2; 2316 else 2317 index = depth * 3; 2318 2319 return index; 2320} 2321 2322static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2323 struct ext4_extent *ex, 2324 ext4_fsblk_t *partial_cluster, 2325 ext4_lblk_t from, ext4_lblk_t to) 2326{ 2327 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2328 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2329 ext4_fsblk_t pblk; 2330 int flags = 0; 2331 2332 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2333 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2334 else if (ext4_should_journal_data(inode)) 2335 flags |= EXT4_FREE_BLOCKS_FORGET; 2336 2337 /* 2338 * For bigalloc file systems, we never free a partial cluster 2339 * at the beginning of the extent. Instead, we make a note 2340 * that we tried freeing the cluster, and check to see if we 2341 * need to free it on a subsequent call to ext4_remove_blocks, 2342 * or at the end of the ext4_truncate() operation. 2343 */ 2344 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2345 2346 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); 2347 /* 2348 * If we have a partial cluster, and it's different from the 2349 * cluster of the last block, we need to explicitly free the 2350 * partial cluster here. 2351 */ 2352 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2353 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2354 ext4_free_blocks(handle, inode, NULL, 2355 EXT4_C2B(sbi, *partial_cluster), 2356 sbi->s_cluster_ratio, flags); 2357 *partial_cluster = 0; 2358 } 2359 2360#ifdef EXTENTS_STATS 2361 { 2362 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2363 spin_lock(&sbi->s_ext_stats_lock); 2364 sbi->s_ext_blocks += ee_len; 2365 sbi->s_ext_extents++; 2366 if (ee_len < sbi->s_ext_min) 2367 sbi->s_ext_min = ee_len; 2368 if (ee_len > sbi->s_ext_max) 2369 sbi->s_ext_max = ee_len; 2370 if (ext_depth(inode) > sbi->s_depth_max) 2371 sbi->s_depth_max = ext_depth(inode); 2372 spin_unlock(&sbi->s_ext_stats_lock); 2373 } 2374#endif 2375 if (from >= le32_to_cpu(ex->ee_block) 2376 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2377 /* tail removal */ 2378 ext4_lblk_t num; 2379 2380 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2381 pblk = ext4_ext_pblock(ex) + ee_len - num; 2382 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2383 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2384 /* 2385 * If the block range to be freed didn't start at the 2386 * beginning of a cluster, and we removed the entire 2387 * extent, save the partial cluster here, since we 2388 * might need to delete if we determine that the 2389 * truncate operation has removed all of the blocks in 2390 * the cluster. 2391 */ 2392 if (pblk & (sbi->s_cluster_ratio - 1) && 2393 (ee_len == num)) 2394 *partial_cluster = EXT4_B2C(sbi, pblk); 2395 else 2396 *partial_cluster = 0; 2397 } else if (from == le32_to_cpu(ex->ee_block) 2398 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2399 /* head removal */ 2400 ext4_lblk_t num; 2401 ext4_fsblk_t start; 2402 2403 num = to - from; 2404 start = ext4_ext_pblock(ex); 2405 2406 ext_debug("free first %u blocks starting %llu\n", num, start); 2407 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2408 2409 } else { 2410 printk(KERN_INFO "strange request: removal(2) " 2411 "%u-%u from %u:%u\n", 2412 from, to, le32_to_cpu(ex->ee_block), ee_len); 2413 } 2414 return 0; 2415} 2416 2417 2418/* 2419 * ext4_ext_rm_leaf() Removes the extents associated with the 2420 * blocks appearing between "start" and "end", and splits the extents 2421 * if "start" and "end" appear in the same extent 2422 * 2423 * @handle: The journal handle 2424 * @inode: The files inode 2425 * @path: The path to the leaf 2426 * @start: The first block to remove 2427 * @end: The last block to remove 2428 */ 2429static int 2430ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2431 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2432 ext4_lblk_t start, ext4_lblk_t end) 2433{ 2434 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2435 int err = 0, correct_index = 0; 2436 int depth = ext_depth(inode), credits; 2437 struct ext4_extent_header *eh; 2438 ext4_lblk_t a, b; 2439 unsigned num; 2440 ext4_lblk_t ex_ee_block; 2441 unsigned short ex_ee_len; 2442 unsigned uninitialized = 0; 2443 struct ext4_extent *ex; 2444 2445 /* the header must be checked already in ext4_ext_remove_space() */ 2446 ext_debug("truncate since %u in leaf to %u\n", start, end); 2447 if (!path[depth].p_hdr) 2448 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2449 eh = path[depth].p_hdr; 2450 if (unlikely(path[depth].p_hdr == NULL)) { 2451 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2452 return -EIO; 2453 } 2454 /* find where to start removing */ 2455 ex = EXT_LAST_EXTENT(eh); 2456 2457 ex_ee_block = le32_to_cpu(ex->ee_block); 2458 ex_ee_len = ext4_ext_get_actual_len(ex); 2459 2460 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2461 2462 while (ex >= EXT_FIRST_EXTENT(eh) && 2463 ex_ee_block + ex_ee_len > start) { 2464 2465 if (ext4_ext_is_uninitialized(ex)) 2466 uninitialized = 1; 2467 else 2468 uninitialized = 0; 2469 2470 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2471 uninitialized, ex_ee_len); 2472 path[depth].p_ext = ex; 2473 2474 a = ex_ee_block > start ? ex_ee_block : start; 2475 b = ex_ee_block+ex_ee_len - 1 < end ? 2476 ex_ee_block+ex_ee_len - 1 : end; 2477 2478 ext_debug(" border %u:%u\n", a, b); 2479 2480 /* If this extent is beyond the end of the hole, skip it */ 2481 if (end < ex_ee_block) { 2482 ex--; 2483 ex_ee_block = le32_to_cpu(ex->ee_block); 2484 ex_ee_len = ext4_ext_get_actual_len(ex); 2485 continue; 2486 } else if (b != ex_ee_block + ex_ee_len - 1) { 2487 EXT4_ERROR_INODE(inode, 2488 "can not handle truncate %u:%u " 2489 "on extent %u:%u", 2490 start, end, ex_ee_block, 2491 ex_ee_block + ex_ee_len - 1); 2492 err = -EIO; 2493 goto out; 2494 } else if (a != ex_ee_block) { 2495 /* remove tail of the extent */ 2496 num = a - ex_ee_block; 2497 } else { 2498 /* remove whole extent: excellent! */ 2499 num = 0; 2500 } 2501 /* 2502 * 3 for leaf, sb, and inode plus 2 (bmap and group 2503 * descriptor) for each block group; assume two block 2504 * groups plus ex_ee_len/blocks_per_block_group for 2505 * the worst case 2506 */ 2507 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2508 if (ex == EXT_FIRST_EXTENT(eh)) { 2509 correct_index = 1; 2510 credits += (ext_depth(inode)) + 1; 2511 } 2512 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2513 2514 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2515 if (err) 2516 goto out; 2517 2518 err = ext4_ext_get_access(handle, inode, path + depth); 2519 if (err) 2520 goto out; 2521 2522 err = ext4_remove_blocks(handle, inode, ex, partial_cluster, 2523 a, b); 2524 if (err) 2525 goto out; 2526 2527 if (num == 0) 2528 /* this extent is removed; mark slot entirely unused */ 2529 ext4_ext_store_pblock(ex, 0); 2530 2531 ex->ee_len = cpu_to_le16(num); 2532 /* 2533 * Do not mark uninitialized if all the blocks in the 2534 * extent have been removed. 2535 */ 2536 if (uninitialized && num) 2537 ext4_ext_mark_uninitialized(ex); 2538 /* 2539 * If the extent was completely released, 2540 * we need to remove it from the leaf 2541 */ 2542 if (num == 0) { 2543 if (end != EXT_MAX_BLOCKS - 1) { 2544 /* 2545 * For hole punching, we need to scoot all the 2546 * extents up when an extent is removed so that 2547 * we dont have blank extents in the middle 2548 */ 2549 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2550 sizeof(struct ext4_extent)); 2551 2552 /* Now get rid of the one at the end */ 2553 memset(EXT_LAST_EXTENT(eh), 0, 2554 sizeof(struct ext4_extent)); 2555 } 2556 le16_add_cpu(&eh->eh_entries, -1); 2557 } else 2558 *partial_cluster = 0; 2559 2560 err = ext4_ext_dirty(handle, inode, path + depth); 2561 if (err) 2562 goto out; 2563 2564 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, 2565 ext4_ext_pblock(ex)); 2566 ex--; 2567 ex_ee_block = le32_to_cpu(ex->ee_block); 2568 ex_ee_len = ext4_ext_get_actual_len(ex); 2569 } 2570 2571 if (correct_index && eh->eh_entries) 2572 err = ext4_ext_correct_indexes(handle, inode, path); 2573 2574 /* 2575 * If there is still a entry in the leaf node, check to see if 2576 * it references the partial cluster. This is the only place 2577 * where it could; if it doesn't, we can free the cluster. 2578 */ 2579 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2580 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2581 *partial_cluster)) { 2582 int flags = EXT4_FREE_BLOCKS_FORGET; 2583 2584 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2585 flags |= EXT4_FREE_BLOCKS_METADATA; 2586 2587 ext4_free_blocks(handle, inode, NULL, 2588 EXT4_C2B(sbi, *partial_cluster), 2589 sbi->s_cluster_ratio, flags); 2590 *partial_cluster = 0; 2591 } 2592 2593 /* if this leaf is free, then we should 2594 * remove it from index block above */ 2595 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2596 err = ext4_ext_rm_idx(handle, inode, path + depth); 2597 2598out: 2599 return err; 2600} 2601 2602/* 2603 * ext4_ext_more_to_rm: 2604 * returns 1 if current index has to be freed (even partial) 2605 */ 2606static int 2607ext4_ext_more_to_rm(struct ext4_ext_path *path) 2608{ 2609 BUG_ON(path->p_idx == NULL); 2610 2611 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2612 return 0; 2613 2614 /* 2615 * if truncate on deeper level happened, it wasn't partial, 2616 * so we have to consider current index for truncation 2617 */ 2618 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2619 return 0; 2620 return 1; 2621} 2622 2623static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2624 ext4_lblk_t end) 2625{ 2626 struct super_block *sb = inode->i_sb; 2627 int depth = ext_depth(inode); 2628 struct ext4_ext_path *path = NULL; 2629 ext4_fsblk_t partial_cluster = 0; 2630 handle_t *handle; 2631 int i = 0, err = 0; 2632 2633 ext_debug("truncate since %u to %u\n", start, end); 2634 2635 /* probably first extent we're gonna free will be last in block */ 2636 handle = ext4_journal_start(inode, depth + 1); 2637 if (IS_ERR(handle)) 2638 return PTR_ERR(handle); 2639 2640again: 2641 ext4_ext_invalidate_cache(inode); 2642 2643 trace_ext4_ext_remove_space(inode, start, depth); 2644 2645 /* 2646 * Check if we are removing extents inside the extent tree. If that 2647 * is the case, we are going to punch a hole inside the extent tree 2648 * so we have to check whether we need to split the extent covering 2649 * the last block to remove so we can easily remove the part of it 2650 * in ext4_ext_rm_leaf(). 2651 */ 2652 if (end < EXT_MAX_BLOCKS - 1) { 2653 struct ext4_extent *ex; 2654 ext4_lblk_t ee_block; 2655 2656 /* find extent for this block */ 2657 path = ext4_ext_find_extent(inode, end, NULL); 2658 if (IS_ERR(path)) { 2659 ext4_journal_stop(handle); 2660 return PTR_ERR(path); 2661 } 2662 depth = ext_depth(inode); 2663 /* Leaf not may not exist only if inode has no blocks at all */ 2664 ex = path[depth].p_ext; 2665 if (!ex) { 2666 if (depth) { 2667 EXT4_ERROR_INODE(inode, 2668 "path[%d].p_hdr == NULL", 2669 depth); 2670 err = -EIO; 2671 } 2672 goto out; 2673 } 2674 2675 ee_block = le32_to_cpu(ex->ee_block); 2676 2677 /* 2678 * See if the last block is inside the extent, if so split 2679 * the extent at 'end' block so we can easily remove the 2680 * tail of the first part of the split extent in 2681 * ext4_ext_rm_leaf(). 2682 */ 2683 if (end >= ee_block && 2684 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2685 int split_flag = 0; 2686 2687 if (ext4_ext_is_uninitialized(ex)) 2688 split_flag = EXT4_EXT_MARK_UNINIT1 | 2689 EXT4_EXT_MARK_UNINIT2; 2690 2691 /* 2692 * Split the extent in two so that 'end' is the last 2693 * block in the first new extent 2694 */ 2695 err = ext4_split_extent_at(handle, inode, path, 2696 end + 1, split_flag, 2697 EXT4_GET_BLOCKS_PRE_IO | 2698 EXT4_GET_BLOCKS_PUNCH_OUT_EXT); 2699 2700 if (err < 0) 2701 goto out; 2702 } 2703 } 2704 /* 2705 * We start scanning from right side, freeing all the blocks 2706 * after i_size and walking into the tree depth-wise. 2707 */ 2708 depth = ext_depth(inode); 2709 if (path) { 2710 int k = i = depth; 2711 while (--k > 0) 2712 path[k].p_block = 2713 le16_to_cpu(path[k].p_hdr->eh_entries)+1; 2714 } else { 2715 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 2716 GFP_NOFS); 2717 if (path == NULL) { 2718 ext4_journal_stop(handle); 2719 return -ENOMEM; 2720 } 2721 path[0].p_depth = depth; 2722 path[0].p_hdr = ext_inode_hdr(inode); 2723 i = 0; 2724 2725 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2726 err = -EIO; 2727 goto out; 2728 } 2729 } 2730 err = 0; 2731 2732 while (i >= 0 && err == 0) { 2733 if (i == depth) { 2734 /* this is leaf block */ 2735 err = ext4_ext_rm_leaf(handle, inode, path, 2736 &partial_cluster, start, 2737 end); 2738 /* root level has p_bh == NULL, brelse() eats this */ 2739 brelse(path[i].p_bh); 2740 path[i].p_bh = NULL; 2741 i--; 2742 continue; 2743 } 2744 2745 /* this is index block */ 2746 if (!path[i].p_hdr) { 2747 ext_debug("initialize header\n"); 2748 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2749 } 2750 2751 if (!path[i].p_idx) { 2752 /* this level hasn't been touched yet */ 2753 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2754 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2755 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2756 path[i].p_hdr, 2757 le16_to_cpu(path[i].p_hdr->eh_entries)); 2758 } else { 2759 /* we were already here, see at next index */ 2760 path[i].p_idx--; 2761 } 2762 2763 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2764 i, EXT_FIRST_INDEX(path[i].p_hdr), 2765 path[i].p_idx); 2766 if (ext4_ext_more_to_rm(path + i)) { 2767 struct buffer_head *bh; 2768 /* go to the next level */ 2769 ext_debug("move to level %d (block %llu)\n", 2770 i + 1, ext4_idx_pblock(path[i].p_idx)); 2771 memset(path + i + 1, 0, sizeof(*path)); 2772 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); 2773 if (!bh) { 2774 /* should we reset i_size? */ 2775 err = -EIO; 2776 break; 2777 } 2778 if (WARN_ON(i + 1 > depth)) { 2779 err = -EIO; 2780 break; 2781 } 2782 if (ext4_ext_check_block(inode, ext_block_hdr(bh), 2783 depth - i - 1, bh)) { 2784 err = -EIO; 2785 break; 2786 } 2787 path[i + 1].p_bh = bh; 2788 2789 /* save actual number of indexes since this 2790 * number is changed at the next iteration */ 2791 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2792 i++; 2793 } else { 2794 /* we finished processing this index, go up */ 2795 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2796 /* index is empty, remove it; 2797 * handle must be already prepared by the 2798 * truncatei_leaf() */ 2799 err = ext4_ext_rm_idx(handle, inode, path + i); 2800 } 2801 /* root level has p_bh == NULL, brelse() eats this */ 2802 brelse(path[i].p_bh); 2803 path[i].p_bh = NULL; 2804 i--; 2805 ext_debug("return to level %d\n", i); 2806 } 2807 } 2808 2809 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2810 path->p_hdr->eh_entries); 2811 2812 /* If we still have something in the partial cluster and we have removed 2813 * even the first extent, then we should free the blocks in the partial 2814 * cluster as well. */ 2815 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2816 int flags = EXT4_FREE_BLOCKS_FORGET; 2817 2818 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2819 flags |= EXT4_FREE_BLOCKS_METADATA; 2820 2821 ext4_free_blocks(handle, inode, NULL, 2822 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2823 EXT4_SB(sb)->s_cluster_ratio, flags); 2824 partial_cluster = 0; 2825 } 2826 2827 /* TODO: flexible tree reduction should be here */ 2828 if (path->p_hdr->eh_entries == 0) { 2829 /* 2830 * truncate to zero freed all the tree, 2831 * so we need to correct eh_depth 2832 */ 2833 err = ext4_ext_get_access(handle, inode, path); 2834 if (err == 0) { 2835 ext_inode_hdr(inode)->eh_depth = 0; 2836 ext_inode_hdr(inode)->eh_max = 2837 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2838 err = ext4_ext_dirty(handle, inode, path); 2839 } 2840 } 2841out: 2842 ext4_ext_drop_refs(path); 2843 kfree(path); 2844 if (err == -EAGAIN) { 2845 path = NULL; 2846 goto again; 2847 } 2848 ext4_journal_stop(handle); 2849 2850 return err; 2851} 2852 2853/* 2854 * called at mount time 2855 */ 2856void ext4_ext_init(struct super_block *sb) 2857{ 2858 /* 2859 * possible initialization would be here 2860 */ 2861 2862 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2863#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2864 printk(KERN_INFO "EXT4-fs: file extents enabled" 2865#ifdef AGGRESSIVE_TEST 2866 ", aggressive tests" 2867#endif 2868#ifdef CHECK_BINSEARCH 2869 ", check binsearch" 2870#endif 2871#ifdef EXTENTS_STATS 2872 ", stats" 2873#endif 2874 "\n"); 2875#endif 2876#ifdef EXTENTS_STATS 2877 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2878 EXT4_SB(sb)->s_ext_min = 1 << 30; 2879 EXT4_SB(sb)->s_ext_max = 0; 2880#endif 2881 } 2882} 2883 2884/* 2885 * called at umount time 2886 */ 2887void ext4_ext_release(struct super_block *sb) 2888{ 2889 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2890 return; 2891 2892#ifdef EXTENTS_STATS 2893 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2894 struct ext4_sb_info *sbi = EXT4_SB(sb); 2895 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2896 sbi->s_ext_blocks, sbi->s_ext_extents, 2897 sbi->s_ext_blocks / sbi->s_ext_extents); 2898 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2899 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2900 } 2901#endif 2902} 2903 2904/* FIXME!! we need to try to merge to left or right after zero-out */ 2905static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2906{ 2907 ext4_fsblk_t ee_pblock; 2908 unsigned int ee_len; 2909 int ret; 2910 2911 ee_len = ext4_ext_get_actual_len(ex); 2912 ee_pblock = ext4_ext_pblock(ex); 2913 2914 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 2915 if (ret > 0) 2916 ret = 0; 2917 2918 return ret; 2919} 2920 2921/* 2922 * ext4_split_extent_at() splits an extent at given block. 2923 * 2924 * @handle: the journal handle 2925 * @inode: the file inode 2926 * @path: the path to the extent 2927 * @split: the logical block where the extent is splitted. 2928 * @split_flags: indicates if the extent could be zeroout if split fails, and 2929 * the states(init or uninit) of new extents. 2930 * @flags: flags used to insert new extent to extent tree. 2931 * 2932 * 2933 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 2934 * of which are deterimined by split_flag. 2935 * 2936 * There are two cases: 2937 * a> the extent are splitted into two extent. 2938 * b> split is not needed, and just mark the extent. 2939 * 2940 * return 0 on success. 2941 */ 2942static int ext4_split_extent_at(handle_t *handle, 2943 struct inode *inode, 2944 struct ext4_ext_path *path, 2945 ext4_lblk_t split, 2946 int split_flag, 2947 int flags) 2948{ 2949 ext4_fsblk_t newblock; 2950 ext4_lblk_t ee_block; 2951 struct ext4_extent *ex, newex, orig_ex; 2952 struct ext4_extent *ex2 = NULL; 2953 unsigned int ee_len, depth; 2954 int err = 0; 2955 2956 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 2957 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 2958 2959 ext_debug("ext4_split_extents_at: inode %lu, logical" 2960 "block %llu\n", inode->i_ino, (unsigned long long)split); 2961 2962 ext4_ext_show_leaf(inode, path); 2963 2964 depth = ext_depth(inode); 2965 ex = path[depth].p_ext; 2966 ee_block = le32_to_cpu(ex->ee_block); 2967 ee_len = ext4_ext_get_actual_len(ex); 2968 newblock = split - ee_block + ext4_ext_pblock(ex); 2969 2970 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 2971 2972 err = ext4_ext_get_access(handle, inode, path + depth); 2973 if (err) 2974 goto out; 2975 2976 if (split == ee_block) { 2977 /* 2978 * case b: block @split is the block that the extent begins with 2979 * then we just change the state of the extent, and splitting 2980 * is not needed. 2981 */ 2982 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2983 ext4_ext_mark_uninitialized(ex); 2984 else 2985 ext4_ext_mark_initialized(ex); 2986 2987 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2988 ext4_ext_try_to_merge(handle, inode, path, ex); 2989 2990 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2991 goto out; 2992 } 2993 2994 /* case a */ 2995 memcpy(&orig_ex, ex, sizeof(orig_ex)); 2996 ex->ee_len = cpu_to_le16(split - ee_block); 2997 if (split_flag & EXT4_EXT_MARK_UNINIT1) 2998 ext4_ext_mark_uninitialized(ex); 2999 3000 /* 3001 * path may lead to new leaf, not to original leaf any more 3002 * after ext4_ext_insert_extent() returns, 3003 */ 3004 err = ext4_ext_dirty(handle, inode, path + depth); 3005 if (err) 3006 goto fix_extent_len; 3007 3008 ex2 = &newex; 3009 ex2->ee_block = cpu_to_le32(split); 3010 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 3011 ext4_ext_store_pblock(ex2, newblock); 3012 if (split_flag & EXT4_EXT_MARK_UNINIT2) 3013 ext4_ext_mark_uninitialized(ex2); 3014 3015 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3016 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3017 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3018 if (split_flag & EXT4_EXT_DATA_VALID1) 3019 err = ext4_ext_zeroout(inode, ex2); 3020 else 3021 err = ext4_ext_zeroout(inode, ex); 3022 } else 3023 err = ext4_ext_zeroout(inode, &orig_ex); 3024 3025 if (err) 3026 goto fix_extent_len; 3027 /* update the extent length and mark as initialized */ 3028 ex->ee_len = cpu_to_le16(ee_len); 3029 ext4_ext_try_to_merge(handle, inode, path, ex); 3030 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3031 goto out; 3032 } else if (err) 3033 goto fix_extent_len; 3034 3035out: 3036 ext4_ext_show_leaf(inode, path); 3037 return err; 3038 3039fix_extent_len: 3040 ex->ee_len = orig_ex.ee_len; 3041 ext4_ext_dirty(handle, inode, path + depth); 3042 return err; 3043} 3044 3045/* 3046 * ext4_split_extents() splits an extent and mark extent which is covered 3047 * by @map as split_flags indicates 3048 * 3049 * It may result in splitting the extent into multiple extents (upto three) 3050 * There are three possibilities: 3051 * a> There is no split required 3052 * b> Splits in two extents: Split is happening at either end of the extent 3053 * c> Splits in three extents: Somone is splitting in middle of the extent 3054 * 3055 */ 3056static int ext4_split_extent(handle_t *handle, 3057 struct inode *inode, 3058 struct ext4_ext_path *path, 3059 struct ext4_map_blocks *map, 3060 int split_flag, 3061 int flags) 3062{ 3063 ext4_lblk_t ee_block; 3064 struct ext4_extent *ex; 3065 unsigned int ee_len, depth; 3066 int err = 0; 3067 int uninitialized; 3068 int split_flag1, flags1; 3069 3070 depth = ext_depth(inode); 3071 ex = path[depth].p_ext; 3072 ee_block = le32_to_cpu(ex->ee_block); 3073 ee_len = ext4_ext_get_actual_len(ex); 3074 uninitialized = ext4_ext_is_uninitialized(ex); 3075 3076 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3077 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3078 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3079 if (uninitialized) 3080 split_flag1 |= EXT4_EXT_MARK_UNINIT1 | 3081 EXT4_EXT_MARK_UNINIT2; 3082 if (split_flag & EXT4_EXT_DATA_VALID2) 3083 split_flag1 |= EXT4_EXT_DATA_VALID1; 3084 err = ext4_split_extent_at(handle, inode, path, 3085 map->m_lblk + map->m_len, split_flag1, flags1); 3086 if (err) 3087 goto out; 3088 } 3089 3090 ext4_ext_drop_refs(path); 3091 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3092 if (IS_ERR(path)) 3093 return PTR_ERR(path); 3094 3095 if (map->m_lblk >= ee_block) { 3096 split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | 3097 EXT4_EXT_DATA_VALID2); 3098 if (uninitialized) 3099 split_flag1 |= EXT4_EXT_MARK_UNINIT1; 3100 if (split_flag & EXT4_EXT_MARK_UNINIT2) 3101 split_flag1 |= EXT4_EXT_MARK_UNINIT2; 3102 err = ext4_split_extent_at(handle, inode, path, 3103 map->m_lblk, split_flag1, flags); 3104 if (err) 3105 goto out; 3106 } 3107 3108 ext4_ext_show_leaf(inode, path); 3109out: 3110 return err ? err : map->m_len; 3111} 3112 3113/* 3114 * This function is called by ext4_ext_map_blocks() if someone tries to write 3115 * to an uninitialized extent. It may result in splitting the uninitialized 3116 * extent into multiple extents (up to three - one initialized and two 3117 * uninitialized). 3118 * There are three possibilities: 3119 * a> There is no split required: Entire extent should be initialized 3120 * b> Splits in two extents: Write is happening at either end of the extent 3121 * c> Splits in three extents: Somone is writing in middle of the extent 3122 * 3123 * Pre-conditions: 3124 * - The extent pointed to by 'path' is uninitialized. 3125 * - The extent pointed to by 'path' contains a superset 3126 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). 3127 * 3128 * Post-conditions on success: 3129 * - the returned value is the number of blocks beyond map->l_lblk 3130 * that are allocated and initialized. 3131 * It is guaranteed to be >= map->m_len. 3132 */ 3133static int ext4_ext_convert_to_initialized(handle_t *handle, 3134 struct inode *inode, 3135 struct ext4_map_blocks *map, 3136 struct ext4_ext_path *path) 3137{ 3138 struct ext4_sb_info *sbi; 3139 struct ext4_extent_header *eh; 3140 struct ext4_map_blocks split_map; 3141 struct ext4_extent zero_ex; 3142 struct ext4_extent *ex; 3143 ext4_lblk_t ee_block, eof_block; 3144 unsigned int ee_len, depth; 3145 int allocated, max_zeroout = 0; 3146 int err = 0; 3147 int split_flag = 0; 3148 3149 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3150 "block %llu, max_blocks %u\n", inode->i_ino, 3151 (unsigned long long)map->m_lblk, map->m_len); 3152 3153 sbi = EXT4_SB(inode->i_sb); 3154 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3155 inode->i_sb->s_blocksize_bits; 3156 if (eof_block < map->m_lblk + map->m_len) 3157 eof_block = map->m_lblk + map->m_len; 3158 3159 depth = ext_depth(inode); 3160 eh = path[depth].p_hdr; 3161 ex = path[depth].p_ext; 3162 ee_block = le32_to_cpu(ex->ee_block); 3163 ee_len = ext4_ext_get_actual_len(ex); 3164 allocated = ee_len - (map->m_lblk - ee_block); 3165 3166 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3167 3168 /* Pre-conditions */ 3169 BUG_ON(!ext4_ext_is_uninitialized(ex)); 3170 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 3171 3172 /* 3173 * Attempt to transfer newly initialized blocks from the currently 3174 * uninitialized extent to its left neighbor. This is much cheaper 3175 * than an insertion followed by a merge as those involve costly 3176 * memmove() calls. This is the common case in steady state for 3177 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append 3178 * writes. 3179 * 3180 * Limitations of the current logic: 3181 * - L1: we only deal with writes at the start of the extent. 3182 * The approach could be extended to writes at the end 3183 * of the extent but this scenario was deemed less common. 3184 * - L2: we do not deal with writes covering the whole extent. 3185 * This would require removing the extent if the transfer 3186 * is possible. 3187 * - L3: we only attempt to merge with an extent stored in the 3188 * same extent tree node. 3189 */ 3190 if ((map->m_lblk == ee_block) && /*L1*/ 3191 (map->m_len < ee_len) && /*L2*/ 3192 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ 3193 struct ext4_extent *prev_ex; 3194 ext4_lblk_t prev_lblk; 3195 ext4_fsblk_t prev_pblk, ee_pblk; 3196 unsigned int prev_len, write_len; 3197 3198 prev_ex = ex - 1; 3199 prev_lblk = le32_to_cpu(prev_ex->ee_block); 3200 prev_len = ext4_ext_get_actual_len(prev_ex); 3201 prev_pblk = ext4_ext_pblock(prev_ex); 3202 ee_pblk = ext4_ext_pblock(ex); 3203 write_len = map->m_len; 3204 3205 /* 3206 * A transfer of blocks from 'ex' to 'prev_ex' is allowed 3207 * upon those conditions: 3208 * - C1: prev_ex is initialized, 3209 * - C2: prev_ex is logically abutting ex, 3210 * - C3: prev_ex is physically abutting ex, 3211 * - C4: prev_ex can receive the additional blocks without 3212 * overflowing the (initialized) length limit. 3213 */ 3214 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ 3215 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3216 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3217 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ 3218 err = ext4_ext_get_access(handle, inode, path + depth); 3219 if (err) 3220 goto out; 3221 3222 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3223 map, ex, prev_ex); 3224 3225 /* Shift the start of ex by 'write_len' blocks */ 3226 ex->ee_block = cpu_to_le32(ee_block + write_len); 3227 ext4_ext_store_pblock(ex, ee_pblk + write_len); 3228 ex->ee_len = cpu_to_le16(ee_len - write_len); 3229 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3230 3231 /* Extend prev_ex by 'write_len' blocks */ 3232 prev_ex->ee_len = cpu_to_le16(prev_len + write_len); 3233 3234 /* Mark the block containing both extents as dirty */ 3235 ext4_ext_dirty(handle, inode, path + depth); 3236 3237 /* Update path to point to the right extent */ 3238 path[depth].p_ext = prev_ex; 3239 3240 /* Result: number of initialized blocks past m_lblk */ 3241 allocated = write_len; 3242 goto out; 3243 } 3244 } 3245 3246 WARN_ON(map->m_lblk < ee_block); 3247 /* 3248 * It is safe to convert extent to initialized via explicit 3249 * zeroout only if extent is fully insde i_size or new_size. 3250 */ 3251 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3252 3253 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3254 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3255 inode->i_sb->s_blocksize_bits; 3256 3257 /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3258 if (max_zeroout && (ee_len <= max_zeroout)) { 3259 err = ext4_ext_zeroout(inode, ex); 3260 if (err) 3261 goto out; 3262 3263 err = ext4_ext_get_access(handle, inode, path + depth); 3264 if (err) 3265 goto out; 3266 ext4_ext_mark_initialized(ex); 3267 ext4_ext_try_to_merge(handle, inode, path, ex); 3268 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3269 goto out; 3270 } 3271 3272 /* 3273 * four cases: 3274 * 1. split the extent into three extents. 3275 * 2. split the extent into two extents, zeroout the first half. 3276 * 3. split the extent into two extents, zeroout the second half. 3277 * 4. split the extent into two extents with out zeroout. 3278 */ 3279 split_map.m_lblk = map->m_lblk; 3280 split_map.m_len = map->m_len; 3281 3282 if (max_zeroout && (allocated > map->m_len)) { 3283 if (allocated <= max_zeroout) { 3284 /* case 3 */ 3285 zero_ex.ee_block = 3286 cpu_to_le32(map->m_lblk); 3287 zero_ex.ee_len = cpu_to_le16(allocated); 3288 ext4_ext_store_pblock(&zero_ex, 3289 ext4_ext_pblock(ex) + map->m_lblk - ee_block); 3290 err = ext4_ext_zeroout(inode, &zero_ex); 3291 if (err) 3292 goto out; 3293 split_map.m_lblk = map->m_lblk; 3294 split_map.m_len = allocated; 3295 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { 3296 /* case 2 */ 3297 if (map->m_lblk != ee_block) { 3298 zero_ex.ee_block = ex->ee_block; 3299 zero_ex.ee_len = cpu_to_le16(map->m_lblk - 3300 ee_block); 3301 ext4_ext_store_pblock(&zero_ex, 3302 ext4_ext_pblock(ex)); 3303 err = ext4_ext_zeroout(inode, &zero_ex); 3304 if (err) 3305 goto out; 3306 } 3307 3308 split_map.m_lblk = ee_block; 3309 split_map.m_len = map->m_lblk - ee_block + map->m_len; 3310 allocated = map->m_len; 3311 } 3312 } 3313 3314 allocated = ext4_split_extent(handle, inode, path, 3315 &split_map, split_flag, 0); 3316 if (allocated < 0) 3317 err = allocated; 3318 3319out: 3320 return err ? err : allocated; 3321} 3322 3323/* 3324 * This function is called by ext4_ext_map_blocks() from 3325 * ext4_get_blocks_dio_write() when DIO to write 3326 * to an uninitialized extent. 3327 * 3328 * Writing to an uninitialized extent may result in splitting the uninitialized 3329 * extent into multiple initialized/uninitialized extents (up to three) 3330 * There are three possibilities: 3331 * a> There is no split required: Entire extent should be uninitialized 3332 * b> Splits in two extents: Write is happening at either end of the extent 3333 * c> Splits in three extents: Somone is writing in middle of the extent 3334 * 3335 * One of more index blocks maybe needed if the extent tree grow after 3336 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3337 * complete, we need to split the uninitialized extent before DIO submit 3338 * the IO. The uninitialized extent called at this time will be split 3339 * into three uninitialized extent(at most). After IO complete, the part 3340 * being filled will be convert to initialized by the end_io callback function 3341 * via ext4_convert_unwritten_extents(). 3342 * 3343 * Returns the size of uninitialized extent to be written on success. 3344 */ 3345static int ext4_split_unwritten_extents(handle_t *handle, 3346 struct inode *inode, 3347 struct ext4_map_blocks *map, 3348 struct ext4_ext_path *path, 3349 int flags) 3350{ 3351 ext4_lblk_t eof_block; 3352 ext4_lblk_t ee_block; 3353 struct ext4_extent *ex; 3354 unsigned int ee_len; 3355 int split_flag = 0, depth; 3356 3357 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3358 "block %llu, max_blocks %u\n", inode->i_ino, 3359 (unsigned long long)map->m_lblk, map->m_len); 3360 3361 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3362 inode->i_sb->s_blocksize_bits; 3363 if (eof_block < map->m_lblk + map->m_len) 3364 eof_block = map->m_lblk + map->m_len; 3365 /* 3366 * It is safe to convert extent to initialized via explicit 3367 * zeroout only if extent is fully insde i_size or new_size. 3368 */ 3369 depth = ext_depth(inode); 3370 ex = path[depth].p_ext; 3371 ee_block = le32_to_cpu(ex->ee_block); 3372 ee_len = ext4_ext_get_actual_len(ex); 3373 3374 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3375 split_flag |= EXT4_EXT_MARK_UNINIT2; 3376 if (flags & EXT4_GET_BLOCKS_CONVERT) 3377 split_flag |= EXT4_EXT_DATA_VALID2; 3378 flags |= EXT4_GET_BLOCKS_PRE_IO; 3379 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3380} 3381 3382static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3383 struct inode *inode, 3384 struct ext4_map_blocks *map, 3385 struct ext4_ext_path *path) 3386{ 3387 struct ext4_extent *ex; 3388 ext4_lblk_t ee_block; 3389 unsigned int ee_len; 3390 int depth; 3391 int err = 0; 3392 3393 depth = ext_depth(inode); 3394 ex = path[depth].p_ext; 3395 ee_block = le32_to_cpu(ex->ee_block); 3396 ee_len = ext4_ext_get_actual_len(ex); 3397 3398 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3399 "block %llu, max_blocks %u\n", inode->i_ino, 3400 (unsigned long long)ee_block, ee_len); 3401 3402 /* If extent is larger than requested then split is required */ 3403 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3404 err = ext4_split_unwritten_extents(handle, inode, map, path, 3405 EXT4_GET_BLOCKS_CONVERT); 3406 if (err < 0) 3407 goto out; 3408 ext4_ext_drop_refs(path); 3409 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3410 if (IS_ERR(path)) { 3411 err = PTR_ERR(path); 3412 goto out; 3413 } 3414 depth = ext_depth(inode); 3415 ex = path[depth].p_ext; 3416 } 3417 3418 err = ext4_ext_get_access(handle, inode, path + depth); 3419 if (err) 3420 goto out; 3421 /* first mark the extent as initialized */ 3422 ext4_ext_mark_initialized(ex); 3423 3424 /* note: ext4_ext_correct_indexes() isn't needed here because 3425 * borders are not changed 3426 */ 3427 ext4_ext_try_to_merge(handle, inode, path, ex); 3428 3429 /* Mark modified extent as dirty */ 3430 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3431out: 3432 ext4_ext_show_leaf(inode, path); 3433 return err; 3434} 3435 3436static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3437 sector_t block, int count) 3438{ 3439 int i; 3440 for (i = 0; i < count; i++) 3441 unmap_underlying_metadata(bdev, block + i); 3442} 3443 3444/* 3445 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3446 */ 3447static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3448 ext4_lblk_t lblk, 3449 struct ext4_ext_path *path, 3450 unsigned int len) 3451{ 3452 int i, depth; 3453 struct ext4_extent_header *eh; 3454 struct ext4_extent *last_ex; 3455 3456 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3457 return 0; 3458 3459 depth = ext_depth(inode); 3460 eh = path[depth].p_hdr; 3461 3462 /* 3463 * We're going to remove EOFBLOCKS_FL entirely in future so we 3464 * do not care for this case anymore. Simply remove the flag 3465 * if there are no extents. 3466 */ 3467 if (unlikely(!eh->eh_entries)) 3468 goto out; 3469 last_ex = EXT_LAST_EXTENT(eh); 3470 /* 3471 * We should clear the EOFBLOCKS_FL flag if we are writing the 3472 * last block in the last extent in the file. We test this by 3473 * first checking to see if the caller to 3474 * ext4_ext_get_blocks() was interested in the last block (or 3475 * a block beyond the last block) in the current extent. If 3476 * this turns out to be false, we can bail out from this 3477 * function immediately. 3478 */ 3479 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3480 ext4_ext_get_actual_len(last_ex)) 3481 return 0; 3482 /* 3483 * If the caller does appear to be planning to write at or 3484 * beyond the end of the current extent, we then test to see 3485 * if the current extent is the last extent in the file, by 3486 * checking to make sure it was reached via the rightmost node 3487 * at each level of the tree. 3488 */ 3489 for (i = depth-1; i >= 0; i--) 3490 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3491 return 0; 3492out: 3493 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3494 return ext4_mark_inode_dirty(handle, inode); 3495} 3496 3497/** 3498 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3499 * 3500 * Return 1 if there is a delalloc block in the range, otherwise 0. 3501 */ 3502static int ext4_find_delalloc_range(struct inode *inode, 3503 ext4_lblk_t lblk_start, 3504 ext4_lblk_t lblk_end) 3505{ 3506 struct extent_status es; 3507 3508 es.start = lblk_start; 3509 ext4_es_find_extent(inode, &es); 3510 if (es.len == 0) 3511 return 0; /* there is no delay extent in this tree */ 3512 else if (es.start <= lblk_start && lblk_start < es.start + es.len) 3513 return 1; 3514 else if (lblk_start <= es.start && es.start <= lblk_end) 3515 return 1; 3516 else 3517 return 0; 3518} 3519 3520int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) 3521{ 3522 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3523 ext4_lblk_t lblk_start, lblk_end; 3524 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3525 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3526 3527 return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3528} 3529 3530/** 3531 * Determines how many complete clusters (out of those specified by the 'map') 3532 * are under delalloc and were reserved quota for. 3533 * This function is called when we are writing out the blocks that were 3534 * originally written with their allocation delayed, but then the space was 3535 * allocated using fallocate() before the delayed allocation could be resolved. 3536 * The cases to look for are: 3537 * ('=' indicated delayed allocated blocks 3538 * '-' indicates non-delayed allocated blocks) 3539 * (a) partial clusters towards beginning and/or end outside of allocated range 3540 * are not delalloc'ed. 3541 * Ex: 3542 * |----c---=|====c====|====c====|===-c----| 3543 * |++++++ allocated ++++++| 3544 * ==> 4 complete clusters in above example 3545 * 3546 * (b) partial cluster (outside of allocated range) towards either end is 3547 * marked for delayed allocation. In this case, we will exclude that 3548 * cluster. 3549 * Ex: 3550 * |----====c========|========c========| 3551 * |++++++ allocated ++++++| 3552 * ==> 1 complete clusters in above example 3553 * 3554 * Ex: 3555 * |================c================| 3556 * |++++++ allocated ++++++| 3557 * ==> 0 complete clusters in above example 3558 * 3559 * The ext4_da_update_reserve_space will be called only if we 3560 * determine here that there were some "entire" clusters that span 3561 * this 'allocated' range. 3562 * In the non-bigalloc case, this function will just end up returning num_blks 3563 * without ever calling ext4_find_delalloc_range. 3564 */ 3565static unsigned int 3566get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, 3567 unsigned int num_blks) 3568{ 3569 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3570 ext4_lblk_t alloc_cluster_start, alloc_cluster_end; 3571 ext4_lblk_t lblk_from, lblk_to, c_offset; 3572 unsigned int allocated_clusters = 0; 3573 3574 alloc_cluster_start = EXT4_B2C(sbi, lblk_start); 3575 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); 3576 3577 /* max possible clusters for this allocation */ 3578 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; 3579 3580 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 3581 3582 /* Check towards left side */ 3583 c_offset = lblk_start & (sbi->s_cluster_ratio - 1); 3584 if (c_offset) { 3585 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3586 lblk_to = lblk_from + c_offset - 1; 3587 3588 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3589 allocated_clusters--; 3590 } 3591 3592 /* Now check towards right. */ 3593 c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); 3594 if (allocated_clusters && c_offset) { 3595 lblk_from = lblk_start + num_blks; 3596 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3597 3598 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3599 allocated_clusters--; 3600 } 3601 3602 return allocated_clusters; 3603} 3604 3605static int 3606ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3607 struct ext4_map_blocks *map, 3608 struct ext4_ext_path *path, int flags, 3609 unsigned int allocated, ext4_fsblk_t newblock) 3610{ 3611 int ret = 0; 3612 int err = 0; 3613 ext4_io_end_t *io = ext4_inode_aio(inode); 3614 3615 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " 3616 "block %llu, max_blocks %u, flags %x, allocated %u\n", 3617 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3618 flags, allocated); 3619 ext4_ext_show_leaf(inode, path); 3620 3621 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3622 allocated, newblock); 3623 3624 /* get_block() before submit the IO, split the extent */ 3625 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3626 ret = ext4_split_unwritten_extents(handle, inode, map, 3627 path, flags); 3628 if (ret <= 0) 3629 goto out; 3630 /* 3631 * Flag the inode(non aio case) or end_io struct (aio case) 3632 * that this IO needs to conversion to written when IO is 3633 * completed 3634 */ 3635 if (io) 3636 ext4_set_io_unwritten_flag(inode, io); 3637 else 3638 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3639 if (ext4_should_dioread_nolock(inode)) 3640 map->m_flags |= EXT4_MAP_UNINIT; 3641 goto out; 3642 } 3643 /* IO end_io complete, convert the filled extent to written */ 3644 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3645 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 3646 path); 3647 if (ret >= 0) { 3648 ext4_update_inode_fsync_trans(handle, inode, 1); 3649 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3650 path, map->m_len); 3651 } else 3652 err = ret; 3653 goto out2; 3654 } 3655 /* buffered IO case */ 3656 /* 3657 * repeat fallocate creation request 3658 * we already have an unwritten extent 3659 */ 3660 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3661 goto map_out; 3662 3663 /* buffered READ or buffered write_begin() lookup */ 3664 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3665 /* 3666 * We have blocks reserved already. We 3667 * return allocated blocks so that delalloc 3668 * won't do block reservation for us. But 3669 * the buffer head will be unmapped so that 3670 * a read from the block returns 0s. 3671 */ 3672 map->m_flags |= EXT4_MAP_UNWRITTEN; 3673 goto out1; 3674 } 3675 3676 /* buffered write, writepage time, convert*/ 3677 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3678 if (ret >= 0) 3679 ext4_update_inode_fsync_trans(handle, inode, 1); 3680out: 3681 if (ret <= 0) { 3682 err = ret; 3683 goto out2; 3684 } else 3685 allocated = ret; 3686 map->m_flags |= EXT4_MAP_NEW; 3687 /* 3688 * if we allocated more blocks than requested 3689 * we need to make sure we unmap the extra block 3690 * allocated. The actual needed block will get 3691 * unmapped later when we find the buffer_head marked 3692 * new. 3693 */ 3694 if (allocated > map->m_len) { 3695 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3696 newblock + map->m_len, 3697 allocated - map->m_len); 3698 allocated = map->m_len; 3699 } 3700 3701 /* 3702 * If we have done fallocate with the offset that is already 3703 * delayed allocated, we would have block reservation 3704 * and quota reservation done in the delayed write path. 3705 * But fallocate would have already updated quota and block 3706 * count for this offset. So cancel these reservation 3707 */ 3708 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 3709 unsigned int reserved_clusters; 3710 reserved_clusters = get_reserved_cluster_alloc(inode, 3711 map->m_lblk, map->m_len); 3712 if (reserved_clusters) 3713 ext4_da_update_reserve_space(inode, 3714 reserved_clusters, 3715 0); 3716 } 3717 3718map_out: 3719 map->m_flags |= EXT4_MAP_MAPPED; 3720 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { 3721 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 3722 map->m_len); 3723 if (err < 0) 3724 goto out2; 3725 } 3726out1: 3727 if (allocated > map->m_len) 3728 allocated = map->m_len; 3729 ext4_ext_show_leaf(inode, path); 3730 map->m_pblk = newblock; 3731 map->m_len = allocated; 3732out2: 3733 if (path) { 3734 ext4_ext_drop_refs(path); 3735 kfree(path); 3736 } 3737 return err ? err : allocated; 3738} 3739 3740/* 3741 * get_implied_cluster_alloc - check to see if the requested 3742 * allocation (in the map structure) overlaps with a cluster already 3743 * allocated in an extent. 3744 * @sb The filesystem superblock structure 3745 * @map The requested lblk->pblk mapping 3746 * @ex The extent structure which might contain an implied 3747 * cluster allocation 3748 * 3749 * This function is called by ext4_ext_map_blocks() after we failed to 3750 * find blocks that were already in the inode's extent tree. Hence, 3751 * we know that the beginning of the requested region cannot overlap 3752 * the extent from the inode's extent tree. There are three cases we 3753 * want to catch. The first is this case: 3754 * 3755 * |--- cluster # N--| 3756 * |--- extent ---| |---- requested region ---| 3757 * |==========| 3758 * 3759 * The second case that we need to test for is this one: 3760 * 3761 * |--------- cluster # N ----------------| 3762 * |--- requested region --| |------- extent ----| 3763 * |=======================| 3764 * 3765 * The third case is when the requested region lies between two extents 3766 * within the same cluster: 3767 * |------------- cluster # N-------------| 3768 * |----- ex -----| |---- ex_right ----| 3769 * |------ requested region ------| 3770 * |================| 3771 * 3772 * In each of the above cases, we need to set the map->m_pblk and 3773 * map->m_len so it corresponds to the return the extent labelled as 3774 * "|====|" from cluster #N, since it is already in use for data in 3775 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to 3776 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated 3777 * as a new "allocated" block region. Otherwise, we will return 0 and 3778 * ext4_ext_map_blocks() will then allocate one or more new clusters 3779 * by calling ext4_mb_new_blocks(). 3780 */ 3781static int get_implied_cluster_alloc(struct super_block *sb, 3782 struct ext4_map_blocks *map, 3783 struct ext4_extent *ex, 3784 struct ext4_ext_path *path) 3785{ 3786 struct ext4_sb_info *sbi = EXT4_SB(sb); 3787 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 3788 ext4_lblk_t ex_cluster_start, ex_cluster_end; 3789 ext4_lblk_t rr_cluster_start; 3790 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3791 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3792 unsigned short ee_len = ext4_ext_get_actual_len(ex); 3793 3794 /* The extent passed in that we are trying to match */ 3795 ex_cluster_start = EXT4_B2C(sbi, ee_block); 3796 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); 3797 3798 /* The requested region passed into ext4_map_blocks() */ 3799 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 3800 3801 if ((rr_cluster_start == ex_cluster_end) || 3802 (rr_cluster_start == ex_cluster_start)) { 3803 if (rr_cluster_start == ex_cluster_end) 3804 ee_start += ee_len - 1; 3805 map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + 3806 c_offset; 3807 map->m_len = min(map->m_len, 3808 (unsigned) sbi->s_cluster_ratio - c_offset); 3809 /* 3810 * Check for and handle this case: 3811 * 3812 * |--------- cluster # N-------------| 3813 * |------- extent ----| 3814 * |--- requested region ---| 3815 * |===========| 3816 */ 3817 3818 if (map->m_lblk < ee_block) 3819 map->m_len = min(map->m_len, ee_block - map->m_lblk); 3820 3821 /* 3822 * Check for the case where there is already another allocated 3823 * block to the right of 'ex' but before the end of the cluster. 3824 * 3825 * |------------- cluster # N-------------| 3826 * |----- ex -----| |---- ex_right ----| 3827 * |------ requested region ------| 3828 * |================| 3829 */ 3830 if (map->m_lblk > ee_block) { 3831 ext4_lblk_t next = ext4_ext_next_allocated_block(path); 3832 map->m_len = min(map->m_len, next - map->m_lblk); 3833 } 3834 3835 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); 3836 return 1; 3837 } 3838 3839 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); 3840 return 0; 3841} 3842 3843 3844/* 3845 * Block allocation/map/preallocation routine for extents based files 3846 * 3847 * 3848 * Need to be called with 3849 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3850 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3851 * 3852 * return > 0, number of of blocks already mapped/allocated 3853 * if create == 0 and these are pre-allocated blocks 3854 * buffer head is unmapped 3855 * otherwise blocks are mapped 3856 * 3857 * return = 0, if plain look up failed (blocks have not been allocated) 3858 * buffer head is unmapped 3859 * 3860 * return < 0, error case. 3861 */ 3862int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3863 struct ext4_map_blocks *map, int flags) 3864{ 3865 struct ext4_ext_path *path = NULL; 3866 struct ext4_extent newex, *ex, *ex2; 3867 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3868 ext4_fsblk_t newblock = 0; 3869 int free_on_err = 0, err = 0, depth; 3870 unsigned int allocated = 0, offset = 0; 3871 unsigned int allocated_clusters = 0; 3872 struct ext4_allocation_request ar; 3873 ext4_io_end_t *io = ext4_inode_aio(inode); 3874 ext4_lblk_t cluster_offset; 3875 int set_unwritten = 0; 3876 3877 ext_debug("blocks %u/%u requested for inode %lu\n", 3878 map->m_lblk, map->m_len, inode->i_ino); 3879 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3880 3881 /* check in cache */ 3882 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3883 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3884 if ((sbi->s_cluster_ratio > 1) && 3885 ext4_find_delalloc_cluster(inode, map->m_lblk)) 3886 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3887 3888 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3889 /* 3890 * block isn't allocated yet and 3891 * user doesn't want to allocate it 3892 */ 3893 goto out2; 3894 } 3895 /* we should allocate requested block */ 3896 } else { 3897 /* block is already allocated */ 3898 if (sbi->s_cluster_ratio > 1) 3899 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3900 newblock = map->m_lblk 3901 - le32_to_cpu(newex.ee_block) 3902 + ext4_ext_pblock(&newex); 3903 /* number of remaining blocks in the extent */ 3904 allocated = ext4_ext_get_actual_len(&newex) - 3905 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3906 goto out; 3907 } 3908 } 3909 3910 /* find extent for this block */ 3911 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3912 if (IS_ERR(path)) { 3913 err = PTR_ERR(path); 3914 path = NULL; 3915 goto out2; 3916 } 3917 3918 depth = ext_depth(inode); 3919 3920 /* 3921 * consistent leaf must not be empty; 3922 * this situation is possible, though, _during_ tree modification; 3923 * this is why assert can't be put in ext4_ext_find_extent() 3924 */ 3925 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3926 EXT4_ERROR_INODE(inode, "bad extent address " 3927 "lblock: %lu, depth: %d pblock %lld", 3928 (unsigned long) map->m_lblk, depth, 3929 path[depth].p_block); 3930 err = -EIO; 3931 goto out2; 3932 } 3933 3934 ex = path[depth].p_ext; 3935 if (ex) { 3936 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3937 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3938 unsigned short ee_len; 3939 3940 /* 3941 * Uninitialized extents are treated as holes, except that 3942 * we split out initialized portions during a write. 3943 */ 3944 ee_len = ext4_ext_get_actual_len(ex); 3945 3946 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 3947 3948 /* if found extent covers block, simply return it */ 3949 if (in_range(map->m_lblk, ee_block, ee_len)) { 3950 newblock = map->m_lblk - ee_block + ee_start; 3951 /* number of remaining blocks in the extent */ 3952 allocated = ee_len - (map->m_lblk - ee_block); 3953 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3954 ee_block, ee_len, newblock); 3955 3956 /* 3957 * Do not put uninitialized extent 3958 * in the cache 3959 */ 3960 if (!ext4_ext_is_uninitialized(ex)) { 3961 ext4_ext_put_in_cache(inode, ee_block, 3962 ee_len, ee_start); 3963 goto out; 3964 } 3965 allocated = ext4_ext_handle_uninitialized_extents( 3966 handle, inode, map, path, flags, 3967 allocated, newblock); 3968 goto out3; 3969 } 3970 } 3971 3972 if ((sbi->s_cluster_ratio > 1) && 3973 ext4_find_delalloc_cluster(inode, map->m_lblk)) 3974 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3975 3976 /* 3977 * requested block isn't allocated yet; 3978 * we couldn't try to create block if create flag is zero 3979 */ 3980 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3981 /* 3982 * put just found gap into cache to speed up 3983 * subsequent requests 3984 */ 3985 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3986 goto out2; 3987 } 3988 3989 /* 3990 * Okay, we need to do block allocation. 3991 */ 3992 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; 3993 newex.ee_block = cpu_to_le32(map->m_lblk); 3994 cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 3995 3996 /* 3997 * If we are doing bigalloc, check to see if the extent returned 3998 * by ext4_ext_find_extent() implies a cluster we can use. 3999 */ 4000 if (cluster_offset && ex && 4001 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4002 ar.len = allocated = map->m_len; 4003 newblock = map->m_pblk; 4004 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4005 goto got_allocated_blocks; 4006 } 4007 4008 /* find neighbour allocated blocks */ 4009 ar.lleft = map->m_lblk; 4010 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 4011 if (err) 4012 goto out2; 4013 ar.lright = map->m_lblk; 4014 ex2 = NULL; 4015 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); 4016 if (err) 4017 goto out2; 4018 4019 /* Check if the extent after searching to the right implies a 4020 * cluster we can use. */ 4021 if ((sbi->s_cluster_ratio > 1) && ex2 && 4022 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4023 ar.len = allocated = map->m_len; 4024 newblock = map->m_pblk; 4025 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4026 goto got_allocated_blocks; 4027 } 4028 4029 /* 4030 * See if request is beyond maximum number of blocks we can have in 4031 * a single extent. For an initialized extent this limit is 4032 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 4033 * EXT_UNINIT_MAX_LEN. 4034 */ 4035 if (map->m_len > EXT_INIT_MAX_LEN && 4036 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 4037 map->m_len = EXT_INIT_MAX_LEN; 4038 else if (map->m_len > EXT_UNINIT_MAX_LEN && 4039 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 4040 map->m_len = EXT_UNINIT_MAX_LEN; 4041 4042 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 4043 newex.ee_len = cpu_to_le16(map->m_len); 4044 err = ext4_ext_check_overlap(sbi, inode, &newex, path); 4045 if (err) 4046 allocated = ext4_ext_get_actual_len(&newex); 4047 else 4048 allocated = map->m_len; 4049 4050 /* allocate new block */ 4051 ar.inode = inode; 4052 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 4053 ar.logical = map->m_lblk; 4054 /* 4055 * We calculate the offset from the beginning of the cluster 4056 * for the logical block number, since when we allocate a 4057 * physical cluster, the physical block should start at the 4058 * same offset from the beginning of the cluster. This is 4059 * needed so that future calls to get_implied_cluster_alloc() 4060 * work correctly. 4061 */ 4062 offset = map->m_lblk & (sbi->s_cluster_ratio - 1); 4063 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4064 ar.goal -= offset; 4065 ar.logical -= offset; 4066 if (S_ISREG(inode->i_mode)) 4067 ar.flags = EXT4_MB_HINT_DATA; 4068 else 4069 /* disable in-core preallocation for non-regular files */ 4070 ar.flags = 0; 4071 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4072 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4073 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4074 if (!newblock) 4075 goto out2; 4076 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4077 ar.goal, newblock, allocated); 4078 free_on_err = 1; 4079 allocated_clusters = ar.len; 4080 ar.len = EXT4_C2B(sbi, ar.len) - offset; 4081 if (ar.len > allocated) 4082 ar.len = allocated; 4083 4084got_allocated_blocks: 4085 /* try to insert new extent into found leaf and return */ 4086 ext4_ext_store_pblock(&newex, newblock + offset); 4087 newex.ee_len = cpu_to_le16(ar.len); 4088 /* Mark uninitialized */ 4089 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 4090 ext4_ext_mark_uninitialized(&newex); 4091 /* 4092 * io_end structure was created for every IO write to an 4093 * uninitialized extent. To avoid unnecessary conversion, 4094 * here we flag the IO that really needs the conversion. 4095 * For non asycn direct IO case, flag the inode state 4096 * that we need to perform conversion when IO is done. 4097 */ 4098 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) 4099 set_unwritten = 1; 4100 if (ext4_should_dioread_nolock(inode)) 4101 map->m_flags |= EXT4_MAP_UNINIT; 4102 } 4103 4104 err = 0; 4105 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) 4106 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4107 path, ar.len); 4108 if (!err) 4109 err = ext4_ext_insert_extent(handle, inode, path, 4110 &newex, flags); 4111 4112 if (!err && set_unwritten) { 4113 if (io) 4114 ext4_set_io_unwritten_flag(inode, io); 4115 else 4116 ext4_set_inode_state(inode, 4117 EXT4_STATE_DIO_UNWRITTEN); 4118 } 4119 4120 if (err && free_on_err) { 4121 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4122 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4123 /* free data blocks we just allocated */ 4124 /* not a good idea to call discard here directly, 4125 * but otherwise we'd need to call it every free() */ 4126 ext4_discard_preallocations(inode); 4127 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 4128 ext4_ext_get_actual_len(&newex), fb_flags); 4129 goto out2; 4130 } 4131 4132 /* previous routine could use block we allocated */ 4133 newblock = ext4_ext_pblock(&newex); 4134 allocated = ext4_ext_get_actual_len(&newex); 4135 if (allocated > map->m_len) 4136 allocated = map->m_len; 4137 map->m_flags |= EXT4_MAP_NEW; 4138 4139 /* 4140 * Update reserved blocks/metadata blocks after successful 4141 * block allocation which had been deferred till now. 4142 */ 4143 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4144 unsigned int reserved_clusters; 4145 /* 4146 * Check how many clusters we had reserved this allocated range 4147 */ 4148 reserved_clusters = get_reserved_cluster_alloc(inode, 4149 map->m_lblk, allocated); 4150 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { 4151 if (reserved_clusters) { 4152 /* 4153 * We have clusters reserved for this range. 4154 * But since we are not doing actual allocation 4155 * and are simply using blocks from previously 4156 * allocated cluster, we should release the 4157 * reservation and not claim quota. 4158 */ 4159 ext4_da_update_reserve_space(inode, 4160 reserved_clusters, 0); 4161 } 4162 } else { 4163 BUG_ON(allocated_clusters < reserved_clusters); 4164 /* We will claim quota for all newly allocated blocks.*/ 4165 ext4_da_update_reserve_space(inode, allocated_clusters, 4166 1); 4167 if (reserved_clusters < allocated_clusters) { 4168 struct ext4_inode_info *ei = EXT4_I(inode); 4169 int reservation = allocated_clusters - 4170 reserved_clusters; 4171 /* 4172 * It seems we claimed few clusters outside of 4173 * the range of this allocation. We should give 4174 * it back to the reservation pool. This can 4175 * happen in the following case: 4176 * 4177 * * Suppose s_cluster_ratio is 4 (i.e., each 4178 * cluster has 4 blocks. Thus, the clusters 4179 * are [0-3],[4-7],[8-11]... 4180 * * First comes delayed allocation write for 4181 * logical blocks 10 & 11. Since there were no 4182 * previous delayed allocated blocks in the 4183 * range [8-11], we would reserve 1 cluster 4184 * for this write. 4185 * * Next comes write for logical blocks 3 to 8. 4186 * In this case, we will reserve 2 clusters 4187 * (for [0-3] and [4-7]; and not for [8-11] as 4188 * that range has a delayed allocated blocks. 4189 * Thus total reserved clusters now becomes 3. 4190 * * Now, during the delayed allocation writeout 4191 * time, we will first write blocks [3-8] and 4192 * allocate 3 clusters for writing these 4193 * blocks. Also, we would claim all these 4194 * three clusters above. 4195 * * Now when we come here to writeout the 4196 * blocks [10-11], we would expect to claim 4197 * the reservation of 1 cluster we had made 4198 * (and we would claim it since there are no 4199 * more delayed allocated blocks in the range 4200 * [8-11]. But our reserved cluster count had 4201 * already gone to 0. 4202 * 4203 * Thus, at the step 4 above when we determine 4204 * that there are still some unwritten delayed 4205 * allocated blocks outside of our current 4206 * block range, we should increment the 4207 * reserved clusters count so that when the 4208 * remaining blocks finally gets written, we 4209 * could claim them. 4210 */ 4211 dquot_reserve_block(inode, 4212 EXT4_C2B(sbi, reservation)); 4213 spin_lock(&ei->i_block_reservation_lock); 4214 ei->i_reserved_data_blocks += reservation; 4215 spin_unlock(&ei->i_block_reservation_lock); 4216 } 4217 } 4218 } 4219 4220 /* 4221 * Cache the extent and update transaction to commit on fdatasync only 4222 * when it is _not_ an uninitialized extent. 4223 */ 4224 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 4225 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); 4226 ext4_update_inode_fsync_trans(handle, inode, 1); 4227 } else 4228 ext4_update_inode_fsync_trans(handle, inode, 0); 4229out: 4230 if (allocated > map->m_len) 4231 allocated = map->m_len; 4232 ext4_ext_show_leaf(inode, path); 4233 map->m_flags |= EXT4_MAP_MAPPED; 4234 map->m_pblk = newblock; 4235 map->m_len = allocated; 4236out2: 4237 if (path) { 4238 ext4_ext_drop_refs(path); 4239 kfree(path); 4240 } 4241 4242out3: 4243 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4244 4245 return err ? err : allocated; 4246} 4247 4248void ext4_ext_truncate(struct inode *inode) 4249{ 4250 struct address_space *mapping = inode->i_mapping; 4251 struct super_block *sb = inode->i_sb; 4252 ext4_lblk_t last_block; 4253 handle_t *handle; 4254 loff_t page_len; 4255 int err = 0; 4256 4257 /* 4258 * finish any pending end_io work so we won't run the risk of 4259 * converting any truncated blocks to initialized later 4260 */ 4261 ext4_flush_unwritten_io(inode); 4262 4263 /* 4264 * probably first extent we're gonna free will be last in block 4265 */ 4266 err = ext4_writepage_trans_blocks(inode); 4267 handle = ext4_journal_start(inode, err); 4268 if (IS_ERR(handle)) 4269 return; 4270 4271 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 4272 page_len = PAGE_CACHE_SIZE - 4273 (inode->i_size & (PAGE_CACHE_SIZE - 1)); 4274 4275 err = ext4_discard_partial_page_buffers(handle, 4276 mapping, inode->i_size, page_len, 0); 4277 4278 if (err) 4279 goto out_stop; 4280 } 4281 4282 if (ext4_orphan_add(handle, inode)) 4283 goto out_stop; 4284 4285 down_write(&EXT4_I(inode)->i_data_sem); 4286 ext4_ext_invalidate_cache(inode); 4287 4288 ext4_discard_preallocations(inode); 4289 4290 /* 4291 * TODO: optimization is possible here. 4292 * Probably we need not scan at all, 4293 * because page truncation is enough. 4294 */ 4295 4296 /* we have to know where to truncate from in crash case */ 4297 EXT4_I(inode)->i_disksize = inode->i_size; 4298 ext4_mark_inode_dirty(handle, inode); 4299 4300 last_block = (inode->i_size + sb->s_blocksize - 1) 4301 >> EXT4_BLOCK_SIZE_BITS(sb); 4302 err = ext4_es_remove_extent(inode, last_block, 4303 EXT_MAX_BLOCKS - last_block); 4304 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4305 4306 /* In a multi-transaction truncate, we only make the final 4307 * transaction synchronous. 4308 */ 4309 if (IS_SYNC(inode)) 4310 ext4_handle_sync(handle); 4311 4312 up_write(&EXT4_I(inode)->i_data_sem); 4313 4314out_stop: 4315 /* 4316 * If this was a simple ftruncate() and the file will remain alive, 4317 * then we need to clear up the orphan record which we created above. 4318 * However, if this was a real unlink then we were called by 4319 * ext4_delete_inode(), and we allow that function to clean up the 4320 * orphan info for us. 4321 */ 4322 if (inode->i_nlink) 4323 ext4_orphan_del(handle, inode); 4324 4325 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4326 ext4_mark_inode_dirty(handle, inode); 4327 ext4_journal_stop(handle); 4328} 4329 4330static void ext4_falloc_update_inode(struct inode *inode, 4331 int mode, loff_t new_size, int update_ctime) 4332{ 4333 struct timespec now; 4334 4335 if (update_ctime) { 4336 now = current_fs_time(inode->i_sb); 4337 if (!timespec_equal(&inode->i_ctime, &now)) 4338 inode->i_ctime = now; 4339 } 4340 /* 4341 * Update only when preallocation was requested beyond 4342 * the file size. 4343 */ 4344 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4345 if (new_size > i_size_read(inode)) 4346 i_size_write(inode, new_size); 4347 if (new_size > EXT4_I(inode)->i_disksize) 4348 ext4_update_i_disksize(inode, new_size); 4349 } else { 4350 /* 4351 * Mark that we allocate beyond EOF so the subsequent truncate 4352 * can proceed even if the new size is the same as i_size. 4353 */ 4354 if (new_size > i_size_read(inode)) 4355 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4356 } 4357 4358} 4359 4360/* 4361 * preallocate space for a file. This implements ext4's fallocate file 4362 * operation, which gets called from sys_fallocate system call. 4363 * For block-mapped files, posix_fallocate should fall back to the method 4364 * of writing zeroes to the required new blocks (the same behavior which is 4365 * expected for file systems which do not support fallocate() system call). 4366 */ 4367long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4368{ 4369 struct inode *inode = file->f_path.dentry->d_inode; 4370 handle_t *handle; 4371 loff_t new_size; 4372 unsigned int max_blocks; 4373 int ret = 0; 4374 int ret2 = 0; 4375 int retries = 0; 4376 int flags; 4377 struct ext4_map_blocks map; 4378 unsigned int credits, blkbits = inode->i_blkbits; 4379 4380 /* 4381 * currently supporting (pre)allocate mode for extent-based 4382 * files _only_ 4383 */ 4384 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4385 return -EOPNOTSUPP; 4386 4387 /* Return error if mode is not supported */ 4388 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4389 return -EOPNOTSUPP; 4390 4391 if (mode & FALLOC_FL_PUNCH_HOLE) 4392 return ext4_punch_hole(file, offset, len); 4393 4394 trace_ext4_fallocate_enter(inode, offset, len, mode); 4395 map.m_lblk = offset >> blkbits; 4396 /* 4397 * We can't just convert len to max_blocks because 4398 * If blocksize = 4096 offset = 3072 and len = 2048 4399 */ 4400 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4401 - map.m_lblk; 4402 /* 4403 * credits to insert 1 extent into extent tree 4404 */ 4405 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4406 mutex_lock(&inode->i_mutex); 4407 ret = inode_newsize_ok(inode, (len + offset)); 4408 if (ret) { 4409 mutex_unlock(&inode->i_mutex); 4410 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4411 return ret; 4412 } 4413 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4414 if (mode & FALLOC_FL_KEEP_SIZE) 4415 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4416 /* 4417 * Don't normalize the request if it can fit in one extent so 4418 * that it doesn't get unnecessarily split into multiple 4419 * extents. 4420 */ 4421 if (len <= EXT_UNINIT_MAX_LEN << blkbits) 4422 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4423 4424 /* Prevent race condition between unwritten */ 4425 ext4_flush_unwritten_io(inode); 4426retry: 4427 while (ret >= 0 && ret < max_blocks) { 4428 map.m_lblk = map.m_lblk + ret; 4429 map.m_len = max_blocks = max_blocks - ret; 4430 handle = ext4_journal_start(inode, credits); 4431 if (IS_ERR(handle)) { 4432 ret = PTR_ERR(handle); 4433 break; 4434 } 4435 ret = ext4_map_blocks(handle, inode, &map, flags); 4436 if (ret <= 0) { 4437#ifdef EXT4FS_DEBUG 4438 WARN_ON(ret <= 0); 4439 printk(KERN_ERR "%s: ext4_ext_map_blocks " 4440 "returned error inode#%lu, block=%u, " 4441 "max_blocks=%u", __func__, 4442 inode->i_ino, map.m_lblk, max_blocks); 4443#endif 4444 ext4_mark_inode_dirty(handle, inode); 4445 ret2 = ext4_journal_stop(handle); 4446 break; 4447 } 4448 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 4449 blkbits) >> blkbits)) 4450 new_size = offset + len; 4451 else 4452 new_size = ((loff_t) map.m_lblk + ret) << blkbits; 4453 4454 ext4_falloc_update_inode(inode, mode, new_size, 4455 (map.m_flags & EXT4_MAP_NEW)); 4456 ext4_mark_inode_dirty(handle, inode); 4457 if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4458 ext4_handle_sync(handle); 4459 ret2 = ext4_journal_stop(handle); 4460 if (ret2) 4461 break; 4462 } 4463 if (ret == -ENOSPC && 4464 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4465 ret = 0; 4466 goto retry; 4467 } 4468 mutex_unlock(&inode->i_mutex); 4469 trace_ext4_fallocate_exit(inode, offset, max_blocks, 4470 ret > 0 ? ret2 : ret); 4471 return ret > 0 ? ret2 : ret; 4472} 4473 4474/* 4475 * This function convert a range of blocks to written extents 4476 * The caller of this function will pass the start offset and the size. 4477 * all unwritten extents within this range will be converted to 4478 * written extents. 4479 * 4480 * This function is called from the direct IO end io call back 4481 * function, to convert the fallocated extents after IO is completed. 4482 * Returns 0 on success. 4483 */ 4484int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4485 ssize_t len) 4486{ 4487 handle_t *handle; 4488 unsigned int max_blocks; 4489 int ret = 0; 4490 int ret2 = 0; 4491 struct ext4_map_blocks map; 4492 unsigned int credits, blkbits = inode->i_blkbits; 4493 4494 map.m_lblk = offset >> blkbits; 4495 /* 4496 * We can't just convert len to max_blocks because 4497 * If blocksize = 4096 offset = 3072 and len = 2048 4498 */ 4499 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4500 map.m_lblk); 4501 /* 4502 * credits to insert 1 extent into extent tree 4503 */ 4504 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4505 while (ret >= 0 && ret < max_blocks) { 4506 map.m_lblk += ret; 4507 map.m_len = (max_blocks -= ret); 4508 handle = ext4_journal_start(inode, credits); 4509 if (IS_ERR(handle)) { 4510 ret = PTR_ERR(handle); 4511 break; 4512 } 4513 ret = ext4_map_blocks(handle, inode, &map, 4514 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4515 if (ret <= 0) { 4516 WARN_ON(ret <= 0); 4517 ext4_msg(inode->i_sb, KERN_ERR, 4518 "%s:%d: inode #%lu: block %u: len %u: " 4519 "ext4_ext_map_blocks returned %d", 4520 __func__, __LINE__, inode->i_ino, map.m_lblk, 4521 map.m_len, ret); 4522 } 4523 ext4_mark_inode_dirty(handle, inode); 4524 ret2 = ext4_journal_stop(handle); 4525 if (ret <= 0 || ret2 ) 4526 break; 4527 } 4528 return ret > 0 ? ret2 : ret; 4529} 4530 4531/* 4532 * If newex is not existing extent (newex->ec_start equals zero) find 4533 * delayed extent at start of newex and update newex accordingly and 4534 * return start of the next delayed extent. 4535 * 4536 * If newex is existing extent (newex->ec_start is not equal zero) 4537 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed 4538 * extent found. Leave newex unmodified. 4539 */ 4540static int ext4_find_delayed_extent(struct inode *inode, 4541 struct ext4_ext_cache *newex) 4542{ 4543 struct extent_status es; 4544 ext4_lblk_t next_del; 4545 4546 es.start = newex->ec_block; 4547 next_del = ext4_es_find_extent(inode, &es); 4548 4549 if (newex->ec_start == 0) { 4550 /* 4551 * No extent in extent-tree contains block @newex->ec_start, 4552 * then the block may stay in 1)a hole or 2)delayed-extent. 4553 */ 4554 if (es.len == 0) 4555 /* A hole found. */ 4556 return 0; 4557 4558 if (es.start > newex->ec_block) { 4559 /* A hole found. */ 4560 newex->ec_len = min(es.start - newex->ec_block, 4561 newex->ec_len); 4562 return 0; 4563 } 4564 4565 newex->ec_len = es.start + es.len - newex->ec_block; 4566 } 4567 4568 return next_del; 4569} 4570/* fiemap flags we can handle specified here */ 4571#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4572 4573static int ext4_xattr_fiemap(struct inode *inode, 4574 struct fiemap_extent_info *fieinfo) 4575{ 4576 __u64 physical = 0; 4577 __u64 length; 4578 __u32 flags = FIEMAP_EXTENT_LAST; 4579 int blockbits = inode->i_sb->s_blocksize_bits; 4580 int error = 0; 4581 4582 /* in-inode? */ 4583 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4584 struct ext4_iloc iloc; 4585 int offset; /* offset of xattr in inode */ 4586 4587 error = ext4_get_inode_loc(inode, &iloc); 4588 if (error) 4589 return error; 4590 physical = iloc.bh->b_blocknr << blockbits; 4591 offset = EXT4_GOOD_OLD_INODE_SIZE + 4592 EXT4_I(inode)->i_extra_isize; 4593 physical += offset; 4594 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 4595 flags |= FIEMAP_EXTENT_DATA_INLINE; 4596 brelse(iloc.bh); 4597 } else { /* external block */ 4598 physical = EXT4_I(inode)->i_file_acl << blockbits; 4599 length = inode->i_sb->s_blocksize; 4600 } 4601 4602 if (physical) 4603 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4604 length, flags); 4605 return (error < 0 ? error : 0); 4606} 4607 4608/* 4609 * ext4_ext_punch_hole 4610 * 4611 * Punches a hole of "length" bytes in a file starting 4612 * at byte "offset" 4613 * 4614 * @inode: The inode of the file to punch a hole in 4615 * @offset: The starting byte offset of the hole 4616 * @length: The length of the hole 4617 * 4618 * Returns the number of blocks removed or negative on err 4619 */ 4620int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) 4621{ 4622 struct inode *inode = file->f_path.dentry->d_inode; 4623 struct super_block *sb = inode->i_sb; 4624 ext4_lblk_t first_block, stop_block; 4625 struct address_space *mapping = inode->i_mapping; 4626 handle_t *handle; 4627 loff_t first_page, last_page, page_len; 4628 loff_t first_page_offset, last_page_offset; 4629 int credits, err = 0; 4630 4631 /* 4632 * Write out all dirty pages to avoid race conditions 4633 * Then release them. 4634 */ 4635 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4636 err = filemap_write_and_wait_range(mapping, 4637 offset, offset + length - 1); 4638 4639 if (err) 4640 return err; 4641 } 4642 4643 mutex_lock(&inode->i_mutex); 4644 /* It's not possible punch hole on append only file */ 4645 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 4646 err = -EPERM; 4647 goto out_mutex; 4648 } 4649 if (IS_SWAPFILE(inode)) { 4650 err = -ETXTBSY; 4651 goto out_mutex; 4652 } 4653 4654 /* No need to punch hole beyond i_size */ 4655 if (offset >= inode->i_size) 4656 goto out_mutex; 4657 4658 /* 4659 * If the hole extends beyond i_size, set the hole 4660 * to end after the page that contains i_size 4661 */ 4662 if (offset + length > inode->i_size) { 4663 length = inode->i_size + 4664 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - 4665 offset; 4666 } 4667 4668 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4669 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4670 4671 first_page_offset = first_page << PAGE_CACHE_SHIFT; 4672 last_page_offset = last_page << PAGE_CACHE_SHIFT; 4673 4674 /* Now release the pages */ 4675 if (last_page_offset > first_page_offset) { 4676 truncate_pagecache_range(inode, first_page_offset, 4677 last_page_offset - 1); 4678 } 4679 4680 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4681 ext4_inode_block_unlocked_dio(inode); 4682 err = ext4_flush_unwritten_io(inode); 4683 if (err) 4684 goto out_dio; 4685 inode_dio_wait(inode); 4686 4687 credits = ext4_writepage_trans_blocks(inode); 4688 handle = ext4_journal_start(inode, credits); 4689 if (IS_ERR(handle)) { 4690 err = PTR_ERR(handle); 4691 goto out_dio; 4692 } 4693 4694 4695 /* 4696 * Now we need to zero out the non-page-aligned data in the 4697 * pages at the start and tail of the hole, and unmap the buffer 4698 * heads for the block aligned regions of the page that were 4699 * completely zeroed. 4700 */ 4701 if (first_page > last_page) { 4702 /* 4703 * If the file space being truncated is contained within a page 4704 * just zero out and unmap the middle of that page 4705 */ 4706 err = ext4_discard_partial_page_buffers(handle, 4707 mapping, offset, length, 0); 4708 4709 if (err) 4710 goto out; 4711 } else { 4712 /* 4713 * zero out and unmap the partial page that contains 4714 * the start of the hole 4715 */ 4716 page_len = first_page_offset - offset; 4717 if (page_len > 0) { 4718 err = ext4_discard_partial_page_buffers(handle, mapping, 4719 offset, page_len, 0); 4720 if (err) 4721 goto out; 4722 } 4723 4724 /* 4725 * zero out and unmap the partial page that contains 4726 * the end of the hole 4727 */ 4728 page_len = offset + length - last_page_offset; 4729 if (page_len > 0) { 4730 err = ext4_discard_partial_page_buffers(handle, mapping, 4731 last_page_offset, page_len, 0); 4732 if (err) 4733 goto out; 4734 } 4735 } 4736 4737 /* 4738 * If i_size is contained in the last page, we need to 4739 * unmap and zero the partial page after i_size 4740 */ 4741 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && 4742 inode->i_size % PAGE_CACHE_SIZE != 0) { 4743 4744 page_len = PAGE_CACHE_SIZE - 4745 (inode->i_size & (PAGE_CACHE_SIZE - 1)); 4746 4747 if (page_len > 0) { 4748 err = ext4_discard_partial_page_buffers(handle, 4749 mapping, inode->i_size, page_len, 0); 4750 4751 if (err) 4752 goto out; 4753 } 4754 } 4755 4756 first_block = (offset + sb->s_blocksize - 1) >> 4757 EXT4_BLOCK_SIZE_BITS(sb); 4758 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4759 4760 /* If there are no blocks to remove, return now */ 4761 if (first_block >= stop_block) 4762 goto out; 4763 4764 down_write(&EXT4_I(inode)->i_data_sem); 4765 ext4_ext_invalidate_cache(inode); 4766 ext4_discard_preallocations(inode); 4767 4768 err = ext4_es_remove_extent(inode, first_block, 4769 stop_block - first_block); 4770 err = ext4_ext_remove_space(inode, first_block, stop_block - 1); 4771 4772 ext4_ext_invalidate_cache(inode); 4773 ext4_discard_preallocations(inode); 4774 4775 if (IS_SYNC(inode)) 4776 ext4_handle_sync(handle); 4777 4778 up_write(&EXT4_I(inode)->i_data_sem); 4779 4780out: 4781 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4782 ext4_mark_inode_dirty(handle, inode); 4783 ext4_journal_stop(handle); 4784out_dio: 4785 ext4_inode_resume_unlocked_dio(inode); 4786out_mutex: 4787 mutex_unlock(&inode->i_mutex); 4788 return err; 4789} 4790 4791int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4792 __u64 start, __u64 len) 4793{ 4794 ext4_lblk_t start_blk; 4795 int error = 0; 4796 4797 /* fallback to generic here if not in extents fmt */ 4798 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4799 return generic_block_fiemap(inode, fieinfo, start, len, 4800 ext4_get_block); 4801 4802 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 4803 return -EBADR; 4804 4805 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4806 error = ext4_xattr_fiemap(inode, fieinfo); 4807 } else { 4808 ext4_lblk_t len_blks; 4809 __u64 last_blk; 4810 4811 start_blk = start >> inode->i_sb->s_blocksize_bits; 4812 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4813 if (last_blk >= EXT_MAX_BLOCKS) 4814 last_blk = EXT_MAX_BLOCKS-1; 4815 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4816 4817 /* 4818 * Walk the extent tree gathering extent information 4819 * and pushing extents back to the user. 4820 */ 4821 error = ext4_fill_fiemap_extents(inode, start_blk, 4822 len_blks, fieinfo); 4823 } 4824 4825 return error; 4826} 4827