aops.c revision 53ef99cad9878f02f27bb30bc304fc42af8bdd6e
1/* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public 17 * License along with this program; if not, write to the 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19 * Boston, MA 021110-1307, USA. 20 */ 21 22#include <linux/fs.h> 23#include <linux/slab.h> 24#include <linux/highmem.h> 25#include <linux/pagemap.h> 26#include <asm/byteorder.h> 27#include <linux/swap.h> 28#include <linux/pipe_fs_i.h> 29#include <linux/mpage.h> 30 31#define MLOG_MASK_PREFIX ML_FILE_IO 32#include <cluster/masklog.h> 33 34#include "ocfs2.h" 35 36#include "alloc.h" 37#include "aops.h" 38#include "dlmglue.h" 39#include "extent_map.h" 40#include "file.h" 41#include "inode.h" 42#include "journal.h" 43#include "suballoc.h" 44#include "super.h" 45#include "symlink.h" 46 47#include "buffer_head_io.h" 48 49static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 50 struct buffer_head *bh_result, int create) 51{ 52 int err = -EIO; 53 int status; 54 struct ocfs2_dinode *fe = NULL; 55 struct buffer_head *bh = NULL; 56 struct buffer_head *buffer_cache_bh = NULL; 57 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 58 void *kaddr; 59 60 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 61 (unsigned long long)iblock, bh_result, create); 62 63 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 64 65 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { 66 mlog(ML_ERROR, "block offset > PATH_MAX: %llu", 67 (unsigned long long)iblock); 68 goto bail; 69 } 70 71 status = ocfs2_read_inode_block(inode, &bh); 72 if (status < 0) { 73 mlog_errno(status); 74 goto bail; 75 } 76 fe = (struct ocfs2_dinode *) bh->b_data; 77 78 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 79 le32_to_cpu(fe->i_clusters))) { 80 mlog(ML_ERROR, "block offset is outside the allocated size: " 81 "%llu\n", (unsigned long long)iblock); 82 goto bail; 83 } 84 85 /* We don't use the page cache to create symlink data, so if 86 * need be, copy it over from the buffer cache. */ 87 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { 88 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + 89 iblock; 90 buffer_cache_bh = sb_getblk(osb->sb, blkno); 91 if (!buffer_cache_bh) { 92 mlog(ML_ERROR, "couldn't getblock for symlink!\n"); 93 goto bail; 94 } 95 96 /* we haven't locked out transactions, so a commit 97 * could've happened. Since we've got a reference on 98 * the bh, even if it commits while we're doing the 99 * copy, the data is still good. */ 100 if (buffer_jbd(buffer_cache_bh) 101 && ocfs2_inode_is_new(inode)) { 102 kaddr = kmap_atomic(bh_result->b_page, KM_USER0); 103 if (!kaddr) { 104 mlog(ML_ERROR, "couldn't kmap!\n"); 105 goto bail; 106 } 107 memcpy(kaddr + (bh_result->b_size * iblock), 108 buffer_cache_bh->b_data, 109 bh_result->b_size); 110 kunmap_atomic(kaddr, KM_USER0); 111 set_buffer_uptodate(bh_result); 112 } 113 brelse(buffer_cache_bh); 114 } 115 116 map_bh(bh_result, inode->i_sb, 117 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); 118 119 err = 0; 120 121bail: 122 brelse(bh); 123 124 mlog_exit(err); 125 return err; 126} 127 128static int ocfs2_get_block(struct inode *inode, sector_t iblock, 129 struct buffer_head *bh_result, int create) 130{ 131 int err = 0; 132 unsigned int ext_flags; 133 u64 max_blocks = bh_result->b_size >> inode->i_blkbits; 134 u64 p_blkno, count, past_eof; 135 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 136 137 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 138 (unsigned long long)iblock, bh_result, create); 139 140 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 141 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 142 inode, inode->i_ino); 143 144 if (S_ISLNK(inode->i_mode)) { 145 /* this always does I/O for some reason. */ 146 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); 147 goto bail; 148 } 149 150 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, 151 &ext_flags); 152 if (err) { 153 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 154 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 155 (unsigned long long)p_blkno); 156 goto bail; 157 } 158 159 if (max_blocks < count) 160 count = max_blocks; 161 162 /* 163 * ocfs2 never allocates in this function - the only time we 164 * need to use BH_New is when we're extending i_size on a file 165 * system which doesn't support holes, in which case BH_New 166 * allows block_prepare_write() to zero. 167 * 168 * If we see this on a sparse file system, then a truncate has 169 * raced us and removed the cluster. In this case, we clear 170 * the buffers dirty and uptodate bits and let the buffer code 171 * ignore it as a hole. 172 */ 173 if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { 174 clear_buffer_dirty(bh_result); 175 clear_buffer_uptodate(bh_result); 176 goto bail; 177 } 178 179 /* Treat the unwritten extent as a hole for zeroing purposes. */ 180 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 181 map_bh(bh_result, inode->i_sb, p_blkno); 182 183 bh_result->b_size = count << inode->i_blkbits; 184 185 if (!ocfs2_sparse_alloc(osb)) { 186 if (p_blkno == 0) { 187 err = -EIO; 188 mlog(ML_ERROR, 189 "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 190 (unsigned long long)iblock, 191 (unsigned long long)p_blkno, 192 (unsigned long long)OCFS2_I(inode)->ip_blkno); 193 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 194 dump_stack(); 195 } 196 197 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 198 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 199 (unsigned long long)past_eof); 200 201 if (create && (iblock >= past_eof)) 202 set_buffer_new(bh_result); 203 } 204 205bail: 206 if (err < 0) 207 err = -EIO; 208 209 mlog_exit(err); 210 return err; 211} 212 213int ocfs2_read_inline_data(struct inode *inode, struct page *page, 214 struct buffer_head *di_bh) 215{ 216 void *kaddr; 217 loff_t size; 218 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 219 220 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 221 ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", 222 (unsigned long long)OCFS2_I(inode)->ip_blkno); 223 return -EROFS; 224 } 225 226 size = i_size_read(inode); 227 228 if (size > PAGE_CACHE_SIZE || 229 size > ocfs2_max_inline_data(inode->i_sb)) { 230 ocfs2_error(inode->i_sb, 231 "Inode %llu has with inline data has bad size: %Lu", 232 (unsigned long long)OCFS2_I(inode)->ip_blkno, 233 (unsigned long long)size); 234 return -EROFS; 235 } 236 237 kaddr = kmap_atomic(page, KM_USER0); 238 if (size) 239 memcpy(kaddr, di->id2.i_data.id_data, size); 240 /* Clear the remaining part of the page */ 241 memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); 242 flush_dcache_page(page); 243 kunmap_atomic(kaddr, KM_USER0); 244 245 SetPageUptodate(page); 246 247 return 0; 248} 249 250static int ocfs2_readpage_inline(struct inode *inode, struct page *page) 251{ 252 int ret; 253 struct buffer_head *di_bh = NULL; 254 255 BUG_ON(!PageLocked(page)); 256 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 257 258 ret = ocfs2_read_inode_block(inode, &di_bh); 259 if (ret) { 260 mlog_errno(ret); 261 goto out; 262 } 263 264 ret = ocfs2_read_inline_data(inode, page, di_bh); 265out: 266 unlock_page(page); 267 268 brelse(di_bh); 269 return ret; 270} 271 272static int ocfs2_readpage(struct file *file, struct page *page) 273{ 274 struct inode *inode = page->mapping->host; 275 struct ocfs2_inode_info *oi = OCFS2_I(inode); 276 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 277 int ret, unlock = 1; 278 279 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 280 281 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); 282 if (ret != 0) { 283 if (ret == AOP_TRUNCATED_PAGE) 284 unlock = 0; 285 mlog_errno(ret); 286 goto out; 287 } 288 289 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 290 ret = AOP_TRUNCATED_PAGE; 291 goto out_inode_unlock; 292 } 293 294 /* 295 * i_size might have just been updated as we grabed the meta lock. We 296 * might now be discovering a truncate that hit on another node. 297 * block_read_full_page->get_block freaks out if it is asked to read 298 * beyond the end of a file, so we check here. Callers 299 * (generic_file_read, vm_ops->fault) are clever enough to check i_size 300 * and notice that the page they just read isn't needed. 301 * 302 * XXX sys_readahead() seems to get that wrong? 303 */ 304 if (start >= i_size_read(inode)) { 305 zero_user(page, 0, PAGE_SIZE); 306 SetPageUptodate(page); 307 ret = 0; 308 goto out_alloc; 309 } 310 311 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 312 ret = ocfs2_readpage_inline(inode, page); 313 else 314 ret = block_read_full_page(page, ocfs2_get_block); 315 unlock = 0; 316 317out_alloc: 318 up_read(&OCFS2_I(inode)->ip_alloc_sem); 319out_inode_unlock: 320 ocfs2_inode_unlock(inode, 0); 321out: 322 if (unlock) 323 unlock_page(page); 324 mlog_exit(ret); 325 return ret; 326} 327 328/* 329 * This is used only for read-ahead. Failures or difficult to handle 330 * situations are safe to ignore. 331 * 332 * Right now, we don't bother with BH_Boundary - in-inode extent lists 333 * are quite large (243 extents on 4k blocks), so most inodes don't 334 * grow out to a tree. If need be, detecting boundary extents could 335 * trivially be added in a future version of ocfs2_get_block(). 336 */ 337static int ocfs2_readpages(struct file *filp, struct address_space *mapping, 338 struct list_head *pages, unsigned nr_pages) 339{ 340 int ret, err = -EIO; 341 struct inode *inode = mapping->host; 342 struct ocfs2_inode_info *oi = OCFS2_I(inode); 343 loff_t start; 344 struct page *last; 345 346 /* 347 * Use the nonblocking flag for the dlm code to avoid page 348 * lock inversion, but don't bother with retrying. 349 */ 350 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); 351 if (ret) 352 return err; 353 354 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 355 ocfs2_inode_unlock(inode, 0); 356 return err; 357 } 358 359 /* 360 * Don't bother with inline-data. There isn't anything 361 * to read-ahead in that case anyway... 362 */ 363 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 364 goto out_unlock; 365 366 /* 367 * Check whether a remote node truncated this file - we just 368 * drop out in that case as it's not worth handling here. 369 */ 370 last = list_entry(pages->prev, struct page, lru); 371 start = (loff_t)last->index << PAGE_CACHE_SHIFT; 372 if (start >= i_size_read(inode)) 373 goto out_unlock; 374 375 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); 376 377out_unlock: 378 up_read(&oi->ip_alloc_sem); 379 ocfs2_inode_unlock(inode, 0); 380 381 return err; 382} 383 384/* Note: Because we don't support holes, our allocation has 385 * already happened (allocation writes zeros to the file data) 386 * so we don't have to worry about ordered writes in 387 * ocfs2_writepage. 388 * 389 * ->writepage is called during the process of invalidating the page cache 390 * during blocked lock processing. It can't block on any cluster locks 391 * to during block mapping. It's relying on the fact that the block 392 * mapping can't have disappeared under the dirty pages that it is 393 * being asked to write back. 394 */ 395static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 396{ 397 int ret; 398 399 mlog_entry("(0x%p)\n", page); 400 401 ret = block_write_full_page(page, ocfs2_get_block, wbc); 402 403 mlog_exit(ret); 404 405 return ret; 406} 407 408/* 409 * This is called from ocfs2_write_zero_page() which has handled it's 410 * own cluster locking and has ensured allocation exists for those 411 * blocks to be written. 412 */ 413int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 414 unsigned from, unsigned to) 415{ 416 int ret; 417 418 ret = block_prepare_write(page, from, to, ocfs2_get_block); 419 420 return ret; 421} 422 423/* Taken from ext3. We don't necessarily need the full blown 424 * functionality yet, but IMHO it's better to cut and paste the whole 425 * thing so we can avoid introducing our own bugs (and easily pick up 426 * their fixes when they happen) --Mark */ 427int walk_page_buffers( handle_t *handle, 428 struct buffer_head *head, 429 unsigned from, 430 unsigned to, 431 int *partial, 432 int (*fn)( handle_t *handle, 433 struct buffer_head *bh)) 434{ 435 struct buffer_head *bh; 436 unsigned block_start, block_end; 437 unsigned blocksize = head->b_size; 438 int err, ret = 0; 439 struct buffer_head *next; 440 441 for ( bh = head, block_start = 0; 442 ret == 0 && (bh != head || !block_start); 443 block_start = block_end, bh = next) 444 { 445 next = bh->b_this_page; 446 block_end = block_start + blocksize; 447 if (block_end <= from || block_start >= to) { 448 if (partial && !buffer_uptodate(bh)) 449 *partial = 1; 450 continue; 451 } 452 err = (*fn)(handle, bh); 453 if (!ret) 454 ret = err; 455 } 456 return ret; 457} 458 459handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 460 struct page *page, 461 unsigned from, 462 unsigned to) 463{ 464 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 465 handle_t *handle; 466 int ret = 0; 467 468 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 469 if (IS_ERR(handle)) { 470 ret = -ENOMEM; 471 mlog_errno(ret); 472 goto out; 473 } 474 475 if (ocfs2_should_order_data(inode)) { 476 ret = ocfs2_jbd2_file_inode(handle, inode); 477 if (ret < 0) 478 mlog_errno(ret); 479 } 480out: 481 if (ret) { 482 if (!IS_ERR(handle)) 483 ocfs2_commit_trans(osb, handle); 484 handle = ERR_PTR(ret); 485 } 486 return handle; 487} 488 489static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 490{ 491 sector_t status; 492 u64 p_blkno = 0; 493 int err = 0; 494 struct inode *inode = mapping->host; 495 496 mlog_entry("(block = %llu)\n", (unsigned long long)block); 497 498 /* We don't need to lock journal system files, since they aren't 499 * accessed concurrently from multiple nodes. 500 */ 501 if (!INODE_JOURNAL(inode)) { 502 err = ocfs2_inode_lock(inode, NULL, 0); 503 if (err) { 504 if (err != -ENOENT) 505 mlog_errno(err); 506 goto bail; 507 } 508 down_read(&OCFS2_I(inode)->ip_alloc_sem); 509 } 510 511 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) 512 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, 513 NULL); 514 515 if (!INODE_JOURNAL(inode)) { 516 up_read(&OCFS2_I(inode)->ip_alloc_sem); 517 ocfs2_inode_unlock(inode, 0); 518 } 519 520 if (err) { 521 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", 522 (unsigned long long)block); 523 mlog_errno(err); 524 goto bail; 525 } 526 527bail: 528 status = err ? 0 : p_blkno; 529 530 mlog_exit((int)status); 531 532 return status; 533} 534 535/* 536 * TODO: Make this into a generic get_blocks function. 537 * 538 * From do_direct_io in direct-io.c: 539 * "So what we do is to permit the ->get_blocks function to populate 540 * bh.b_size with the size of IO which is permitted at this offset and 541 * this i_blkbits." 542 * 543 * This function is called directly from get_more_blocks in direct-io.c. 544 * 545 * called like this: dio->get_blocks(dio->inode, fs_startblk, 546 * fs_count, map_bh, dio->rw == WRITE); 547 */ 548static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 549 struct buffer_head *bh_result, int create) 550{ 551 int ret; 552 u64 p_blkno, inode_blocks, contig_blocks; 553 unsigned int ext_flags; 554 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 555 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 556 557 /* This function won't even be called if the request isn't all 558 * nicely aligned and of the right size, so there's no need 559 * for us to check any of that. */ 560 561 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 562 563 /* 564 * Any write past EOF is not allowed because we'd be extending. 565 */ 566 if (create && (iblock + max_blocks) > inode_blocks) { 567 ret = -EIO; 568 goto bail; 569 } 570 571 /* This figures out the size of the next contiguous block, and 572 * our logical offset */ 573 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 574 &contig_blocks, &ext_flags); 575 if (ret) { 576 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 577 (unsigned long long)iblock); 578 ret = -EIO; 579 goto bail; 580 } 581 582 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { 583 ocfs2_error(inode->i_sb, 584 "Inode %llu has a hole at block %llu\n", 585 (unsigned long long)OCFS2_I(inode)->ip_blkno, 586 (unsigned long long)iblock); 587 ret = -EROFS; 588 goto bail; 589 } 590 591 /* 592 * get_more_blocks() expects us to describe a hole by clearing 593 * the mapped bit on bh_result(). 594 * 595 * Consider an unwritten extent as a hole. 596 */ 597 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 598 map_bh(bh_result, inode->i_sb, p_blkno); 599 else { 600 /* 601 * ocfs2_prepare_inode_for_write() should have caught 602 * the case where we'd be filling a hole and triggered 603 * a buffered write instead. 604 */ 605 if (create) { 606 ret = -EIO; 607 mlog_errno(ret); 608 goto bail; 609 } 610 611 clear_buffer_mapped(bh_result); 612 } 613 614 /* make sure we don't map more than max_blocks blocks here as 615 that's all the kernel will handle at this point. */ 616 if (max_blocks < contig_blocks) 617 contig_blocks = max_blocks; 618 bh_result->b_size = contig_blocks << blocksize_bits; 619bail: 620 return ret; 621} 622 623/* 624 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 625 * particularly interested in the aio/dio case. Like the core uses 626 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 627 * truncation on another. 628 */ 629static void ocfs2_dio_end_io(struct kiocb *iocb, 630 loff_t offset, 631 ssize_t bytes, 632 void *private) 633{ 634 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 635 int level; 636 637 /* this io's submitter should not have unlocked this before we could */ 638 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 639 640 ocfs2_iocb_clear_rw_locked(iocb); 641 642 level = ocfs2_iocb_rw_locked_level(iocb); 643 if (!level) 644 up_read(&inode->i_alloc_sem); 645 ocfs2_rw_unlock(inode, level); 646} 647 648/* 649 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen 650 * from ext3. PageChecked() bits have been removed as OCFS2 does not 651 * do journalled data. 652 */ 653static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 654{ 655 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 656 657 jbd2_journal_invalidatepage(journal, page, offset); 658} 659 660static int ocfs2_releasepage(struct page *page, gfp_t wait) 661{ 662 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 663 664 if (!page_has_buffers(page)) 665 return 0; 666 return jbd2_journal_try_to_free_buffers(journal, page, wait); 667} 668 669static ssize_t ocfs2_direct_IO(int rw, 670 struct kiocb *iocb, 671 const struct iovec *iov, 672 loff_t offset, 673 unsigned long nr_segs) 674{ 675 struct file *file = iocb->ki_filp; 676 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 677 int ret; 678 679 mlog_entry_void(); 680 681 /* 682 * Fallback to buffered I/O if we see an inode without 683 * extents. 684 */ 685 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 686 return 0; 687 688 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 689 inode->i_sb->s_bdev, iov, offset, 690 nr_segs, 691 ocfs2_direct_IO_get_blocks, 692 ocfs2_dio_end_io); 693 694 mlog_exit(ret); 695 return ret; 696} 697 698static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 699 u32 cpos, 700 unsigned int *start, 701 unsigned int *end) 702{ 703 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; 704 705 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { 706 unsigned int cpp; 707 708 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); 709 710 cluster_start = cpos % cpp; 711 cluster_start = cluster_start << osb->s_clustersize_bits; 712 713 cluster_end = cluster_start + osb->s_clustersize; 714 } 715 716 BUG_ON(cluster_start > PAGE_SIZE); 717 BUG_ON(cluster_end > PAGE_SIZE); 718 719 if (start) 720 *start = cluster_start; 721 if (end) 722 *end = cluster_end; 723} 724 725/* 726 * 'from' and 'to' are the region in the page to avoid zeroing. 727 * 728 * If pagesize > clustersize, this function will avoid zeroing outside 729 * of the cluster boundary. 730 * 731 * from == to == 0 is code for "zero the entire cluster region" 732 */ 733static void ocfs2_clear_page_regions(struct page *page, 734 struct ocfs2_super *osb, u32 cpos, 735 unsigned from, unsigned to) 736{ 737 void *kaddr; 738 unsigned int cluster_start, cluster_end; 739 740 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); 741 742 kaddr = kmap_atomic(page, KM_USER0); 743 744 if (from || to) { 745 if (from > cluster_start) 746 memset(kaddr + cluster_start, 0, from - cluster_start); 747 if (to < cluster_end) 748 memset(kaddr + to, 0, cluster_end - to); 749 } else { 750 memset(kaddr + cluster_start, 0, cluster_end - cluster_start); 751 } 752 753 kunmap_atomic(kaddr, KM_USER0); 754} 755 756/* 757 * Nonsparse file systems fully allocate before we get to the write 758 * code. This prevents ocfs2_write() from tagging the write as an 759 * allocating one, which means ocfs2_map_page_blocks() might try to 760 * read-in the blocks at the tail of our file. Avoid reading them by 761 * testing i_size against each block offset. 762 */ 763static int ocfs2_should_read_blk(struct inode *inode, struct page *page, 764 unsigned int block_start) 765{ 766 u64 offset = page_offset(page) + block_start; 767 768 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 769 return 1; 770 771 if (i_size_read(inode) > offset) 772 return 1; 773 774 return 0; 775} 776 777/* 778 * Some of this taken from block_prepare_write(). We already have our 779 * mapping by now though, and the entire write will be allocating or 780 * it won't, so not much need to use BH_New. 781 * 782 * This will also skip zeroing, which is handled externally. 783 */ 784int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 785 struct inode *inode, unsigned int from, 786 unsigned int to, int new) 787{ 788 int ret = 0; 789 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; 790 unsigned int block_end, block_start; 791 unsigned int bsize = 1 << inode->i_blkbits; 792 793 if (!page_has_buffers(page)) 794 create_empty_buffers(page, bsize, 0); 795 796 head = page_buffers(page); 797 for (bh = head, block_start = 0; bh != head || !block_start; 798 bh = bh->b_this_page, block_start += bsize) { 799 block_end = block_start + bsize; 800 801 clear_buffer_new(bh); 802 803 /* 804 * Ignore blocks outside of our i/o range - 805 * they may belong to unallocated clusters. 806 */ 807 if (block_start >= to || block_end <= from) { 808 if (PageUptodate(page)) 809 set_buffer_uptodate(bh); 810 continue; 811 } 812 813 /* 814 * For an allocating write with cluster size >= page 815 * size, we always write the entire page. 816 */ 817 if (new) 818 set_buffer_new(bh); 819 820 if (!buffer_mapped(bh)) { 821 map_bh(bh, inode->i_sb, *p_blkno); 822 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 823 } 824 825 if (PageUptodate(page)) { 826 if (!buffer_uptodate(bh)) 827 set_buffer_uptodate(bh); 828 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 829 !buffer_new(bh) && 830 ocfs2_should_read_blk(inode, page, block_start) && 831 (block_start < from || block_end > to)) { 832 ll_rw_block(READ, 1, &bh); 833 *wait_bh++=bh; 834 } 835 836 *p_blkno = *p_blkno + 1; 837 } 838 839 /* 840 * If we issued read requests - let them complete. 841 */ 842 while(wait_bh > wait) { 843 wait_on_buffer(*--wait_bh); 844 if (!buffer_uptodate(*wait_bh)) 845 ret = -EIO; 846 } 847 848 if (ret == 0 || !new) 849 return ret; 850 851 /* 852 * If we get -EIO above, zero out any newly allocated blocks 853 * to avoid exposing stale data. 854 */ 855 bh = head; 856 block_start = 0; 857 do { 858 block_end = block_start + bsize; 859 if (block_end <= from) 860 goto next_bh; 861 if (block_start >= to) 862 break; 863 864 zero_user(page, block_start, bh->b_size); 865 set_buffer_uptodate(bh); 866 mark_buffer_dirty(bh); 867 868next_bh: 869 block_start = block_end; 870 bh = bh->b_this_page; 871 } while (bh != head); 872 873 return ret; 874} 875 876#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) 877#define OCFS2_MAX_CTXT_PAGES 1 878#else 879#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) 880#endif 881 882#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 883 884/* 885 * Describe the state of a single cluster to be written to. 886 */ 887struct ocfs2_write_cluster_desc { 888 u32 c_cpos; 889 u32 c_phys; 890 /* 891 * Give this a unique field because c_phys eventually gets 892 * filled. 893 */ 894 unsigned c_new; 895 unsigned c_unwritten; 896}; 897 898static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) 899{ 900 return d->c_new || d->c_unwritten; 901} 902 903struct ocfs2_write_ctxt { 904 /* Logical cluster position / len of write */ 905 u32 w_cpos; 906 u32 w_clen; 907 908 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 909 910 /* 911 * This is true if page_size > cluster_size. 912 * 913 * It triggers a set of special cases during write which might 914 * have to deal with allocating writes to partial pages. 915 */ 916 unsigned int w_large_pages; 917 918 /* 919 * Pages involved in this write. 920 * 921 * w_target_page is the page being written to by the user. 922 * 923 * w_pages is an array of pages which always contains 924 * w_target_page, and in the case of an allocating write with 925 * page_size < cluster size, it will contain zero'd and mapped 926 * pages adjacent to w_target_page which need to be written 927 * out in so that future reads from that region will get 928 * zero's. 929 */ 930 struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; 931 unsigned int w_num_pages; 932 struct page *w_target_page; 933 934 /* 935 * ocfs2_write_end() uses this to know what the real range to 936 * write in the target should be. 937 */ 938 unsigned int w_target_from; 939 unsigned int w_target_to; 940 941 /* 942 * We could use journal_current_handle() but this is cleaner, 943 * IMHO -Mark 944 */ 945 handle_t *w_handle; 946 947 struct buffer_head *w_di_bh; 948 949 struct ocfs2_cached_dealloc_ctxt w_dealloc; 950}; 951 952void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) 953{ 954 int i; 955 956 for(i = 0; i < num_pages; i++) { 957 if (pages[i]) { 958 unlock_page(pages[i]); 959 mark_page_accessed(pages[i]); 960 page_cache_release(pages[i]); 961 } 962 } 963} 964 965static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 966{ 967 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 968 969 brelse(wc->w_di_bh); 970 kfree(wc); 971} 972 973static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 974 struct ocfs2_super *osb, loff_t pos, 975 unsigned len, struct buffer_head *di_bh) 976{ 977 u32 cend; 978 struct ocfs2_write_ctxt *wc; 979 980 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); 981 if (!wc) 982 return -ENOMEM; 983 984 wc->w_cpos = pos >> osb->s_clustersize_bits; 985 cend = (pos + len - 1) >> osb->s_clustersize_bits; 986 wc->w_clen = cend - wc->w_cpos + 1; 987 get_bh(di_bh); 988 wc->w_di_bh = di_bh; 989 990 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 991 wc->w_large_pages = 1; 992 else 993 wc->w_large_pages = 0; 994 995 ocfs2_init_dealloc_ctxt(&wc->w_dealloc); 996 997 *wcp = wc; 998 999 return 0; 1000} 1001 1002/* 1003 * If a page has any new buffers, zero them out here, and mark them uptodate 1004 * and dirty so they'll be written out (in order to prevent uninitialised 1005 * block data from leaking). And clear the new bit. 1006 */ 1007static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) 1008{ 1009 unsigned int block_start, block_end; 1010 struct buffer_head *head, *bh; 1011 1012 BUG_ON(!PageLocked(page)); 1013 if (!page_has_buffers(page)) 1014 return; 1015 1016 bh = head = page_buffers(page); 1017 block_start = 0; 1018 do { 1019 block_end = block_start + bh->b_size; 1020 1021 if (buffer_new(bh)) { 1022 if (block_end > from && block_start < to) { 1023 if (!PageUptodate(page)) { 1024 unsigned start, end; 1025 1026 start = max(from, block_start); 1027 end = min(to, block_end); 1028 1029 zero_user_segment(page, start, end); 1030 set_buffer_uptodate(bh); 1031 } 1032 1033 clear_buffer_new(bh); 1034 mark_buffer_dirty(bh); 1035 } 1036 } 1037 1038 block_start = block_end; 1039 bh = bh->b_this_page; 1040 } while (bh != head); 1041} 1042 1043/* 1044 * Only called when we have a failure during allocating write to write 1045 * zero's to the newly allocated region. 1046 */ 1047static void ocfs2_write_failure(struct inode *inode, 1048 struct ocfs2_write_ctxt *wc, 1049 loff_t user_pos, unsigned user_len) 1050{ 1051 int i; 1052 unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), 1053 to = user_pos + user_len; 1054 struct page *tmppage; 1055 1056 ocfs2_zero_new_buffers(wc->w_target_page, from, to); 1057 1058 for(i = 0; i < wc->w_num_pages; i++) { 1059 tmppage = wc->w_pages[i]; 1060 1061 if (page_has_buffers(tmppage)) { 1062 if (ocfs2_should_order_data(inode)) 1063 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1064 1065 block_commit_write(tmppage, from, to); 1066 } 1067 } 1068} 1069 1070static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, 1071 struct ocfs2_write_ctxt *wc, 1072 struct page *page, u32 cpos, 1073 loff_t user_pos, unsigned user_len, 1074 int new) 1075{ 1076 int ret; 1077 unsigned int map_from = 0, map_to = 0; 1078 unsigned int cluster_start, cluster_end; 1079 unsigned int user_data_from = 0, user_data_to = 0; 1080 1081 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, 1082 &cluster_start, &cluster_end); 1083 1084 if (page == wc->w_target_page) { 1085 map_from = user_pos & (PAGE_CACHE_SIZE - 1); 1086 map_to = map_from + user_len; 1087 1088 if (new) 1089 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1090 cluster_start, cluster_end, 1091 new); 1092 else 1093 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1094 map_from, map_to, new); 1095 if (ret) { 1096 mlog_errno(ret); 1097 goto out; 1098 } 1099 1100 user_data_from = map_from; 1101 user_data_to = map_to; 1102 if (new) { 1103 map_from = cluster_start; 1104 map_to = cluster_end; 1105 } 1106 } else { 1107 /* 1108 * If we haven't allocated the new page yet, we 1109 * shouldn't be writing it out without copying user 1110 * data. This is likely a math error from the caller. 1111 */ 1112 BUG_ON(!new); 1113 1114 map_from = cluster_start; 1115 map_to = cluster_end; 1116 1117 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1118 cluster_start, cluster_end, new); 1119 if (ret) { 1120 mlog_errno(ret); 1121 goto out; 1122 } 1123 } 1124 1125 /* 1126 * Parts of newly allocated pages need to be zero'd. 1127 * 1128 * Above, we have also rewritten 'to' and 'from' - as far as 1129 * the rest of the function is concerned, the entire cluster 1130 * range inside of a page needs to be written. 1131 * 1132 * We can skip this if the page is up to date - it's already 1133 * been zero'd from being read in as a hole. 1134 */ 1135 if (new && !PageUptodate(page)) 1136 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1137 cpos, user_data_from, user_data_to); 1138 1139 flush_dcache_page(page); 1140 1141out: 1142 return ret; 1143} 1144 1145/* 1146 * This function will only grab one clusters worth of pages. 1147 */ 1148static int ocfs2_grab_pages_for_write(struct address_space *mapping, 1149 struct ocfs2_write_ctxt *wc, 1150 u32 cpos, loff_t user_pos, int new, 1151 struct page *mmap_page) 1152{ 1153 int ret = 0, i; 1154 unsigned long start, target_index, index; 1155 struct inode *inode = mapping->host; 1156 1157 target_index = user_pos >> PAGE_CACHE_SHIFT; 1158 1159 /* 1160 * Figure out how many pages we'll be manipulating here. For 1161 * non allocating write, we just change the one 1162 * page. Otherwise, we'll need a whole clusters worth. 1163 */ 1164 if (new) { 1165 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); 1166 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); 1167 } else { 1168 wc->w_num_pages = 1; 1169 start = target_index; 1170 } 1171 1172 for(i = 0; i < wc->w_num_pages; i++) { 1173 index = start + i; 1174 1175 if (index == target_index && mmap_page) { 1176 /* 1177 * ocfs2_pagemkwrite() is a little different 1178 * and wants us to directly use the page 1179 * passed in. 1180 */ 1181 lock_page(mmap_page); 1182 1183 if (mmap_page->mapping != mapping) { 1184 unlock_page(mmap_page); 1185 /* 1186 * Sanity check - the locking in 1187 * ocfs2_pagemkwrite() should ensure 1188 * that this code doesn't trigger. 1189 */ 1190 ret = -EINVAL; 1191 mlog_errno(ret); 1192 goto out; 1193 } 1194 1195 page_cache_get(mmap_page); 1196 wc->w_pages[i] = mmap_page; 1197 } else { 1198 wc->w_pages[i] = find_or_create_page(mapping, index, 1199 GFP_NOFS); 1200 if (!wc->w_pages[i]) { 1201 ret = -ENOMEM; 1202 mlog_errno(ret); 1203 goto out; 1204 } 1205 } 1206 1207 if (index == target_index) 1208 wc->w_target_page = wc->w_pages[i]; 1209 } 1210out: 1211 return ret; 1212} 1213 1214/* 1215 * Prepare a single cluster for write one cluster into the file. 1216 */ 1217static int ocfs2_write_cluster(struct address_space *mapping, 1218 u32 phys, unsigned int unwritten, 1219 struct ocfs2_alloc_context *data_ac, 1220 struct ocfs2_alloc_context *meta_ac, 1221 struct ocfs2_write_ctxt *wc, u32 cpos, 1222 loff_t user_pos, unsigned user_len) 1223{ 1224 int ret, i, new, should_zero = 0; 1225 u64 v_blkno, p_blkno; 1226 struct inode *inode = mapping->host; 1227 struct ocfs2_extent_tree et; 1228 1229 new = phys == 0 ? 1 : 0; 1230 if (new || unwritten) 1231 should_zero = 1; 1232 1233 if (new) { 1234 u32 tmp_pos; 1235 1236 /* 1237 * This is safe to call with the page locks - it won't take 1238 * any additional semaphores or cluster locks. 1239 */ 1240 tmp_pos = cpos; 1241 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, 1242 &tmp_pos, 1, 0, wc->w_di_bh, 1243 wc->w_handle, data_ac, 1244 meta_ac, NULL); 1245 /* 1246 * This shouldn't happen because we must have already 1247 * calculated the correct meta data allocation required. The 1248 * internal tree allocation code should know how to increase 1249 * transaction credits itself. 1250 * 1251 * If need be, we could handle -EAGAIN for a 1252 * RESTART_TRANS here. 1253 */ 1254 mlog_bug_on_msg(ret == -EAGAIN, 1255 "Inode %llu: EAGAIN return during allocation.\n", 1256 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1257 if (ret < 0) { 1258 mlog_errno(ret); 1259 goto out; 1260 } 1261 } else if (unwritten) { 1262 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1263 ret = ocfs2_mark_extent_written(inode, &et, 1264 wc->w_handle, cpos, 1, phys, 1265 meta_ac, &wc->w_dealloc); 1266 if (ret < 0) { 1267 mlog_errno(ret); 1268 goto out; 1269 } 1270 } 1271 1272 if (should_zero) 1273 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); 1274 else 1275 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; 1276 1277 /* 1278 * The only reason this should fail is due to an inability to 1279 * find the extent added. 1280 */ 1281 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1282 NULL); 1283 if (ret < 0) { 1284 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " 1285 "at logical block %llu", 1286 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1287 (unsigned long long)v_blkno); 1288 goto out; 1289 } 1290 1291 BUG_ON(p_blkno == 0); 1292 1293 for(i = 0; i < wc->w_num_pages; i++) { 1294 int tmpret; 1295 1296 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1297 wc->w_pages[i], cpos, 1298 user_pos, user_len, 1299 should_zero); 1300 if (tmpret) { 1301 mlog_errno(tmpret); 1302 if (ret == 0) 1303 tmpret = ret; 1304 } 1305 } 1306 1307 /* 1308 * We only have cleanup to do in case of allocating write. 1309 */ 1310 if (ret && new) 1311 ocfs2_write_failure(inode, wc, user_pos, user_len); 1312 1313out: 1314 1315 return ret; 1316} 1317 1318static int ocfs2_write_cluster_by_desc(struct address_space *mapping, 1319 struct ocfs2_alloc_context *data_ac, 1320 struct ocfs2_alloc_context *meta_ac, 1321 struct ocfs2_write_ctxt *wc, 1322 loff_t pos, unsigned len) 1323{ 1324 int ret, i; 1325 loff_t cluster_off; 1326 unsigned int local_len = len; 1327 struct ocfs2_write_cluster_desc *desc; 1328 struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb); 1329 1330 for (i = 0; i < wc->w_clen; i++) { 1331 desc = &wc->w_desc[i]; 1332 1333 /* 1334 * We have to make sure that the total write passed in 1335 * doesn't extend past a single cluster. 1336 */ 1337 local_len = len; 1338 cluster_off = pos & (osb->s_clustersize - 1); 1339 if ((cluster_off + local_len) > osb->s_clustersize) 1340 local_len = osb->s_clustersize - cluster_off; 1341 1342 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1343 desc->c_unwritten, data_ac, meta_ac, 1344 wc, desc->c_cpos, pos, local_len); 1345 if (ret) { 1346 mlog_errno(ret); 1347 goto out; 1348 } 1349 1350 len -= local_len; 1351 pos += local_len; 1352 } 1353 1354 ret = 0; 1355out: 1356 return ret; 1357} 1358 1359/* 1360 * ocfs2_write_end() wants to know which parts of the target page it 1361 * should complete the write on. It's easiest to compute them ahead of 1362 * time when a more complete view of the write is available. 1363 */ 1364static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, 1365 struct ocfs2_write_ctxt *wc, 1366 loff_t pos, unsigned len, int alloc) 1367{ 1368 struct ocfs2_write_cluster_desc *desc; 1369 1370 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); 1371 wc->w_target_to = wc->w_target_from + len; 1372 1373 if (alloc == 0) 1374 return; 1375 1376 /* 1377 * Allocating write - we may have different boundaries based 1378 * on page size and cluster size. 1379 * 1380 * NOTE: We can no longer compute one value from the other as 1381 * the actual write length and user provided length may be 1382 * different. 1383 */ 1384 1385 if (wc->w_large_pages) { 1386 /* 1387 * We only care about the 1st and last cluster within 1388 * our range and whether they should be zero'd or not. Either 1389 * value may be extended out to the start/end of a 1390 * newly allocated cluster. 1391 */ 1392 desc = &wc->w_desc[0]; 1393 if (ocfs2_should_zero_cluster(desc)) 1394 ocfs2_figure_cluster_boundaries(osb, 1395 desc->c_cpos, 1396 &wc->w_target_from, 1397 NULL); 1398 1399 desc = &wc->w_desc[wc->w_clen - 1]; 1400 if (ocfs2_should_zero_cluster(desc)) 1401 ocfs2_figure_cluster_boundaries(osb, 1402 desc->c_cpos, 1403 NULL, 1404 &wc->w_target_to); 1405 } else { 1406 wc->w_target_from = 0; 1407 wc->w_target_to = PAGE_CACHE_SIZE; 1408 } 1409} 1410 1411/* 1412 * Populate each single-cluster write descriptor in the write context 1413 * with information about the i/o to be done. 1414 * 1415 * Returns the number of clusters that will have to be allocated, as 1416 * well as a worst case estimate of the number of extent records that 1417 * would have to be created during a write to an unwritten region. 1418 */ 1419static int ocfs2_populate_write_desc(struct inode *inode, 1420 struct ocfs2_write_ctxt *wc, 1421 unsigned int *clusters_to_alloc, 1422 unsigned int *extents_to_split) 1423{ 1424 int ret; 1425 struct ocfs2_write_cluster_desc *desc; 1426 unsigned int num_clusters = 0; 1427 unsigned int ext_flags = 0; 1428 u32 phys = 0; 1429 int i; 1430 1431 *clusters_to_alloc = 0; 1432 *extents_to_split = 0; 1433 1434 for (i = 0; i < wc->w_clen; i++) { 1435 desc = &wc->w_desc[i]; 1436 desc->c_cpos = wc->w_cpos + i; 1437 1438 if (num_clusters == 0) { 1439 /* 1440 * Need to look up the next extent record. 1441 */ 1442 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, 1443 &num_clusters, &ext_flags); 1444 if (ret) { 1445 mlog_errno(ret); 1446 goto out; 1447 } 1448 1449 /* 1450 * Assume worst case - that we're writing in 1451 * the middle of the extent. 1452 * 1453 * We can assume that the write proceeds from 1454 * left to right, in which case the extent 1455 * insert code is smart enough to coalesce the 1456 * next splits into the previous records created. 1457 */ 1458 if (ext_flags & OCFS2_EXT_UNWRITTEN) 1459 *extents_to_split = *extents_to_split + 2; 1460 } else if (phys) { 1461 /* 1462 * Only increment phys if it doesn't describe 1463 * a hole. 1464 */ 1465 phys++; 1466 } 1467 1468 desc->c_phys = phys; 1469 if (phys == 0) { 1470 desc->c_new = 1; 1471 *clusters_to_alloc = *clusters_to_alloc + 1; 1472 } 1473 if (ext_flags & OCFS2_EXT_UNWRITTEN) 1474 desc->c_unwritten = 1; 1475 1476 num_clusters--; 1477 } 1478 1479 ret = 0; 1480out: 1481 return ret; 1482} 1483 1484static int ocfs2_write_begin_inline(struct address_space *mapping, 1485 struct inode *inode, 1486 struct ocfs2_write_ctxt *wc) 1487{ 1488 int ret; 1489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1490 struct page *page; 1491 handle_t *handle; 1492 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1493 1494 page = find_or_create_page(mapping, 0, GFP_NOFS); 1495 if (!page) { 1496 ret = -ENOMEM; 1497 mlog_errno(ret); 1498 goto out; 1499 } 1500 /* 1501 * If we don't set w_num_pages then this page won't get unlocked 1502 * and freed on cleanup of the write context. 1503 */ 1504 wc->w_pages[0] = wc->w_target_page = page; 1505 wc->w_num_pages = 1; 1506 1507 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1508 if (IS_ERR(handle)) { 1509 ret = PTR_ERR(handle); 1510 mlog_errno(ret); 1511 goto out; 1512 } 1513 1514 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1515 OCFS2_JOURNAL_ACCESS_WRITE); 1516 if (ret) { 1517 ocfs2_commit_trans(osb, handle); 1518 1519 mlog_errno(ret); 1520 goto out; 1521 } 1522 1523 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) 1524 ocfs2_set_inode_data_inline(inode, di); 1525 1526 if (!PageUptodate(page)) { 1527 ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); 1528 if (ret) { 1529 ocfs2_commit_trans(osb, handle); 1530 1531 goto out; 1532 } 1533 } 1534 1535 wc->w_handle = handle; 1536out: 1537 return ret; 1538} 1539 1540int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) 1541{ 1542 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1543 1544 if (new_size <= le16_to_cpu(di->id2.i_data.id_count)) 1545 return 1; 1546 return 0; 1547} 1548 1549static int ocfs2_try_to_write_inline_data(struct address_space *mapping, 1550 struct inode *inode, loff_t pos, 1551 unsigned len, struct page *mmap_page, 1552 struct ocfs2_write_ctxt *wc) 1553{ 1554 int ret, written = 0; 1555 loff_t end = pos + len; 1556 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1557 1558 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", 1559 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, 1560 oi->ip_dyn_features); 1561 1562 /* 1563 * Handle inodes which already have inline data 1st. 1564 */ 1565 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1566 if (mmap_page == NULL && 1567 ocfs2_size_fits_inline_data(wc->w_di_bh, end)) 1568 goto do_inline_write; 1569 1570 /* 1571 * The write won't fit - we have to give this inode an 1572 * inline extent list now. 1573 */ 1574 ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh); 1575 if (ret) 1576 mlog_errno(ret); 1577 goto out; 1578 } 1579 1580 /* 1581 * Check whether the inode can accept inline data. 1582 */ 1583 if (oi->ip_clusters != 0 || i_size_read(inode) != 0) 1584 return 0; 1585 1586 /* 1587 * Check whether the write can fit. 1588 */ 1589 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) 1590 return 0; 1591 1592do_inline_write: 1593 ret = ocfs2_write_begin_inline(mapping, inode, wc); 1594 if (ret) { 1595 mlog_errno(ret); 1596 goto out; 1597 } 1598 1599 /* 1600 * This signals to the caller that the data can be written 1601 * inline. 1602 */ 1603 written = 1; 1604out: 1605 return written ? written : ret; 1606} 1607 1608/* 1609 * This function only does anything for file systems which can't 1610 * handle sparse files. 1611 * 1612 * What we want to do here is fill in any hole between the current end 1613 * of allocation and the end of our write. That way the rest of the 1614 * write path can treat it as an non-allocating write, which has no 1615 * special case code for sparse/nonsparse files. 1616 */ 1617static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, 1618 unsigned len, 1619 struct ocfs2_write_ctxt *wc) 1620{ 1621 int ret; 1622 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1623 loff_t newsize = pos + len; 1624 1625 if (ocfs2_sparse_alloc(osb)) 1626 return 0; 1627 1628 if (newsize <= i_size_read(inode)) 1629 return 0; 1630 1631 ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); 1632 if (ret) 1633 mlog_errno(ret); 1634 1635 return ret; 1636} 1637 1638int ocfs2_write_begin_nolock(struct address_space *mapping, 1639 loff_t pos, unsigned len, unsigned flags, 1640 struct page **pagep, void **fsdata, 1641 struct buffer_head *di_bh, struct page *mmap_page) 1642{ 1643 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1644 unsigned int clusters_to_alloc, extents_to_split; 1645 struct ocfs2_write_ctxt *wc; 1646 struct inode *inode = mapping->host; 1647 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1648 struct ocfs2_dinode *di; 1649 struct ocfs2_alloc_context *data_ac = NULL; 1650 struct ocfs2_alloc_context *meta_ac = NULL; 1651 handle_t *handle; 1652 struct ocfs2_extent_tree et; 1653 1654 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1655 if (ret) { 1656 mlog_errno(ret); 1657 return ret; 1658 } 1659 1660 if (ocfs2_supports_inline_data(osb)) { 1661 ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, 1662 mmap_page, wc); 1663 if (ret == 1) { 1664 ret = 0; 1665 goto success; 1666 } 1667 if (ret < 0) { 1668 mlog_errno(ret); 1669 goto out; 1670 } 1671 } 1672 1673 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); 1674 if (ret) { 1675 mlog_errno(ret); 1676 goto out; 1677 } 1678 1679 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1680 &extents_to_split); 1681 if (ret) { 1682 mlog_errno(ret); 1683 goto out; 1684 } 1685 1686 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1687 1688 /* 1689 * We set w_target_from, w_target_to here so that 1690 * ocfs2_write_end() knows which range in the target page to 1691 * write out. An allocation requires that we write the entire 1692 * cluster range. 1693 */ 1694 if (clusters_to_alloc || extents_to_split) { 1695 /* 1696 * XXX: We are stretching the limits of 1697 * ocfs2_lock_allocators(). It greatly over-estimates 1698 * the work to be done. 1699 */ 1700 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," 1701 " clusters_to_add = %u, extents_to_split = %u\n", 1702 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1703 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), 1704 clusters_to_alloc, extents_to_split); 1705 1706 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1707 ret = ocfs2_lock_allocators(inode, &et, 1708 clusters_to_alloc, extents_to_split, 1709 &data_ac, &meta_ac); 1710 if (ret) { 1711 mlog_errno(ret); 1712 goto out; 1713 } 1714 1715 credits = ocfs2_calc_extend_credits(inode->i_sb, 1716 &di->id2.i_list, 1717 clusters_to_alloc); 1718 1719 } 1720 1721 ocfs2_set_target_boundaries(osb, wc, pos, len, 1722 clusters_to_alloc + extents_to_split); 1723 1724 handle = ocfs2_start_trans(osb, credits); 1725 if (IS_ERR(handle)) { 1726 ret = PTR_ERR(handle); 1727 mlog_errno(ret); 1728 goto out; 1729 } 1730 1731 wc->w_handle = handle; 1732 1733 /* 1734 * We don't want this to fail in ocfs2_write_end(), so do it 1735 * here. 1736 */ 1737 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1738 OCFS2_JOURNAL_ACCESS_WRITE); 1739 if (ret) { 1740 mlog_errno(ret); 1741 goto out_commit; 1742 } 1743 1744 /* 1745 * Fill our page array first. That way we've grabbed enough so 1746 * that we can zero and flush if we error after adding the 1747 * extent. 1748 */ 1749 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1750 clusters_to_alloc + extents_to_split, 1751 mmap_page); 1752 if (ret) { 1753 mlog_errno(ret); 1754 goto out_commit; 1755 } 1756 1757 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1758 len); 1759 if (ret) { 1760 mlog_errno(ret); 1761 goto out_commit; 1762 } 1763 1764 if (data_ac) 1765 ocfs2_free_alloc_context(data_ac); 1766 if (meta_ac) 1767 ocfs2_free_alloc_context(meta_ac); 1768 1769success: 1770 *pagep = wc->w_target_page; 1771 *fsdata = wc; 1772 return 0; 1773out_commit: 1774 ocfs2_commit_trans(osb, handle); 1775 1776out: 1777 ocfs2_free_write_ctxt(wc); 1778 1779 if (data_ac) 1780 ocfs2_free_alloc_context(data_ac); 1781 if (meta_ac) 1782 ocfs2_free_alloc_context(meta_ac); 1783 return ret; 1784} 1785 1786static int ocfs2_write_begin(struct file *file, struct address_space *mapping, 1787 loff_t pos, unsigned len, unsigned flags, 1788 struct page **pagep, void **fsdata) 1789{ 1790 int ret; 1791 struct buffer_head *di_bh = NULL; 1792 struct inode *inode = mapping->host; 1793 1794 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1795 if (ret) { 1796 mlog_errno(ret); 1797 return ret; 1798 } 1799 1800 /* 1801 * Take alloc sem here to prevent concurrent lookups. That way 1802 * the mapping, zeroing and tree manipulation within 1803 * ocfs2_write() will be safe against ->readpage(). This 1804 * should also serve to lock out allocation from a shared 1805 * writeable region. 1806 */ 1807 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1808 1809 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1810 fsdata, di_bh, NULL); 1811 if (ret) { 1812 mlog_errno(ret); 1813 goto out_fail; 1814 } 1815 1816 brelse(di_bh); 1817 1818 return 0; 1819 1820out_fail: 1821 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1822 1823 brelse(di_bh); 1824 ocfs2_inode_unlock(inode, 1); 1825 1826 return ret; 1827} 1828 1829static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, 1830 unsigned len, unsigned *copied, 1831 struct ocfs2_dinode *di, 1832 struct ocfs2_write_ctxt *wc) 1833{ 1834 void *kaddr; 1835 1836 if (unlikely(*copied < len)) { 1837 if (!PageUptodate(wc->w_target_page)) { 1838 *copied = 0; 1839 return; 1840 } 1841 } 1842 1843 kaddr = kmap_atomic(wc->w_target_page, KM_USER0); 1844 memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); 1845 kunmap_atomic(kaddr, KM_USER0); 1846 1847 mlog(0, "Data written to inode at offset %llu. " 1848 "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", 1849 (unsigned long long)pos, *copied, 1850 le16_to_cpu(di->id2.i_data.id_count), 1851 le16_to_cpu(di->i_dyn_features)); 1852} 1853 1854int ocfs2_write_end_nolock(struct address_space *mapping, 1855 loff_t pos, unsigned len, unsigned copied, 1856 struct page *page, void *fsdata) 1857{ 1858 int i; 1859 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); 1860 struct inode *inode = mapping->host; 1861 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1862 struct ocfs2_write_ctxt *wc = fsdata; 1863 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1864 handle_t *handle = wc->w_handle; 1865 struct page *tmppage; 1866 1867 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1868 ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); 1869 goto out_write_size; 1870 } 1871 1872 if (unlikely(copied < len)) { 1873 if (!PageUptodate(wc->w_target_page)) 1874 copied = 0; 1875 1876 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 1877 start+len); 1878 } 1879 flush_dcache_page(wc->w_target_page); 1880 1881 for(i = 0; i < wc->w_num_pages; i++) { 1882 tmppage = wc->w_pages[i]; 1883 1884 if (tmppage == wc->w_target_page) { 1885 from = wc->w_target_from; 1886 to = wc->w_target_to; 1887 1888 BUG_ON(from > PAGE_CACHE_SIZE || 1889 to > PAGE_CACHE_SIZE || 1890 to < from); 1891 } else { 1892 /* 1893 * Pages adjacent to the target (if any) imply 1894 * a hole-filling write in which case we want 1895 * to flush their entire range. 1896 */ 1897 from = 0; 1898 to = PAGE_CACHE_SIZE; 1899 } 1900 1901 if (page_has_buffers(tmppage)) { 1902 if (ocfs2_should_order_data(inode)) 1903 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1904 block_commit_write(tmppage, from, to); 1905 } 1906 } 1907 1908out_write_size: 1909 pos += copied; 1910 if (pos > inode->i_size) { 1911 i_size_write(inode, pos); 1912 mark_inode_dirty(inode); 1913 } 1914 inode->i_blocks = ocfs2_inode_sector_count(inode); 1915 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 1916 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1917 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1918 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1919 ocfs2_journal_dirty(handle, wc->w_di_bh); 1920 1921 ocfs2_commit_trans(osb, handle); 1922 1923 ocfs2_run_deallocs(osb, &wc->w_dealloc); 1924 1925 ocfs2_free_write_ctxt(wc); 1926 1927 return copied; 1928} 1929 1930static int ocfs2_write_end(struct file *file, struct address_space *mapping, 1931 loff_t pos, unsigned len, unsigned copied, 1932 struct page *page, void *fsdata) 1933{ 1934 int ret; 1935 struct inode *inode = mapping->host; 1936 1937 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1938 1939 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1940 ocfs2_inode_unlock(inode, 1); 1941 1942 return ret; 1943} 1944 1945const struct address_space_operations ocfs2_aops = { 1946 .readpage = ocfs2_readpage, 1947 .readpages = ocfs2_readpages, 1948 .writepage = ocfs2_writepage, 1949 .write_begin = ocfs2_write_begin, 1950 .write_end = ocfs2_write_end, 1951 .bmap = ocfs2_bmap, 1952 .sync_page = block_sync_page, 1953 .direct_IO = ocfs2_direct_IO, 1954 .invalidatepage = ocfs2_invalidatepage, 1955 .releasepage = ocfs2_releasepage, 1956 .migratepage = buffer_migrate_page, 1957}; 1958