1/* 2 * GPL HEADER START 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 only, 8 * as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * General Public License version 2 for more details (a copy is included 14 * in the LICENSE file that accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License 17 * version 2 along with this program; If not, see 18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf 19 * 20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 21 * CA 95054 USA or visit www.sun.com if you need additional information or 22 * have any questions. 23 * 24 * GPL HEADER END 25 */ 26/* 27 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 28 * Use is subject to license terms. 29 * 30 * Copyright (c) 2011, 2012, Intel Corporation. 31 */ 32/* 33 * This file is part of Lustre, http://www.lustre.org/ 34 * Lustre is a trademark of Sun Microsystems, Inc. 35 */ 36 37#include <linux/kernel.h> 38#include <linux/mm.h> 39#include <linux/string.h> 40#include <linux/stat.h> 41#include <linux/errno.h> 42#include <linux/unistd.h> 43#include <asm/uaccess.h> 44 45#include <linux/fs.h> 46#include <linux/pagemap.h> 47 48#define DEBUG_SUBSYSTEM S_LLITE 49 50#include "../include/lustre_lite.h" 51#include "llite_internal.h" 52#include "../include/linux/lustre_compat25.h" 53 54static const struct vm_operations_struct ll_file_vm_ops; 55 56void policy_from_vma(ldlm_policy_data_t *policy, 57 struct vm_area_struct *vma, unsigned long addr, 58 size_t count) 59{ 60 policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) + 61 (vma->vm_pgoff << PAGE_CACHE_SHIFT); 62 policy->l_extent.end = (policy->l_extent.start + count - 1) | 63 ~CFS_PAGE_MASK; 64} 65 66struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, 67 size_t count) 68{ 69 struct vm_area_struct *vma, *ret = NULL; 70 71 /* mmap_sem must have been held by caller. */ 72 LASSERT(!down_write_trylock(&mm->mmap_sem)); 73 74 for (vma = find_vma(mm, addr); 75 vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { 76 if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && 77 vma->vm_flags & VM_SHARED) { 78 ret = vma; 79 break; 80 } 81 } 82 return ret; 83} 84 85/** 86 * API independent part for page fault initialization. 87 * \param vma - virtual memory area addressed to page fault 88 * \param env - corespondent lu_env to processing 89 * \param nest - nested level 90 * \param index - page index corespondent to fault. 91 * \parm ra_flags - vma readahead flags. 92 * 93 * \return allocated and initialized env for fault operation. 94 * \retval EINVAL if env can't allocated 95 * \return other error codes from cl_io_init. 96 */ 97static struct cl_io * 98ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, 99 struct cl_env_nest *nest, pgoff_t index, 100 unsigned long *ra_flags) 101{ 102 struct file *file = vma->vm_file; 103 struct inode *inode = file->f_dentry->d_inode; 104 struct cl_io *io; 105 struct cl_fault_io *fio; 106 struct lu_env *env; 107 int rc; 108 109 *env_ret = NULL; 110 if (ll_file_nolock(file)) 111 return ERR_PTR(-EOPNOTSUPP); 112 113 /* 114 * page fault can be called when lustre IO is 115 * already active for the current thread, e.g., when doing read/write 116 * against user level buffer mapped from Lustre buffer. To avoid 117 * stomping on existing context, optionally force an allocation of a new 118 * one. 119 */ 120 env = cl_env_nested_get(nest); 121 if (IS_ERR(env)) 122 return ERR_PTR(-EINVAL); 123 124 *env_ret = env; 125 126 io = ccc_env_thread_io(env); 127 io->ci_obj = ll_i2info(inode)->lli_clob; 128 LASSERT(io->ci_obj != NULL); 129 130 fio = &io->u.ci_fault; 131 fio->ft_index = index; 132 fio->ft_executable = vma->vm_flags&VM_EXEC; 133 134 /* 135 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that 136 * the kernel will not read other pages not covered by ldlm in 137 * filemap_nopage. we do our readahead in ll_readpage. 138 */ 139 if (ra_flags != NULL) 140 *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); 141 vma->vm_flags &= ~VM_SEQ_READ; 142 vma->vm_flags |= VM_RAND_READ; 143 144 CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, 145 fio->ft_index, fio->ft_executable); 146 147 rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); 148 if (rc == 0) { 149 struct ccc_io *cio = ccc_env_io(env); 150 struct ll_file_data *fd = LUSTRE_FPRIVATE(file); 151 152 LASSERT(cio->cui_cl.cis_io == io); 153 154 /* mmap lock must be MANDATORY it has to cache 155 * pages. */ 156 io->ci_lockreq = CILR_MANDATORY; 157 cio->cui_fd = fd; 158 } else { 159 LASSERT(rc < 0); 160 cl_io_fini(env, io); 161 cl_env_nested_put(nest, env); 162 io = ERR_PTR(rc); 163 } 164 165 return io; 166} 167 168/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ 169static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, 170 bool *retry) 171{ 172 struct lu_env *env; 173 struct cl_io *io; 174 struct vvp_io *vio; 175 struct cl_env_nest nest; 176 int result; 177 sigset_t set; 178 struct inode *inode; 179 struct ll_inode_info *lli; 180 181 LASSERT(vmpage != NULL); 182 183 io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); 184 if (IS_ERR(io)) { 185 result = PTR_ERR(io); 186 goto out; 187 } 188 189 result = io->ci_result; 190 if (result < 0) 191 goto out_io; 192 193 io->u.ci_fault.ft_mkwrite = 1; 194 io->u.ci_fault.ft_writable = 1; 195 196 vio = vvp_env_io(env); 197 vio->u.fault.ft_vma = vma; 198 vio->u.fault.ft_vmpage = vmpage; 199 200 set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); 201 202 /* we grab lli_trunc_sem to exclude truncate case. 203 * Otherwise, we could add dirty pages into osc cache 204 * while truncate is on-going. */ 205 inode = ccc_object_inode(io->ci_obj); 206 lli = ll_i2info(inode); 207 down_read(&lli->lli_trunc_sem); 208 209 result = cl_io_loop(env, io); 210 211 up_read(&lli->lli_trunc_sem); 212 213 cfs_restore_sigs(set); 214 215 if (result == 0) { 216 struct inode *inode = vma->vm_file->f_dentry->d_inode; 217 struct ll_inode_info *lli = ll_i2info(inode); 218 219 lock_page(vmpage); 220 if (vmpage->mapping == NULL) { 221 unlock_page(vmpage); 222 223 /* page was truncated and lock was cancelled, return 224 * ENODATA so that VM_FAULT_NOPAGE will be returned 225 * to handle_mm_fault(). */ 226 if (result == 0) 227 result = -ENODATA; 228 } else if (!PageDirty(vmpage)) { 229 /* race, the page has been cleaned by ptlrpcd after 230 * it was unlocked, it has to be added into dirty 231 * cache again otherwise this soon-to-dirty page won't 232 * consume any grants, even worse if this page is being 233 * transferred because it will break RPC checksum. 234 */ 235 unlock_page(vmpage); 236 237 CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " 238 "been written out, retry.\n", 239 vmpage, vmpage->index); 240 241 *retry = true; 242 result = -EAGAIN; 243 } 244 245 if (result == 0) { 246 spin_lock(&lli->lli_lock); 247 lli->lli_flags |= LLIF_DATA_MODIFIED; 248 spin_unlock(&lli->lli_lock); 249 } 250 } 251 252out_io: 253 cl_io_fini(env, io); 254 cl_env_nested_put(&nest, env); 255out: 256 CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); 257 LASSERT(ergo(result == 0, PageLocked(vmpage))); 258 259 return result; 260} 261 262 263 264static inline int to_fault_error(int result) 265{ 266 switch (result) { 267 case 0: 268 result = VM_FAULT_LOCKED; 269 break; 270 case -EFAULT: 271 result = VM_FAULT_NOPAGE; 272 break; 273 case -ENOMEM: 274 result = VM_FAULT_OOM; 275 break; 276 default: 277 result = VM_FAULT_SIGBUS; 278 break; 279 } 280 return result; 281} 282 283/** 284 * Lustre implementation of a vm_operations_struct::fault() method, called by 285 * VM to server page fault (both in kernel and user space). 286 * 287 * \param vma - is virtual area struct related to page fault 288 * \param vmf - structure which describe type and address where hit fault 289 * 290 * \return allocated and filled _locked_ page for address 291 * \retval VM_FAULT_ERROR on general error 292 * \retval NOPAGE_OOM not have memory for allocate new page 293 */ 294static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) 295{ 296 struct lu_env *env; 297 struct cl_io *io; 298 struct vvp_io *vio = NULL; 299 struct page *vmpage; 300 unsigned long ra_flags; 301 struct cl_env_nest nest; 302 int result; 303 int fault_ret = 0; 304 305 io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); 306 if (IS_ERR(io)) 307 return to_fault_error(PTR_ERR(io)); 308 309 result = io->ci_result; 310 if (result == 0) { 311 vio = vvp_env_io(env); 312 vio->u.fault.ft_vma = vma; 313 vio->u.fault.ft_vmpage = NULL; 314 vio->u.fault.fault.ft_vmf = vmf; 315 vio->u.fault.fault.ft_flags = 0; 316 vio->u.fault.fault.ft_flags_valid = 0; 317 318 result = cl_io_loop(env, io); 319 320 /* ft_flags are only valid if we reached 321 * the call to filemap_fault */ 322 if (vio->u.fault.fault.ft_flags_valid) 323 fault_ret = vio->u.fault.fault.ft_flags; 324 325 vmpage = vio->u.fault.ft_vmpage; 326 if (result != 0 && vmpage != NULL) { 327 page_cache_release(vmpage); 328 vmf->page = NULL; 329 } 330 } 331 cl_io_fini(env, io); 332 cl_env_nested_put(&nest, env); 333 334 vma->vm_flags |= ra_flags; 335 if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) 336 fault_ret |= to_fault_error(result); 337 338 CDEBUG(D_MMAP, "%s fault %d/%d\n", 339 current->comm, fault_ret, result); 340 return fault_ret; 341} 342 343static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 344{ 345 int count = 0; 346 bool printed = false; 347 int result; 348 sigset_t set; 349 350 /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite 351 * so that it can be killed by admin but not cause segfault by 352 * other signals. */ 353 set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); 354 355restart: 356 result = ll_fault0(vma, vmf); 357 LASSERT(!(result & VM_FAULT_LOCKED)); 358 if (result == 0) { 359 struct page *vmpage = vmf->page; 360 361 /* check if this page has been truncated */ 362 lock_page(vmpage); 363 if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ 364 unlock_page(vmpage); 365 page_cache_release(vmpage); 366 vmf->page = NULL; 367 368 if (!printed && ++count > 16) { 369 CWARN("the page is under heavy contention," 370 "maybe your app(%s) needs revising :-)\n", 371 current->comm); 372 printed = true; 373 } 374 375 goto restart; 376 } 377 378 result = VM_FAULT_LOCKED; 379 } 380 cfs_restore_sigs(set); 381 return result; 382} 383 384static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 385{ 386 int count = 0; 387 bool printed = false; 388 bool retry; 389 int result; 390 391 do { 392 retry = false; 393 result = ll_page_mkwrite0(vma, vmf->page, &retry); 394 395 if (!printed && ++count > 16) { 396 CWARN("app(%s): the page %lu of file %lu is under heavy" 397 " contention.\n", 398 current->comm, vmf->pgoff, 399 vma->vm_file->f_dentry->d_inode->i_ino); 400 printed = true; 401 } 402 } while (retry); 403 404 switch (result) { 405 case 0: 406 LASSERT(PageLocked(vmf->page)); 407 result = VM_FAULT_LOCKED; 408 break; 409 case -ENODATA: 410 case -EFAULT: 411 result = VM_FAULT_NOPAGE; 412 break; 413 case -ENOMEM: 414 result = VM_FAULT_OOM; 415 break; 416 case -EAGAIN: 417 result = VM_FAULT_RETRY; 418 break; 419 default: 420 result = VM_FAULT_SIGBUS; 421 break; 422 } 423 424 return result; 425} 426 427/** 428 * To avoid cancel the locks covering mmapped region for lock cache pressure, 429 * we track the mapped vma count in ccc_object::cob_mmap_cnt. 430 */ 431static void ll_vm_open(struct vm_area_struct *vma) 432{ 433 struct inode *inode = vma->vm_file->f_dentry->d_inode; 434 struct ccc_object *vob = cl_inode2ccc(inode); 435 436 LASSERT(vma->vm_file); 437 LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); 438 atomic_inc(&vob->cob_mmap_cnt); 439} 440 441/** 442 * Dual to ll_vm_open(). 443 */ 444static void ll_vm_close(struct vm_area_struct *vma) 445{ 446 struct inode *inode = vma->vm_file->f_dentry->d_inode; 447 struct ccc_object *vob = cl_inode2ccc(inode); 448 449 LASSERT(vma->vm_file); 450 atomic_dec(&vob->cob_mmap_cnt); 451 LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); 452} 453 454/* XXX put nice comment here. talk about __free_pte -> dirty pages and 455 * nopage's reference passing to the pte */ 456int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) 457{ 458 int rc = -ENOENT; 459 460 LASSERTF(last > first, "last %llu first %llu\n", last, first); 461 if (mapping_mapped(mapping)) { 462 rc = 0; 463 unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1, 464 last - first + 1, 0); 465 } 466 467 return rc; 468} 469 470static const struct vm_operations_struct ll_file_vm_ops = { 471 .fault = ll_fault, 472 .page_mkwrite = ll_page_mkwrite, 473 .open = ll_vm_open, 474 .close = ll_vm_close, 475}; 476 477int ll_file_mmap(struct file *file, struct vm_area_struct *vma) 478{ 479 struct inode *inode = file->f_dentry->d_inode; 480 int rc; 481 482 if (ll_file_nolock(file)) 483 return -EOPNOTSUPP; 484 485 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); 486 rc = generic_file_mmap(file, vma); 487 if (rc == 0) { 488 vma->vm_ops = &ll_file_vm_ops; 489 vma->vm_ops->open(vma); 490 /* update the inode's size and mtime */ 491 rc = ll_glimpse_size(inode); 492 } 493 494 return rc; 495} 496