1/* 2 * Lockless get_user_pages_fast for x86 3 * 4 * Copyright (C) 2008 Nick Piggin 5 * Copyright (C) 2008 Novell Inc. 6 */ 7#include <linux/sched.h> 8#include <linux/mm.h> 9#include <linux/vmstat.h> 10#include <linux/highmem.h> 11#include <linux/swap.h> 12 13#include <asm/pgtable.h> 14 15static inline pte_t gup_get_pte(pte_t *ptep) 16{ 17#ifndef CONFIG_X86_PAE 18 return ACCESS_ONCE(*ptep); 19#else 20 /* 21 * With get_user_pages_fast, we walk down the pagetables without taking 22 * any locks. For this we would like to load the pointers atomically, 23 * but that is not possible (without expensive cmpxchg8b) on PAE. What 24 * we do have is the guarantee that a pte will only either go from not 25 * present to present, or present to not present or both -- it will not 26 * switch to a completely different present page without a TLB flush in 27 * between; something that we are blocking by holding interrupts off. 28 * 29 * Setting ptes from not present to present goes: 30 * ptep->pte_high = h; 31 * smp_wmb(); 32 * ptep->pte_low = l; 33 * 34 * And present to not present goes: 35 * ptep->pte_low = 0; 36 * smp_wmb(); 37 * ptep->pte_high = 0; 38 * 39 * We must ensure here that the load of pte_low sees l iff pte_high 40 * sees h. We load pte_high *after* loading pte_low, which ensures we 41 * don't see an older value of pte_high. *Then* we recheck pte_low, 42 * which ensures that we haven't picked up a changed pte high. We might 43 * have got rubbish values from pte_low and pte_high, but we are 44 * guaranteed that pte_low will not have the present bit set *unless* 45 * it is 'l'. And get_user_pages_fast only operates on present ptes, so 46 * we're safe. 47 * 48 * gup_get_pte should not be used or copied outside gup.c without being 49 * very careful -- it does not atomically load the pte or anything that 50 * is likely to be useful for you. 51 */ 52 pte_t pte; 53 54retry: 55 pte.pte_low = ptep->pte_low; 56 smp_rmb(); 57 pte.pte_high = ptep->pte_high; 58 smp_rmb(); 59 if (unlikely(pte.pte_low != ptep->pte_low)) 60 goto retry; 61 62 return pte; 63#endif 64} 65 66/* 67 * The performance critical leaf functions are made noinline otherwise gcc 68 * inlines everything into a single function which results in too much 69 * register pressure. 70 */ 71static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, 72 unsigned long end, int write, struct page **pages, int *nr) 73{ 74 unsigned long mask; 75 pte_t *ptep; 76 77 mask = _PAGE_PRESENT|_PAGE_USER; 78 if (write) 79 mask |= _PAGE_RW; 80 81 ptep = pte_offset_map(&pmd, addr); 82 do { 83 pte_t pte = gup_get_pte(ptep); 84 struct page *page; 85 86 /* Similar to the PMD case, NUMA hinting must take slow path */ 87 if (pte_numa(pte)) { 88 pte_unmap(ptep); 89 return 0; 90 } 91 92 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 93 pte_unmap(ptep); 94 return 0; 95 } 96 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 97 page = pte_page(pte); 98 get_page(page); 99 SetPageReferenced(page); 100 pages[*nr] = page; 101 (*nr)++; 102 103 } while (ptep++, addr += PAGE_SIZE, addr != end); 104 pte_unmap(ptep - 1); 105 106 return 1; 107} 108 109static inline void get_head_page_multiple(struct page *page, int nr) 110{ 111 VM_BUG_ON_PAGE(page != compound_head(page), page); 112 VM_BUG_ON_PAGE(page_count(page) == 0, page); 113 atomic_add(nr, &page->_count); 114 SetPageReferenced(page); 115} 116 117static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 118 unsigned long end, int write, struct page **pages, int *nr) 119{ 120 unsigned long mask; 121 pte_t pte = *(pte_t *)&pmd; 122 struct page *head, *page; 123 int refs; 124 125 mask = _PAGE_PRESENT|_PAGE_USER; 126 if (write) 127 mask |= _PAGE_RW; 128 if ((pte_flags(pte) & mask) != mask) 129 return 0; 130 /* hugepages are never "special" */ 131 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); 132 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 133 134 refs = 0; 135 head = pte_page(pte); 136 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 137 do { 138 VM_BUG_ON_PAGE(compound_head(page) != head, page); 139 pages[*nr] = page; 140 if (PageTail(page)) 141 get_huge_page_tail(page); 142 (*nr)++; 143 page++; 144 refs++; 145 } while (addr += PAGE_SIZE, addr != end); 146 get_head_page_multiple(head, refs); 147 148 return 1; 149} 150 151static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 152 int write, struct page **pages, int *nr) 153{ 154 unsigned long next; 155 pmd_t *pmdp; 156 157 pmdp = pmd_offset(&pud, addr); 158 do { 159 pmd_t pmd = *pmdp; 160 161 next = pmd_addr_end(addr, end); 162 /* 163 * The pmd_trans_splitting() check below explains why 164 * pmdp_splitting_flush has to flush the tlb, to stop 165 * this gup-fast code from running while we set the 166 * splitting bit in the pmd. Returning zero will take 167 * the slow path that will call wait_split_huge_page() 168 * if the pmd is still in splitting state. gup-fast 169 * can't because it has irq disabled and 170 * wait_split_huge_page() would never return as the 171 * tlb flush IPI wouldn't run. 172 */ 173 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 174 return 0; 175 if (unlikely(pmd_large(pmd))) { 176 /* 177 * NUMA hinting faults need to be handled in the GUP 178 * slowpath for accounting purposes and so that they 179 * can be serialised against THP migration. 180 */ 181 if (pmd_numa(pmd)) 182 return 0; 183 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 184 return 0; 185 } else { 186 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 187 return 0; 188 } 189 } while (pmdp++, addr = next, addr != end); 190 191 return 1; 192} 193 194static noinline int gup_huge_pud(pud_t pud, unsigned long addr, 195 unsigned long end, int write, struct page **pages, int *nr) 196{ 197 unsigned long mask; 198 pte_t pte = *(pte_t *)&pud; 199 struct page *head, *page; 200 int refs; 201 202 mask = _PAGE_PRESENT|_PAGE_USER; 203 if (write) 204 mask |= _PAGE_RW; 205 if ((pte_flags(pte) & mask) != mask) 206 return 0; 207 /* hugepages are never "special" */ 208 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); 209 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 210 211 refs = 0; 212 head = pte_page(pte); 213 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 214 do { 215 VM_BUG_ON_PAGE(compound_head(page) != head, page); 216 pages[*nr] = page; 217 if (PageTail(page)) 218 get_huge_page_tail(page); 219 (*nr)++; 220 page++; 221 refs++; 222 } while (addr += PAGE_SIZE, addr != end); 223 get_head_page_multiple(head, refs); 224 225 return 1; 226} 227 228static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, 229 int write, struct page **pages, int *nr) 230{ 231 unsigned long next; 232 pud_t *pudp; 233 234 pudp = pud_offset(&pgd, addr); 235 do { 236 pud_t pud = *pudp; 237 238 next = pud_addr_end(addr, end); 239 if (pud_none(pud)) 240 return 0; 241 if (unlikely(pud_large(pud))) { 242 if (!gup_huge_pud(pud, addr, next, write, pages, nr)) 243 return 0; 244 } else { 245 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 246 return 0; 247 } 248 } while (pudp++, addr = next, addr != end); 249 250 return 1; 251} 252 253/* 254 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall 255 * back to the regular GUP. 256 */ 257int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 258 struct page **pages) 259{ 260 struct mm_struct *mm = current->mm; 261 unsigned long addr, len, end; 262 unsigned long next; 263 unsigned long flags; 264 pgd_t *pgdp; 265 int nr = 0; 266 267 start &= PAGE_MASK; 268 addr = start; 269 len = (unsigned long) nr_pages << PAGE_SHIFT; 270 end = start + len; 271 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 272 (void __user *)start, len))) 273 return 0; 274 275 /* 276 * XXX: batch / limit 'nr', to avoid large irq off latency 277 * needs some instrumenting to determine the common sizes used by 278 * important workloads (eg. DB2), and whether limiting the batch size 279 * will decrease performance. 280 * 281 * It seems like we're in the clear for the moment. Direct-IO is 282 * the main guy that batches up lots of get_user_pages, and even 283 * they are limited to 64-at-a-time which is not so many. 284 */ 285 /* 286 * This doesn't prevent pagetable teardown, but does prevent 287 * the pagetables and pages from being freed on x86. 288 * 289 * So long as we atomically load page table pointers versus teardown 290 * (which we do on x86, with the above PAE exception), we can follow the 291 * address down to the the page and take a ref on it. 292 */ 293 local_irq_save(flags); 294 pgdp = pgd_offset(mm, addr); 295 do { 296 pgd_t pgd = *pgdp; 297 298 next = pgd_addr_end(addr, end); 299 if (pgd_none(pgd)) 300 break; 301 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 302 break; 303 } while (pgdp++, addr = next, addr != end); 304 local_irq_restore(flags); 305 306 return nr; 307} 308 309/** 310 * get_user_pages_fast() - pin user pages in memory 311 * @start: starting user address 312 * @nr_pages: number of pages from start to pin 313 * @write: whether pages will be written to 314 * @pages: array that receives pointers to the pages pinned. 315 * Should be at least nr_pages long. 316 * 317 * Attempt to pin user pages in memory without taking mm->mmap_sem. 318 * If not successful, it will fall back to taking the lock and 319 * calling get_user_pages(). 320 * 321 * Returns number of pages pinned. This may be fewer than the number 322 * requested. If nr_pages is 0 or negative, returns 0. If no pages 323 * were pinned, returns -errno. 324 */ 325int get_user_pages_fast(unsigned long start, int nr_pages, int write, 326 struct page **pages) 327{ 328 struct mm_struct *mm = current->mm; 329 unsigned long addr, len, end; 330 unsigned long next; 331 pgd_t *pgdp; 332 int nr = 0; 333 334 start &= PAGE_MASK; 335 addr = start; 336 len = (unsigned long) nr_pages << PAGE_SHIFT; 337 338 end = start + len; 339 if (end < start) 340 goto slow_irqon; 341 342#ifdef CONFIG_X86_64 343 if (end >> __VIRTUAL_MASK_SHIFT) 344 goto slow_irqon; 345#endif 346 347 /* 348 * XXX: batch / limit 'nr', to avoid large irq off latency 349 * needs some instrumenting to determine the common sizes used by 350 * important workloads (eg. DB2), and whether limiting the batch size 351 * will decrease performance. 352 * 353 * It seems like we're in the clear for the moment. Direct-IO is 354 * the main guy that batches up lots of get_user_pages, and even 355 * they are limited to 64-at-a-time which is not so many. 356 */ 357 /* 358 * This doesn't prevent pagetable teardown, but does prevent 359 * the pagetables and pages from being freed on x86. 360 * 361 * So long as we atomically load page table pointers versus teardown 362 * (which we do on x86, with the above PAE exception), we can follow the 363 * address down to the the page and take a ref on it. 364 */ 365 local_irq_disable(); 366 pgdp = pgd_offset(mm, addr); 367 do { 368 pgd_t pgd = *pgdp; 369 370 next = pgd_addr_end(addr, end); 371 if (pgd_none(pgd)) 372 goto slow; 373 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 374 goto slow; 375 } while (pgdp++, addr = next, addr != end); 376 local_irq_enable(); 377 378 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 379 return nr; 380 381 { 382 int ret; 383 384slow: 385 local_irq_enable(); 386slow_irqon: 387 /* Try to get the remaining pages with get_user_pages */ 388 start += nr << PAGE_SHIFT; 389 pages += nr; 390 391 down_read(&mm->mmap_sem); 392 ret = get_user_pages(current, mm, start, 393 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); 394 up_read(&mm->mmap_sem); 395 396 /* Have to be a bit careful with return values */ 397 if (nr > 0) { 398 if (ret < 0) 399 ret = nr; 400 else 401 ret += nr; 402 } 403 404 return ret; 405 } 406} 407