highmem.c revision a19b27ce3847c3a5d4ea6b6c91b6f7154759af23
1/* 2 * High memory handling common code and variables. 3 * 4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de 5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de 6 * 7 * 8 * Redesigned the x86 32-bit VM architecture to deal with 9 * 64-bit physical space. With current x86 CPUs this 10 * means up to 64 Gigabytes physical RAM. 11 * 12 * Rewrote high memory support to move the page cache into 13 * high memory. Implemented permanent (schedulable) kmaps 14 * based on Linus' idea. 15 * 16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 17 */ 18 19#include <linux/mm.h> 20#include <linux/module.h> 21#include <linux/swap.h> 22#include <linux/bio.h> 23#include <linux/pagemap.h> 24#include <linux/mempool.h> 25#include <linux/blkdev.h> 26#include <linux/init.h> 27#include <linux/hash.h> 28#include <linux/highmem.h> 29#include <linux/blktrace_api.h> 30#include <asm/tlbflush.h> 31 32static mempool_t *page_pool, *isa_page_pool; 33 34static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) 35{ 36 return mempool_alloc_pages(gfp_mask | GFP_DMA, data); 37} 38 39/* 40 * Virtual_count is not a pure "count". 41 * 0 means that it is not mapped, and has not been mapped 42 * since a TLB flush - it is usable. 43 * 1 means that there are no users, but it has been mapped 44 * since the last TLB flush - so we can't use it. 45 * n means that there are (n-1) current users of it. 46 */ 47#ifdef CONFIG_HIGHMEM 48 49static int pkmap_count[LAST_PKMAP]; 50static unsigned int last_pkmap_nr; 51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 52 53pte_t * pkmap_page_table; 54 55static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); 56 57static void flush_all_zero_pkmaps(void) 58{ 59 int i; 60 61 flush_cache_kmaps(); 62 63 for (i = 0; i < LAST_PKMAP; i++) { 64 struct page *page; 65 66 /* 67 * zero means we don't have anything to do, 68 * >1 means that it is still in use. Only 69 * a count of 1 means that it is free but 70 * needs to be unmapped 71 */ 72 if (pkmap_count[i] != 1) 73 continue; 74 pkmap_count[i] = 0; 75 76 /* sanity check */ 77 if (pte_none(pkmap_page_table[i])) 78 BUG(); 79 80 /* 81 * Don't need an atomic fetch-and-clear op here; 82 * no-one has the page mapped, and cannot get at 83 * its virtual address (and hence PTE) without first 84 * getting the kmap_lock (which is held here). 85 * So no dangers, even with speculative execution. 86 */ 87 page = pte_page(pkmap_page_table[i]); 88 pte_clear(&init_mm, (unsigned long)page_address(page), 89 &pkmap_page_table[i]); 90 91 set_page_address(page, NULL); 92 } 93 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 94} 95 96static inline unsigned long map_new_virtual(struct page *page) 97{ 98 unsigned long vaddr; 99 int count; 100 101start: 102 count = LAST_PKMAP; 103 /* Find an empty entry */ 104 for (;;) { 105 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; 106 if (!last_pkmap_nr) { 107 flush_all_zero_pkmaps(); 108 count = LAST_PKMAP; 109 } 110 if (!pkmap_count[last_pkmap_nr]) 111 break; /* Found a usable entry */ 112 if (--count) 113 continue; 114 115 /* 116 * Sleep for somebody else to unmap their entries 117 */ 118 { 119 DECLARE_WAITQUEUE(wait, current); 120 121 __set_current_state(TASK_UNINTERRUPTIBLE); 122 add_wait_queue(&pkmap_map_wait, &wait); 123 spin_unlock(&kmap_lock); 124 schedule(); 125 remove_wait_queue(&pkmap_map_wait, &wait); 126 spin_lock(&kmap_lock); 127 128 /* Somebody else might have mapped it while we slept */ 129 if (page_address(page)) 130 return (unsigned long)page_address(page); 131 132 /* Re-start */ 133 goto start; 134 } 135 } 136 vaddr = PKMAP_ADDR(last_pkmap_nr); 137 set_pte_at(&init_mm, vaddr, 138 &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); 139 140 pkmap_count[last_pkmap_nr] = 1; 141 set_page_address(page, (void *)vaddr); 142 143 return vaddr; 144} 145 146void fastcall *kmap_high(struct page *page) 147{ 148 unsigned long vaddr; 149 150 /* 151 * For highmem pages, we can't trust "virtual" until 152 * after we have the lock. 153 * 154 * We cannot call this from interrupts, as it may block 155 */ 156 spin_lock(&kmap_lock); 157 vaddr = (unsigned long)page_address(page); 158 if (!vaddr) 159 vaddr = map_new_virtual(page); 160 pkmap_count[PKMAP_NR(vaddr)]++; 161 if (pkmap_count[PKMAP_NR(vaddr)] < 2) 162 BUG(); 163 spin_unlock(&kmap_lock); 164 return (void*) vaddr; 165} 166 167EXPORT_SYMBOL(kmap_high); 168 169void fastcall kunmap_high(struct page *page) 170{ 171 unsigned long vaddr; 172 unsigned long nr; 173 int need_wakeup; 174 175 spin_lock(&kmap_lock); 176 vaddr = (unsigned long)page_address(page); 177 if (!vaddr) 178 BUG(); 179 nr = PKMAP_NR(vaddr); 180 181 /* 182 * A count must never go down to zero 183 * without a TLB flush! 184 */ 185 need_wakeup = 0; 186 switch (--pkmap_count[nr]) { 187 case 0: 188 BUG(); 189 case 1: 190 /* 191 * Avoid an unnecessary wake_up() function call. 192 * The common case is pkmap_count[] == 1, but 193 * no waiters. 194 * The tasks queued in the wait-queue are guarded 195 * by both the lock in the wait-queue-head and by 196 * the kmap_lock. As the kmap_lock is held here, 197 * no need for the wait-queue-head's lock. Simply 198 * test if the queue is empty. 199 */ 200 need_wakeup = waitqueue_active(&pkmap_map_wait); 201 } 202 spin_unlock(&kmap_lock); 203 204 /* do wake-up, if needed, race-free outside of the spin lock */ 205 if (need_wakeup) 206 wake_up(&pkmap_map_wait); 207} 208 209EXPORT_SYMBOL(kunmap_high); 210 211#define POOL_SIZE 64 212 213static __init int init_emergency_pool(void) 214{ 215 struct sysinfo i; 216 si_meminfo(&i); 217 si_swapinfo(&i); 218 219 if (!i.totalhigh) 220 return 0; 221 222 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 223 if (!page_pool) 224 BUG(); 225 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 226 227 return 0; 228} 229 230__initcall(init_emergency_pool); 231 232/* 233 * highmem version, map in to vec 234 */ 235static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) 236{ 237 unsigned long flags; 238 unsigned char *vto; 239 240 local_irq_save(flags); 241 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 242 memcpy(vto + to->bv_offset, vfrom, to->bv_len); 243 kunmap_atomic(vto, KM_BOUNCE_READ); 244 local_irq_restore(flags); 245} 246 247#else /* CONFIG_HIGHMEM */ 248 249#define bounce_copy_vec(to, vfrom) \ 250 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) 251 252#endif 253 254#define ISA_POOL_SIZE 16 255 256/* 257 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA 258 * as the max address, so check if the pool has already been created. 259 */ 260int init_emergency_isa_pool(void) 261{ 262 if (isa_page_pool) 263 return 0; 264 265 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, 266 mempool_free_pages, (void *) 0); 267 if (!isa_page_pool) 268 BUG(); 269 270 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); 271 return 0; 272} 273 274/* 275 * Simple bounce buffer support for highmem pages. Depending on the 276 * queue gfp mask set, *to may or may not be a highmem page. kmap it 277 * always, it will do the Right Thing 278 */ 279static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 280{ 281 unsigned char *vfrom; 282 struct bio_vec *tovec, *fromvec; 283 int i; 284 285 __bio_for_each_segment(tovec, to, i, 0) { 286 fromvec = from->bi_io_vec + i; 287 288 /* 289 * not bounced 290 */ 291 if (tovec->bv_page == fromvec->bv_page) 292 continue; 293 294 /* 295 * fromvec->bv_offset and fromvec->bv_len might have been 296 * modified by the block layer, so use the original copy, 297 * bounce_copy_vec already uses tovec->bv_len 298 */ 299 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 300 301 flush_dcache_page(tovec->bv_page); 302 bounce_copy_vec(tovec, vfrom); 303 } 304} 305 306static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) 307{ 308 struct bio *bio_orig = bio->bi_private; 309 struct bio_vec *bvec, *org_vec; 310 int i; 311 312 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 313 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); 314 315 /* 316 * free up bounce indirect pages used 317 */ 318 __bio_for_each_segment(bvec, bio, i, 0) { 319 org_vec = bio_orig->bi_io_vec + i; 320 if (bvec->bv_page == org_vec->bv_page) 321 continue; 322 323 mempool_free(bvec->bv_page, pool); 324 dec_page_state(nr_bounce); 325 } 326 327 bio_endio(bio_orig, bio_orig->bi_size, err); 328 bio_put(bio); 329} 330 331static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) 332{ 333 if (bio->bi_size) 334 return 1; 335 336 bounce_end_io(bio, page_pool, err); 337 return 0; 338} 339 340static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) 341{ 342 if (bio->bi_size) 343 return 1; 344 345 bounce_end_io(bio, isa_page_pool, err); 346 return 0; 347} 348 349static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) 350{ 351 struct bio *bio_orig = bio->bi_private; 352 353 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 354 copy_to_high_bio_irq(bio_orig, bio); 355 356 bounce_end_io(bio, pool, err); 357} 358 359static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) 360{ 361 if (bio->bi_size) 362 return 1; 363 364 __bounce_end_io_read(bio, page_pool, err); 365 return 0; 366} 367 368static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) 369{ 370 if (bio->bi_size) 371 return 1; 372 373 __bounce_end_io_read(bio, isa_page_pool, err); 374 return 0; 375} 376 377static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, 378 mempool_t *pool) 379{ 380 struct page *page; 381 struct bio *bio = NULL; 382 int i, rw = bio_data_dir(*bio_orig); 383 struct bio_vec *to, *from; 384 385 bio_for_each_segment(from, *bio_orig, i) { 386 page = from->bv_page; 387 388 /* 389 * is destination page below bounce pfn? 390 */ 391 if (page_to_pfn(page) < q->bounce_pfn) 392 continue; 393 394 /* 395 * irk, bounce it 396 */ 397 if (!bio) 398 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); 399 400 to = bio->bi_io_vec + i; 401 402 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 403 to->bv_len = from->bv_len; 404 to->bv_offset = from->bv_offset; 405 inc_page_state(nr_bounce); 406 407 if (rw == WRITE) { 408 char *vto, *vfrom; 409 410 flush_dcache_page(from->bv_page); 411 vto = page_address(to->bv_page) + to->bv_offset; 412 vfrom = kmap(from->bv_page) + from->bv_offset; 413 memcpy(vto, vfrom, to->bv_len); 414 kunmap(from->bv_page); 415 } 416 } 417 418 /* 419 * no pages bounced 420 */ 421 if (!bio) 422 return; 423 424 /* 425 * at least one page was bounced, fill in possible non-highmem 426 * pages 427 */ 428 __bio_for_each_segment(from, *bio_orig, i, 0) { 429 to = bio_iovec_idx(bio, i); 430 if (!to->bv_page) { 431 to->bv_page = from->bv_page; 432 to->bv_len = from->bv_len; 433 to->bv_offset = from->bv_offset; 434 } 435 } 436 437 bio->bi_bdev = (*bio_orig)->bi_bdev; 438 bio->bi_flags |= (1 << BIO_BOUNCED); 439 bio->bi_sector = (*bio_orig)->bi_sector; 440 bio->bi_rw = (*bio_orig)->bi_rw; 441 442 bio->bi_vcnt = (*bio_orig)->bi_vcnt; 443 bio->bi_idx = (*bio_orig)->bi_idx; 444 bio->bi_size = (*bio_orig)->bi_size; 445 446 if (pool == page_pool) { 447 bio->bi_end_io = bounce_end_io_write; 448 if (rw == READ) 449 bio->bi_end_io = bounce_end_io_read; 450 } else { 451 bio->bi_end_io = bounce_end_io_write_isa; 452 if (rw == READ) 453 bio->bi_end_io = bounce_end_io_read_isa; 454 } 455 456 bio->bi_private = *bio_orig; 457 *bio_orig = bio; 458} 459 460void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) 461{ 462 mempool_t *pool; 463 464 /* 465 * for non-isa bounce case, just check if the bounce pfn is equal 466 * to or bigger than the highest pfn in the system -- in that case, 467 * don't waste time iterating over bio segments 468 */ 469 if (!(q->bounce_gfp & GFP_DMA)) { 470 if (q->bounce_pfn >= blk_max_pfn) 471 return; 472 pool = page_pool; 473 } else { 474 BUG_ON(!isa_page_pool); 475 pool = isa_page_pool; 476 } 477 478 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); 479 480 /* 481 * slow path 482 */ 483 __blk_queue_bounce(q, bio_orig, pool); 484} 485 486EXPORT_SYMBOL(blk_queue_bounce); 487 488#if defined(HASHED_PAGE_VIRTUAL) 489 490#define PA_HASH_ORDER 7 491 492/* 493 * Describes one page->virtual association 494 */ 495struct page_address_map { 496 struct page *page; 497 void *virtual; 498 struct list_head list; 499}; 500 501/* 502 * page_address_map freelist, allocated from page_address_maps. 503 */ 504static struct list_head page_address_pool; /* freelist */ 505static spinlock_t pool_lock; /* protects page_address_pool */ 506 507/* 508 * Hash table bucket 509 */ 510static struct page_address_slot { 511 struct list_head lh; /* List of page_address_maps */ 512 spinlock_t lock; /* Protect this bucket's list */ 513} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 514 515static struct page_address_slot *page_slot(struct page *page) 516{ 517 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 518} 519 520void *page_address(struct page *page) 521{ 522 unsigned long flags; 523 void *ret; 524 struct page_address_slot *pas; 525 526 if (!PageHighMem(page)) 527 return lowmem_page_address(page); 528 529 pas = page_slot(page); 530 ret = NULL; 531 spin_lock_irqsave(&pas->lock, flags); 532 if (!list_empty(&pas->lh)) { 533 struct page_address_map *pam; 534 535 list_for_each_entry(pam, &pas->lh, list) { 536 if (pam->page == page) { 537 ret = pam->virtual; 538 goto done; 539 } 540 } 541 } 542done: 543 spin_unlock_irqrestore(&pas->lock, flags); 544 return ret; 545} 546 547EXPORT_SYMBOL(page_address); 548 549void set_page_address(struct page *page, void *virtual) 550{ 551 unsigned long flags; 552 struct page_address_slot *pas; 553 struct page_address_map *pam; 554 555 BUG_ON(!PageHighMem(page)); 556 557 pas = page_slot(page); 558 if (virtual) { /* Add */ 559 BUG_ON(list_empty(&page_address_pool)); 560 561 spin_lock_irqsave(&pool_lock, flags); 562 pam = list_entry(page_address_pool.next, 563 struct page_address_map, list); 564 list_del(&pam->list); 565 spin_unlock_irqrestore(&pool_lock, flags); 566 567 pam->page = page; 568 pam->virtual = virtual; 569 570 spin_lock_irqsave(&pas->lock, flags); 571 list_add_tail(&pam->list, &pas->lh); 572 spin_unlock_irqrestore(&pas->lock, flags); 573 } else { /* Remove */ 574 spin_lock_irqsave(&pas->lock, flags); 575 list_for_each_entry(pam, &pas->lh, list) { 576 if (pam->page == page) { 577 list_del(&pam->list); 578 spin_unlock_irqrestore(&pas->lock, flags); 579 spin_lock_irqsave(&pool_lock, flags); 580 list_add_tail(&pam->list, &page_address_pool); 581 spin_unlock_irqrestore(&pool_lock, flags); 582 goto done; 583 } 584 } 585 spin_unlock_irqrestore(&pas->lock, flags); 586 } 587done: 588 return; 589} 590 591static struct page_address_map page_address_maps[LAST_PKMAP]; 592 593void __init page_address_init(void) 594{ 595 int i; 596 597 INIT_LIST_HEAD(&page_address_pool); 598 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) 599 list_add(&page_address_maps[i].list, &page_address_pool); 600 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 601 INIT_LIST_HEAD(&page_address_htable[i].lh); 602 spin_lock_init(&page_address_htable[i].lock); 603 } 604 spin_lock_init(&pool_lock); 605} 606 607#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 608