1/* 2 * zcache.c 3 * 4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. 5 * Copyright (c) 2010,2011, Nitin Gupta 6 * 7 * Zcache provides an in-kernel "host implementation" for transcendent memory 8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two 9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression: 10 * 1) "compression buddies" ("zbud") is used for ephemeral pages 11 * 2) xvmalloc is used for persistent pages. 12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation 13 * so maximizes space efficiency, while zbud allows pairs (and potentially, 14 * in the future, more than a pair of) compressed pages to be closely linked 15 * so that reclaiming can be done via the kernel's physical-page-oriented 16 * "shrinker" interface. 17 * 18 * [1] For a definition of page-accessible memory (aka PAM), see: 19 * http://marc.info/?l=linux-mm&m=127811271605009 20 * RAMSTER TODO: 21 * - handle remotifying of buddied pages (see zbud_remotify_zbpg) 22 * - kernel boot params: nocleancache/nofrontswap don't always work?!? 23 */ 24 25#include <linux/module.h> 26#include <linux/cpu.h> 27#include <linux/highmem.h> 28#include <linux/list.h> 29#include <linux/lzo.h> 30#include <linux/slab.h> 31#include <linux/spinlock.h> 32#include <linux/types.h> 33#include <linux/atomic.h> 34#include <linux/math64.h> 35#include "tmem.h" 36#include "zcache.h" 37#include "ramster.h" 38#include "cluster/tcp.h" 39 40#include "xvmalloc.h" /* temporary until change to zsmalloc */ 41 42#define RAMSTER_TESTING 43 44#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) 45#error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" 46#endif 47#ifdef CONFIG_CLEANCACHE 48#include <linux/cleancache.h> 49#endif 50#ifdef CONFIG_FRONTSWAP 51#include <linux/frontswap.h> 52#endif 53 54enum ramster_remotify_op { 55 RAMSTER_REMOTIFY_EPH_PUT, 56 RAMSTER_REMOTIFY_PERS_PUT, 57 RAMSTER_REMOTIFY_FLUSH_PAGE, 58 RAMSTER_REMOTIFY_FLUSH_OBJ, 59 RAMSTER_INTRANSIT_PERS 60}; 61 62struct ramster_remotify_hdr { 63 enum ramster_remotify_op op; 64 struct list_head list; 65}; 66 67#define ZBH_SENTINEL 0x43214321 68#define ZBPG_SENTINEL 0xdeadbeef 69 70#define ZBUD_MAX_BUDS 2 71 72struct zbud_hdr { 73 struct ramster_remotify_hdr rem_op; 74 uint16_t client_id; 75 uint16_t pool_id; 76 struct tmem_oid oid; 77 uint32_t index; 78 uint16_t size; /* compressed size in bytes, zero means unused */ 79 DECL_SENTINEL 80}; 81 82#define ZVH_SENTINEL 0x43214321 83static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; 84 85struct zv_hdr { 86 struct ramster_remotify_hdr rem_op; 87 uint16_t client_id; 88 uint16_t pool_id; 89 struct tmem_oid oid; 90 uint32_t index; 91 DECL_SENTINEL 92}; 93 94struct flushlist_node { 95 struct ramster_remotify_hdr rem_op; 96 struct tmem_xhandle xh; 97}; 98 99union { 100 struct ramster_remotify_hdr rem_op; 101 struct zv_hdr zv; 102 struct zbud_hdr zbud; 103 struct flushlist_node flist; 104} remotify_list_node; 105 106static LIST_HEAD(zcache_rem_op_list); 107static DEFINE_SPINLOCK(zcache_rem_op_list_lock); 108 109#if 0 110/* this is more aggressive but may cause other problems? */ 111#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) 112#else 113#define ZCACHE_GFP_MASK \ 114 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) 115#endif 116 117#define MAX_POOLS_PER_CLIENT 16 118 119#define MAX_CLIENTS 16 120#define LOCAL_CLIENT ((uint16_t)-1) 121 122MODULE_LICENSE("GPL"); 123 124struct zcache_client { 125 struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; 126 struct xv_pool *xvpool; 127 bool allocated; 128 atomic_t refcount; 129}; 130 131static struct zcache_client zcache_host; 132static struct zcache_client zcache_clients[MAX_CLIENTS]; 133 134static inline uint16_t get_client_id_from_client(struct zcache_client *cli) 135{ 136 BUG_ON(cli == NULL); 137 if (cli == &zcache_host) 138 return LOCAL_CLIENT; 139 return cli - &zcache_clients[0]; 140} 141 142static inline bool is_local_client(struct zcache_client *cli) 143{ 144 return cli == &zcache_host; 145} 146 147/********** 148 * Compression buddies ("zbud") provides for packing two (or, possibly 149 * in the future, more) compressed ephemeral pages into a single "raw" 150 * (physical) page and tracking them with data structures so that 151 * the raw pages can be easily reclaimed. 152 * 153 * A zbud page ("zbpg") is an aligned page containing a list_head, 154 * a lock, and two "zbud headers". The remainder of the physical 155 * page is divided up into aligned 64-byte "chunks" which contain 156 * the compressed data for zero, one, or two zbuds. Each zbpg 157 * resides on: (1) an "unused list" if it has no zbuds; (2) a 158 * "buddied" list if it is fully populated with two zbuds; or 159 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks 160 * the one unbuddied zbud uses. The data inside a zbpg cannot be 161 * read or written unless the zbpg's lock is held. 162 */ 163 164struct zbud_page { 165 struct list_head bud_list; 166 spinlock_t lock; 167 struct zbud_hdr buddy[ZBUD_MAX_BUDS]; 168 DECL_SENTINEL 169 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ 170}; 171 172#define CHUNK_SHIFT 6 173#define CHUNK_SIZE (1 << CHUNK_SHIFT) 174#define CHUNK_MASK (~(CHUNK_SIZE-1)) 175#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ 176 CHUNK_MASK) >> CHUNK_SHIFT) 177#define MAX_CHUNK (NCHUNKS-1) 178 179static struct { 180 struct list_head list; 181 unsigned count; 182} zbud_unbuddied[NCHUNKS]; 183/* list N contains pages with N chunks USED and NCHUNKS-N unused */ 184/* element 0 is never used but optimizing that isn't worth it */ 185static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; 186 187struct list_head zbud_buddied_list; 188static unsigned long zcache_zbud_buddied_count; 189 190/* protects the buddied list and all unbuddied lists */ 191static DEFINE_SPINLOCK(zbud_budlists_spinlock); 192 193static atomic_t zcache_zbud_curr_raw_pages; 194static atomic_t zcache_zbud_curr_zpages; 195static unsigned long zcache_zbud_curr_zbytes; 196static unsigned long zcache_zbud_cumul_zpages; 197static unsigned long zcache_zbud_cumul_zbytes; 198static unsigned long zcache_compress_poor; 199static unsigned long zcache_policy_percent_exceeded; 200static unsigned long zcache_mean_compress_poor; 201 202/* 203 * RAMster counters 204 * - Remote pages are pages with a local pampd but the data is remote 205 * - Foreign pages are pages stored locally but belonging to another node 206 */ 207static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); 208static unsigned long ramster_pers_remotify_enable; 209static unsigned long ramster_eph_remotify_enable; 210static unsigned long ramster_eph_pages_remoted; 211static unsigned long ramster_eph_pages_remote_failed; 212static unsigned long ramster_pers_pages_remoted; 213static unsigned long ramster_pers_pages_remote_failed; 214static unsigned long ramster_pers_pages_remote_nomem; 215static unsigned long ramster_remote_objects_flushed; 216static unsigned long ramster_remote_object_flushes_failed; 217static unsigned long ramster_remote_pages_flushed; 218static unsigned long ramster_remote_page_flushes_failed; 219static unsigned long ramster_remote_eph_pages_succ_get; 220static unsigned long ramster_remote_pers_pages_succ_get; 221static unsigned long ramster_remote_eph_pages_unsucc_get; 222static unsigned long ramster_remote_pers_pages_unsucc_get; 223static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0); 224static unsigned long ramster_curr_flnode_count_max; 225static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0); 226static unsigned long ramster_foreign_eph_pampd_count_max; 227static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0); 228static unsigned long ramster_foreign_pers_pampd_count_max; 229 230/* forward references */ 231static void *zcache_get_free_page(void); 232static void zcache_free_page(void *p); 233 234/* 235 * zbud helper functions 236 */ 237 238static inline unsigned zbud_max_buddy_size(void) 239{ 240 return MAX_CHUNK << CHUNK_SHIFT; 241} 242 243static inline unsigned zbud_size_to_chunks(unsigned size) 244{ 245 BUG_ON(size == 0 || size > zbud_max_buddy_size()); 246 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 247} 248 249static inline int zbud_budnum(struct zbud_hdr *zh) 250{ 251 unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); 252 struct zbud_page *zbpg = NULL; 253 unsigned budnum = -1U; 254 int i; 255 256 for (i = 0; i < ZBUD_MAX_BUDS; i++) 257 if (offset == offsetof(typeof(*zbpg), buddy[i])) { 258 budnum = i; 259 break; 260 } 261 BUG_ON(budnum == -1U); 262 return budnum; 263} 264 265static char *zbud_data(struct zbud_hdr *zh, unsigned size) 266{ 267 struct zbud_page *zbpg; 268 char *p; 269 unsigned budnum; 270 271 ASSERT_SENTINEL(zh, ZBH); 272 budnum = zbud_budnum(zh); 273 BUG_ON(size == 0 || size > zbud_max_buddy_size()); 274 zbpg = container_of(zh, struct zbud_page, buddy[budnum]); 275 ASSERT_SPINLOCK(&zbpg->lock); 276 p = (char *)zbpg; 277 if (budnum == 0) 278 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & 279 CHUNK_MASK); 280 else if (budnum == 1) 281 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); 282 return p; 283} 284 285static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh) 286{ 287 struct zbud_page *zbpg; 288 char *p; 289 unsigned budnum; 290 291 ASSERT_SENTINEL(zh, ZBH); 292 budnum = zbud_budnum(zh); 293 zbpg = container_of(zh, struct zbud_page, buddy[budnum]); 294 spin_lock(&zbpg->lock); 295 BUG_ON(zh->size > *size); 296 p = (char *)zbpg; 297 if (budnum == 0) 298 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & 299 CHUNK_MASK); 300 else if (budnum == 1) 301 p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK); 302 /* client should be filled in by caller */ 303 memcpy(data, p, zh->size); 304 *size = zh->size; 305 spin_unlock(&zbpg->lock); 306} 307 308/* 309 * zbud raw page management 310 */ 311 312static struct zbud_page *zbud_alloc_raw_page(void) 313{ 314 struct zbud_page *zbpg = NULL; 315 struct zbud_hdr *zh0, *zh1; 316 zbpg = zcache_get_free_page(); 317 if (likely(zbpg != NULL)) { 318 INIT_LIST_HEAD(&zbpg->bud_list); 319 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; 320 spin_lock_init(&zbpg->lock); 321 atomic_inc(&zcache_zbud_curr_raw_pages); 322 INIT_LIST_HEAD(&zbpg->bud_list); 323 SET_SENTINEL(zbpg, ZBPG); 324 zh0->size = 0; zh1->size = 0; 325 tmem_oid_set_invalid(&zh0->oid); 326 tmem_oid_set_invalid(&zh1->oid); 327 } 328 return zbpg; 329} 330 331static void zbud_free_raw_page(struct zbud_page *zbpg) 332{ 333 struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; 334 335 ASSERT_SENTINEL(zbpg, ZBPG); 336 BUG_ON(!list_empty(&zbpg->bud_list)); 337 ASSERT_SPINLOCK(&zbpg->lock); 338 BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); 339 BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); 340 INVERT_SENTINEL(zbpg, ZBPG); 341 spin_unlock(&zbpg->lock); 342 atomic_dec(&zcache_zbud_curr_raw_pages); 343 zcache_free_page(zbpg); 344} 345 346/* 347 * core zbud handling routines 348 */ 349 350static unsigned zbud_free(struct zbud_hdr *zh) 351{ 352 unsigned size; 353 354 ASSERT_SENTINEL(zh, ZBH); 355 BUG_ON(!tmem_oid_valid(&zh->oid)); 356 size = zh->size; 357 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); 358 zh->size = 0; 359 tmem_oid_set_invalid(&zh->oid); 360 INVERT_SENTINEL(zh, ZBH); 361 zcache_zbud_curr_zbytes -= size; 362 atomic_dec(&zcache_zbud_curr_zpages); 363 return size; 364} 365 366static void zbud_free_and_delist(struct zbud_hdr *zh) 367{ 368 unsigned chunks; 369 struct zbud_hdr *zh_other; 370 unsigned budnum = zbud_budnum(zh), size; 371 struct zbud_page *zbpg = 372 container_of(zh, struct zbud_page, buddy[budnum]); 373 374 /* FIXME, should be BUG_ON, pool destruction path doesn't disable 375 * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()-> 376 * tmem_objnode_node_destroy()-> zcache_pampd_free() */ 377 WARN_ON(!irqs_disabled()); 378 spin_lock(&zbpg->lock); 379 if (list_empty(&zbpg->bud_list)) { 380 /* ignore zombie page... see zbud_evict_pages() */ 381 spin_unlock(&zbpg->lock); 382 return; 383 } 384 size = zbud_free(zh); 385 ASSERT_SPINLOCK(&zbpg->lock); 386 zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; 387 if (zh_other->size == 0) { /* was unbuddied: unlist and free */ 388 chunks = zbud_size_to_chunks(size) ; 389 spin_lock(&zbud_budlists_spinlock); 390 BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); 391 list_del_init(&zbpg->bud_list); 392 zbud_unbuddied[chunks].count--; 393 spin_unlock(&zbud_budlists_spinlock); 394 zbud_free_raw_page(zbpg); 395 } else { /* was buddied: move remaining buddy to unbuddied list */ 396 chunks = zbud_size_to_chunks(zh_other->size) ; 397 spin_lock(&zbud_budlists_spinlock); 398 list_del_init(&zbpg->bud_list); 399 zcache_zbud_buddied_count--; 400 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); 401 zbud_unbuddied[chunks].count++; 402 spin_unlock(&zbud_budlists_spinlock); 403 spin_unlock(&zbpg->lock); 404 } 405} 406 407static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, 408 struct tmem_oid *oid, 409 uint32_t index, struct page *page, 410 void *cdata, unsigned size) 411{ 412 struct zbud_hdr *zh0, *zh1, *zh = NULL; 413 struct zbud_page *zbpg = NULL, *ztmp; 414 unsigned nchunks; 415 char *to; 416 int i, found_good_buddy = 0; 417 418 nchunks = zbud_size_to_chunks(size) ; 419 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { 420 spin_lock(&zbud_budlists_spinlock); 421 if (!list_empty(&zbud_unbuddied[i].list)) { 422 list_for_each_entry_safe(zbpg, ztmp, 423 &zbud_unbuddied[i].list, bud_list) { 424 if (spin_trylock(&zbpg->lock)) { 425 found_good_buddy = i; 426 goto found_unbuddied; 427 } 428 } 429 } 430 spin_unlock(&zbud_budlists_spinlock); 431 } 432 /* didn't find a good buddy, try allocating a new page */ 433 zbpg = zbud_alloc_raw_page(); 434 if (unlikely(zbpg == NULL)) 435 goto out; 436 /* ok, have a page, now compress the data before taking locks */ 437 spin_lock(&zbud_budlists_spinlock); 438 spin_lock(&zbpg->lock); 439 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); 440 zbud_unbuddied[nchunks].count++; 441 zh = &zbpg->buddy[0]; 442 goto init_zh; 443 444found_unbuddied: 445 ASSERT_SPINLOCK(&zbpg->lock); 446 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; 447 BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); 448 if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ 449 ASSERT_SENTINEL(zh0, ZBH); 450 zh = zh1; 451 } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ 452 ASSERT_SENTINEL(zh1, ZBH); 453 zh = zh0; 454 } else 455 BUG(); 456 list_del_init(&zbpg->bud_list); 457 zbud_unbuddied[found_good_buddy].count--; 458 list_add_tail(&zbpg->bud_list, &zbud_buddied_list); 459 zcache_zbud_buddied_count++; 460 461init_zh: 462 SET_SENTINEL(zh, ZBH); 463 zh->size = size; 464 zh->index = index; 465 zh->oid = *oid; 466 zh->pool_id = pool_id; 467 zh->client_id = client_id; 468 to = zbud_data(zh, size); 469 memcpy(to, cdata, size); 470 spin_unlock(&zbpg->lock); 471 spin_unlock(&zbud_budlists_spinlock); 472 zbud_cumul_chunk_counts[nchunks]++; 473 atomic_inc(&zcache_zbud_curr_zpages); 474 zcache_zbud_cumul_zpages++; 475 zcache_zbud_curr_zbytes += size; 476 zcache_zbud_cumul_zbytes += size; 477out: 478 return zh; 479} 480 481static int zbud_decompress(struct page *page, struct zbud_hdr *zh) 482{ 483 struct zbud_page *zbpg; 484 unsigned budnum = zbud_budnum(zh); 485 size_t out_len = PAGE_SIZE; 486 char *to_va, *from_va; 487 unsigned size; 488 int ret = 0; 489 490 zbpg = container_of(zh, struct zbud_page, buddy[budnum]); 491 spin_lock(&zbpg->lock); 492 if (list_empty(&zbpg->bud_list)) { 493 /* ignore zombie page... see zbud_evict_pages() */ 494 ret = -EINVAL; 495 goto out; 496 } 497 ASSERT_SENTINEL(zh, ZBH); 498 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); 499 to_va = kmap_atomic(page); 500 size = zh->size; 501 from_va = zbud_data(zh, size); 502 ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); 503 BUG_ON(ret != LZO_E_OK); 504 BUG_ON(out_len != PAGE_SIZE); 505 kunmap_atomic(to_va); 506out: 507 spin_unlock(&zbpg->lock); 508 return ret; 509} 510 511/* 512 * The following routines handle shrinking of ephemeral pages by evicting 513 * pages "least valuable" first. 514 */ 515 516static unsigned long zcache_evicted_raw_pages; 517static unsigned long zcache_evicted_buddied_pages; 518static unsigned long zcache_evicted_unbuddied_pages; 519 520static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, 521 uint16_t poolid); 522static void zcache_put_pool(struct tmem_pool *pool); 523 524/* 525 * Flush and free all zbuds in a zbpg, then free the pageframe 526 */ 527static void zbud_evict_zbpg(struct zbud_page *zbpg) 528{ 529 struct zbud_hdr *zh; 530 int i, j; 531 uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; 532 uint32_t index[ZBUD_MAX_BUDS]; 533 struct tmem_oid oid[ZBUD_MAX_BUDS]; 534 struct tmem_pool *pool; 535 unsigned long flags; 536 537 ASSERT_SPINLOCK(&zbpg->lock); 538 for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { 539 zh = &zbpg->buddy[i]; 540 if (zh->size) { 541 client_id[j] = zh->client_id; 542 pool_id[j] = zh->pool_id; 543 oid[j] = zh->oid; 544 index[j] = zh->index; 545 j++; 546 } 547 } 548 spin_unlock(&zbpg->lock); 549 for (i = 0; i < j; i++) { 550 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); 551 BUG_ON(pool == NULL); 552 local_irq_save(flags); 553 /* these flushes should dispose of any local storage */ 554 tmem_flush_page(pool, &oid[i], index[i]); 555 local_irq_restore(flags); 556 zcache_put_pool(pool); 557 } 558} 559 560/* 561 * Free nr pages. This code is funky because we want to hold the locks 562 * protecting various lists for as short a time as possible, and in some 563 * circumstances the list may change asynchronously when the list lock is 564 * not held. In some cases we also trylock not only to avoid waiting on a 565 * page in use by another cpu, but also to avoid potential deadlock due to 566 * lock inversion. 567 */ 568static void zbud_evict_pages(int nr) 569{ 570 struct zbud_page *zbpg; 571 int i, newly_unused_pages = 0; 572 573 574 /* now try freeing unbuddied pages, starting with least space avail */ 575 for (i = 0; i < MAX_CHUNK; i++) { 576retry_unbud_list_i: 577 spin_lock_bh(&zbud_budlists_spinlock); 578 if (list_empty(&zbud_unbuddied[i].list)) { 579 spin_unlock_bh(&zbud_budlists_spinlock); 580 continue; 581 } 582 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { 583 if (unlikely(!spin_trylock(&zbpg->lock))) 584 continue; 585 zbud_unbuddied[i].count--; 586 spin_unlock(&zbud_budlists_spinlock); 587 zcache_evicted_unbuddied_pages++; 588 /* want budlists unlocked when doing zbpg eviction */ 589 zbud_evict_zbpg(zbpg); 590 newly_unused_pages++; 591 local_bh_enable(); 592 if (--nr <= 0) 593 goto evict_unused; 594 goto retry_unbud_list_i; 595 } 596 spin_unlock_bh(&zbud_budlists_spinlock); 597 } 598 599 /* as a last resort, free buddied pages */ 600retry_bud_list: 601 spin_lock_bh(&zbud_budlists_spinlock); 602 if (list_empty(&zbud_buddied_list)) { 603 spin_unlock_bh(&zbud_budlists_spinlock); 604 goto evict_unused; 605 } 606 list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { 607 if (unlikely(!spin_trylock(&zbpg->lock))) 608 continue; 609 zcache_zbud_buddied_count--; 610 spin_unlock(&zbud_budlists_spinlock); 611 zcache_evicted_buddied_pages++; 612 /* want budlists unlocked when doing zbpg eviction */ 613 zbud_evict_zbpg(zbpg); 614 newly_unused_pages++; 615 local_bh_enable(); 616 if (--nr <= 0) 617 goto evict_unused; 618 goto retry_bud_list; 619 } 620 spin_unlock_bh(&zbud_budlists_spinlock); 621 622evict_unused: 623 return; 624} 625 626static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem); 627 628static int zbud_remotify_zbud(struct tmem_xhandle *xh, char *data, 629 size_t size) 630{ 631 struct tmem_pool *pool; 632 int i, remotenode, ret = -1; 633 unsigned char cksum, *p; 634 unsigned long flags; 635 636 for (p = data, cksum = 0, i = 0; i < size; i++) 637 cksum += *p; 638 ret = ramster_remote_put(xh, data, size, true, &remotenode); 639 if (ret == 0) { 640 /* data was successfully remoted so change the local version 641 * to point to the remote node where it landed */ 642 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh->pool_id); 643 BUG_ON(pool == NULL); 644 local_irq_save(flags); 645 /* tmem_replace will also free up any local space */ 646 (void)tmem_replace(pool, &xh->oid, xh->index, 647 pampd_make_remote(remotenode, size, cksum)); 648 local_irq_restore(flags); 649 zcache_put_pool(pool); 650 ramster_eph_pages_remoted++; 651 ret = 0; 652 } else 653 ramster_eph_pages_remote_failed++; 654 return ret; 655} 656 657static int zbud_remotify_zbpg(struct zbud_page *zbpg) 658{ 659 struct zbud_hdr *zh1, *zh2 = NULL; 660 struct tmem_xhandle xh1, xh2 = { 0 }; 661 char *data1 = NULL, *data2 = NULL; 662 size_t size1 = 0, size2 = 0; 663 int ret = 0; 664 unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); 665 666 ASSERT_SPINLOCK(&zbpg->lock); 667 if (zbpg->buddy[0].size == 0) 668 zh1 = &zbpg->buddy[1]; 669 else if (zbpg->buddy[1].size == 0) 670 zh1 = &zbpg->buddy[0]; 671 else { 672 zh1 = &zbpg->buddy[0]; 673 zh2 = &zbpg->buddy[1]; 674 } 675 /* don't remotify pages that are already remotified */ 676 if (zh1->client_id != LOCAL_CLIENT) 677 zh1 = NULL; 678 if ((zh2 != NULL) && (zh2->client_id != LOCAL_CLIENT)) 679 zh2 = NULL; 680 681 /* copy the data and metadata so can release lock */ 682 if (zh1 != NULL) { 683 xh1.client_id = zh1->client_id; 684 xh1.pool_id = zh1->pool_id; 685 xh1.oid = zh1->oid; 686 xh1.index = zh1->index; 687 size1 = zh1->size; 688 data1 = zbud_data(zh1, size1); 689 memcpy(tmpmem, zbud_data(zh1, size1), size1); 690 data1 = tmpmem; 691 tmpmem += size1; 692 } 693 if (zh2 != NULL) { 694 xh2.client_id = zh2->client_id; 695 xh2.pool_id = zh2->pool_id; 696 xh2.oid = zh2->oid; 697 xh2.index = zh2->index; 698 size2 = zh2->size; 699 memcpy(tmpmem, zbud_data(zh2, size2), size2); 700 data2 = tmpmem; 701 } 702 spin_unlock(&zbpg->lock); 703 preempt_enable(); 704 705 /* OK, no locks held anymore, remotify one or both zbuds */ 706 if (zh1 != NULL) 707 ret = zbud_remotify_zbud(&xh1, data1, size1); 708 if (zh2 != NULL) 709 ret |= zbud_remotify_zbud(&xh2, data2, size2); 710 return ret; 711} 712 713void zbud_remotify_pages(int nr) 714{ 715 struct zbud_page *zbpg; 716 int i, ret; 717 718 /* 719 * for now just try remotifying unbuddied pages, starting with 720 * least space avail 721 */ 722 for (i = 0; i < MAX_CHUNK; i++) { 723retry_unbud_list_i: 724 preempt_disable(); /* enable in zbud_remotify_zbpg */ 725 spin_lock_bh(&zbud_budlists_spinlock); 726 if (list_empty(&zbud_unbuddied[i].list)) { 727 spin_unlock_bh(&zbud_budlists_spinlock); 728 preempt_enable(); 729 continue; /* next i in for loop */ 730 } 731 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { 732 if (unlikely(!spin_trylock(&zbpg->lock))) 733 continue; /* next list_for_each_entry */ 734 zbud_unbuddied[i].count--; 735 /* want budlists unlocked when doing zbpg remotify */ 736 spin_unlock_bh(&zbud_budlists_spinlock); 737 ret = zbud_remotify_zbpg(zbpg); 738 /* preemption is re-enabled in zbud_remotify_zbpg */ 739 if (ret == 0) { 740 if (--nr <= 0) 741 goto out; 742 goto retry_unbud_list_i; 743 } 744 /* if fail to remotify any page, quit */ 745 pr_err("TESTING zbud_remotify_pages failed on page," 746 " trying to re-add\n"); 747 spin_lock_bh(&zbud_budlists_spinlock); 748 spin_lock(&zbpg->lock); 749 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[i].list); 750 zbud_unbuddied[i].count++; 751 spin_unlock(&zbpg->lock); 752 spin_unlock_bh(&zbud_budlists_spinlock); 753 pr_err("TESTING zbud_remotify_pages failed on page," 754 " finished re-add\n"); 755 goto out; 756 } 757 spin_unlock_bh(&zbud_budlists_spinlock); 758 preempt_enable(); 759 } 760 761next_buddied_zbpg: 762 preempt_disable(); /* enable in zbud_remotify_zbpg */ 763 spin_lock_bh(&zbud_budlists_spinlock); 764 if (list_empty(&zbud_buddied_list)) 765 goto unlock_out; 766 list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { 767 if (unlikely(!spin_trylock(&zbpg->lock))) 768 continue; /* next list_for_each_entry */ 769 zcache_zbud_buddied_count--; 770 /* want budlists unlocked when doing zbpg remotify */ 771 spin_unlock_bh(&zbud_budlists_spinlock); 772 ret = zbud_remotify_zbpg(zbpg); 773 /* preemption is re-enabled in zbud_remotify_zbpg */ 774 if (ret == 0) { 775 if (--nr <= 0) 776 goto out; 777 goto next_buddied_zbpg; 778 } 779 /* if fail to remotify any page, quit */ 780 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," 781 " trying to re-add\n"); 782 spin_lock_bh(&zbud_budlists_spinlock); 783 spin_lock(&zbpg->lock); 784 list_add_tail(&zbpg->bud_list, &zbud_buddied_list); 785 zcache_zbud_buddied_count++; 786 spin_unlock(&zbpg->lock); 787 spin_unlock_bh(&zbud_budlists_spinlock); 788 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," 789 " finished re-add\n"); 790 goto out; 791 } 792unlock_out: 793 spin_unlock_bh(&zbud_budlists_spinlock); 794 preempt_enable(); 795out: 796 return; 797} 798 799/* the "flush list" asynchronously collects pages to remotely flush */ 800#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) 801static void ramster_flnode_free(struct flushlist_node *, 802 struct tmem_pool *); 803 804static void zcache_remote_flush_page(struct flushlist_node *flnode) 805{ 806 struct tmem_xhandle *xh; 807 int remotenode, ret; 808 809 preempt_disable(); 810 xh = &flnode->xh; 811 remotenode = flnode->xh.client_id; 812 ret = ramster_remote_flush(xh, remotenode); 813 if (ret >= 0) 814 ramster_remote_pages_flushed++; 815 else 816 ramster_remote_page_flushes_failed++; 817 preempt_enable_no_resched(); 818 ramster_flnode_free(flnode, NULL); 819} 820 821static void zcache_remote_flush_object(struct flushlist_node *flnode) 822{ 823 struct tmem_xhandle *xh; 824 int remotenode, ret; 825 826 preempt_disable(); 827 xh = &flnode->xh; 828 remotenode = flnode->xh.client_id; 829 ret = ramster_remote_flush_object(xh, remotenode); 830 if (ret >= 0) 831 ramster_remote_objects_flushed++; 832 else 833 ramster_remote_object_flushes_failed++; 834 preempt_enable_no_resched(); 835 ramster_flnode_free(flnode, NULL); 836} 837 838static void zcache_remote_eph_put(struct zbud_hdr *zbud) 839{ 840 /* FIXME */ 841} 842 843static void zcache_remote_pers_put(struct zv_hdr *zv) 844{ 845 struct tmem_xhandle xh; 846 uint16_t size; 847 bool ephemeral; 848 int remotenode, ret = -1; 849 char *data; 850 struct tmem_pool *pool; 851 unsigned long flags; 852 unsigned char cksum; 853 char *p; 854 int i; 855 unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); 856 857 ASSERT_SENTINEL(zv, ZVH); 858 BUG_ON(zv->client_id != LOCAL_CLIENT); 859 local_bh_disable(); 860 xh.client_id = zv->client_id; 861 xh.pool_id = zv->pool_id; 862 xh.oid = zv->oid; 863 xh.index = zv->index; 864 size = xv_get_object_size(zv) - sizeof(*zv); 865 BUG_ON(size == 0 || size > zv_max_page_size); 866 data = (char *)zv + sizeof(*zv); 867 for (p = data, cksum = 0, i = 0; i < size; i++) 868 cksum += *p; 869 memcpy(tmpmem, data, size); 870 data = tmpmem; 871 pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id); 872 ephemeral = is_ephemeral(pool); 873 zcache_put_pool(pool); 874 /* now OK to release lock set in caller */ 875 spin_unlock(&zcache_rem_op_list_lock); 876 local_bh_enable(); 877 preempt_disable(); 878 ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode); 879 preempt_enable_no_resched(); 880 if (ret != 0) { 881 /* 882 * This is some form of a memory leak... if the remote put 883 * fails, there will never be another attempt to remotify 884 * this page. But since we've dropped the zv pointer, 885 * the page may have been freed or the data replaced 886 * so we can't just "put it back" in the remote op list. 887 * Even if we could, not sure where to put it in the list 888 * because there may be flushes that must be strictly 889 * ordered vs the put. So leave this as a FIXME for now. 890 * But count them so we know if it becomes a problem. 891 */ 892 ramster_pers_pages_remote_failed++; 893 goto out; 894 } else 895 atomic_inc(&ramster_remote_pers_pages); 896 ramster_pers_pages_remoted++; 897 /* 898 * data was successfully remoted so change the local version to 899 * point to the remote node where it landed 900 */ 901 local_bh_disable(); 902 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); 903 local_irq_save(flags); 904 (void)tmem_replace(pool, &xh.oid, xh.index, 905 pampd_make_remote(remotenode, size, cksum)); 906 local_irq_restore(flags); 907 zcache_put_pool(pool); 908 local_bh_enable(); 909out: 910 return; 911} 912 913static void zcache_do_remotify_ops(int nr) 914{ 915 struct ramster_remotify_hdr *rem_op; 916 union remotify_list_node *u; 917 918 while (1) { 919 if (!nr) 920 goto out; 921 spin_lock(&zcache_rem_op_list_lock); 922 if (list_empty(&zcache_rem_op_list)) { 923 spin_unlock(&zcache_rem_op_list_lock); 924 goto out; 925 } 926 rem_op = list_first_entry(&zcache_rem_op_list, 927 struct ramster_remotify_hdr, list); 928 list_del_init(&rem_op->list); 929 if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT) 930 spin_unlock(&zcache_rem_op_list_lock); 931 u = (union remotify_list_node *)rem_op; 932 switch (rem_op->op) { 933 case RAMSTER_REMOTIFY_EPH_PUT: 934BUG(); 935 zcache_remote_eph_put((struct zbud_hdr *)rem_op); 936 break; 937 case RAMSTER_REMOTIFY_PERS_PUT: 938 zcache_remote_pers_put((struct zv_hdr *)rem_op); 939 break; 940 case RAMSTER_REMOTIFY_FLUSH_PAGE: 941 zcache_remote_flush_page((struct flushlist_node *)u); 942 break; 943 case RAMSTER_REMOTIFY_FLUSH_OBJ: 944 zcache_remote_flush_object((struct flushlist_node *)u); 945 break; 946 default: 947 BUG(); 948 } 949 } 950out: 951 return; 952} 953 954/* 955 * Communicate interface revision with userspace 956 */ 957#include "cluster/ramster_nodemanager.h" 958static unsigned long ramster_interface_revision = R2NM_API_VERSION; 959 960/* 961 * For now, just push over a few pages every few seconds to 962 * ensure that it basically works 963 */ 964static struct workqueue_struct *ramster_remotify_workqueue; 965static void ramster_remotify_process(struct work_struct *work); 966static DECLARE_DELAYED_WORK(ramster_remotify_worker, 967 ramster_remotify_process); 968 969static void ramster_remotify_queue_delayed_work(unsigned long delay) 970{ 971 if (!queue_delayed_work(ramster_remotify_workqueue, 972 &ramster_remotify_worker, delay)) 973 pr_err("ramster_remotify: bad workqueue\n"); 974} 975 976 977static int use_frontswap; 978static int use_cleancache; 979static int ramster_remote_target_nodenum = -1; 980static void ramster_remotify_process(struct work_struct *work) 981{ 982 static bool remotify_in_progress; 983 984 BUG_ON(irqs_disabled()); 985 if (remotify_in_progress) 986 ramster_remotify_queue_delayed_work(HZ); 987 else if (ramster_remote_target_nodenum != -1) { 988 remotify_in_progress = true; 989#ifdef CONFIG_CLEANCACHE 990 if (use_cleancache && ramster_eph_remotify_enable) 991 zbud_remotify_pages(5000); /* FIXME is this a good number? */ 992#endif 993#ifdef CONFIG_FRONTSWAP 994 if (use_frontswap && ramster_pers_remotify_enable) 995 zcache_do_remotify_ops(500); /* FIXME is this a good number? */ 996#endif 997 remotify_in_progress = false; 998 ramster_remotify_queue_delayed_work(HZ); 999 } 1000} 1001 1002static void ramster_remotify_init(void) 1003{ 1004 unsigned long n = 60UL; 1005 ramster_remotify_workqueue = 1006 create_singlethread_workqueue("ramster_remotify"); 1007 ramster_remotify_queue_delayed_work(n * HZ); 1008} 1009 1010 1011static void zbud_init(void) 1012{ 1013 int i; 1014 1015 INIT_LIST_HEAD(&zbud_buddied_list); 1016 zcache_zbud_buddied_count = 0; 1017 for (i = 0; i < NCHUNKS; i++) { 1018 INIT_LIST_HEAD(&zbud_unbuddied[i].list); 1019 zbud_unbuddied[i].count = 0; 1020 } 1021} 1022 1023#ifdef CONFIG_SYSFS 1024/* 1025 * These sysfs routines show a nice distribution of how many zbpg's are 1026 * currently (and have ever been placed) in each unbuddied list. It's fun 1027 * to watch but can probably go away before final merge. 1028 */ 1029static int zbud_show_unbuddied_list_counts(char *buf) 1030{ 1031 int i; 1032 char *p = buf; 1033 1034 for (i = 0; i < NCHUNKS; i++) 1035 p += sprintf(p, "%u ", zbud_unbuddied[i].count); 1036 return p - buf; 1037} 1038 1039static int zbud_show_cumul_chunk_counts(char *buf) 1040{ 1041 unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; 1042 unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; 1043 unsigned long total_chunks_lte_42 = 0; 1044 char *p = buf; 1045 1046 for (i = 0; i < NCHUNKS; i++) { 1047 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); 1048 chunks += zbud_cumul_chunk_counts[i]; 1049 total_chunks += zbud_cumul_chunk_counts[i]; 1050 sum_total_chunks += i * zbud_cumul_chunk_counts[i]; 1051 if (i == 21) 1052 total_chunks_lte_21 = total_chunks; 1053 if (i == 32) 1054 total_chunks_lte_32 = total_chunks; 1055 if (i == 42) 1056 total_chunks_lte_42 = total_chunks; 1057 } 1058 p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", 1059 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, 1060 chunks == 0 ? 0 : sum_total_chunks / chunks); 1061 return p - buf; 1062} 1063#endif 1064 1065/********** 1066 * This "zv" PAM implementation combines the TLSF-based xvMalloc 1067 * with lzo1x compression to maximize the amount of data that can 1068 * be packed into a physical page. 1069 * 1070 * Zv represents a PAM page with the index and object (plus a "size" value 1071 * necessary for decompression) immediately preceding the compressed data. 1072 */ 1073 1074/* rudimentary policy limits */ 1075/* total number of persistent pages may not exceed this percentage */ 1076static unsigned int zv_page_count_policy_percent = 75; 1077/* 1078 * byte count defining poor compression; pages with greater zsize will be 1079 * rejected 1080 */ 1081static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; 1082/* 1083 * byte count defining poor *mean* compression; pages with greater zsize 1084 * will be rejected until sufficient better-compressed pages are accepted 1085 * driving the mean below this threshold 1086 */ 1087static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; 1088 1089static atomic_t zv_curr_dist_counts[NCHUNKS]; 1090static atomic_t zv_cumul_dist_counts[NCHUNKS]; 1091 1092 1093static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id, 1094 struct tmem_oid *oid, uint32_t index, 1095 void *cdata, unsigned clen) 1096{ 1097 struct page *page; 1098 struct zv_hdr *zv = NULL; 1099 uint32_t offset; 1100 int alloc_size = clen + sizeof(struct zv_hdr); 1101 int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; 1102 int ret; 1103 1104 BUG_ON(!irqs_disabled()); 1105 BUG_ON(chunks >= NCHUNKS); 1106 ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), 1107 &page, &offset, ZCACHE_GFP_MASK); 1108 if (unlikely(ret)) 1109 goto out; 1110 atomic_inc(&zv_curr_dist_counts[chunks]); 1111 atomic_inc(&zv_cumul_dist_counts[chunks]); 1112 zv = kmap_atomic(page) + offset; 1113 zv->index = index; 1114 zv->oid = *oid; 1115 zv->pool_id = pool_id; 1116 SET_SENTINEL(zv, ZVH); 1117 INIT_LIST_HEAD(&zv->rem_op.list); 1118 zv->client_id = get_client_id_from_client(cli); 1119 zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT; 1120 if (zv->client_id == LOCAL_CLIENT) { 1121 spin_lock(&zcache_rem_op_list_lock); 1122 list_add_tail(&zv->rem_op.list, &zcache_rem_op_list); 1123 spin_unlock(&zcache_rem_op_list_lock); 1124 } 1125 memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); 1126 kunmap_atomic(zv); 1127out: 1128 return zv; 1129} 1130 1131/* similar to zv_create, but just reserve space, no data yet */ 1132static struct zv_hdr *zv_alloc(struct tmem_pool *pool, 1133 struct tmem_oid *oid, uint32_t index, 1134 unsigned clen) 1135{ 1136 struct zcache_client *cli = pool->client; 1137 struct page *page; 1138 struct zv_hdr *zv = NULL; 1139 uint32_t offset; 1140 int ret; 1141 1142 BUG_ON(!irqs_disabled()); 1143 BUG_ON(!is_local_client(pool->client)); 1144 ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), 1145 &page, &offset, ZCACHE_GFP_MASK); 1146 if (unlikely(ret)) 1147 goto out; 1148 zv = kmap_atomic(page) + offset; 1149 SET_SENTINEL(zv, ZVH); 1150 INIT_LIST_HEAD(&zv->rem_op.list); 1151 zv->client_id = LOCAL_CLIENT; 1152 zv->rem_op.op = RAMSTER_INTRANSIT_PERS; 1153 zv->index = index; 1154 zv->oid = *oid; 1155 zv->pool_id = pool->pool_id; 1156 kunmap_atomic(zv); 1157out: 1158 return zv; 1159} 1160 1161static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) 1162{ 1163 unsigned long flags; 1164 struct page *page; 1165 uint32_t offset; 1166 uint16_t size = xv_get_object_size(zv); 1167 int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; 1168 1169 ASSERT_SENTINEL(zv, ZVH); 1170 BUG_ON(chunks >= NCHUNKS); 1171 atomic_dec(&zv_curr_dist_counts[chunks]); 1172 size -= sizeof(*zv); 1173 spin_lock(&zcache_rem_op_list_lock); 1174 size = xv_get_object_size(zv) - sizeof(*zv); 1175 BUG_ON(size == 0); 1176 INVERT_SENTINEL(zv, ZVH); 1177 if (!list_empty(&zv->rem_op.list)) 1178 list_del_init(&zv->rem_op.list); 1179 spin_unlock(&zcache_rem_op_list_lock); 1180 page = virt_to_page(zv); 1181 offset = (unsigned long)zv & ~PAGE_MASK; 1182 local_irq_save(flags); 1183 xv_free(xvpool, page, offset); 1184 local_irq_restore(flags); 1185} 1186 1187static void zv_decompress(struct page *page, struct zv_hdr *zv) 1188{ 1189 size_t clen = PAGE_SIZE; 1190 char *to_va; 1191 unsigned size; 1192 int ret; 1193 1194 ASSERT_SENTINEL(zv, ZVH); 1195 size = xv_get_object_size(zv) - sizeof(*zv); 1196 BUG_ON(size == 0); 1197 to_va = kmap_atomic(page); 1198 ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), 1199 size, to_va, &clen); 1200 kunmap_atomic(to_va); 1201 BUG_ON(ret != LZO_E_OK); 1202 BUG_ON(clen != PAGE_SIZE); 1203} 1204 1205static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv) 1206{ 1207 unsigned size; 1208 1209 ASSERT_SENTINEL(zv, ZVH); 1210 size = xv_get_object_size(zv) - sizeof(*zv); 1211 BUG_ON(size == 0 || size > zv_max_page_size); 1212 BUG_ON(size > *bufsize); 1213 memcpy(data, (char *)zv + sizeof(*zv), size); 1214 *bufsize = size; 1215} 1216 1217static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size) 1218{ 1219 unsigned zv_size; 1220 1221 ASSERT_SENTINEL(zv, ZVH); 1222 zv_size = xv_get_object_size(zv) - sizeof(*zv); 1223 BUG_ON(zv_size != size); 1224 BUG_ON(zv_size == 0 || zv_size > zv_max_page_size); 1225 memcpy((char *)zv + sizeof(*zv), data, size); 1226} 1227 1228#ifdef CONFIG_SYSFS 1229/* 1230 * show a distribution of compression stats for zv pages. 1231 */ 1232 1233static int zv_curr_dist_counts_show(char *buf) 1234{ 1235 unsigned long i, n, chunks = 0, sum_total_chunks = 0; 1236 char *p = buf; 1237 1238 for (i = 0; i < NCHUNKS; i++) { 1239 n = atomic_read(&zv_curr_dist_counts[i]); 1240 p += sprintf(p, "%lu ", n); 1241 chunks += n; 1242 sum_total_chunks += i * n; 1243 } 1244 p += sprintf(p, "mean:%lu\n", 1245 chunks == 0 ? 0 : sum_total_chunks / chunks); 1246 return p - buf; 1247} 1248 1249static int zv_cumul_dist_counts_show(char *buf) 1250{ 1251 unsigned long i, n, chunks = 0, sum_total_chunks = 0; 1252 char *p = buf; 1253 1254 for (i = 0; i < NCHUNKS; i++) { 1255 n = atomic_read(&zv_cumul_dist_counts[i]); 1256 p += sprintf(p, "%lu ", n); 1257 chunks += n; 1258 sum_total_chunks += i * n; 1259 } 1260 p += sprintf(p, "mean:%lu\n", 1261 chunks == 0 ? 0 : sum_total_chunks / chunks); 1262 return p - buf; 1263} 1264 1265/* 1266 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) 1267 * pages that don't compress to less than this value (including metadata 1268 * overhead) to be rejected. We don't allow the value to get too close 1269 * to PAGE_SIZE. 1270 */ 1271static ssize_t zv_max_zsize_show(struct kobject *kobj, 1272 struct kobj_attribute *attr, 1273 char *buf) 1274{ 1275 return sprintf(buf, "%u\n", zv_max_zsize); 1276} 1277 1278static ssize_t zv_max_zsize_store(struct kobject *kobj, 1279 struct kobj_attribute *attr, 1280 const char *buf, size_t count) 1281{ 1282 unsigned long val; 1283 int err; 1284 1285 if (!capable(CAP_SYS_ADMIN)) 1286 return -EPERM; 1287 1288 err = kstrtoul(buf, 10, &val); 1289 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) 1290 return -EINVAL; 1291 zv_max_zsize = val; 1292 return count; 1293} 1294 1295/* 1296 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) 1297 * pages that don't compress to less than this value (including metadata 1298 * overhead) to be rejected UNLESS the mean compression is also smaller 1299 * than this value. In other words, we are load-balancing-by-zsize the 1300 * accepted pages. Again, we don't allow the value to get too close 1301 * to PAGE_SIZE. 1302 */ 1303static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, 1304 struct kobj_attribute *attr, 1305 char *buf) 1306{ 1307 return sprintf(buf, "%u\n", zv_max_mean_zsize); 1308} 1309 1310static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, 1311 struct kobj_attribute *attr, 1312 const char *buf, size_t count) 1313{ 1314 unsigned long val; 1315 int err; 1316 1317 if (!capable(CAP_SYS_ADMIN)) 1318 return -EPERM; 1319 1320 err = kstrtoul(buf, 10, &val); 1321 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) 1322 return -EINVAL; 1323 zv_max_mean_zsize = val; 1324 return count; 1325} 1326 1327/* 1328 * setting zv_page_count_policy_percent via sysfs sets an upper bound of 1329 * persistent (e.g. swap) pages that will be retained according to: 1330 * (zv_page_count_policy_percent * totalram_pages) / 100) 1331 * when that limit is reached, further puts will be rejected (until 1332 * some pages have been flushed). Note that, due to compression, 1333 * this number may exceed 100; it defaults to 75 and we set an 1334 * arbitary limit of 150. A poor choice will almost certainly result 1335 * in OOM's, so this value should only be changed prudently. 1336 */ 1337static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, 1338 struct kobj_attribute *attr, 1339 char *buf) 1340{ 1341 return sprintf(buf, "%u\n", zv_page_count_policy_percent); 1342} 1343 1344static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, 1345 struct kobj_attribute *attr, 1346 const char *buf, size_t count) 1347{ 1348 unsigned long val; 1349 int err; 1350 1351 if (!capable(CAP_SYS_ADMIN)) 1352 return -EPERM; 1353 1354 err = kstrtoul(buf, 10, &val); 1355 if (err || (val == 0) || (val > 150)) 1356 return -EINVAL; 1357 zv_page_count_policy_percent = val; 1358 return count; 1359} 1360 1361static struct kobj_attribute zcache_zv_max_zsize_attr = { 1362 .attr = { .name = "zv_max_zsize", .mode = 0644 }, 1363 .show = zv_max_zsize_show, 1364 .store = zv_max_zsize_store, 1365}; 1366 1367static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { 1368 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, 1369 .show = zv_max_mean_zsize_show, 1370 .store = zv_max_mean_zsize_store, 1371}; 1372 1373static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { 1374 .attr = { .name = "zv_page_count_policy_percent", 1375 .mode = 0644 }, 1376 .show = zv_page_count_policy_percent_show, 1377 .store = zv_page_count_policy_percent_store, 1378}; 1379#endif 1380 1381/* 1382 * zcache core code starts here 1383 */ 1384 1385/* useful stats not collected by cleancache or frontswap */ 1386static unsigned long zcache_flush_total; 1387static unsigned long zcache_flush_found; 1388static unsigned long zcache_flobj_total; 1389static unsigned long zcache_flobj_found; 1390static unsigned long zcache_failed_eph_puts; 1391static unsigned long zcache_nonactive_puts; 1392static unsigned long zcache_failed_pers_puts; 1393 1394/* 1395 * Tmem operations assume the poolid implies the invoking client. 1396 * Zcache only has one client (the kernel itself): LOCAL_CLIENT. 1397 * RAMster has each client numbered by cluster node, and a KVM version 1398 * of zcache would have one client per guest and each client might 1399 * have a poolid==N. 1400 */ 1401static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) 1402{ 1403 struct tmem_pool *pool = NULL; 1404 struct zcache_client *cli = NULL; 1405 1406 if (cli_id == LOCAL_CLIENT) 1407 cli = &zcache_host; 1408 else { 1409 if (cli_id >= MAX_CLIENTS) 1410 goto out; 1411 cli = &zcache_clients[cli_id]; 1412 if (cli == NULL) 1413 goto out; 1414 atomic_inc(&cli->refcount); 1415 } 1416 if (poolid < MAX_POOLS_PER_CLIENT) { 1417 pool = cli->tmem_pools[poolid]; 1418 if (pool != NULL) 1419 atomic_inc(&pool->refcount); 1420 } 1421out: 1422 return pool; 1423} 1424 1425static void zcache_put_pool(struct tmem_pool *pool) 1426{ 1427 struct zcache_client *cli = NULL; 1428 1429 if (pool == NULL) 1430 BUG(); 1431 cli = pool->client; 1432 atomic_dec(&pool->refcount); 1433 atomic_dec(&cli->refcount); 1434} 1435 1436int zcache_new_client(uint16_t cli_id) 1437{ 1438 struct zcache_client *cli = NULL; 1439 int ret = -1; 1440 1441 if (cli_id == LOCAL_CLIENT) 1442 cli = &zcache_host; 1443 else if ((unsigned int)cli_id < MAX_CLIENTS) 1444 cli = &zcache_clients[cli_id]; 1445 if (cli == NULL) 1446 goto out; 1447 if (cli->allocated) 1448 goto out; 1449 cli->allocated = 1; 1450#ifdef CONFIG_FRONTSWAP 1451 cli->xvpool = xv_create_pool(); 1452 if (cli->xvpool == NULL) 1453 goto out; 1454#endif 1455 ret = 0; 1456out: 1457 return ret; 1458} 1459 1460/* counters for debugging */ 1461static unsigned long zcache_failed_get_free_pages; 1462static unsigned long zcache_failed_alloc; 1463static unsigned long zcache_put_to_flush; 1464 1465/* 1466 * for now, used named slabs so can easily track usage; later can 1467 * either just use kmalloc, or perhaps add a slab-like allocator 1468 * to more carefully manage total memory utilization 1469 */ 1470static struct kmem_cache *zcache_objnode_cache; 1471static struct kmem_cache *zcache_obj_cache; 1472static struct kmem_cache *ramster_flnode_cache; 1473static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); 1474static unsigned long zcache_curr_obj_count_max; 1475static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); 1476static unsigned long zcache_curr_objnode_count_max; 1477 1478/* 1479 * to avoid memory allocation recursion (e.g. due to direct reclaim), we 1480 * preload all necessary data structures so the hostops callbacks never 1481 * actually do a malloc 1482 */ 1483struct zcache_preload { 1484 void *page; 1485 struct tmem_obj *obj; 1486 int nr; 1487 struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; 1488 struct flushlist_node *flnode; 1489}; 1490static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; 1491 1492static int zcache_do_preload(struct tmem_pool *pool) 1493{ 1494 struct zcache_preload *kp; 1495 struct tmem_objnode *objnode; 1496 struct tmem_obj *obj; 1497 struct flushlist_node *flnode; 1498 void *page; 1499 int ret = -ENOMEM; 1500 1501 if (unlikely(zcache_objnode_cache == NULL)) 1502 goto out; 1503 if (unlikely(zcache_obj_cache == NULL)) 1504 goto out; 1505 preempt_disable(); 1506 kp = &__get_cpu_var(zcache_preloads); 1507 while (kp->nr < ARRAY_SIZE(kp->objnodes)) { 1508 preempt_enable_no_resched(); 1509 objnode = kmem_cache_alloc(zcache_objnode_cache, 1510 ZCACHE_GFP_MASK); 1511 if (unlikely(objnode == NULL)) { 1512 zcache_failed_alloc++; 1513 goto out; 1514 } 1515 preempt_disable(); 1516 kp = &__get_cpu_var(zcache_preloads); 1517 if (kp->nr < ARRAY_SIZE(kp->objnodes)) 1518 kp->objnodes[kp->nr++] = objnode; 1519 else 1520 kmem_cache_free(zcache_objnode_cache, objnode); 1521 } 1522 preempt_enable_no_resched(); 1523 obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); 1524 if (unlikely(obj == NULL)) { 1525 zcache_failed_alloc++; 1526 goto out; 1527 } 1528 flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK); 1529 if (unlikely(flnode == NULL)) { 1530 zcache_failed_alloc++; 1531 goto out; 1532 } 1533 if (is_ephemeral(pool)) { 1534 page = (void *)__get_free_page(ZCACHE_GFP_MASK); 1535 if (unlikely(page == NULL)) { 1536 zcache_failed_get_free_pages++; 1537 kmem_cache_free(zcache_obj_cache, obj); 1538 kmem_cache_free(ramster_flnode_cache, flnode); 1539 goto out; 1540 } 1541 } 1542 preempt_disable(); 1543 kp = &__get_cpu_var(zcache_preloads); 1544 if (kp->obj == NULL) 1545 kp->obj = obj; 1546 else 1547 kmem_cache_free(zcache_obj_cache, obj); 1548 if (kp->flnode == NULL) 1549 kp->flnode = flnode; 1550 else 1551 kmem_cache_free(ramster_flnode_cache, flnode); 1552 if (is_ephemeral(pool)) { 1553 if (kp->page == NULL) 1554 kp->page = page; 1555 else 1556 free_page((unsigned long)page); 1557 } 1558 ret = 0; 1559out: 1560 return ret; 1561} 1562 1563static int ramster_do_preload_flnode_only(struct tmem_pool *pool) 1564{ 1565 struct zcache_preload *kp; 1566 struct flushlist_node *flnode; 1567 int ret = -ENOMEM; 1568 1569 BUG_ON(!irqs_disabled()); 1570 if (unlikely(ramster_flnode_cache == NULL)) 1571 BUG(); 1572 kp = &__get_cpu_var(zcache_preloads); 1573 flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); 1574 if (unlikely(flnode == NULL) && kp->flnode == NULL) 1575 BUG(); /* FIXME handle more gracefully, but how??? */ 1576 else if (kp->flnode == NULL) 1577 kp->flnode = flnode; 1578 else 1579 kmem_cache_free(ramster_flnode_cache, flnode); 1580 return ret; 1581} 1582 1583static void *zcache_get_free_page(void) 1584{ 1585 struct zcache_preload *kp; 1586 void *page; 1587 1588 kp = &__get_cpu_var(zcache_preloads); 1589 page = kp->page; 1590 BUG_ON(page == NULL); 1591 kp->page = NULL; 1592 return page; 1593} 1594 1595static void zcache_free_page(void *p) 1596{ 1597 free_page((unsigned long)p); 1598} 1599 1600/* 1601 * zcache implementation for tmem host ops 1602 */ 1603 1604static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) 1605{ 1606 struct tmem_objnode *objnode = NULL; 1607 unsigned long count; 1608 struct zcache_preload *kp; 1609 1610 kp = &__get_cpu_var(zcache_preloads); 1611 if (kp->nr <= 0) 1612 goto out; 1613 objnode = kp->objnodes[kp->nr - 1]; 1614 BUG_ON(objnode == NULL); 1615 kp->objnodes[kp->nr - 1] = NULL; 1616 kp->nr--; 1617 count = atomic_inc_return(&zcache_curr_objnode_count); 1618 if (count > zcache_curr_objnode_count_max) 1619 zcache_curr_objnode_count_max = count; 1620out: 1621 return objnode; 1622} 1623 1624static void zcache_objnode_free(struct tmem_objnode *objnode, 1625 struct tmem_pool *pool) 1626{ 1627 atomic_dec(&zcache_curr_objnode_count); 1628 BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); 1629 kmem_cache_free(zcache_objnode_cache, objnode); 1630} 1631 1632static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) 1633{ 1634 struct tmem_obj *obj = NULL; 1635 unsigned long count; 1636 struct zcache_preload *kp; 1637 1638 kp = &__get_cpu_var(zcache_preloads); 1639 obj = kp->obj; 1640 BUG_ON(obj == NULL); 1641 kp->obj = NULL; 1642 count = atomic_inc_return(&zcache_curr_obj_count); 1643 if (count > zcache_curr_obj_count_max) 1644 zcache_curr_obj_count_max = count; 1645 return obj; 1646} 1647 1648static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) 1649{ 1650 atomic_dec(&zcache_curr_obj_count); 1651 BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); 1652 kmem_cache_free(zcache_obj_cache, obj); 1653} 1654 1655static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) 1656{ 1657 struct flushlist_node *flnode = NULL; 1658 struct zcache_preload *kp; 1659 int count; 1660 1661 kp = &__get_cpu_var(zcache_preloads); 1662 flnode = kp->flnode; 1663 BUG_ON(flnode == NULL); 1664 kp->flnode = NULL; 1665 count = atomic_inc_return(&ramster_curr_flnode_count); 1666 if (count > ramster_curr_flnode_count_max) 1667 ramster_curr_flnode_count_max = count; 1668 return flnode; 1669} 1670 1671static void ramster_flnode_free(struct flushlist_node *flnode, 1672 struct tmem_pool *pool) 1673{ 1674 atomic_dec(&ramster_curr_flnode_count); 1675 BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0); 1676 kmem_cache_free(ramster_flnode_cache, flnode); 1677} 1678 1679static struct tmem_hostops zcache_hostops = { 1680 .obj_alloc = zcache_obj_alloc, 1681 .obj_free = zcache_obj_free, 1682 .objnode_alloc = zcache_objnode_alloc, 1683 .objnode_free = zcache_objnode_free, 1684}; 1685 1686/* 1687 * zcache implementations for PAM page descriptor ops 1688 */ 1689 1690 1691static inline void dec_and_check(atomic_t *pvar) 1692{ 1693 atomic_dec(pvar); 1694 /* later when all accounting is fixed, make this a BUG */ 1695 WARN_ON_ONCE(atomic_read(pvar) < 0); 1696} 1697 1698static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); 1699static unsigned long zcache_curr_eph_pampd_count_max; 1700static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); 1701static unsigned long zcache_curr_pers_pampd_count_max; 1702 1703/* forward reference */ 1704static int zcache_compress(struct page *from, void **out_va, size_t *out_len); 1705 1706static int zcache_pampd_eph_create(char *data, size_t size, bool raw, 1707 struct tmem_pool *pool, struct tmem_oid *oid, 1708 uint32_t index, void **pampd) 1709{ 1710 int ret = -1; 1711 void *cdata = data; 1712 size_t clen = size; 1713 struct zcache_client *cli = pool->client; 1714 uint16_t client_id = get_client_id_from_client(cli); 1715 struct page *page = NULL; 1716 unsigned long count; 1717 1718 if (!raw) { 1719 page = virt_to_page(data); 1720 ret = zcache_compress(page, &cdata, &clen); 1721 if (ret == 0) 1722 goto out; 1723 if (clen == 0 || clen > zbud_max_buddy_size()) { 1724 zcache_compress_poor++; 1725 goto out; 1726 } 1727 } 1728 *pampd = (void *)zbud_create(client_id, pool->pool_id, oid, 1729 index, page, cdata, clen); 1730 if (*pampd == NULL) { 1731 ret = -ENOMEM; 1732 goto out; 1733 } 1734 ret = 0; 1735 count = atomic_inc_return(&zcache_curr_eph_pampd_count); 1736 if (count > zcache_curr_eph_pampd_count_max) 1737 zcache_curr_eph_pampd_count_max = count; 1738 if (client_id != LOCAL_CLIENT) { 1739 count = atomic_inc_return(&ramster_foreign_eph_pampd_count); 1740 if (count > ramster_foreign_eph_pampd_count_max) 1741 ramster_foreign_eph_pampd_count_max = count; 1742 } 1743out: 1744 return ret; 1745} 1746 1747static int zcache_pampd_pers_create(char *data, size_t size, bool raw, 1748 struct tmem_pool *pool, struct tmem_oid *oid, 1749 uint32_t index, void **pampd) 1750{ 1751 int ret = -1; 1752 void *cdata = data; 1753 size_t clen = size; 1754 struct zcache_client *cli = pool->client; 1755 struct page *page; 1756 unsigned long count; 1757 unsigned long zv_mean_zsize; 1758 struct zv_hdr *zv; 1759 long curr_pers_pampd_count; 1760 u64 total_zsize; 1761#ifdef RAMSTER_TESTING 1762 static bool pampd_neg_warned; 1763#endif 1764 1765 curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) - 1766 atomic_read(&ramster_remote_pers_pages); 1767#ifdef RAMSTER_TESTING 1768 /* should always be positive, but warn if accounting is off */ 1769 if (!pampd_neg_warned) { 1770 pr_warn("ramster: bad accounting for curr_pers_pampd_count\n"); 1771 pampd_neg_warned = true; 1772 } 1773#endif 1774 if (curr_pers_pampd_count > 1775 (zv_page_count_policy_percent * totalram_pages) / 100) { 1776 zcache_policy_percent_exceeded++; 1777 goto out; 1778 } 1779 if (raw) 1780 goto ok_to_create; 1781 page = virt_to_page(data); 1782 if (zcache_compress(page, &cdata, &clen) == 0) 1783 goto out; 1784 /* reject if compression is too poor */ 1785 if (clen > zv_max_zsize) { 1786 zcache_compress_poor++; 1787 goto out; 1788 } 1789 /* reject if mean compression is too poor */ 1790 if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { 1791 total_zsize = xv_get_total_size_bytes(cli->xvpool); 1792 zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count); 1793 if (zv_mean_zsize > zv_max_mean_zsize) { 1794 zcache_mean_compress_poor++; 1795 goto out; 1796 } 1797 } 1798ok_to_create: 1799 *pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen); 1800 if (*pampd == NULL) { 1801 ret = -ENOMEM; 1802 goto out; 1803 } 1804 ret = 0; 1805 count = atomic_inc_return(&zcache_curr_pers_pampd_count); 1806 if (count > zcache_curr_pers_pampd_count_max) 1807 zcache_curr_pers_pampd_count_max = count; 1808 if (is_local_client(cli)) 1809 goto out; 1810 zv = *(struct zv_hdr **)pampd; 1811 count = atomic_inc_return(&ramster_foreign_pers_pampd_count); 1812 if (count > ramster_foreign_pers_pampd_count_max) 1813 ramster_foreign_pers_pampd_count_max = count; 1814out: 1815 return ret; 1816} 1817 1818static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, 1819 struct tmem_pool *pool, struct tmem_oid *oid, 1820 uint32_t index) 1821{ 1822 void *pampd = NULL; 1823 int ret; 1824 bool ephemeral; 1825 1826 BUG_ON(preemptible()); 1827 ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool)); 1828 if (ephemeral) 1829 ret = zcache_pampd_eph_create(data, size, raw, pool, 1830 oid, index, &pampd); 1831 else 1832 ret = zcache_pampd_pers_create(data, size, raw, pool, 1833 oid, index, &pampd); 1834 /* FIXME add some counters here for failed creates? */ 1835 return pampd; 1836} 1837 1838/* 1839 * fill the pageframe corresponding to the struct page with the data 1840 * from the passed pampd 1841 */ 1842static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, 1843 void *pampd, struct tmem_pool *pool, 1844 struct tmem_oid *oid, uint32_t index) 1845{ 1846 int ret = 0; 1847 1848 BUG_ON(preemptible()); 1849 BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */ 1850 BUG_ON(pampd_is_remote(pampd)); 1851 if (raw) 1852 zv_copy_from_pampd(data, bufsize, pampd); 1853 else 1854 zv_decompress(virt_to_page(data), pampd); 1855 return ret; 1856} 1857 1858static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, 1859 void *pampd, struct tmem_pool *pool, 1860 struct tmem_oid *oid, uint32_t index) 1861{ 1862 int ret = 0; 1863 unsigned long flags; 1864 struct zcache_client *cli = pool->client; 1865 1866 BUG_ON(preemptible()); 1867 BUG_ON(pampd_is_remote(pampd)); 1868 if (is_ephemeral(pool)) { 1869 local_irq_save(flags); 1870 if (raw) 1871 zbud_copy_from_pampd(data, bufsize, pampd); 1872 else 1873 ret = zbud_decompress(virt_to_page(data), pampd); 1874 zbud_free_and_delist((struct zbud_hdr *)pampd); 1875 local_irq_restore(flags); 1876 if (!is_local_client(cli)) 1877 dec_and_check(&ramster_foreign_eph_pampd_count); 1878 dec_and_check(&zcache_curr_eph_pampd_count); 1879 } else { 1880 if (is_local_client(cli)) 1881 BUG(); 1882 if (raw) 1883 zv_copy_from_pampd(data, bufsize, pampd); 1884 else 1885 zv_decompress(virt_to_page(data), pampd); 1886 zv_free(cli->xvpool, pampd); 1887 if (!is_local_client(cli)) 1888 dec_and_check(&ramster_foreign_pers_pampd_count); 1889 dec_and_check(&zcache_curr_pers_pampd_count); 1890 ret = 0; 1891 } 1892 return ret; 1893} 1894 1895static bool zcache_pampd_is_remote(void *pampd) 1896{ 1897 return pampd_is_remote(pampd); 1898} 1899 1900/* 1901 * free the pampd and remove it from any zcache lists 1902 * pampd must no longer be pointed to from any tmem data structures! 1903 */ 1904static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, 1905 struct tmem_oid *oid, uint32_t index, bool acct) 1906{ 1907 struct zcache_client *cli = pool->client; 1908 bool eph = is_ephemeral(pool); 1909 struct zv_hdr *zv; 1910 1911 BUG_ON(preemptible()); 1912 if (pampd_is_remote(pampd)) { 1913 WARN_ON(acct == false); 1914 if (oid == NULL) { 1915 /* 1916 * a NULL oid means to ignore this pampd free 1917 * as the remote freeing will be handled elsewhere 1918 */ 1919 } else if (eph) { 1920 /* FIXME remote flush optional but probably good idea */ 1921 /* FIXME get these working properly again */ 1922 dec_and_check(&zcache_curr_eph_pampd_count); 1923 } else if (pampd_is_intransit(pampd)) { 1924 /* did a pers remote get_and_free, so just free local */ 1925 pampd = pampd_mask_intransit_and_remote(pampd); 1926 goto local_pers; 1927 } else { 1928 struct flushlist_node *flnode = 1929 ramster_flnode_alloc(pool); 1930 1931 flnode->xh.client_id = pampd_remote_node(pampd); 1932 flnode->xh.pool_id = pool->pool_id; 1933 flnode->xh.oid = *oid; 1934 flnode->xh.index = index; 1935 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; 1936 spin_lock(&zcache_rem_op_list_lock); 1937 list_add(&flnode->rem_op.list, &zcache_rem_op_list); 1938 spin_unlock(&zcache_rem_op_list_lock); 1939 dec_and_check(&zcache_curr_pers_pampd_count); 1940 dec_and_check(&ramster_remote_pers_pages); 1941 } 1942 } else if (eph) { 1943 zbud_free_and_delist((struct zbud_hdr *)pampd); 1944 if (!is_local_client(pool->client)) 1945 dec_and_check(&ramster_foreign_eph_pampd_count); 1946 if (acct) 1947 /* FIXME get these working properly again */ 1948 dec_and_check(&zcache_curr_eph_pampd_count); 1949 } else { 1950local_pers: 1951 zv = (struct zv_hdr *)pampd; 1952 if (!is_local_client(pool->client)) 1953 dec_and_check(&ramster_foreign_pers_pampd_count); 1954 zv_free(cli->xvpool, zv); 1955 if (acct) 1956 /* FIXME get these working properly again */ 1957 dec_and_check(&zcache_curr_pers_pampd_count); 1958 } 1959} 1960 1961static void zcache_pampd_free_obj(struct tmem_pool *pool, 1962 struct tmem_obj *obj) 1963{ 1964 struct flushlist_node *flnode; 1965 1966 BUG_ON(preemptible()); 1967 if (obj->extra == NULL) 1968 return; 1969 BUG_ON(!pampd_is_remote(obj->extra)); 1970 flnode = ramster_flnode_alloc(pool); 1971 flnode->xh.client_id = pampd_remote_node(obj->extra); 1972 flnode->xh.pool_id = pool->pool_id; 1973 flnode->xh.oid = obj->oid; 1974 flnode->xh.index = FLUSH_ENTIRE_OBJECT; 1975 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; 1976 spin_lock(&zcache_rem_op_list_lock); 1977 list_add(&flnode->rem_op.list, &zcache_rem_op_list); 1978 spin_unlock(&zcache_rem_op_list_lock); 1979} 1980 1981void zcache_pampd_new_obj(struct tmem_obj *obj) 1982{ 1983 obj->extra = NULL; 1984} 1985 1986int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) 1987{ 1988 int ret = -1; 1989 1990 if (new_pampd != NULL) { 1991 if (obj->extra == NULL) 1992 obj->extra = new_pampd; 1993 /* enforce that all remote pages in an object reside 1994 * in the same node! */ 1995 else if (pampd_remote_node(new_pampd) != 1996 pampd_remote_node((void *)(obj->extra))) 1997 BUG(); 1998 ret = 0; 1999 } 2000 return ret; 2001} 2002 2003/* 2004 * Called by the message handler after a (still compressed) page has been 2005 * fetched from the remote machine in response to an "is_remote" tmem_get 2006 * or persistent tmem_localify. For a tmem_get, "extra" is the address of 2007 * the page that is to be filled to succesfully resolve the tmem_get; for 2008 * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only 2009 * in the local zcache). "data" points to "size" bytes of (compressed) data 2010 * passed in the message. In the case of a persistent remote get, if 2011 * pre-allocation was successful (see zcache_repatriate_preload), the page 2012 * is placed into both local zcache and at "extra". 2013 */ 2014int zcache_localify(int pool_id, struct tmem_oid *oidp, 2015 uint32_t index, char *data, size_t size, 2016 void *extra) 2017{ 2018 int ret = -ENOENT; 2019 unsigned long flags; 2020 struct tmem_pool *pool; 2021 bool ephemeral, delete = false; 2022 size_t clen = PAGE_SIZE; 2023 void *pampd, *saved_hb; 2024 struct tmem_obj *obj; 2025 2026 pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); 2027 if (unlikely(pool == NULL)) 2028 /* pool doesn't exist anymore */ 2029 goto out; 2030 ephemeral = is_ephemeral(pool); 2031 local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ 2032 pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); 2033 if (pampd == NULL) { 2034 /* hmmm... must have been a flush while waiting */ 2035#ifdef RAMSTER_TESTING 2036 pr_err("UNTESTED pampd==NULL in zcache_localify\n"); 2037#endif 2038 if (ephemeral) 2039 ramster_remote_eph_pages_unsucc_get++; 2040 else 2041 ramster_remote_pers_pages_unsucc_get++; 2042 obj = NULL; 2043 goto finish; 2044 } else if (unlikely(!pampd_is_remote(pampd))) { 2045 /* hmmm... must have been a dup put while waiting */ 2046#ifdef RAMSTER_TESTING 2047 pr_err("UNTESTED dup while waiting in zcache_localify\n"); 2048#endif 2049 if (ephemeral) 2050 ramster_remote_eph_pages_unsucc_get++; 2051 else 2052 ramster_remote_pers_pages_unsucc_get++; 2053 obj = NULL; 2054 pampd = NULL; 2055 ret = -EEXIST; 2056 goto finish; 2057 } else if (size == 0) { 2058 /* no remote data, delete the local is_remote pampd */ 2059 pampd = NULL; 2060 if (ephemeral) 2061 ramster_remote_eph_pages_unsucc_get++; 2062 else 2063 BUG(); 2064 delete = true; 2065 goto finish; 2066 } 2067 if (!ephemeral && pampd_is_intransit(pampd)) { 2068 /* localify to zcache */ 2069 pampd = pampd_mask_intransit_and_remote(pampd); 2070 zv_copy_to_pampd(pampd, data, size); 2071 } else { 2072 pampd = NULL; 2073 obj = NULL; 2074 } 2075 if (extra != NULL) { 2076 /* decompress direct-to-memory to complete remotify */ 2077 ret = lzo1x_decompress_safe((char *)data, size, 2078 (char *)extra, &clen); 2079 BUG_ON(ret != LZO_E_OK); 2080 BUG_ON(clen != PAGE_SIZE); 2081 } 2082 if (ephemeral) 2083 ramster_remote_eph_pages_succ_get++; 2084 else 2085 ramster_remote_pers_pages_succ_get++; 2086 ret = 0; 2087finish: 2088 tmem_localify_finish(obj, index, pampd, saved_hb, delete); 2089 zcache_put_pool(pool); 2090 local_irq_restore(flags); 2091out: 2092 return ret; 2093} 2094 2095/* 2096 * Called on a remote persistent tmem_get to attempt to preallocate 2097 * local storage for the data contained in the remote persistent page. 2098 * If succesfully preallocated, returns the pampd, marked as remote and 2099 * in_transit. Else returns NULL. Note that the appropriate tmem data 2100 * structure must be locked. 2101 */ 2102static void *zcache_pampd_repatriate_preload(void *pampd, 2103 struct tmem_pool *pool, 2104 struct tmem_oid *oid, 2105 uint32_t index, 2106 bool *intransit) 2107{ 2108 int clen = pampd_remote_size(pampd); 2109 void *ret_pampd = NULL; 2110 unsigned long flags; 2111 2112 if (!pampd_is_remote(pampd)) 2113 BUG(); 2114 if (is_ephemeral(pool)) 2115 BUG(); 2116 if (pampd_is_intransit(pampd)) { 2117 /* 2118 * to avoid multiple allocations (and maybe a memory leak) 2119 * don't preallocate if already in the process of being 2120 * repatriated 2121 */ 2122 *intransit = true; 2123 goto out; 2124 } 2125 *intransit = false; 2126 local_irq_save(flags); 2127 ret_pampd = (void *)zv_alloc(pool, oid, index, clen); 2128 if (ret_pampd != NULL) { 2129 /* 2130 * a pampd is marked intransit if it is remote and space has 2131 * been allocated for it locally (note, only happens for 2132 * persistent pages, in which case the remote copy is freed) 2133 */ 2134 ret_pampd = pampd_mark_intransit(ret_pampd); 2135 dec_and_check(&ramster_remote_pers_pages); 2136 } else 2137 ramster_pers_pages_remote_nomem++; 2138 local_irq_restore(flags); 2139out: 2140 return ret_pampd; 2141} 2142 2143/* 2144 * Called on a remote tmem_get to invoke a message to fetch the page. 2145 * Might sleep so no tmem locks can be held. "extra" is passed 2146 * all the way through the round-trip messaging to zcache_localify. 2147 */ 2148static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd, 2149 struct tmem_pool *pool, 2150 struct tmem_oid *oid, uint32_t index, 2151 bool free, void *extra) 2152{ 2153 struct tmem_xhandle xh; 2154 int ret; 2155 2156 if (pampd_is_intransit(real_pampd)) 2157 /* have local space pre-reserved, so free remote copy */ 2158 free = true; 2159 xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); 2160 /* unreliable request/response for now */ 2161 ret = ramster_remote_async_get(&xh, free, 2162 pampd_remote_node(fake_pampd), 2163 pampd_remote_size(fake_pampd), 2164 pampd_remote_cksum(fake_pampd), 2165 extra); 2166#ifdef RAMSTER_TESTING 2167 if (ret != 0 && ret != -ENOENT) 2168 pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n", 2169 ret); 2170#endif 2171 return ret; 2172} 2173 2174static struct tmem_pamops zcache_pamops = { 2175 .create = zcache_pampd_create, 2176 .get_data = zcache_pampd_get_data, 2177 .free = zcache_pampd_free, 2178 .get_data_and_free = zcache_pampd_get_data_and_free, 2179 .free_obj = zcache_pampd_free_obj, 2180 .is_remote = zcache_pampd_is_remote, 2181 .repatriate_preload = zcache_pampd_repatriate_preload, 2182 .repatriate = zcache_pampd_repatriate, 2183 .new_obj = zcache_pampd_new_obj, 2184 .replace_in_obj = zcache_pampd_replace_in_obj, 2185}; 2186 2187/* 2188 * zcache compression/decompression and related per-cpu stuff 2189 */ 2190 2191#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS 2192#define LZO_DSTMEM_PAGE_ORDER 1 2193static DEFINE_PER_CPU(unsigned char *, zcache_workmem); 2194static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); 2195 2196static int zcache_compress(struct page *from, void **out_va, size_t *out_len) 2197{ 2198 int ret = 0; 2199 unsigned char *dmem = __get_cpu_var(zcache_dstmem); 2200 unsigned char *wmem = __get_cpu_var(zcache_workmem); 2201 char *from_va; 2202 2203 BUG_ON(!irqs_disabled()); 2204 if (unlikely(dmem == NULL || wmem == NULL)) 2205 goto out; /* no buffer, so can't compress */ 2206 from_va = kmap_atomic(from); 2207 mb(); 2208 ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); 2209 BUG_ON(ret != LZO_E_OK); 2210 *out_va = dmem; 2211 kunmap_atomic(from_va); 2212 ret = 1; 2213out: 2214 return ret; 2215} 2216 2217 2218static int zcache_cpu_notifier(struct notifier_block *nb, 2219 unsigned long action, void *pcpu) 2220{ 2221 int cpu = (long)pcpu; 2222 struct zcache_preload *kp; 2223 2224 switch (action) { 2225 case CPU_UP_PREPARE: 2226 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( 2227 GFP_KERNEL | __GFP_REPEAT, 2228 LZO_DSTMEM_PAGE_ORDER), 2229 per_cpu(zcache_workmem, cpu) = 2230 kzalloc(LZO1X_MEM_COMPRESS, 2231 GFP_KERNEL | __GFP_REPEAT); 2232 per_cpu(zcache_remoteputmem, cpu) = 2233 kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); 2234 break; 2235 case CPU_DEAD: 2236 case CPU_UP_CANCELED: 2237 kfree(per_cpu(zcache_remoteputmem, cpu)); 2238 per_cpu(zcache_remoteputmem, cpu) = NULL; 2239 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), 2240 LZO_DSTMEM_PAGE_ORDER); 2241 per_cpu(zcache_dstmem, cpu) = NULL; 2242 kfree(per_cpu(zcache_workmem, cpu)); 2243 per_cpu(zcache_workmem, cpu) = NULL; 2244 kp = &per_cpu(zcache_preloads, cpu); 2245 while (kp->nr) { 2246 kmem_cache_free(zcache_objnode_cache, 2247 kp->objnodes[kp->nr - 1]); 2248 kp->objnodes[kp->nr - 1] = NULL; 2249 kp->nr--; 2250 } 2251 if (kp->obj) { 2252 kmem_cache_free(zcache_obj_cache, kp->obj); 2253 kp->obj = NULL; 2254 } 2255 if (kp->flnode) { 2256 kmem_cache_free(ramster_flnode_cache, kp->flnode); 2257 kp->flnode = NULL; 2258 } 2259 if (kp->page) { 2260 free_page((unsigned long)kp->page); 2261 kp->page = NULL; 2262 } 2263 break; 2264 default: 2265 break; 2266 } 2267 return NOTIFY_OK; 2268} 2269 2270static struct notifier_block zcache_cpu_notifier_block = { 2271 .notifier_call = zcache_cpu_notifier 2272}; 2273 2274#ifdef CONFIG_SYSFS 2275#define ZCACHE_SYSFS_RO(_name) \ 2276 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 2277 struct kobj_attribute *attr, char *buf) \ 2278 { \ 2279 return sprintf(buf, "%lu\n", zcache_##_name); \ 2280 } \ 2281 static struct kobj_attribute zcache_##_name##_attr = { \ 2282 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2283 .show = zcache_##_name##_show, \ 2284 } 2285 2286#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ 2287 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 2288 struct kobj_attribute *attr, char *buf) \ 2289 { \ 2290 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ 2291 } \ 2292 static struct kobj_attribute zcache_##_name##_attr = { \ 2293 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2294 .show = zcache_##_name##_show, \ 2295 } 2296 2297#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ 2298 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 2299 struct kobj_attribute *attr, char *buf) \ 2300 { \ 2301 return _func(buf); \ 2302 } \ 2303 static struct kobj_attribute zcache_##_name##_attr = { \ 2304 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2305 .show = zcache_##_name##_show, \ 2306 } 2307 2308ZCACHE_SYSFS_RO(curr_obj_count_max); 2309ZCACHE_SYSFS_RO(curr_objnode_count_max); 2310ZCACHE_SYSFS_RO(flush_total); 2311ZCACHE_SYSFS_RO(flush_found); 2312ZCACHE_SYSFS_RO(flobj_total); 2313ZCACHE_SYSFS_RO(flobj_found); 2314ZCACHE_SYSFS_RO(failed_eph_puts); 2315ZCACHE_SYSFS_RO(nonactive_puts); 2316ZCACHE_SYSFS_RO(failed_pers_puts); 2317ZCACHE_SYSFS_RO(zbud_curr_zbytes); 2318ZCACHE_SYSFS_RO(zbud_cumul_zpages); 2319ZCACHE_SYSFS_RO(zbud_cumul_zbytes); 2320ZCACHE_SYSFS_RO(zbud_buddied_count); 2321ZCACHE_SYSFS_RO(evicted_raw_pages); 2322ZCACHE_SYSFS_RO(evicted_unbuddied_pages); 2323ZCACHE_SYSFS_RO(evicted_buddied_pages); 2324ZCACHE_SYSFS_RO(failed_get_free_pages); 2325ZCACHE_SYSFS_RO(failed_alloc); 2326ZCACHE_SYSFS_RO(put_to_flush); 2327ZCACHE_SYSFS_RO(compress_poor); 2328ZCACHE_SYSFS_RO(mean_compress_poor); 2329ZCACHE_SYSFS_RO(policy_percent_exceeded); 2330ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); 2331ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); 2332ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); 2333ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); 2334ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, 2335 zbud_show_unbuddied_list_counts); 2336ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, 2337 zbud_show_cumul_chunk_counts); 2338ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, 2339 zv_curr_dist_counts_show); 2340ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, 2341 zv_cumul_dist_counts_show); 2342 2343static struct attribute *zcache_attrs[] = { 2344 &zcache_curr_obj_count_attr.attr, 2345 &zcache_curr_obj_count_max_attr.attr, 2346 &zcache_curr_objnode_count_attr.attr, 2347 &zcache_curr_objnode_count_max_attr.attr, 2348 &zcache_flush_total_attr.attr, 2349 &zcache_flobj_total_attr.attr, 2350 &zcache_flush_found_attr.attr, 2351 &zcache_flobj_found_attr.attr, 2352 &zcache_failed_eph_puts_attr.attr, 2353 &zcache_nonactive_puts_attr.attr, 2354 &zcache_failed_pers_puts_attr.attr, 2355 &zcache_policy_percent_exceeded_attr.attr, 2356 &zcache_compress_poor_attr.attr, 2357 &zcache_mean_compress_poor_attr.attr, 2358 &zcache_zbud_curr_raw_pages_attr.attr, 2359 &zcache_zbud_curr_zpages_attr.attr, 2360 &zcache_zbud_curr_zbytes_attr.attr, 2361 &zcache_zbud_cumul_zpages_attr.attr, 2362 &zcache_zbud_cumul_zbytes_attr.attr, 2363 &zcache_zbud_buddied_count_attr.attr, 2364 &zcache_evicted_raw_pages_attr.attr, 2365 &zcache_evicted_unbuddied_pages_attr.attr, 2366 &zcache_evicted_buddied_pages_attr.attr, 2367 &zcache_failed_get_free_pages_attr.attr, 2368 &zcache_failed_alloc_attr.attr, 2369 &zcache_put_to_flush_attr.attr, 2370 &zcache_zbud_unbuddied_list_counts_attr.attr, 2371 &zcache_zbud_cumul_chunk_counts_attr.attr, 2372 &zcache_zv_curr_dist_counts_attr.attr, 2373 &zcache_zv_cumul_dist_counts_attr.attr, 2374 &zcache_zv_max_zsize_attr.attr, 2375 &zcache_zv_max_mean_zsize_attr.attr, 2376 &zcache_zv_page_count_policy_percent_attr.attr, 2377 NULL, 2378}; 2379 2380static struct attribute_group zcache_attr_group = { 2381 .attrs = zcache_attrs, 2382 .name = "zcache", 2383}; 2384 2385#define RAMSTER_SYSFS_RO(_name) \ 2386 static ssize_t ramster_##_name##_show(struct kobject *kobj, \ 2387 struct kobj_attribute *attr, char *buf) \ 2388 { \ 2389 return sprintf(buf, "%lu\n", ramster_##_name); \ 2390 } \ 2391 static struct kobj_attribute ramster_##_name##_attr = { \ 2392 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2393 .show = ramster_##_name##_show, \ 2394 } 2395 2396#define RAMSTER_SYSFS_RW(_name) \ 2397 static ssize_t ramster_##_name##_show(struct kobject *kobj, \ 2398 struct kobj_attribute *attr, char *buf) \ 2399 { \ 2400 return sprintf(buf, "%lu\n", ramster_##_name); \ 2401 } \ 2402 static ssize_t ramster_##_name##_store(struct kobject *kobj, \ 2403 struct kobj_attribute *attr, const char *buf, size_t count) \ 2404 { \ 2405 int err; \ 2406 unsigned long enable; \ 2407 err = kstrtoul(buf, 10, &enable); \ 2408 if (err) \ 2409 return -EINVAL; \ 2410 ramster_##_name = enable; \ 2411 return count; \ 2412 } \ 2413 static struct kobj_attribute ramster_##_name##_attr = { \ 2414 .attr = { .name = __stringify(_name), .mode = 0644 }, \ 2415 .show = ramster_##_name##_show, \ 2416 .store = ramster_##_name##_store, \ 2417 } 2418 2419#define RAMSTER_SYSFS_RO_ATOMIC(_name) \ 2420 static ssize_t ramster_##_name##_show(struct kobject *kobj, \ 2421 struct kobj_attribute *attr, char *buf) \ 2422 { \ 2423 return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ 2424 } \ 2425 static struct kobj_attribute ramster_##_name##_attr = { \ 2426 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2427 .show = ramster_##_name##_show, \ 2428 } 2429 2430RAMSTER_SYSFS_RO(interface_revision); 2431RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); 2432RAMSTER_SYSFS_RW(pers_remotify_enable); 2433RAMSTER_SYSFS_RW(eph_remotify_enable); 2434RAMSTER_SYSFS_RO(eph_pages_remoted); 2435RAMSTER_SYSFS_RO(eph_pages_remote_failed); 2436RAMSTER_SYSFS_RO(pers_pages_remoted); 2437RAMSTER_SYSFS_RO(pers_pages_remote_failed); 2438RAMSTER_SYSFS_RO(pers_pages_remote_nomem); 2439RAMSTER_SYSFS_RO(remote_pages_flushed); 2440RAMSTER_SYSFS_RO(remote_page_flushes_failed); 2441RAMSTER_SYSFS_RO(remote_objects_flushed); 2442RAMSTER_SYSFS_RO(remote_object_flushes_failed); 2443RAMSTER_SYSFS_RO(remote_eph_pages_succ_get); 2444RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get); 2445RAMSTER_SYSFS_RO(remote_pers_pages_succ_get); 2446RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get); 2447RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count); 2448RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max); 2449RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count); 2450RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max); 2451RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count); 2452RAMSTER_SYSFS_RO(curr_flnode_count_max); 2453 2454#define MANUAL_NODES 8 2455static bool ramster_nodes_manual_up[MANUAL_NODES]; 2456static ssize_t ramster_manual_node_up_show(struct kobject *kobj, 2457 struct kobj_attribute *attr, char *buf) 2458{ 2459 int i; 2460 char *p = buf; 2461 for (i = 0; i < MANUAL_NODES; i++) 2462 if (ramster_nodes_manual_up[i]) 2463 p += sprintf(p, "%d ", i); 2464 p += sprintf(p, "\n"); 2465 return p - buf; 2466} 2467 2468static ssize_t ramster_manual_node_up_store(struct kobject *kobj, 2469 struct kobj_attribute *attr, const char *buf, size_t count) 2470{ 2471 int err; 2472 unsigned long node_num; 2473 2474 err = kstrtoul(buf, 10, &node_num); 2475 if (err) { 2476 pr_err("ramster: bad strtoul?\n"); 2477 return -EINVAL; 2478 } 2479 if (node_num >= MANUAL_NODES) { 2480 pr_err("ramster: bad node_num=%lu?\n", node_num); 2481 return -EINVAL; 2482 } 2483 if (ramster_nodes_manual_up[node_num]) { 2484 pr_err("ramster: node %d already up, ignoring\n", 2485 (int)node_num); 2486 } else { 2487 ramster_nodes_manual_up[node_num] = true; 2488 r2net_hb_node_up_manual((int)node_num); 2489 } 2490 return count; 2491} 2492 2493static struct kobj_attribute ramster_manual_node_up_attr = { 2494 .attr = { .name = "manual_node_up", .mode = 0644 }, 2495 .show = ramster_manual_node_up_show, 2496 .store = ramster_manual_node_up_store, 2497}; 2498 2499static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj, 2500 struct kobj_attribute *attr, char *buf) 2501{ 2502 if (ramster_remote_target_nodenum == -1UL) 2503 return sprintf(buf, "unset\n"); 2504 else 2505 return sprintf(buf, "%d\n", ramster_remote_target_nodenum); 2506} 2507 2508static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj, 2509 struct kobj_attribute *attr, const char *buf, size_t count) 2510{ 2511 int err; 2512 unsigned long node_num; 2513 2514 err = kstrtoul(buf, 10, &node_num); 2515 if (err) { 2516 pr_err("ramster: bad strtoul?\n"); 2517 return -EINVAL; 2518 } else if (node_num == -1UL) { 2519 pr_err("ramster: disabling all remotification, " 2520 "data may still reside on remote nodes however\n"); 2521 return -EINVAL; 2522 } else if (node_num >= MANUAL_NODES) { 2523 pr_err("ramster: bad node_num=%lu?\n", node_num); 2524 return -EINVAL; 2525 } else if (!ramster_nodes_manual_up[node_num]) { 2526 pr_err("ramster: node %d not up, ignoring setting " 2527 "of remotification target\n", (int)node_num); 2528 } else if (r2net_remote_target_node_set((int)node_num) >= 0) { 2529 pr_info("ramster: node %d set as remotification target\n", 2530 (int)node_num); 2531 ramster_remote_target_nodenum = (int)node_num; 2532 } else { 2533 pr_err("ramster: bad num to node node_num=%d?\n", 2534 (int)node_num); 2535 return -EINVAL; 2536 } 2537 return count; 2538} 2539 2540static struct kobj_attribute ramster_remote_target_nodenum_attr = { 2541 .attr = { .name = "remote_target_nodenum", .mode = 0644 }, 2542 .show = ramster_remote_target_nodenum_show, 2543 .store = ramster_remote_target_nodenum_store, 2544}; 2545 2546 2547static struct attribute *ramster_attrs[] = { 2548 &ramster_interface_revision_attr.attr, 2549 &ramster_pers_remotify_enable_attr.attr, 2550 &ramster_eph_remotify_enable_attr.attr, 2551 &ramster_remote_pers_pages_attr.attr, 2552 &ramster_eph_pages_remoted_attr.attr, 2553 &ramster_eph_pages_remote_failed_attr.attr, 2554 &ramster_pers_pages_remoted_attr.attr, 2555 &ramster_pers_pages_remote_failed_attr.attr, 2556 &ramster_pers_pages_remote_nomem_attr.attr, 2557 &ramster_remote_pages_flushed_attr.attr, 2558 &ramster_remote_page_flushes_failed_attr.attr, 2559 &ramster_remote_objects_flushed_attr.attr, 2560 &ramster_remote_object_flushes_failed_attr.attr, 2561 &ramster_remote_eph_pages_succ_get_attr.attr, 2562 &ramster_remote_eph_pages_unsucc_get_attr.attr, 2563 &ramster_remote_pers_pages_succ_get_attr.attr, 2564 &ramster_remote_pers_pages_unsucc_get_attr.attr, 2565 &ramster_foreign_eph_pampd_count_attr.attr, 2566 &ramster_foreign_eph_pampd_count_max_attr.attr, 2567 &ramster_foreign_pers_pampd_count_attr.attr, 2568 &ramster_foreign_pers_pampd_count_max_attr.attr, 2569 &ramster_curr_flnode_count_attr.attr, 2570 &ramster_curr_flnode_count_max_attr.attr, 2571 &ramster_manual_node_up_attr.attr, 2572 &ramster_remote_target_nodenum_attr.attr, 2573 NULL, 2574}; 2575 2576static struct attribute_group ramster_attr_group = { 2577 .attrs = ramster_attrs, 2578 .name = "ramster", 2579}; 2580 2581#endif /* CONFIG_SYSFS */ 2582/* 2583 * When zcache is disabled ("frozen"), pools can be created and destroyed, 2584 * but all puts (and thus all other operations that require memory allocation) 2585 * must fail. If zcache is unfrozen, accepts puts, then frozen again, 2586 * data consistency requires all puts while frozen to be converted into 2587 * flushes. 2588 */ 2589static bool zcache_freeze; 2590 2591/* 2592 * zcache shrinker interface (only useful for ephemeral pages, so zbud only) 2593 */ 2594static int shrink_zcache_memory(struct shrinker *shrink, 2595 struct shrink_control *sc) 2596{ 2597 int ret = -1; 2598 int nr = sc->nr_to_scan; 2599 gfp_t gfp_mask = sc->gfp_mask; 2600 2601 if (nr >= 0) { 2602 if (!(gfp_mask & __GFP_FS)) 2603 /* does this case really need to be skipped? */ 2604 goto out; 2605 zbud_evict_pages(nr); 2606 } 2607 ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); 2608out: 2609 return ret; 2610} 2611 2612static struct shrinker zcache_shrinker = { 2613 .shrink = shrink_zcache_memory, 2614 .seeks = DEFAULT_SEEKS, 2615}; 2616 2617/* 2618 * zcache shims between cleancache/frontswap ops and tmem 2619 */ 2620 2621int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp, 2622 uint32_t index, char *data, size_t size, 2623 bool raw, int ephemeral) 2624{ 2625 struct tmem_pool *pool; 2626 int ret = -1; 2627 2628 BUG_ON(!irqs_disabled()); 2629 pool = zcache_get_pool_by_id(cli_id, pool_id); 2630 if (unlikely(pool == NULL)) 2631 goto out; 2632 if (!zcache_freeze && zcache_do_preload(pool) == 0) { 2633 /* preload does preempt_disable on success */ 2634 ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral); 2635 if (ret < 0) { 2636 if (is_ephemeral(pool)) 2637 zcache_failed_eph_puts++; 2638 else 2639 zcache_failed_pers_puts++; 2640 } 2641 zcache_put_pool(pool); 2642 preempt_enable_no_resched(); 2643 } else { 2644 zcache_put_to_flush++; 2645 if (atomic_read(&pool->obj_count) > 0) 2646 /* the put fails whether the flush succeeds or not */ 2647 (void)tmem_flush_page(pool, oidp, index); 2648 zcache_put_pool(pool); 2649 } 2650out: 2651 return ret; 2652} 2653 2654int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp, 2655 uint32_t index, char *data, size_t *sizep, 2656 bool raw, int get_and_free) 2657{ 2658 struct tmem_pool *pool; 2659 int ret = -1; 2660 bool eph; 2661 2662 if (!raw) { 2663 BUG_ON(irqs_disabled()); 2664 BUG_ON(in_softirq()); 2665 } 2666 pool = zcache_get_pool_by_id(cli_id, pool_id); 2667 eph = is_ephemeral(pool); 2668 if (likely(pool != NULL)) { 2669 if (atomic_read(&pool->obj_count) > 0) 2670 ret = tmem_get(pool, oidp, index, data, sizep, 2671 raw, get_and_free); 2672 zcache_put_pool(pool); 2673 } 2674 WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, " 2675 "bad things are very likely to happen soon\n"); 2676#ifdef RAMSTER_TESTING 2677 if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) 2678 pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); 2679#endif 2680 if (ret == -EAGAIN) 2681 BUG(); /* FIXME... don't need this anymore??? let's ensure */ 2682 return ret; 2683} 2684 2685int zcache_flush(int cli_id, int pool_id, 2686 struct tmem_oid *oidp, uint32_t index) 2687{ 2688 struct tmem_pool *pool; 2689 int ret = -1; 2690 unsigned long flags; 2691 2692 local_irq_save(flags); 2693 zcache_flush_total++; 2694 pool = zcache_get_pool_by_id(cli_id, pool_id); 2695 ramster_do_preload_flnode_only(pool); 2696 if (likely(pool != NULL)) { 2697 if (atomic_read(&pool->obj_count) > 0) 2698 ret = tmem_flush_page(pool, oidp, index); 2699 zcache_put_pool(pool); 2700 } 2701 if (ret >= 0) 2702 zcache_flush_found++; 2703 local_irq_restore(flags); 2704 return ret; 2705} 2706 2707int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp) 2708{ 2709 struct tmem_pool *pool; 2710 int ret = -1; 2711 unsigned long flags; 2712 2713 local_irq_save(flags); 2714 zcache_flobj_total++; 2715 pool = zcache_get_pool_by_id(cli_id, pool_id); 2716 ramster_do_preload_flnode_only(pool); 2717 if (likely(pool != NULL)) { 2718 if (atomic_read(&pool->obj_count) > 0) 2719 ret = tmem_flush_object(pool, oidp); 2720 zcache_put_pool(pool); 2721 } 2722 if (ret >= 0) 2723 zcache_flobj_found++; 2724 local_irq_restore(flags); 2725 return ret; 2726} 2727 2728int zcache_client_destroy_pool(int cli_id, int pool_id) 2729{ 2730 struct tmem_pool *pool = NULL; 2731 struct zcache_client *cli = NULL; 2732 int ret = -1; 2733 2734 if (pool_id < 0) 2735 goto out; 2736 if (cli_id == LOCAL_CLIENT) 2737 cli = &zcache_host; 2738 else if ((unsigned int)cli_id < MAX_CLIENTS) 2739 cli = &zcache_clients[cli_id]; 2740 if (cli == NULL) 2741 goto out; 2742 atomic_inc(&cli->refcount); 2743 pool = cli->tmem_pools[pool_id]; 2744 if (pool == NULL) 2745 goto out; 2746 cli->tmem_pools[pool_id] = NULL; 2747 /* wait for pool activity on other cpus to quiesce */ 2748 while (atomic_read(&pool->refcount) != 0) 2749 ; 2750 atomic_dec(&cli->refcount); 2751 local_bh_disable(); 2752 ret = tmem_destroy_pool(pool); 2753 local_bh_enable(); 2754 kfree(pool); 2755 pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id); 2756out: 2757 return ret; 2758} 2759 2760static int zcache_destroy_pool(int pool_id) 2761{ 2762 return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); 2763} 2764 2765int zcache_new_pool(uint16_t cli_id, uint32_t flags) 2766{ 2767 int poolid = -1; 2768 struct tmem_pool *pool; 2769 struct zcache_client *cli = NULL; 2770 2771 if (cli_id == LOCAL_CLIENT) 2772 cli = &zcache_host; 2773 else if ((unsigned int)cli_id < MAX_CLIENTS) 2774 cli = &zcache_clients[cli_id]; 2775 if (cli == NULL) 2776 goto out; 2777 atomic_inc(&cli->refcount); 2778 pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); 2779 if (pool == NULL) { 2780 pr_info("ramster: pool creation failed: out of memory\n"); 2781 goto out; 2782 } 2783 2784 for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) 2785 if (cli->tmem_pools[poolid] == NULL) 2786 break; 2787 if (poolid >= MAX_POOLS_PER_CLIENT) { 2788 pr_info("ramster: pool creation failed: max exceeded\n"); 2789 kfree(pool); 2790 poolid = -1; 2791 goto out; 2792 } 2793 atomic_set(&pool->refcount, 0); 2794 pool->client = cli; 2795 pool->pool_id = poolid; 2796 tmem_new_pool(pool, flags); 2797 cli->tmem_pools[poolid] = pool; 2798 if (cli_id == LOCAL_CLIENT) 2799 pr_info("ramster: created %s tmem pool, id=%d, local client\n", 2800 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", 2801 poolid); 2802 else 2803 pr_info("ramster: created %s tmem pool, id=%d, client=%d\n", 2804 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", 2805 poolid, cli_id); 2806out: 2807 if (cli != NULL) 2808 atomic_dec(&cli->refcount); 2809 return poolid; 2810} 2811 2812static int zcache_local_new_pool(uint32_t flags) 2813{ 2814 return zcache_new_pool(LOCAL_CLIENT, flags); 2815} 2816 2817int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral) 2818{ 2819 struct tmem_pool *pool; 2820 struct zcache_client *cli = NULL; 2821 uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST; 2822 int ret = -1; 2823 2824 if (cli_id == LOCAL_CLIENT) 2825 goto out; 2826 if (pool_id >= MAX_POOLS_PER_CLIENT) 2827 goto out; 2828 else if ((unsigned int)cli_id < MAX_CLIENTS) 2829 cli = &zcache_clients[cli_id]; 2830 if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap)) 2831 BUG(); /* FIXME, handle more gracefully later */ 2832 if (!cli->allocated) { 2833 if (zcache_new_client(cli_id)) 2834 BUG(); /* FIXME, handle more gracefully later */ 2835 cli = &zcache_clients[cli_id]; 2836 } 2837 atomic_inc(&cli->refcount); 2838 pool = cli->tmem_pools[pool_id]; 2839 if (pool != NULL) { 2840 if (pool->persistent && ephemeral) { 2841 pr_err("zcache_autocreate_pool: type mismatch\n"); 2842 goto out; 2843 } 2844 ret = 0; 2845 goto out; 2846 } 2847 pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); 2848 if (pool == NULL) { 2849 pr_info("ramster: pool creation failed: out of memory\n"); 2850 goto out; 2851 } 2852 atomic_set(&pool->refcount, 0); 2853 pool->client = cli; 2854 pool->pool_id = pool_id; 2855 tmem_new_pool(pool, flags); 2856 cli->tmem_pools[pool_id] = pool; 2857 pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", 2858 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", 2859 pool_id, cli_id); 2860 ret = 0; 2861out: 2862 if (cli == NULL) 2863 BUG(); /* FIXME, handle more gracefully later */ 2864 /* pr_err("zcache_autocreate_pool: failed\n"); */ 2865 if (cli != NULL) 2866 atomic_dec(&cli->refcount); 2867 return ret; 2868} 2869 2870/********** 2871 * Two kernel functionalities currently can be layered on top of tmem. 2872 * These are "cleancache" which is used as a second-chance cache for clean 2873 * page cache pages; and "frontswap" which is used for swap pages 2874 * to avoid writes to disk. A generic "shim" is provided here for each 2875 * to translate in-kernel semantics to zcache semantics. 2876 */ 2877 2878#ifdef CONFIG_CLEANCACHE 2879static void zcache_cleancache_put_page(int pool_id, 2880 struct cleancache_filekey key, 2881 pgoff_t index, struct page *page) 2882{ 2883 u32 ind = (u32) index; 2884 struct tmem_oid oid = *(struct tmem_oid *)&key; 2885 2886#ifdef __PG_WAS_ACTIVE 2887 if (!PageWasActive(page)) { 2888 zcache_nonactive_puts++; 2889 return; 2890 } 2891#endif 2892 if (likely(ind == index)) { 2893 char *kva = page_address(page); 2894 2895 (void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index, 2896 kva, PAGE_SIZE, 0, 1); 2897 } 2898} 2899 2900static int zcache_cleancache_get_page(int pool_id, 2901 struct cleancache_filekey key, 2902 pgoff_t index, struct page *page) 2903{ 2904 u32 ind = (u32) index; 2905 struct tmem_oid oid = *(struct tmem_oid *)&key; 2906 int ret = -1; 2907 2908 preempt_disable(); 2909 if (likely(ind == index)) { 2910 char *kva = page_address(page); 2911 size_t size = PAGE_SIZE; 2912 2913 ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index, 2914 kva, &size, 0, 0); 2915#ifdef __PG_WAS_ACTIVE 2916 if (ret == 0) 2917 SetPageWasActive(page); 2918#endif 2919 } 2920 preempt_enable(); 2921 return ret; 2922} 2923 2924static void zcache_cleancache_flush_page(int pool_id, 2925 struct cleancache_filekey key, 2926 pgoff_t index) 2927{ 2928 u32 ind = (u32) index; 2929 struct tmem_oid oid = *(struct tmem_oid *)&key; 2930 2931 if (likely(ind == index)) 2932 (void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind); 2933} 2934 2935static void zcache_cleancache_flush_inode(int pool_id, 2936 struct cleancache_filekey key) 2937{ 2938 struct tmem_oid oid = *(struct tmem_oid *)&key; 2939 2940 (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); 2941} 2942 2943static void zcache_cleancache_flush_fs(int pool_id) 2944{ 2945 if (pool_id >= 0) 2946 (void)zcache_destroy_pool(pool_id); 2947} 2948 2949static int zcache_cleancache_init_fs(size_t pagesize) 2950{ 2951 BUG_ON(sizeof(struct cleancache_filekey) != 2952 sizeof(struct tmem_oid)); 2953 BUG_ON(pagesize != PAGE_SIZE); 2954 return zcache_local_new_pool(0); 2955} 2956 2957static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) 2958{ 2959 /* shared pools are unsupported and map to private */ 2960 BUG_ON(sizeof(struct cleancache_filekey) != 2961 sizeof(struct tmem_oid)); 2962 BUG_ON(pagesize != PAGE_SIZE); 2963 return zcache_local_new_pool(0); 2964} 2965 2966static struct cleancache_ops zcache_cleancache_ops = { 2967 .put_page = zcache_cleancache_put_page, 2968 .get_page = zcache_cleancache_get_page, 2969 .invalidate_page = zcache_cleancache_flush_page, 2970 .invalidate_inode = zcache_cleancache_flush_inode, 2971 .invalidate_fs = zcache_cleancache_flush_fs, 2972 .init_shared_fs = zcache_cleancache_init_shared_fs, 2973 .init_fs = zcache_cleancache_init_fs 2974}; 2975 2976struct cleancache_ops zcache_cleancache_register_ops(void) 2977{ 2978 struct cleancache_ops old_ops = 2979 cleancache_register_ops(&zcache_cleancache_ops); 2980 2981 return old_ops; 2982} 2983#endif 2984 2985#ifdef CONFIG_FRONTSWAP 2986/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ 2987static int zcache_frontswap_poolid = -1; 2988 2989/* 2990 * Swizzling increases objects per swaptype, increasing tmem concurrency 2991 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS 2992 */ 2993#define SWIZ_BITS 8 2994#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) 2995#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) 2996#define iswiz(_ind) (_ind >> SWIZ_BITS) 2997 2998static inline struct tmem_oid oswiz(unsigned type, u32 ind) 2999{ 3000 struct tmem_oid oid = { .oid = { 0 } }; 3001 oid.oid[0] = _oswiz(type, ind); 3002 return oid; 3003} 3004 3005static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, 3006 struct page *page) 3007{ 3008 u64 ind64 = (u64)offset; 3009 u32 ind = (u32)offset; 3010 struct tmem_oid oid = oswiz(type, ind); 3011 int ret = -1; 3012 unsigned long flags; 3013 char *kva; 3014 3015 BUG_ON(!PageLocked(page)); 3016 if (likely(ind64 == ind)) { 3017 local_irq_save(flags); 3018 kva = page_address(page); 3019 ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid, 3020 &oid, iswiz(ind), kva, PAGE_SIZE, 0, 0); 3021 local_irq_restore(flags); 3022 } 3023 return ret; 3024} 3025 3026/* returns 0 if the page was successfully gotten from frontswap, -1 if 3027 * was not present (should never happen!) */ 3028static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, 3029 struct page *page) 3030{ 3031 u64 ind64 = (u64)offset; 3032 u32 ind = (u32)offset; 3033 struct tmem_oid oid = oswiz(type, ind); 3034 int ret = -1; 3035 3036 preempt_disable(); /* FIXME, remove this? */ 3037 BUG_ON(!PageLocked(page)); 3038 if (likely(ind64 == ind)) { 3039 char *kva = page_address(page); 3040 size_t size = PAGE_SIZE; 3041 3042 ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid, 3043 &oid, iswiz(ind), kva, &size, 0, -1); 3044 } 3045 preempt_enable(); /* FIXME, remove this? */ 3046 return ret; 3047} 3048 3049/* flush a single page from frontswap */ 3050static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) 3051{ 3052 u64 ind64 = (u64)offset; 3053 u32 ind = (u32)offset; 3054 struct tmem_oid oid = oswiz(type, ind); 3055 3056 if (likely(ind64 == ind)) 3057 (void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid, 3058 &oid, iswiz(ind)); 3059} 3060 3061/* flush all pages from the passed swaptype */ 3062static void zcache_frontswap_flush_area(unsigned type) 3063{ 3064 struct tmem_oid oid; 3065 int ind; 3066 3067 for (ind = SWIZ_MASK; ind >= 0; ind--) { 3068 oid = oswiz(type, ind); 3069 (void)zcache_flush_object(LOCAL_CLIENT, 3070 zcache_frontswap_poolid, &oid); 3071 } 3072} 3073 3074static void zcache_frontswap_init(unsigned ignored) 3075{ 3076 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ 3077 if (zcache_frontswap_poolid < 0) 3078 zcache_frontswap_poolid = 3079 zcache_local_new_pool(TMEM_POOL_PERSIST); 3080} 3081 3082static struct frontswap_ops zcache_frontswap_ops = { 3083 .put_page = zcache_frontswap_put_page, 3084 .get_page = zcache_frontswap_get_page, 3085 .invalidate_page = zcache_frontswap_flush_page, 3086 .invalidate_area = zcache_frontswap_flush_area, 3087 .init = zcache_frontswap_init 3088}; 3089 3090struct frontswap_ops zcache_frontswap_register_ops(void) 3091{ 3092 struct frontswap_ops old_ops = 3093 frontswap_register_ops(&zcache_frontswap_ops); 3094 3095 return old_ops; 3096} 3097#endif 3098 3099/* 3100 * frontswap selfshrinking 3101 */ 3102 3103#ifdef CONFIG_FRONTSWAP 3104/* In HZ, controls frequency of worker invocation. */ 3105static unsigned int selfshrink_interval __read_mostly = 5; 3106 3107static void selfshrink_process(struct work_struct *work); 3108static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); 3109 3110/* Enable/disable with sysfs. */ 3111static bool frontswap_selfshrinking __read_mostly; 3112 3113/* Enable/disable with kernel boot option. */ 3114static bool use_frontswap_selfshrink __initdata = true; 3115 3116/* 3117 * The default values for the following parameters were deemed reasonable 3118 * by experimentation, may be workload-dependent, and can all be 3119 * adjusted via sysfs. 3120 */ 3121 3122/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ 3123static unsigned int frontswap_hysteresis __read_mostly = 20; 3124 3125/* 3126 * Number of selfshrink worker invocations to wait before observing that 3127 * frontswap selfshrinking should commence. Note that selfshrinking does 3128 * not use a separate worker thread. 3129 */ 3130static unsigned int frontswap_inertia __read_mostly = 3; 3131 3132/* Countdown to next invocation of frontswap_shrink() */ 3133static unsigned long frontswap_inertia_counter; 3134 3135/* 3136 * Invoked by the selfshrink worker thread, uses current number of pages 3137 * in frontswap (frontswap_curr_pages()), previous status, and control 3138 * values (hysteresis and inertia) to determine if frontswap should be 3139 * shrunk and what the new frontswap size should be. Note that 3140 * frontswap_shrink is essentially a partial swapoff that immediately 3141 * transfers pages from the "swap device" (frontswap) back into kernel 3142 * RAM; despite the name, frontswap "shrinking" is very different from 3143 * the "shrinker" interface used by the kernel MM subsystem to reclaim 3144 * memory. 3145 */ 3146static void frontswap_selfshrink(void) 3147{ 3148 static unsigned long cur_frontswap_pages; 3149 static unsigned long last_frontswap_pages; 3150 static unsigned long tgt_frontswap_pages; 3151 3152 last_frontswap_pages = cur_frontswap_pages; 3153 cur_frontswap_pages = frontswap_curr_pages(); 3154 if (!cur_frontswap_pages || 3155 (cur_frontswap_pages > last_frontswap_pages)) { 3156 frontswap_inertia_counter = frontswap_inertia; 3157 return; 3158 } 3159 if (frontswap_inertia_counter && --frontswap_inertia_counter) 3160 return; 3161 if (cur_frontswap_pages <= frontswap_hysteresis) 3162 tgt_frontswap_pages = 0; 3163 else 3164 tgt_frontswap_pages = cur_frontswap_pages - 3165 (cur_frontswap_pages / frontswap_hysteresis); 3166 frontswap_shrink(tgt_frontswap_pages); 3167} 3168 3169static int __init ramster_nofrontswap_selfshrink_setup(char *s) 3170{ 3171 use_frontswap_selfshrink = false; 3172 return 1; 3173} 3174 3175__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); 3176 3177static void selfshrink_process(struct work_struct *work) 3178{ 3179 if (frontswap_selfshrinking && frontswap_enabled) { 3180 frontswap_selfshrink(); 3181 schedule_delayed_work(&selfshrink_worker, 3182 selfshrink_interval * HZ); 3183 } 3184} 3185 3186static int ramster_enabled; 3187 3188static int __init ramster_selfshrink_init(void) 3189{ 3190 frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink; 3191 if (frontswap_selfshrinking) 3192 pr_info("ramster: Initializing frontswap " 3193 "selfshrinking driver.\n"); 3194 else 3195 return -ENODEV; 3196 3197 schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ); 3198 3199 return 0; 3200} 3201 3202subsys_initcall(ramster_selfshrink_init); 3203#endif 3204 3205/* 3206 * zcache initialization 3207 * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR 3208 * NOTHING HAPPENS! 3209 */ 3210 3211static int ramster_enabled; 3212 3213static int __init enable_ramster(char *s) 3214{ 3215 ramster_enabled = 1; 3216 return 1; 3217} 3218__setup("ramster", enable_ramster); 3219 3220/* allow independent dynamic disabling of cleancache and frontswap */ 3221 3222static int use_cleancache = 1; 3223 3224static int __init no_cleancache(char *s) 3225{ 3226 pr_info("INIT no_cleancache called\n"); 3227 use_cleancache = 0; 3228 return 1; 3229} 3230 3231/* 3232 * FIXME: need to guarantee this gets checked before zcache_init is called 3233 * What is the correct way to achieve this? 3234 */ 3235early_param("nocleancache", no_cleancache); 3236 3237static int use_frontswap = 1; 3238 3239static int __init no_frontswap(char *s) 3240{ 3241 pr_info("INIT no_frontswap called\n"); 3242 use_frontswap = 0; 3243 return 1; 3244} 3245 3246__setup("nofrontswap", no_frontswap); 3247 3248static int __init zcache_init(void) 3249{ 3250 int ret = 0; 3251 3252#ifdef CONFIG_SYSFS 3253 ret = sysfs_create_group(mm_kobj, &zcache_attr_group); 3254 ret = sysfs_create_group(mm_kobj, &ramster_attr_group); 3255 if (ret) { 3256 pr_err("ramster: can't create sysfs\n"); 3257 goto out; 3258 } 3259#endif /* CONFIG_SYSFS */ 3260#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) 3261 if (ramster_enabled) { 3262 unsigned int cpu; 3263 3264 (void)r2net_register_handlers(); 3265 tmem_register_hostops(&zcache_hostops); 3266 tmem_register_pamops(&zcache_pamops); 3267 ret = register_cpu_notifier(&zcache_cpu_notifier_block); 3268 if (ret) { 3269 pr_err("ramster: can't register cpu notifier\n"); 3270 goto out; 3271 } 3272 for_each_online_cpu(cpu) { 3273 void *pcpu = (void *)(long)cpu; 3274 zcache_cpu_notifier(&zcache_cpu_notifier_block, 3275 CPU_UP_PREPARE, pcpu); 3276 } 3277 } 3278 zcache_objnode_cache = kmem_cache_create("zcache_objnode", 3279 sizeof(struct tmem_objnode), 0, 0, NULL); 3280 zcache_obj_cache = kmem_cache_create("zcache_obj", 3281 sizeof(struct tmem_obj), 0, 0, NULL); 3282 ramster_flnode_cache = kmem_cache_create("ramster_flnode", 3283 sizeof(struct flushlist_node), 0, 0, NULL); 3284#endif 3285#ifdef CONFIG_CLEANCACHE 3286 pr_info("INIT ramster_enabled=%d use_cleancache=%d\n", 3287 ramster_enabled, use_cleancache); 3288 if (ramster_enabled && use_cleancache) { 3289 struct cleancache_ops old_ops; 3290 3291 zbud_init(); 3292 register_shrinker(&zcache_shrinker); 3293 old_ops = zcache_cleancache_register_ops(); 3294 pr_info("ramster: cleancache enabled using kernel " 3295 "transcendent memory and compression buddies\n"); 3296 if (old_ops.init_fs != NULL) 3297 pr_warning("ramster: cleancache_ops overridden"); 3298 } 3299#endif 3300#ifdef CONFIG_FRONTSWAP 3301 pr_info("INIT ramster_enabled=%d use_frontswap=%d\n", 3302 ramster_enabled, use_frontswap); 3303 if (ramster_enabled && use_frontswap) { 3304 struct frontswap_ops old_ops; 3305 3306 zcache_new_client(LOCAL_CLIENT); 3307 old_ops = zcache_frontswap_register_ops(); 3308 pr_info("ramster: frontswap enabled using kernel " 3309 "transcendent memory and xvmalloc\n"); 3310 if (old_ops.init != NULL) 3311 pr_warning("ramster: frontswap_ops overridden"); 3312 } 3313 if (ramster_enabled && (use_frontswap || use_cleancache)) 3314 ramster_remotify_init(); 3315#endif 3316out: 3317 return ret; 3318} 3319 3320module_init(zcache_init) 3321