1/* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm-cache-policy.h" 8#include "dm.h" 9 10#include <linux/hash.h> 11#include <linux/module.h> 12#include <linux/mutex.h> 13#include <linux/slab.h> 14#include <linux/vmalloc.h> 15 16#define DM_MSG_PREFIX "cache-policy-mq" 17 18static struct kmem_cache *mq_entry_cache; 19 20/*----------------------------------------------------------------*/ 21 22static unsigned next_power(unsigned n, unsigned min) 23{ 24 return roundup_pow_of_two(max(n, min)); 25} 26 27/*----------------------------------------------------------------*/ 28 29/* 30 * Large, sequential ios are probably better left on the origin device since 31 * spindles tend to have good bandwidth. 32 * 33 * The io_tracker tries to spot when the io is in one of these sequential 34 * modes. 35 * 36 * Two thresholds to switch between random and sequential io mode are defaulting 37 * as follows and can be adjusted via the constructor and message interfaces. 38 */ 39#define RANDOM_THRESHOLD_DEFAULT 4 40#define SEQUENTIAL_THRESHOLD_DEFAULT 512 41 42enum io_pattern { 43 PATTERN_SEQUENTIAL, 44 PATTERN_RANDOM 45}; 46 47struct io_tracker { 48 enum io_pattern pattern; 49 50 unsigned nr_seq_samples; 51 unsigned nr_rand_samples; 52 unsigned thresholds[2]; 53 54 dm_oblock_t last_end_oblock; 55}; 56 57static void iot_init(struct io_tracker *t, 58 int sequential_threshold, int random_threshold) 59{ 60 t->pattern = PATTERN_RANDOM; 61 t->nr_seq_samples = 0; 62 t->nr_rand_samples = 0; 63 t->last_end_oblock = 0; 64 t->thresholds[PATTERN_RANDOM] = random_threshold; 65 t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; 66} 67 68static enum io_pattern iot_pattern(struct io_tracker *t) 69{ 70 return t->pattern; 71} 72 73static void iot_update_stats(struct io_tracker *t, struct bio *bio) 74{ 75 if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) 76 t->nr_seq_samples++; 77 else { 78 /* 79 * Just one non-sequential IO is enough to reset the 80 * counters. 81 */ 82 if (t->nr_seq_samples) { 83 t->nr_seq_samples = 0; 84 t->nr_rand_samples = 0; 85 } 86 87 t->nr_rand_samples++; 88 } 89 90 t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); 91} 92 93static void iot_check_for_pattern_switch(struct io_tracker *t) 94{ 95 switch (t->pattern) { 96 case PATTERN_SEQUENTIAL: 97 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { 98 t->pattern = PATTERN_RANDOM; 99 t->nr_seq_samples = t->nr_rand_samples = 0; 100 } 101 break; 102 103 case PATTERN_RANDOM: 104 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { 105 t->pattern = PATTERN_SEQUENTIAL; 106 t->nr_seq_samples = t->nr_rand_samples = 0; 107 } 108 break; 109 } 110} 111 112static void iot_examine_bio(struct io_tracker *t, struct bio *bio) 113{ 114 iot_update_stats(t, bio); 115 iot_check_for_pattern_switch(t); 116} 117 118/*----------------------------------------------------------------*/ 119 120 121/* 122 * This queue is divided up into different levels. Allowing us to push 123 * entries to the back of any of the levels. Think of it as a partially 124 * sorted queue. 125 */ 126#define NR_QUEUE_LEVELS 16u 127 128struct queue { 129 struct list_head qs[NR_QUEUE_LEVELS]; 130}; 131 132static void queue_init(struct queue *q) 133{ 134 unsigned i; 135 136 for (i = 0; i < NR_QUEUE_LEVELS; i++) 137 INIT_LIST_HEAD(q->qs + i); 138} 139 140/* 141 * Checks to see if the queue is empty. 142 * FIXME: reduce cpu usage. 143 */ 144static bool queue_empty(struct queue *q) 145{ 146 unsigned i; 147 148 for (i = 0; i < NR_QUEUE_LEVELS; i++) 149 if (!list_empty(q->qs + i)) 150 return false; 151 152 return true; 153} 154 155/* 156 * Insert an entry to the back of the given level. 157 */ 158static void queue_push(struct queue *q, unsigned level, struct list_head *elt) 159{ 160 list_add_tail(elt, q->qs + level); 161} 162 163static void queue_remove(struct list_head *elt) 164{ 165 list_del(elt); 166} 167 168/* 169 * Shifts all regions down one level. This has no effect on the order of 170 * the queue. 171 */ 172static void queue_shift_down(struct queue *q) 173{ 174 unsigned level; 175 176 for (level = 1; level < NR_QUEUE_LEVELS; level++) 177 list_splice_init(q->qs + level, q->qs + level - 1); 178} 179 180/* 181 * Gives us the oldest entry of the lowest popoulated level. If the first 182 * level is emptied then we shift down one level. 183 */ 184static struct list_head *queue_pop(struct queue *q) 185{ 186 unsigned level; 187 struct list_head *r; 188 189 for (level = 0; level < NR_QUEUE_LEVELS; level++) 190 if (!list_empty(q->qs + level)) { 191 r = q->qs[level].next; 192 list_del(r); 193 194 /* have we just emptied the bottom level? */ 195 if (level == 0 && list_empty(q->qs)) 196 queue_shift_down(q); 197 198 return r; 199 } 200 201 return NULL; 202} 203 204static struct list_head *list_pop(struct list_head *lh) 205{ 206 struct list_head *r = lh->next; 207 208 BUG_ON(!r); 209 list_del_init(r); 210 211 return r; 212} 213 214/*----------------------------------------------------------------*/ 215 216/* 217 * Describes a cache entry. Used in both the cache and the pre_cache. 218 */ 219struct entry { 220 struct hlist_node hlist; 221 struct list_head list; 222 dm_oblock_t oblock; 223 224 /* 225 * FIXME: pack these better 226 */ 227 bool dirty:1; 228 unsigned hit_count; 229 unsigned generation; 230 unsigned tick; 231}; 232 233/* 234 * Rather than storing the cblock in an entry, we allocate all entries in 235 * an array, and infer the cblock from the entry position. 236 * 237 * Free entries are linked together into a list. 238 */ 239struct entry_pool { 240 struct entry *entries, *entries_end; 241 struct list_head free; 242 unsigned nr_allocated; 243}; 244 245static int epool_init(struct entry_pool *ep, unsigned nr_entries) 246{ 247 unsigned i; 248 249 ep->entries = vzalloc(sizeof(struct entry) * nr_entries); 250 if (!ep->entries) 251 return -ENOMEM; 252 253 ep->entries_end = ep->entries + nr_entries; 254 255 INIT_LIST_HEAD(&ep->free); 256 for (i = 0; i < nr_entries; i++) 257 list_add(&ep->entries[i].list, &ep->free); 258 259 ep->nr_allocated = 0; 260 261 return 0; 262} 263 264static void epool_exit(struct entry_pool *ep) 265{ 266 vfree(ep->entries); 267} 268 269static struct entry *alloc_entry(struct entry_pool *ep) 270{ 271 struct entry *e; 272 273 if (list_empty(&ep->free)) 274 return NULL; 275 276 e = list_entry(list_pop(&ep->free), struct entry, list); 277 INIT_LIST_HEAD(&e->list); 278 INIT_HLIST_NODE(&e->hlist); 279 ep->nr_allocated++; 280 281 return e; 282} 283 284/* 285 * This assumes the cblock hasn't already been allocated. 286 */ 287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) 288{ 289 struct entry *e = ep->entries + from_cblock(cblock); 290 291 list_del_init(&e->list); 292 INIT_HLIST_NODE(&e->hlist); 293 ep->nr_allocated++; 294 295 return e; 296} 297 298static void free_entry(struct entry_pool *ep, struct entry *e) 299{ 300 BUG_ON(!ep->nr_allocated); 301 ep->nr_allocated--; 302 INIT_HLIST_NODE(&e->hlist); 303 list_add(&e->list, &ep->free); 304} 305 306/* 307 * Returns NULL if the entry is free. 308 */ 309static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) 310{ 311 struct entry *e = ep->entries + from_cblock(cblock); 312 return !hlist_unhashed(&e->hlist) ? e : NULL; 313} 314 315static bool epool_empty(struct entry_pool *ep) 316{ 317 return list_empty(&ep->free); 318} 319 320static bool in_pool(struct entry_pool *ep, struct entry *e) 321{ 322 return e >= ep->entries && e < ep->entries_end; 323} 324 325static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) 326{ 327 return to_cblock(e - ep->entries); 328} 329 330/*----------------------------------------------------------------*/ 331 332struct mq_policy { 333 struct dm_cache_policy policy; 334 335 /* protects everything */ 336 struct mutex lock; 337 dm_cblock_t cache_size; 338 struct io_tracker tracker; 339 340 /* 341 * Entries come from two pools, one of pre-cache entries, and one 342 * for the cache proper. 343 */ 344 struct entry_pool pre_cache_pool; 345 struct entry_pool cache_pool; 346 347 /* 348 * We maintain three queues of entries. The cache proper, 349 * consisting of a clean and dirty queue, contains the currently 350 * active mappings. Whereas the pre_cache tracks blocks that 351 * are being hit frequently and potential candidates for promotion 352 * to the cache. 353 */ 354 struct queue pre_cache; 355 struct queue cache_clean; 356 struct queue cache_dirty; 357 358 /* 359 * Keeps track of time, incremented by the core. We use this to 360 * avoid attributing multiple hits within the same tick. 361 * 362 * Access to tick_protected should be done with the spin lock held. 363 * It's copied to tick at the start of the map function (within the 364 * mutex). 365 */ 366 spinlock_t tick_lock; 367 unsigned tick_protected; 368 unsigned tick; 369 370 /* 371 * A count of the number of times the map function has been called 372 * and found an entry in the pre_cache or cache. Currently used to 373 * calculate the generation. 374 */ 375 unsigned hit_count; 376 377 /* 378 * A generation is a longish period that is used to trigger some 379 * book keeping effects. eg, decrementing hit counts on entries. 380 * This is needed to allow the cache to evolve as io patterns 381 * change. 382 */ 383 unsigned generation; 384 unsigned generation_period; /* in lookups (will probably change) */ 385 386 /* 387 * Entries in the pre_cache whose hit count passes the promotion 388 * threshold move to the cache proper. Working out the correct 389 * value for the promotion_threshold is crucial to this policy. 390 */ 391 unsigned promote_threshold; 392 393 unsigned discard_promote_adjustment; 394 unsigned read_promote_adjustment; 395 unsigned write_promote_adjustment; 396 397 /* 398 * The hash table allows us to quickly find an entry by origin 399 * block. Both pre_cache and cache entries are in here. 400 */ 401 unsigned nr_buckets; 402 dm_block_t hash_bits; 403 struct hlist_head *table; 404}; 405 406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 409 410/*----------------------------------------------------------------*/ 411 412/* 413 * Simple hash table implementation. Should replace with the standard hash 414 * table that's making its way upstream. 415 */ 416static void hash_insert(struct mq_policy *mq, struct entry *e) 417{ 418 unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); 419 420 hlist_add_head(&e->hlist, mq->table + h); 421} 422 423static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) 424{ 425 unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); 426 struct hlist_head *bucket = mq->table + h; 427 struct entry *e; 428 429 hlist_for_each_entry(e, bucket, hlist) 430 if (e->oblock == oblock) { 431 hlist_del(&e->hlist); 432 hlist_add_head(&e->hlist, bucket); 433 return e; 434 } 435 436 return NULL; 437} 438 439static void hash_remove(struct entry *e) 440{ 441 hlist_del(&e->hlist); 442} 443 444/*----------------------------------------------------------------*/ 445 446static bool any_free_cblocks(struct mq_policy *mq) 447{ 448 return !epool_empty(&mq->cache_pool); 449} 450 451static bool any_clean_cblocks(struct mq_policy *mq) 452{ 453 return !queue_empty(&mq->cache_clean); 454} 455 456/*----------------------------------------------------------------*/ 457 458/* 459 * Now we get to the meat of the policy. This section deals with deciding 460 * when to to add entries to the pre_cache and cache, and move between 461 * them. 462 */ 463 464/* 465 * The queue level is based on the log2 of the hit count. 466 */ 467static unsigned queue_level(struct entry *e) 468{ 469 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); 470} 471 472static bool in_cache(struct mq_policy *mq, struct entry *e) 473{ 474 return in_pool(&mq->cache_pool, e); 475} 476 477/* 478 * Inserts the entry into the pre_cache or the cache. Ensures the cache 479 * block is marked as allocated if necc. Inserts into the hash table. 480 * Sets the tick which records when the entry was last moved about. 481 */ 482static void push(struct mq_policy *mq, struct entry *e) 483{ 484 e->tick = mq->tick; 485 hash_insert(mq, e); 486 487 if (in_cache(mq, e)) 488 queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, 489 queue_level(e), &e->list); 490 else 491 queue_push(&mq->pre_cache, queue_level(e), &e->list); 492} 493 494/* 495 * Removes an entry from pre_cache or cache. Removes from the hash table. 496 */ 497static void del(struct mq_policy *mq, struct entry *e) 498{ 499 queue_remove(&e->list); 500 hash_remove(e); 501} 502 503/* 504 * Like del, except it removes the first entry in the queue (ie. the least 505 * recently used). 506 */ 507static struct entry *pop(struct mq_policy *mq, struct queue *q) 508{ 509 struct entry *e; 510 struct list_head *h = queue_pop(q); 511 512 if (!h) 513 return NULL; 514 515 e = container_of(h, struct entry, list); 516 hash_remove(e); 517 518 return e; 519} 520 521/* 522 * Has this entry already been updated? 523 */ 524static bool updated_this_tick(struct mq_policy *mq, struct entry *e) 525{ 526 return mq->tick == e->tick; 527} 528 529/* 530 * The promotion threshold is adjusted every generation. As are the counts 531 * of the entries. 532 * 533 * At the moment the threshold is taken by averaging the hit counts of some 534 * of the entries in the cache (the first 20 entries across all levels in 535 * ascending order, giving preference to the clean entries at each level). 536 * 537 * We can be much cleverer than this though. For example, each promotion 538 * could bump up the threshold helping to prevent churn. Much more to do 539 * here. 540 */ 541 542#define MAX_TO_AVERAGE 20 543 544static void check_generation(struct mq_policy *mq) 545{ 546 unsigned total = 0, nr = 0, count = 0, level; 547 struct list_head *head; 548 struct entry *e; 549 550 if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { 551 mq->hit_count = 0; 552 mq->generation++; 553 554 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { 555 head = mq->cache_clean.qs + level; 556 list_for_each_entry(e, head, list) { 557 nr++; 558 total += e->hit_count; 559 560 if (++count >= MAX_TO_AVERAGE) 561 break; 562 } 563 564 head = mq->cache_dirty.qs + level; 565 list_for_each_entry(e, head, list) { 566 nr++; 567 total += e->hit_count; 568 569 if (++count >= MAX_TO_AVERAGE) 570 break; 571 } 572 } 573 574 mq->promote_threshold = nr ? total / nr : 1; 575 if (mq->promote_threshold * nr < total) 576 mq->promote_threshold++; 577 } 578} 579 580/* 581 * Whenever we use an entry we bump up it's hit counter, and push it to the 582 * back to it's current level. 583 */ 584static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) 585{ 586 if (updated_this_tick(mq, e)) 587 return; 588 589 e->hit_count++; 590 mq->hit_count++; 591 check_generation(mq); 592 593 /* generation adjustment, to stop the counts increasing forever. */ 594 /* FIXME: divide? */ 595 /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ 596 e->generation = mq->generation; 597 598 del(mq, e); 599 push(mq, e); 600} 601 602/* 603 * Demote the least recently used entry from the cache to the pre_cache. 604 * Returns the new cache entry to use, and the old origin block it was 605 * mapped to. 606 * 607 * We drop the hit count on the demoted entry back to 1 to stop it bouncing 608 * straight back into the cache if it's subsequently hit. There are 609 * various options here, and more experimentation would be good: 610 * 611 * - just forget about the demoted entry completely (ie. don't insert it 612 into the pre_cache). 613 * - divide the hit count rather that setting to some hard coded value. 614 * - set the hit count to a hard coded value other than 1, eg, is it better 615 * if it goes in at level 2? 616 */ 617static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) 618{ 619 struct entry *demoted = pop(mq, &mq->cache_clean); 620 621 if (!demoted) 622 /* 623 * We could get a block from mq->cache_dirty, but that 624 * would add extra latency to the triggering bio as it 625 * waits for the writeback. Better to not promote this 626 * time and hope there's a clean block next time this block 627 * is hit. 628 */ 629 return -ENOSPC; 630 631 *oblock = demoted->oblock; 632 free_entry(&mq->cache_pool, demoted); 633 634 /* 635 * We used to put the demoted block into the pre-cache, but I think 636 * it's simpler to just let it work it's way up from zero again. 637 * Stops blocks flickering in and out of the cache. 638 */ 639 640 return 0; 641} 642 643/* 644 * We modify the basic promotion_threshold depending on the specific io. 645 * 646 * If the origin block has been discarded then there's no cost to copy it 647 * to the cache. 648 * 649 * We bias towards reads, since they can be demoted at no cost if they 650 * haven't been dirtied. 651 */ 652static unsigned adjusted_promote_threshold(struct mq_policy *mq, 653 bool discarded_oblock, int data_dir) 654{ 655 if (data_dir == READ) 656 return mq->promote_threshold + mq->read_promote_adjustment; 657 658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 659 /* 660 * We don't need to do any copying at all, so give this a 661 * very low threshold. 662 */ 663 return mq->discard_promote_adjustment; 664 } 665 666 return mq->promote_threshold + mq->write_promote_adjustment; 667} 668 669static bool should_promote(struct mq_policy *mq, struct entry *e, 670 bool discarded_oblock, int data_dir) 671{ 672 return e->hit_count >= 673 adjusted_promote_threshold(mq, discarded_oblock, data_dir); 674} 675 676static int cache_entry_found(struct mq_policy *mq, 677 struct entry *e, 678 struct policy_result *result) 679{ 680 requeue_and_update_tick(mq, e); 681 682 if (in_cache(mq, e)) { 683 result->op = POLICY_HIT; 684 result->cblock = infer_cblock(&mq->cache_pool, e); 685 } 686 687 return 0; 688} 689 690/* 691 * Moves an entry from the pre_cache to the cache. The main work is 692 * finding which cache block to use. 693 */ 694static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, 695 struct policy_result *result) 696{ 697 int r; 698 struct entry *new_e; 699 700 /* Ensure there's a free cblock in the cache */ 701 if (epool_empty(&mq->cache_pool)) { 702 result->op = POLICY_REPLACE; 703 r = demote_cblock(mq, &result->old_oblock); 704 if (r) { 705 result->op = POLICY_MISS; 706 return 0; 707 } 708 } else 709 result->op = POLICY_NEW; 710 711 new_e = alloc_entry(&mq->cache_pool); 712 BUG_ON(!new_e); 713 714 new_e->oblock = e->oblock; 715 new_e->dirty = false; 716 new_e->hit_count = e->hit_count; 717 new_e->generation = e->generation; 718 new_e->tick = e->tick; 719 720 del(mq, e); 721 free_entry(&mq->pre_cache_pool, e); 722 push(mq, new_e); 723 724 result->cblock = infer_cblock(&mq->cache_pool, new_e); 725 726 return 0; 727} 728 729static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, 730 bool can_migrate, bool discarded_oblock, 731 int data_dir, struct policy_result *result) 732{ 733 int r = 0; 734 bool updated = updated_this_tick(mq, e); 735 736 if ((!discarded_oblock && updated) || 737 !should_promote(mq, e, discarded_oblock, data_dir)) { 738 requeue_and_update_tick(mq, e); 739 result->op = POLICY_MISS; 740 741 } else if (!can_migrate) 742 r = -EWOULDBLOCK; 743 744 else { 745 requeue_and_update_tick(mq, e); 746 r = pre_cache_to_cache(mq, e, result); 747 } 748 749 return r; 750} 751 752static void insert_in_pre_cache(struct mq_policy *mq, 753 dm_oblock_t oblock) 754{ 755 struct entry *e = alloc_entry(&mq->pre_cache_pool); 756 757 if (!e) 758 /* 759 * There's no spare entry structure, so we grab the least 760 * used one from the pre_cache. 761 */ 762 e = pop(mq, &mq->pre_cache); 763 764 if (unlikely(!e)) { 765 DMWARN("couldn't pop from pre cache"); 766 return; 767 } 768 769 e->dirty = false; 770 e->oblock = oblock; 771 e->hit_count = 1; 772 e->generation = mq->generation; 773 push(mq, e); 774} 775 776static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, 777 struct policy_result *result) 778{ 779 int r; 780 struct entry *e; 781 782 if (epool_empty(&mq->cache_pool)) { 783 result->op = POLICY_REPLACE; 784 r = demote_cblock(mq, &result->old_oblock); 785 if (unlikely(r)) { 786 result->op = POLICY_MISS; 787 insert_in_pre_cache(mq, oblock); 788 return; 789 } 790 791 /* 792 * This will always succeed, since we've just demoted. 793 */ 794 e = alloc_entry(&mq->cache_pool); 795 BUG_ON(!e); 796 797 } else { 798 e = alloc_entry(&mq->cache_pool); 799 result->op = POLICY_NEW; 800 } 801 802 e->oblock = oblock; 803 e->dirty = false; 804 e->hit_count = 1; 805 e->generation = mq->generation; 806 push(mq, e); 807 808 result->cblock = infer_cblock(&mq->cache_pool, e); 809} 810 811static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, 812 bool can_migrate, bool discarded_oblock, 813 int data_dir, struct policy_result *result) 814{ 815 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { 816 if (can_migrate) 817 insert_in_cache(mq, oblock, result); 818 else 819 return -EWOULDBLOCK; 820 } else { 821 insert_in_pre_cache(mq, oblock); 822 result->op = POLICY_MISS; 823 } 824 825 return 0; 826} 827 828/* 829 * Looks the oblock up in the hash table, then decides whether to put in 830 * pre_cache, or cache etc. 831 */ 832static int map(struct mq_policy *mq, dm_oblock_t oblock, 833 bool can_migrate, bool discarded_oblock, 834 int data_dir, struct policy_result *result) 835{ 836 int r = 0; 837 struct entry *e = hash_lookup(mq, oblock); 838 839 if (e && in_cache(mq, e)) 840 r = cache_entry_found(mq, e, result); 841 842 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 843 result->op = POLICY_MISS; 844 845 else if (e) 846 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, 847 data_dir, result); 848 849 else 850 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, 851 data_dir, result); 852 853 if (r == -EWOULDBLOCK) 854 result->op = POLICY_MISS; 855 856 return r; 857} 858 859/*----------------------------------------------------------------*/ 860 861/* 862 * Public interface, via the policy struct. See dm-cache-policy.h for a 863 * description of these. 864 */ 865 866static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) 867{ 868 return container_of(p, struct mq_policy, policy); 869} 870 871static void mq_destroy(struct dm_cache_policy *p) 872{ 873 struct mq_policy *mq = to_mq_policy(p); 874 875 vfree(mq->table); 876 epool_exit(&mq->cache_pool); 877 epool_exit(&mq->pre_cache_pool); 878 kfree(mq); 879} 880 881static void copy_tick(struct mq_policy *mq) 882{ 883 unsigned long flags; 884 885 spin_lock_irqsave(&mq->tick_lock, flags); 886 mq->tick = mq->tick_protected; 887 spin_unlock_irqrestore(&mq->tick_lock, flags); 888} 889 890static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 891 bool can_block, bool can_migrate, bool discarded_oblock, 892 struct bio *bio, struct policy_result *result) 893{ 894 int r; 895 struct mq_policy *mq = to_mq_policy(p); 896 897 result->op = POLICY_MISS; 898 899 if (can_block) 900 mutex_lock(&mq->lock); 901 else if (!mutex_trylock(&mq->lock)) 902 return -EWOULDBLOCK; 903 904 copy_tick(mq); 905 906 iot_examine_bio(&mq->tracker, bio); 907 r = map(mq, oblock, can_migrate, discarded_oblock, 908 bio_data_dir(bio), result); 909 910 mutex_unlock(&mq->lock); 911 912 return r; 913} 914 915static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 916{ 917 int r; 918 struct mq_policy *mq = to_mq_policy(p); 919 struct entry *e; 920 921 if (!mutex_trylock(&mq->lock)) 922 return -EWOULDBLOCK; 923 924 e = hash_lookup(mq, oblock); 925 if (e && in_cache(mq, e)) { 926 *cblock = infer_cblock(&mq->cache_pool, e); 927 r = 0; 928 } else 929 r = -ENOENT; 930 931 mutex_unlock(&mq->lock); 932 933 return r; 934} 935 936static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) 937{ 938 struct entry *e; 939 940 e = hash_lookup(mq, oblock); 941 BUG_ON(!e || !in_cache(mq, e)); 942 943 del(mq, e); 944 e->dirty = set; 945 push(mq, e); 946} 947 948static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 949{ 950 struct mq_policy *mq = to_mq_policy(p); 951 952 mutex_lock(&mq->lock); 953 __mq_set_clear_dirty(mq, oblock, true); 954 mutex_unlock(&mq->lock); 955} 956 957static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 958{ 959 struct mq_policy *mq = to_mq_policy(p); 960 961 mutex_lock(&mq->lock); 962 __mq_set_clear_dirty(mq, oblock, false); 963 mutex_unlock(&mq->lock); 964} 965 966static int mq_load_mapping(struct dm_cache_policy *p, 967 dm_oblock_t oblock, dm_cblock_t cblock, 968 uint32_t hint, bool hint_valid) 969{ 970 struct mq_policy *mq = to_mq_policy(p); 971 struct entry *e; 972 973 e = alloc_particular_entry(&mq->cache_pool, cblock); 974 e->oblock = oblock; 975 e->dirty = false; /* this gets corrected in a minute */ 976 e->hit_count = hint_valid ? hint : 1; 977 e->generation = mq->generation; 978 push(mq, e); 979 980 return 0; 981} 982 983static int mq_save_hints(struct mq_policy *mq, struct queue *q, 984 policy_walk_fn fn, void *context) 985{ 986 int r; 987 unsigned level; 988 struct entry *e; 989 990 for (level = 0; level < NR_QUEUE_LEVELS; level++) 991 list_for_each_entry(e, q->qs + level, list) { 992 r = fn(context, infer_cblock(&mq->cache_pool, e), 993 e->oblock, e->hit_count); 994 if (r) 995 return r; 996 } 997 998 return 0; 999} 1000 1001static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, 1002 void *context) 1003{ 1004 struct mq_policy *mq = to_mq_policy(p); 1005 int r = 0; 1006 1007 mutex_lock(&mq->lock); 1008 1009 r = mq_save_hints(mq, &mq->cache_clean, fn, context); 1010 if (!r) 1011 r = mq_save_hints(mq, &mq->cache_dirty, fn, context); 1012 1013 mutex_unlock(&mq->lock); 1014 1015 return r; 1016} 1017 1018static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) 1019{ 1020 struct entry *e; 1021 1022 e = hash_lookup(mq, oblock); 1023 BUG_ON(!e || !in_cache(mq, e)); 1024 1025 del(mq, e); 1026 free_entry(&mq->cache_pool, e); 1027} 1028 1029static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1030{ 1031 struct mq_policy *mq = to_mq_policy(p); 1032 1033 mutex_lock(&mq->lock); 1034 __remove_mapping(mq, oblock); 1035 mutex_unlock(&mq->lock); 1036} 1037 1038static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) 1039{ 1040 struct entry *e = epool_find(&mq->cache_pool, cblock); 1041 1042 if (!e) 1043 return -ENODATA; 1044 1045 del(mq, e); 1046 free_entry(&mq->cache_pool, e); 1047 1048 return 0; 1049} 1050 1051static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1052{ 1053 int r; 1054 struct mq_policy *mq = to_mq_policy(p); 1055 1056 mutex_lock(&mq->lock); 1057 r = __remove_cblock(mq, cblock); 1058 mutex_unlock(&mq->lock); 1059 1060 return r; 1061} 1062 1063static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, 1064 dm_cblock_t *cblock) 1065{ 1066 struct entry *e = pop(mq, &mq->cache_dirty); 1067 1068 if (!e) 1069 return -ENODATA; 1070 1071 *oblock = e->oblock; 1072 *cblock = infer_cblock(&mq->cache_pool, e); 1073 e->dirty = false; 1074 push(mq, e); 1075 1076 return 0; 1077} 1078 1079static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1080 dm_cblock_t *cblock) 1081{ 1082 int r; 1083 struct mq_policy *mq = to_mq_policy(p); 1084 1085 mutex_lock(&mq->lock); 1086 r = __mq_writeback_work(mq, oblock, cblock); 1087 mutex_unlock(&mq->lock); 1088 1089 return r; 1090} 1091 1092static void __force_mapping(struct mq_policy *mq, 1093 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1094{ 1095 struct entry *e = hash_lookup(mq, current_oblock); 1096 1097 if (e && in_cache(mq, e)) { 1098 del(mq, e); 1099 e->oblock = new_oblock; 1100 e->dirty = true; 1101 push(mq, e); 1102 } 1103} 1104 1105static void mq_force_mapping(struct dm_cache_policy *p, 1106 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1107{ 1108 struct mq_policy *mq = to_mq_policy(p); 1109 1110 mutex_lock(&mq->lock); 1111 __force_mapping(mq, current_oblock, new_oblock); 1112 mutex_unlock(&mq->lock); 1113} 1114 1115static dm_cblock_t mq_residency(struct dm_cache_policy *p) 1116{ 1117 dm_cblock_t r; 1118 struct mq_policy *mq = to_mq_policy(p); 1119 1120 mutex_lock(&mq->lock); 1121 r = to_cblock(mq->cache_pool.nr_allocated); 1122 mutex_unlock(&mq->lock); 1123 1124 return r; 1125} 1126 1127static void mq_tick(struct dm_cache_policy *p) 1128{ 1129 struct mq_policy *mq = to_mq_policy(p); 1130 unsigned long flags; 1131 1132 spin_lock_irqsave(&mq->tick_lock, flags); 1133 mq->tick_protected++; 1134 spin_unlock_irqrestore(&mq->tick_lock, flags); 1135} 1136 1137static int mq_set_config_value(struct dm_cache_policy *p, 1138 const char *key, const char *value) 1139{ 1140 struct mq_policy *mq = to_mq_policy(p); 1141 unsigned long tmp; 1142 1143 if (kstrtoul(value, 10, &tmp)) 1144 return -EINVAL; 1145 1146 if (!strcasecmp(key, "random_threshold")) { 1147 mq->tracker.thresholds[PATTERN_RANDOM] = tmp; 1148 1149 } else if (!strcasecmp(key, "sequential_threshold")) { 1150 mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; 1151 1152 } else if (!strcasecmp(key, "discard_promote_adjustment")) 1153 mq->discard_promote_adjustment = tmp; 1154 1155 else if (!strcasecmp(key, "read_promote_adjustment")) 1156 mq->read_promote_adjustment = tmp; 1157 1158 else if (!strcasecmp(key, "write_promote_adjustment")) 1159 mq->write_promote_adjustment = tmp; 1160 1161 else 1162 return -EINVAL; 1163 1164 return 0; 1165} 1166 1167static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) 1168{ 1169 ssize_t sz = 0; 1170 struct mq_policy *mq = to_mq_policy(p); 1171 1172 DMEMIT("10 random_threshold %u " 1173 "sequential_threshold %u " 1174 "discard_promote_adjustment %u " 1175 "read_promote_adjustment %u " 1176 "write_promote_adjustment %u", 1177 mq->tracker.thresholds[PATTERN_RANDOM], 1178 mq->tracker.thresholds[PATTERN_SEQUENTIAL], 1179 mq->discard_promote_adjustment, 1180 mq->read_promote_adjustment, 1181 mq->write_promote_adjustment); 1182 1183 return 0; 1184} 1185 1186/* Init the policy plugin interface function pointers. */ 1187static void init_policy_functions(struct mq_policy *mq) 1188{ 1189 mq->policy.destroy = mq_destroy; 1190 mq->policy.map = mq_map; 1191 mq->policy.lookup = mq_lookup; 1192 mq->policy.set_dirty = mq_set_dirty; 1193 mq->policy.clear_dirty = mq_clear_dirty; 1194 mq->policy.load_mapping = mq_load_mapping; 1195 mq->policy.walk_mappings = mq_walk_mappings; 1196 mq->policy.remove_mapping = mq_remove_mapping; 1197 mq->policy.remove_cblock = mq_remove_cblock; 1198 mq->policy.writeback_work = mq_writeback_work; 1199 mq->policy.force_mapping = mq_force_mapping; 1200 mq->policy.residency = mq_residency; 1201 mq->policy.tick = mq_tick; 1202 mq->policy.emit_config_values = mq_emit_config_values; 1203 mq->policy.set_config_value = mq_set_config_value; 1204} 1205 1206static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1207 sector_t origin_size, 1208 sector_t cache_block_size) 1209{ 1210 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); 1211 1212 if (!mq) 1213 return NULL; 1214 1215 init_policy_functions(mq); 1216 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); 1217 mq->cache_size = cache_size; 1218 1219 if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { 1220 DMERR("couldn't initialize pool of pre-cache entries"); 1221 goto bad_pre_cache_init; 1222 } 1223 1224 if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { 1225 DMERR("couldn't initialize pool of cache entries"); 1226 goto bad_cache_init; 1227 } 1228 1229 mq->tick_protected = 0; 1230 mq->tick = 0; 1231 mq->hit_count = 0; 1232 mq->generation = 0; 1233 mq->promote_threshold = 0; 1234 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; 1235 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; 1236 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; 1237 mutex_init(&mq->lock); 1238 spin_lock_init(&mq->tick_lock); 1239 1240 queue_init(&mq->pre_cache); 1241 queue_init(&mq->cache_clean); 1242 queue_init(&mq->cache_dirty); 1243 1244 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); 1245 1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1247 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1248 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); 1249 if (!mq->table) 1250 goto bad_alloc_table; 1251 1252 return &mq->policy; 1253 1254bad_alloc_table: 1255 epool_exit(&mq->cache_pool); 1256bad_cache_init: 1257 epool_exit(&mq->pre_cache_pool); 1258bad_pre_cache_init: 1259 kfree(mq); 1260 1261 return NULL; 1262} 1263 1264/*----------------------------------------------------------------*/ 1265 1266static struct dm_cache_policy_type mq_policy_type = { 1267 .name = "mq", 1268 .version = {1, 2, 0}, 1269 .hint_size = 4, 1270 .owner = THIS_MODULE, 1271 .create = mq_create 1272}; 1273 1274static struct dm_cache_policy_type default_policy_type = { 1275 .name = "default", 1276 .version = {1, 2, 0}, 1277 .hint_size = 4, 1278 .owner = THIS_MODULE, 1279 .create = mq_create, 1280 .real = &mq_policy_type 1281}; 1282 1283static int __init mq_init(void) 1284{ 1285 int r; 1286 1287 mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", 1288 sizeof(struct entry), 1289 __alignof__(struct entry), 1290 0, NULL); 1291 if (!mq_entry_cache) 1292 goto bad; 1293 1294 r = dm_cache_policy_register(&mq_policy_type); 1295 if (r) { 1296 DMERR("register failed %d", r); 1297 goto bad_register_mq; 1298 } 1299 1300 r = dm_cache_policy_register(&default_policy_type); 1301 if (!r) { 1302 DMINFO("version %u.%u.%u loaded", 1303 mq_policy_type.version[0], 1304 mq_policy_type.version[1], 1305 mq_policy_type.version[2]); 1306 return 0; 1307 } 1308 1309 DMERR("register failed (as default) %d", r); 1310 1311 dm_cache_policy_unregister(&mq_policy_type); 1312bad_register_mq: 1313 kmem_cache_destroy(mq_entry_cache); 1314bad: 1315 return -ENOMEM; 1316} 1317 1318static void __exit mq_exit(void) 1319{ 1320 dm_cache_policy_unregister(&mq_policy_type); 1321 dm_cache_policy_unregister(&default_policy_type); 1322 1323 kmem_cache_destroy(mq_entry_cache); 1324} 1325 1326module_init(mq_init); 1327module_exit(mq_exit); 1328 1329MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1330MODULE_LICENSE("GPL"); 1331MODULE_DESCRIPTION("mq cache policy"); 1332 1333MODULE_ALIAS("dm-cache-default"); 1334