1/* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8#include "dm.h" 9#include "dm-uevent.h" 10 11#include <linux/init.h> 12#include <linux/module.h> 13#include <linux/mutex.h> 14#include <linux/moduleparam.h> 15#include <linux/blkpg.h> 16#include <linux/bio.h> 17#include <linux/mempool.h> 18#include <linux/slab.h> 19#include <linux/idr.h> 20#include <linux/hdreg.h> 21#include <linux/delay.h> 22 23#include <trace/events/block.h> 24 25#define DM_MSG_PREFIX "core" 26 27#ifdef CONFIG_PRINTK 28/* 29 * ratelimit state to be used in DMXXX_LIMIT(). 30 */ 31DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 32 DEFAULT_RATELIMIT_INTERVAL, 33 DEFAULT_RATELIMIT_BURST); 34EXPORT_SYMBOL(dm_ratelimit_state); 35#endif 36 37/* 38 * Cookies are numeric values sent with CHANGE and REMOVE 39 * uevents while resuming, removing or renaming the device. 40 */ 41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 42#define DM_COOKIE_LENGTH 24 43 44static const char *_name = DM_NAME; 45 46static unsigned int major = 0; 47static unsigned int _major = 0; 48 49static DEFINE_IDR(_minor_idr); 50 51static DEFINE_SPINLOCK(_minor_lock); 52/* 53 * For bio-based dm. 54 * One of these is allocated per bio. 55 */ 56struct dm_io { 57 struct mapped_device *md; 58 int error; 59 atomic_t io_count; 60 struct bio *bio; 61 unsigned long start_time; 62 spinlock_t endio_lock; 63}; 64 65/* 66 * For bio-based dm. 67 * One of these is allocated per target within a bio. Hopefully 68 * this will be simplified out one day. 69 */ 70struct dm_target_io { 71 struct dm_io *io; 72 struct dm_target *ti; 73 union map_info info; 74}; 75 76/* 77 * For request-based dm. 78 * One of these is allocated per request. 79 */ 80struct dm_rq_target_io { 81 struct mapped_device *md; 82 struct dm_target *ti; 83 struct request *orig, clone; 84 int error; 85 union map_info info; 86}; 87 88/* 89 * For request-based dm. 90 * One of these is allocated per bio. 91 */ 92struct dm_rq_clone_bio_info { 93 struct bio *orig; 94 struct dm_rq_target_io *tio; 95}; 96 97union map_info *dm_get_mapinfo(struct bio *bio) 98{ 99 if (bio && bio->bi_private) 100 return &((struct dm_target_io *)bio->bi_private)->info; 101 return NULL; 102} 103 104union map_info *dm_get_rq_mapinfo(struct request *rq) 105{ 106 if (rq && rq->end_io_data) 107 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 108 return NULL; 109} 110EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 111 112#define MINOR_ALLOCED ((void *)-1) 113 114/* 115 * Bits for the md->flags field. 116 */ 117#define DMF_BLOCK_IO_FOR_SUSPEND 0 118#define DMF_SUSPENDED 1 119#define DMF_FROZEN 2 120#define DMF_FREEING 3 121#define DMF_DELETING 4 122#define DMF_NOFLUSH_SUSPENDING 5 123#define DMF_MERGE_IS_OPTIONAL 6 124 125/* 126 * Work processed by per-device workqueue. 127 */ 128struct mapped_device { 129 struct rw_semaphore io_lock; 130 struct mutex suspend_lock; 131 rwlock_t map_lock; 132 atomic_t holders; 133 atomic_t open_count; 134 135 unsigned long flags; 136 137 struct request_queue *queue; 138 unsigned type; 139 /* Protect queue and type against concurrent access. */ 140 struct mutex type_lock; 141 142 struct target_type *immutable_target_type; 143 144 struct gendisk *disk; 145 char name[16]; 146 147 void *interface_ptr; 148 149 /* 150 * A list of ios that arrived while we were suspended. 151 */ 152 atomic_t pending[2]; 153 wait_queue_head_t wait; 154 struct work_struct work; 155 struct bio_list deferred; 156 spinlock_t deferred_lock; 157 158 /* 159 * Processing queue (flush) 160 */ 161 struct workqueue_struct *wq; 162 163 /* 164 * The current mapping. 165 */ 166 struct dm_table *map; 167 168 /* 169 * io objects are allocated from here. 170 */ 171 mempool_t *io_pool; 172 mempool_t *tio_pool; 173 174 struct bio_set *bs; 175 176 /* 177 * Event handling. 178 */ 179 atomic_t event_nr; 180 wait_queue_head_t eventq; 181 atomic_t uevent_seq; 182 struct list_head uevent_list; 183 spinlock_t uevent_lock; /* Protect access to uevent_list */ 184 185 /* 186 * freeze/thaw support require holding onto a super block 187 */ 188 struct super_block *frozen_sb; 189 struct block_device *bdev; 190 191 /* forced geometry settings */ 192 struct hd_geometry geometry; 193 194 /* sysfs handle */ 195 struct kobject kobj; 196 197 /* zero-length flush that will be cloned and submitted to targets */ 198 struct bio flush_bio; 199}; 200 201/* 202 * For mempools pre-allocation at the table loading time. 203 */ 204struct dm_md_mempools { 205 mempool_t *io_pool; 206 mempool_t *tio_pool; 207 struct bio_set *bs; 208}; 209 210#define MIN_IOS 256 211static struct kmem_cache *_io_cache; 212static struct kmem_cache *_tio_cache; 213static struct kmem_cache *_rq_tio_cache; 214static struct kmem_cache *_rq_bio_info_cache; 215 216static int __init local_init(void) 217{ 218 int r = -ENOMEM; 219 220 /* allocate a slab for the dm_ios */ 221 _io_cache = KMEM_CACHE(dm_io, 0); 222 if (!_io_cache) 223 return r; 224 225 /* allocate a slab for the target ios */ 226 _tio_cache = KMEM_CACHE(dm_target_io, 0); 227 if (!_tio_cache) 228 goto out_free_io_cache; 229 230 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 231 if (!_rq_tio_cache) 232 goto out_free_tio_cache; 233 234 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 235 if (!_rq_bio_info_cache) 236 goto out_free_rq_tio_cache; 237 238 r = dm_uevent_init(); 239 if (r) 240 goto out_free_rq_bio_info_cache; 241 242 _major = major; 243 r = register_blkdev(_major, _name); 244 if (r < 0) 245 goto out_uevent_exit; 246 247 if (!_major) 248 _major = r; 249 250 return 0; 251 252out_uevent_exit: 253 dm_uevent_exit(); 254out_free_rq_bio_info_cache: 255 kmem_cache_destroy(_rq_bio_info_cache); 256out_free_rq_tio_cache: 257 kmem_cache_destroy(_rq_tio_cache); 258out_free_tio_cache: 259 kmem_cache_destroy(_tio_cache); 260out_free_io_cache: 261 kmem_cache_destroy(_io_cache); 262 263 return r; 264} 265 266static void local_exit(void) 267{ 268 kmem_cache_destroy(_rq_bio_info_cache); 269 kmem_cache_destroy(_rq_tio_cache); 270 kmem_cache_destroy(_tio_cache); 271 kmem_cache_destroy(_io_cache); 272 unregister_blkdev(_major, _name); 273 dm_uevent_exit(); 274 275 _major = 0; 276 277 DMINFO("cleaned up"); 278} 279 280static int (*_inits[])(void) __initdata = { 281 local_init, 282 dm_target_init, 283 dm_linear_init, 284 dm_stripe_init, 285 dm_io_init, 286 dm_kcopyd_init, 287 dm_interface_init, 288}; 289 290static void (*_exits[])(void) = { 291 local_exit, 292 dm_target_exit, 293 dm_linear_exit, 294 dm_stripe_exit, 295 dm_io_exit, 296 dm_kcopyd_exit, 297 dm_interface_exit, 298}; 299 300static int __init dm_init(void) 301{ 302 const int count = ARRAY_SIZE(_inits); 303 304 int r, i; 305 306 for (i = 0; i < count; i++) { 307 r = _inits[i](); 308 if (r) 309 goto bad; 310 } 311 312 return 0; 313 314 bad: 315 while (i--) 316 _exits[i](); 317 318 return r; 319} 320 321static void __exit dm_exit(void) 322{ 323 int i = ARRAY_SIZE(_exits); 324 325 while (i--) 326 _exits[i](); 327 328 /* 329 * Should be empty by this point. 330 */ 331 idr_remove_all(&_minor_idr); 332 idr_destroy(&_minor_idr); 333} 334 335/* 336 * Block device functions 337 */ 338int dm_deleting_md(struct mapped_device *md) 339{ 340 return test_bit(DMF_DELETING, &md->flags); 341} 342 343static int dm_blk_open(struct block_device *bdev, fmode_t mode) 344{ 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = bdev->bd_disk->private_data; 350 if (!md) 351 goto out; 352 353 if (test_bit(DMF_FREEING, &md->flags) || 354 dm_deleting_md(md)) { 355 md = NULL; 356 goto out; 357 } 358 359 dm_get(md); 360 atomic_inc(&md->open_count); 361 362out: 363 spin_unlock(&_minor_lock); 364 365 return md ? 0 : -ENXIO; 366} 367 368static int dm_blk_close(struct gendisk *disk, fmode_t mode) 369{ 370 struct mapped_device *md = disk->private_data; 371 372 spin_lock(&_minor_lock); 373 374 atomic_dec(&md->open_count); 375 dm_put(md); 376 377 spin_unlock(&_minor_lock); 378 379 return 0; 380} 381 382int dm_open_count(struct mapped_device *md) 383{ 384 return atomic_read(&md->open_count); 385} 386 387/* 388 * Guarantees nothing is using the device before it's deleted. 389 */ 390int dm_lock_for_deletion(struct mapped_device *md) 391{ 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (dm_open_count(md)) 397 r = -EBUSY; 398 else 399 set_bit(DMF_DELETING, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404} 405 406static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 407{ 408 struct mapped_device *md = bdev->bd_disk->private_data; 409 410 return dm_get_geometry(md, geo); 411} 412 413static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 414 unsigned int cmd, unsigned long arg) 415{ 416 struct mapped_device *md = bdev->bd_disk->private_data; 417 struct dm_table *map = dm_get_live_table(md); 418 struct dm_target *tgt; 419 int r = -ENOTTY; 420 421 if (!map || !dm_table_get_size(map)) 422 goto out; 423 424 /* We only support devices that have a single target */ 425 if (dm_table_get_num_targets(map) != 1) 426 goto out; 427 428 tgt = dm_table_get_target(map, 0); 429 430 if (dm_suspended_md(md)) { 431 r = -EAGAIN; 432 goto out; 433 } 434 435 if (tgt->type->ioctl) 436 r = tgt->type->ioctl(tgt, cmd, arg); 437 438out: 439 dm_table_put(map); 440 441 return r; 442} 443 444static struct dm_io *alloc_io(struct mapped_device *md) 445{ 446 return mempool_alloc(md->io_pool, GFP_NOIO); 447} 448 449static void free_io(struct mapped_device *md, struct dm_io *io) 450{ 451 mempool_free(io, md->io_pool); 452} 453 454static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 455{ 456 mempool_free(tio, md->tio_pool); 457} 458 459static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 460 gfp_t gfp_mask) 461{ 462 return mempool_alloc(md->tio_pool, gfp_mask); 463} 464 465static void free_rq_tio(struct dm_rq_target_io *tio) 466{ 467 mempool_free(tio, tio->md->tio_pool); 468} 469 470static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 471{ 472 return mempool_alloc(md->io_pool, GFP_ATOMIC); 473} 474 475static void free_bio_info(struct dm_rq_clone_bio_info *info) 476{ 477 mempool_free(info, info->tio->md->io_pool); 478} 479 480static int md_in_flight(struct mapped_device *md) 481{ 482 return atomic_read(&md->pending[READ]) + 483 atomic_read(&md->pending[WRITE]); 484} 485 486static void start_io_acct(struct dm_io *io) 487{ 488 struct mapped_device *md = io->md; 489 int cpu; 490 int rw = bio_data_dir(io->bio); 491 492 io->start_time = jiffies; 493 494 cpu = part_stat_lock(); 495 part_round_stats(cpu, &dm_disk(md)->part0); 496 part_stat_unlock(); 497 atomic_set(&dm_disk(md)->part0.in_flight[rw], 498 atomic_inc_return(&md->pending[rw])); 499} 500 501static void end_io_acct(struct dm_io *io) 502{ 503 struct mapped_device *md = io->md; 504 struct bio *bio = io->bio; 505 unsigned long duration = jiffies - io->start_time; 506 int pending, cpu; 507 int rw = bio_data_dir(bio); 508 509 cpu = part_stat_lock(); 510 part_round_stats(cpu, &dm_disk(md)->part0); 511 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 512 part_stat_unlock(); 513 514 /* 515 * After this is decremented the bio must not be touched if it is 516 * a flush. 517 */ 518 pending = atomic_dec_return(&md->pending[rw]); 519 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 520 pending += atomic_read(&md->pending[rw^0x1]); 521 522 /* nudge anyone waiting on suspend queue */ 523 if (!pending) 524 wake_up(&md->wait); 525} 526 527/* 528 * Add the bio to the list of deferred io. 529 */ 530static void queue_io(struct mapped_device *md, struct bio *bio) 531{ 532 unsigned long flags; 533 534 spin_lock_irqsave(&md->deferred_lock, flags); 535 bio_list_add(&md->deferred, bio); 536 spin_unlock_irqrestore(&md->deferred_lock, flags); 537 queue_work(md->wq, &md->work); 538} 539 540/* 541 * Everyone (including functions in this file), should use this 542 * function to access the md->map field, and make sure they call 543 * dm_table_put() when finished. 544 */ 545struct dm_table *dm_get_live_table(struct mapped_device *md) 546{ 547 struct dm_table *t; 548 unsigned long flags; 549 550 read_lock_irqsave(&md->map_lock, flags); 551 t = md->map; 552 if (t) 553 dm_table_get(t); 554 read_unlock_irqrestore(&md->map_lock, flags); 555 556 return t; 557} 558 559/* 560 * Get the geometry associated with a dm device 561 */ 562int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 563{ 564 *geo = md->geometry; 565 566 return 0; 567} 568 569/* 570 * Set the geometry of a device. 571 */ 572int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 573{ 574 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 575 576 if (geo->start > sz) { 577 DMWARN("Start sector is beyond the geometry limits."); 578 return -EINVAL; 579 } 580 581 md->geometry = *geo; 582 583 return 0; 584} 585 586/*----------------------------------------------------------------- 587 * CRUD START: 588 * A more elegant soln is in the works that uses the queue 589 * merge fn, unfortunately there are a couple of changes to 590 * the block layer that I want to make for this. So in the 591 * interests of getting something for people to use I give 592 * you this clearly demarcated crap. 593 *---------------------------------------------------------------*/ 594 595static int __noflush_suspending(struct mapped_device *md) 596{ 597 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 598} 599 600/* 601 * Decrements the number of outstanding ios that a bio has been 602 * cloned into, completing the original io if necc. 603 */ 604static void dec_pending(struct dm_io *io, int error) 605{ 606 unsigned long flags; 607 int io_error; 608 struct bio *bio; 609 struct mapped_device *md = io->md; 610 611 /* Push-back supersedes any I/O errors */ 612 if (unlikely(error)) { 613 spin_lock_irqsave(&io->endio_lock, flags); 614 if (!(io->error > 0 && __noflush_suspending(md))) 615 io->error = error; 616 spin_unlock_irqrestore(&io->endio_lock, flags); 617 } 618 619 if (atomic_dec_and_test(&io->io_count)) { 620 if (io->error == DM_ENDIO_REQUEUE) { 621 /* 622 * Target requested pushing back the I/O. 623 */ 624 spin_lock_irqsave(&md->deferred_lock, flags); 625 if (__noflush_suspending(md)) 626 bio_list_add_head(&md->deferred, io->bio); 627 else 628 /* noflush suspend was interrupted. */ 629 io->error = -EIO; 630 spin_unlock_irqrestore(&md->deferred_lock, flags); 631 } 632 633 io_error = io->error; 634 bio = io->bio; 635 end_io_acct(io); 636 free_io(md, io); 637 638 if (io_error == DM_ENDIO_REQUEUE) 639 return; 640 641 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 642 /* 643 * Preflush done for flush with data, reissue 644 * without REQ_FLUSH. 645 */ 646 bio->bi_rw &= ~REQ_FLUSH; 647 queue_io(md, bio); 648 } else { 649 /* done with normal IO or empty flush */ 650 trace_block_bio_complete(md->queue, bio, io_error); 651 bio_endio(bio, io_error); 652 } 653 } 654} 655 656static void clone_endio(struct bio *bio, int error) 657{ 658 int r = 0; 659 struct dm_target_io *tio = bio->bi_private; 660 struct dm_io *io = tio->io; 661 struct mapped_device *md = tio->io->md; 662 dm_endio_fn endio = tio->ti->type->end_io; 663 664 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 665 error = -EIO; 666 667 if (endio) { 668 r = endio(tio->ti, bio, error, &tio->info); 669 if (r < 0 || r == DM_ENDIO_REQUEUE) 670 /* 671 * error and requeue request are handled 672 * in dec_pending(). 673 */ 674 error = r; 675 else if (r == DM_ENDIO_INCOMPLETE) 676 /* The target will handle the io */ 677 return; 678 else if (r) { 679 DMWARN("unimplemented target endio return value: %d", r); 680 BUG(); 681 } 682 } 683 684 /* 685 * Store md for cleanup instead of tio which is about to get freed. 686 */ 687 bio->bi_private = md->bs; 688 689 free_tio(md, tio); 690 bio_put(bio); 691 dec_pending(io, error); 692} 693 694/* 695 * Partial completion handling for request-based dm 696 */ 697static void end_clone_bio(struct bio *clone, int error) 698{ 699 struct dm_rq_clone_bio_info *info = clone->bi_private; 700 struct dm_rq_target_io *tio = info->tio; 701 struct bio *bio = info->orig; 702 unsigned int nr_bytes = info->orig->bi_size; 703 704 bio_put(clone); 705 706 if (tio->error) 707 /* 708 * An error has already been detected on the request. 709 * Once error occurred, just let clone->end_io() handle 710 * the remainder. 711 */ 712 return; 713 else if (error) { 714 /* 715 * Don't notice the error to the upper layer yet. 716 * The error handling decision is made by the target driver, 717 * when the request is completed. 718 */ 719 tio->error = error; 720 return; 721 } 722 723 /* 724 * I/O for the bio successfully completed. 725 * Notice the data completion to the upper layer. 726 */ 727 728 /* 729 * bios are processed from the head of the list. 730 * So the completing bio should always be rq->bio. 731 * If it's not, something wrong is happening. 732 */ 733 if (tio->orig->bio != bio) 734 DMERR("bio completion is going in the middle of the request"); 735 736 /* 737 * Update the original request. 738 * Do not use blk_end_request() here, because it may complete 739 * the original request before the clone, and break the ordering. 740 */ 741 blk_update_request(tio->orig, 0, nr_bytes); 742} 743 744/* 745 * Don't touch any member of the md after calling this function because 746 * the md may be freed in dm_put() at the end of this function. 747 * Or do dm_get() before calling this function and dm_put() later. 748 */ 749static void rq_completed(struct mapped_device *md, int rw, int run_queue) 750{ 751 atomic_dec(&md->pending[rw]); 752 753 /* nudge anyone waiting on suspend queue */ 754 if (!md_in_flight(md)) 755 wake_up(&md->wait); 756 757 if (run_queue) 758 blk_run_queue(md->queue); 759 760 /* 761 * dm_put() must be at the end of this function. See the comment above 762 */ 763 dm_put(md); 764} 765 766static void free_rq_clone(struct request *clone) 767{ 768 struct dm_rq_target_io *tio = clone->end_io_data; 769 770 blk_rq_unprep_clone(clone); 771 free_rq_tio(tio); 772} 773 774/* 775 * Complete the clone and the original request. 776 * Must be called without queue lock. 777 */ 778static void dm_end_request(struct request *clone, int error) 779{ 780 int rw = rq_data_dir(clone); 781 struct dm_rq_target_io *tio = clone->end_io_data; 782 struct mapped_device *md = tio->md; 783 struct request *rq = tio->orig; 784 785 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 786 rq->errors = clone->errors; 787 rq->resid_len = clone->resid_len; 788 789 if (rq->sense) 790 /* 791 * We are using the sense buffer of the original 792 * request. 793 * So setting the length of the sense data is enough. 794 */ 795 rq->sense_len = clone->sense_len; 796 } 797 798 free_rq_clone(clone); 799 blk_end_request_all(rq, error); 800 rq_completed(md, rw, true); 801} 802 803static void dm_unprep_request(struct request *rq) 804{ 805 struct request *clone = rq->special; 806 807 rq->special = NULL; 808 rq->cmd_flags &= ~REQ_DONTPREP; 809 810 free_rq_clone(clone); 811} 812 813/* 814 * Requeue the original request of a clone. 815 */ 816void dm_requeue_unmapped_request(struct request *clone) 817{ 818 int rw = rq_data_dir(clone); 819 struct dm_rq_target_io *tio = clone->end_io_data; 820 struct mapped_device *md = tio->md; 821 struct request *rq = tio->orig; 822 struct request_queue *q = rq->q; 823 unsigned long flags; 824 825 dm_unprep_request(rq); 826 827 spin_lock_irqsave(q->queue_lock, flags); 828 blk_requeue_request(q, rq); 829 spin_unlock_irqrestore(q->queue_lock, flags); 830 831 rq_completed(md, rw, 0); 832} 833EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 834 835static void __stop_queue(struct request_queue *q) 836{ 837 blk_stop_queue(q); 838} 839 840static void stop_queue(struct request_queue *q) 841{ 842 unsigned long flags; 843 844 spin_lock_irqsave(q->queue_lock, flags); 845 __stop_queue(q); 846 spin_unlock_irqrestore(q->queue_lock, flags); 847} 848 849static void __start_queue(struct request_queue *q) 850{ 851 if (blk_queue_stopped(q)) 852 blk_start_queue(q); 853} 854 855static void start_queue(struct request_queue *q) 856{ 857 unsigned long flags; 858 859 spin_lock_irqsave(q->queue_lock, flags); 860 __start_queue(q); 861 spin_unlock_irqrestore(q->queue_lock, flags); 862} 863 864static void dm_done(struct request *clone, int error, bool mapped) 865{ 866 int r = error; 867 struct dm_rq_target_io *tio = clone->end_io_data; 868 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 869 870 if (mapped && rq_end_io) 871 r = rq_end_io(tio->ti, clone, error, &tio->info); 872 873 if (r <= 0) 874 /* The target wants to complete the I/O */ 875 dm_end_request(clone, r); 876 else if (r == DM_ENDIO_INCOMPLETE) 877 /* The target will handle the I/O */ 878 return; 879 else if (r == DM_ENDIO_REQUEUE) 880 /* The target wants to requeue the I/O */ 881 dm_requeue_unmapped_request(clone); 882 else { 883 DMWARN("unimplemented target endio return value: %d", r); 884 BUG(); 885 } 886} 887 888/* 889 * Request completion handler for request-based dm 890 */ 891static void dm_softirq_done(struct request *rq) 892{ 893 bool mapped = true; 894 struct request *clone = rq->completion_data; 895 struct dm_rq_target_io *tio = clone->end_io_data; 896 897 if (rq->cmd_flags & REQ_FAILED) 898 mapped = false; 899 900 dm_done(clone, tio->error, mapped); 901} 902 903/* 904 * Complete the clone and the original request with the error status 905 * through softirq context. 906 */ 907static void dm_complete_request(struct request *clone, int error) 908{ 909 struct dm_rq_target_io *tio = clone->end_io_data; 910 struct request *rq = tio->orig; 911 912 tio->error = error; 913 rq->completion_data = clone; 914 blk_complete_request(rq); 915} 916 917/* 918 * Complete the not-mapped clone and the original request with the error status 919 * through softirq context. 920 * Target's rq_end_io() function isn't called. 921 * This may be used when the target's map_rq() function fails. 922 */ 923void dm_kill_unmapped_request(struct request *clone, int error) 924{ 925 struct dm_rq_target_io *tio = clone->end_io_data; 926 struct request *rq = tio->orig; 927 928 rq->cmd_flags |= REQ_FAILED; 929 dm_complete_request(clone, error); 930} 931EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 932 933/* 934 * Called with the queue lock held 935 */ 936static void end_clone_request(struct request *clone, int error) 937{ 938 /* 939 * For just cleaning up the information of the queue in which 940 * the clone was dispatched. 941 * The clone is *NOT* freed actually here because it is alloced from 942 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 943 */ 944 __blk_put_request(clone->q, clone); 945 946 /* 947 * Actual request completion is done in a softirq context which doesn't 948 * hold the queue lock. Otherwise, deadlock could occur because: 949 * - another request may be submitted by the upper level driver 950 * of the stacking during the completion 951 * - the submission which requires queue lock may be done 952 * against this queue 953 */ 954 dm_complete_request(clone, error); 955} 956 957/* 958 * Return maximum size of I/O possible at the supplied sector up to the current 959 * target boundary. 960 */ 961static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 962{ 963 sector_t target_offset = dm_target_offset(ti, sector); 964 965 return ti->len - target_offset; 966} 967 968static sector_t max_io_len(sector_t sector, struct dm_target *ti) 969{ 970 sector_t len = max_io_len_target_boundary(sector, ti); 971 972 /* 973 * Does the target need to split even further ? 974 */ 975 if (ti->split_io) { 976 sector_t boundary; 977 sector_t offset = dm_target_offset(ti, sector); 978 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 979 - offset; 980 if (len > boundary) 981 len = boundary; 982 } 983 984 return len; 985} 986 987static void __map_bio(struct dm_target *ti, struct bio *clone, 988 struct dm_target_io *tio) 989{ 990 int r; 991 sector_t sector; 992 struct mapped_device *md; 993 994 clone->bi_end_io = clone_endio; 995 clone->bi_private = tio; 996 997 /* 998 * Map the clone. If r == 0 we don't need to do 999 * anything, the target has assumed ownership of 1000 * this io. 1001 */ 1002 atomic_inc(&tio->io->io_count); 1003 sector = clone->bi_sector; 1004 r = ti->type->map(ti, clone, &tio->info); 1005 if (r == DM_MAPIO_REMAPPED) { 1006 /* the bio has been remapped so dispatch it */ 1007 1008 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1009 tio->io->bio->bi_bdev->bd_dev, sector); 1010 1011 generic_make_request(clone); 1012 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1013 /* error the io and bail out, or requeue it if needed */ 1014 md = tio->io->md; 1015 dec_pending(tio->io, r); 1016 /* 1017 * Store bio_set for cleanup. 1018 */ 1019 clone->bi_private = md->bs; 1020 bio_put(clone); 1021 free_tio(md, tio); 1022 } else if (r) { 1023 DMWARN("unimplemented target map return value: %d", r); 1024 BUG(); 1025 } 1026} 1027 1028struct clone_info { 1029 struct mapped_device *md; 1030 struct dm_table *map; 1031 struct bio *bio; 1032 struct dm_io *io; 1033 sector_t sector; 1034 sector_t sector_count; 1035 unsigned short idx; 1036}; 1037 1038static void dm_bio_destructor(struct bio *bio) 1039{ 1040 struct bio_set *bs = bio->bi_private; 1041 1042 bio_free(bio, bs); 1043} 1044 1045/* 1046 * Creates a little bio that just does part of a bvec. 1047 */ 1048static struct bio *split_bvec(struct bio *bio, sector_t sector, 1049 unsigned short idx, unsigned int offset, 1050 unsigned int len, struct bio_set *bs) 1051{ 1052 struct bio *clone; 1053 struct bio_vec *bv = bio->bi_io_vec + idx; 1054 1055 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1056 clone->bi_destructor = dm_bio_destructor; 1057 *clone->bi_io_vec = *bv; 1058 1059 clone->bi_sector = sector; 1060 clone->bi_bdev = bio->bi_bdev; 1061 clone->bi_rw = bio->bi_rw; 1062 clone->bi_vcnt = 1; 1063 clone->bi_size = to_bytes(len); 1064 clone->bi_io_vec->bv_offset = offset; 1065 clone->bi_io_vec->bv_len = clone->bi_size; 1066 clone->bi_flags |= 1 << BIO_CLONED; 1067 1068 if (bio_integrity(bio)) { 1069 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1070 bio_integrity_trim(clone, 1071 bio_sector_offset(bio, idx, offset), len); 1072 } 1073 1074 return clone; 1075} 1076 1077/* 1078 * Creates a bio that consists of range of complete bvecs. 1079 */ 1080static struct bio *clone_bio(struct bio *bio, sector_t sector, 1081 unsigned short idx, unsigned short bv_count, 1082 unsigned int len, struct bio_set *bs) 1083{ 1084 struct bio *clone; 1085 1086 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1087 __bio_clone(clone, bio); 1088 clone->bi_destructor = dm_bio_destructor; 1089 clone->bi_sector = sector; 1090 clone->bi_idx = idx; 1091 clone->bi_vcnt = idx + bv_count; 1092 clone->bi_size = to_bytes(len); 1093 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1094 1095 if (bio_integrity(bio)) { 1096 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1097 1098 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1099 bio_integrity_trim(clone, 1100 bio_sector_offset(bio, idx, 0), len); 1101 } 1102 1103 return clone; 1104} 1105 1106static struct dm_target_io *alloc_tio(struct clone_info *ci, 1107 struct dm_target *ti) 1108{ 1109 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1110 1111 tio->io = ci->io; 1112 tio->ti = ti; 1113 memset(&tio->info, 0, sizeof(tio->info)); 1114 1115 return tio; 1116} 1117 1118static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1119 unsigned request_nr, sector_t len) 1120{ 1121 struct dm_target_io *tio = alloc_tio(ci, ti); 1122 struct bio *clone; 1123 1124 tio->info.target_request_nr = request_nr; 1125 1126 /* 1127 * Discard requests require the bio's inline iovecs be initialized. 1128 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1129 * and discard, so no need for concern about wasted bvec allocations. 1130 */ 1131 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1132 __bio_clone(clone, ci->bio); 1133 clone->bi_destructor = dm_bio_destructor; 1134 if (len) { 1135 clone->bi_sector = ci->sector; 1136 clone->bi_size = to_bytes(len); 1137 } 1138 1139 __map_bio(ti, clone, tio); 1140} 1141 1142static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1143 unsigned num_requests, sector_t len) 1144{ 1145 unsigned request_nr; 1146 1147 for (request_nr = 0; request_nr < num_requests; request_nr++) 1148 __issue_target_request(ci, ti, request_nr, len); 1149} 1150 1151static int __clone_and_map_empty_flush(struct clone_info *ci) 1152{ 1153 unsigned target_nr = 0; 1154 struct dm_target *ti; 1155 1156 BUG_ON(bio_has_data(ci->bio)); 1157 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1158 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1159 1160 return 0; 1161} 1162 1163/* 1164 * Perform all io with a single clone. 1165 */ 1166static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1167{ 1168 struct bio *clone, *bio = ci->bio; 1169 struct dm_target_io *tio; 1170 1171 tio = alloc_tio(ci, ti); 1172 clone = clone_bio(bio, ci->sector, ci->idx, 1173 bio->bi_vcnt - ci->idx, ci->sector_count, 1174 ci->md->bs); 1175 __map_bio(ti, clone, tio); 1176 ci->sector_count = 0; 1177} 1178 1179static int __clone_and_map_discard(struct clone_info *ci) 1180{ 1181 struct dm_target *ti; 1182 sector_t len; 1183 1184 do { 1185 ti = dm_table_find_target(ci->map, ci->sector); 1186 if (!dm_target_is_valid(ti)) 1187 return -EIO; 1188 1189 /* 1190 * Even though the device advertised discard support, 1191 * that does not mean every target supports it, and 1192 * reconfiguration might also have changed that since the 1193 * check was performed. 1194 */ 1195 if (!ti->num_discard_requests) 1196 return -EOPNOTSUPP; 1197 1198 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1199 1200 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1201 1202 ci->sector += len; 1203 } while (ci->sector_count -= len); 1204 1205 return 0; 1206} 1207 1208static int __clone_and_map(struct clone_info *ci) 1209{ 1210 struct bio *clone, *bio = ci->bio; 1211 struct dm_target *ti; 1212 sector_t len = 0, max; 1213 struct dm_target_io *tio; 1214 1215 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1216 return __clone_and_map_discard(ci); 1217 1218 ti = dm_table_find_target(ci->map, ci->sector); 1219 if (!dm_target_is_valid(ti)) 1220 return -EIO; 1221 1222 max = max_io_len(ci->sector, ti); 1223 1224 if (ci->sector_count <= max) { 1225 /* 1226 * Optimise for the simple case where we can do all of 1227 * the remaining io with a single clone. 1228 */ 1229 __clone_and_map_simple(ci, ti); 1230 1231 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1232 /* 1233 * There are some bvecs that don't span targets. 1234 * Do as many of these as possible. 1235 */ 1236 int i; 1237 sector_t remaining = max; 1238 sector_t bv_len; 1239 1240 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1241 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1242 1243 if (bv_len > remaining) 1244 break; 1245 1246 remaining -= bv_len; 1247 len += bv_len; 1248 } 1249 1250 tio = alloc_tio(ci, ti); 1251 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1252 ci->md->bs); 1253 __map_bio(ti, clone, tio); 1254 1255 ci->sector += len; 1256 ci->sector_count -= len; 1257 ci->idx = i; 1258 1259 } else { 1260 /* 1261 * Handle a bvec that must be split between two or more targets. 1262 */ 1263 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1264 sector_t remaining = to_sector(bv->bv_len); 1265 unsigned int offset = 0; 1266 1267 do { 1268 if (offset) { 1269 ti = dm_table_find_target(ci->map, ci->sector); 1270 if (!dm_target_is_valid(ti)) 1271 return -EIO; 1272 1273 max = max_io_len(ci->sector, ti); 1274 } 1275 1276 len = min(remaining, max); 1277 1278 tio = alloc_tio(ci, ti); 1279 clone = split_bvec(bio, ci->sector, ci->idx, 1280 bv->bv_offset + offset, len, 1281 ci->md->bs); 1282 1283 __map_bio(ti, clone, tio); 1284 1285 ci->sector += len; 1286 ci->sector_count -= len; 1287 offset += to_bytes(len); 1288 } while (remaining -= len); 1289 1290 ci->idx++; 1291 } 1292 1293 return 0; 1294} 1295 1296/* 1297 * Split the bio into several clones and submit it to targets. 1298 */ 1299static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1300{ 1301 struct clone_info ci; 1302 int error = 0; 1303 1304 ci.map = dm_get_live_table(md); 1305 if (unlikely(!ci.map)) { 1306 bio_io_error(bio); 1307 return; 1308 } 1309 1310 ci.md = md; 1311 ci.io = alloc_io(md); 1312 ci.io->error = 0; 1313 atomic_set(&ci.io->io_count, 1); 1314 ci.io->bio = bio; 1315 ci.io->md = md; 1316 spin_lock_init(&ci.io->endio_lock); 1317 ci.sector = bio->bi_sector; 1318 ci.idx = bio->bi_idx; 1319 1320 start_io_acct(ci.io); 1321 if (bio->bi_rw & REQ_FLUSH) { 1322 ci.bio = &ci.md->flush_bio; 1323 ci.sector_count = 0; 1324 error = __clone_and_map_empty_flush(&ci); 1325 /* dec_pending submits any data associated with flush */ 1326 } else { 1327 ci.bio = bio; 1328 ci.sector_count = bio_sectors(bio); 1329 while (ci.sector_count && !error) 1330 error = __clone_and_map(&ci); 1331 } 1332 1333 /* drop the extra reference count */ 1334 dec_pending(ci.io, error); 1335 dm_table_put(ci.map); 1336} 1337/*----------------------------------------------------------------- 1338 * CRUD END 1339 *---------------------------------------------------------------*/ 1340 1341static int dm_merge_bvec(struct request_queue *q, 1342 struct bvec_merge_data *bvm, 1343 struct bio_vec *biovec) 1344{ 1345 struct mapped_device *md = q->queuedata; 1346 struct dm_table *map = dm_get_live_table(md); 1347 struct dm_target *ti; 1348 sector_t max_sectors; 1349 int max_size = 0; 1350 1351 if (unlikely(!map)) 1352 goto out; 1353 1354 ti = dm_table_find_target(map, bvm->bi_sector); 1355 if (!dm_target_is_valid(ti)) 1356 goto out_table; 1357 1358 /* 1359 * Find maximum amount of I/O that won't need splitting 1360 */ 1361 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1362 (sector_t) BIO_MAX_SECTORS); 1363 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1364 if (max_size < 0) 1365 max_size = 0; 1366 1367 /* 1368 * merge_bvec_fn() returns number of bytes 1369 * it can accept at this offset 1370 * max is precomputed maximal io size 1371 */ 1372 if (max_size && ti->type->merge) 1373 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1374 /* 1375 * If the target doesn't support merge method and some of the devices 1376 * provided their merge_bvec method (we know this by looking at 1377 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1378 * entries. So always set max_size to 0, and the code below allows 1379 * just one page. 1380 */ 1381 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1382 1383 max_size = 0; 1384 1385out_table: 1386 dm_table_put(map); 1387 1388out: 1389 /* 1390 * Always allow an entire first page 1391 */ 1392 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1393 max_size = biovec->bv_len; 1394 1395 return max_size; 1396} 1397 1398/* 1399 * The request function that just remaps the bio built up by 1400 * dm_merge_bvec. 1401 */ 1402static void _dm_request(struct request_queue *q, struct bio *bio) 1403{ 1404 int rw = bio_data_dir(bio); 1405 struct mapped_device *md = q->queuedata; 1406 int cpu; 1407 1408 down_read(&md->io_lock); 1409 1410 cpu = part_stat_lock(); 1411 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1412 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1413 part_stat_unlock(); 1414 1415 /* if we're suspended, we have to queue this io for later */ 1416 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1417 up_read(&md->io_lock); 1418 1419 if (bio_rw(bio) != READA) 1420 queue_io(md, bio); 1421 else 1422 bio_io_error(bio); 1423 return; 1424 } 1425 1426 __split_and_process_bio(md, bio); 1427 up_read(&md->io_lock); 1428 return; 1429} 1430 1431static int dm_request_based(struct mapped_device *md) 1432{ 1433 return blk_queue_stackable(md->queue); 1434} 1435 1436static void dm_request(struct request_queue *q, struct bio *bio) 1437{ 1438 struct mapped_device *md = q->queuedata; 1439 1440 if (dm_request_based(md)) 1441 blk_queue_bio(q, bio); 1442 else 1443 _dm_request(q, bio); 1444} 1445 1446void dm_dispatch_request(struct request *rq) 1447{ 1448 int r; 1449 1450 if (blk_queue_io_stat(rq->q)) 1451 rq->cmd_flags |= REQ_IO_STAT; 1452 1453 rq->start_time = jiffies; 1454 r = blk_insert_cloned_request(rq->q, rq); 1455 if (r) 1456 dm_complete_request(rq, r); 1457} 1458EXPORT_SYMBOL_GPL(dm_dispatch_request); 1459 1460static void dm_rq_bio_destructor(struct bio *bio) 1461{ 1462 struct dm_rq_clone_bio_info *info = bio->bi_private; 1463 struct mapped_device *md = info->tio->md; 1464 1465 free_bio_info(info); 1466 bio_free(bio, md->bs); 1467} 1468 1469static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1470 void *data) 1471{ 1472 struct dm_rq_target_io *tio = data; 1473 struct mapped_device *md = tio->md; 1474 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1475 1476 if (!info) 1477 return -ENOMEM; 1478 1479 info->orig = bio_orig; 1480 info->tio = tio; 1481 bio->bi_end_io = end_clone_bio; 1482 bio->bi_private = info; 1483 bio->bi_destructor = dm_rq_bio_destructor; 1484 1485 return 0; 1486} 1487 1488static int setup_clone(struct request *clone, struct request *rq, 1489 struct dm_rq_target_io *tio) 1490{ 1491 int r; 1492 1493 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1494 dm_rq_bio_constructor, tio); 1495 if (r) 1496 return r; 1497 1498 clone->cmd = rq->cmd; 1499 clone->cmd_len = rq->cmd_len; 1500 clone->sense = rq->sense; 1501 clone->buffer = rq->buffer; 1502 clone->end_io = end_clone_request; 1503 clone->end_io_data = tio; 1504 1505 return 0; 1506} 1507 1508static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1509 gfp_t gfp_mask) 1510{ 1511 struct request *clone; 1512 struct dm_rq_target_io *tio; 1513 1514 tio = alloc_rq_tio(md, gfp_mask); 1515 if (!tio) 1516 return NULL; 1517 1518 tio->md = md; 1519 tio->ti = NULL; 1520 tio->orig = rq; 1521 tio->error = 0; 1522 memset(&tio->info, 0, sizeof(tio->info)); 1523 1524 clone = &tio->clone; 1525 if (setup_clone(clone, rq, tio)) { 1526 /* -ENOMEM */ 1527 free_rq_tio(tio); 1528 return NULL; 1529 } 1530 1531 return clone; 1532} 1533 1534/* 1535 * Called with the queue lock held. 1536 */ 1537static int dm_prep_fn(struct request_queue *q, struct request *rq) 1538{ 1539 struct mapped_device *md = q->queuedata; 1540 struct request *clone; 1541 1542 if (unlikely(rq->special)) { 1543 DMWARN("Already has something in rq->special."); 1544 return BLKPREP_KILL; 1545 } 1546 1547 clone = clone_rq(rq, md, GFP_ATOMIC); 1548 if (!clone) 1549 return BLKPREP_DEFER; 1550 1551 rq->special = clone; 1552 rq->cmd_flags |= REQ_DONTPREP; 1553 1554 return BLKPREP_OK; 1555} 1556 1557/* 1558 * Returns: 1559 * 0 : the request has been processed (not requeued) 1560 * !0 : the request has been requeued 1561 */ 1562static int map_request(struct dm_target *ti, struct request *clone, 1563 struct mapped_device *md) 1564{ 1565 int r, requeued = 0; 1566 struct dm_rq_target_io *tio = clone->end_io_data; 1567 1568 /* 1569 * Hold the md reference here for the in-flight I/O. 1570 * We can't rely on the reference count by device opener, 1571 * because the device may be closed during the request completion 1572 * when all bios are completed. 1573 * See the comment in rq_completed() too. 1574 */ 1575 dm_get(md); 1576 1577 tio->ti = ti; 1578 r = ti->type->map_rq(ti, clone, &tio->info); 1579 switch (r) { 1580 case DM_MAPIO_SUBMITTED: 1581 /* The target has taken the I/O to submit by itself later */ 1582 break; 1583 case DM_MAPIO_REMAPPED: 1584 /* The target has remapped the I/O so dispatch it */ 1585 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1586 blk_rq_pos(tio->orig)); 1587 dm_dispatch_request(clone); 1588 break; 1589 case DM_MAPIO_REQUEUE: 1590 /* The target wants to requeue the I/O */ 1591 dm_requeue_unmapped_request(clone); 1592 requeued = 1; 1593 break; 1594 default: 1595 if (r > 0) { 1596 DMWARN("unimplemented target map return value: %d", r); 1597 BUG(); 1598 } 1599 1600 /* The target wants to complete the I/O */ 1601 dm_kill_unmapped_request(clone, r); 1602 break; 1603 } 1604 1605 return requeued; 1606} 1607 1608/* 1609 * q->request_fn for request-based dm. 1610 * Called with the queue lock held. 1611 */ 1612static void dm_request_fn(struct request_queue *q) 1613{ 1614 struct mapped_device *md = q->queuedata; 1615 struct dm_table *map = dm_get_live_table(md); 1616 struct dm_target *ti; 1617 struct request *rq, *clone; 1618 sector_t pos; 1619 1620 /* 1621 * For suspend, check blk_queue_stopped() and increment 1622 * ->pending within a single queue_lock not to increment the 1623 * number of in-flight I/Os after the queue is stopped in 1624 * dm_suspend(). 1625 */ 1626 while (!blk_queue_stopped(q)) { 1627 rq = blk_peek_request(q); 1628 if (!rq) 1629 goto delay_and_out; 1630 1631 /* always use block 0 to find the target for flushes for now */ 1632 pos = 0; 1633 if (!(rq->cmd_flags & REQ_FLUSH)) 1634 pos = blk_rq_pos(rq); 1635 1636 ti = dm_table_find_target(map, pos); 1637 BUG_ON(!dm_target_is_valid(ti)); 1638 1639 if (ti->type->busy && ti->type->busy(ti)) 1640 goto delay_and_out; 1641 1642 blk_start_request(rq); 1643 clone = rq->special; 1644 atomic_inc(&md->pending[rq_data_dir(clone)]); 1645 1646 spin_unlock(q->queue_lock); 1647 if (map_request(ti, clone, md)) 1648 goto requeued; 1649 1650 BUG_ON(!irqs_disabled()); 1651 spin_lock(q->queue_lock); 1652 } 1653 1654 goto out; 1655 1656requeued: 1657 BUG_ON(!irqs_disabled()); 1658 spin_lock(q->queue_lock); 1659 1660delay_and_out: 1661 blk_delay_queue(q, HZ / 10); 1662out: 1663 dm_table_put(map); 1664 1665 return; 1666} 1667 1668int dm_underlying_device_busy(struct request_queue *q) 1669{ 1670 return blk_lld_busy(q); 1671} 1672EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1673 1674static int dm_lld_busy(struct request_queue *q) 1675{ 1676 int r; 1677 struct mapped_device *md = q->queuedata; 1678 struct dm_table *map = dm_get_live_table(md); 1679 1680 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1681 r = 1; 1682 else 1683 r = dm_table_any_busy_target(map); 1684 1685 dm_table_put(map); 1686 1687 return r; 1688} 1689 1690static int dm_any_congested(void *congested_data, int bdi_bits) 1691{ 1692 int r = bdi_bits; 1693 struct mapped_device *md = congested_data; 1694 struct dm_table *map; 1695 1696 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1697 map = dm_get_live_table(md); 1698 if (map) { 1699 /* 1700 * Request-based dm cares about only own queue for 1701 * the query about congestion status of request_queue 1702 */ 1703 if (dm_request_based(md)) 1704 r = md->queue->backing_dev_info.state & 1705 bdi_bits; 1706 else 1707 r = dm_table_any_congested(map, bdi_bits); 1708 1709 dm_table_put(map); 1710 } 1711 } 1712 1713 return r; 1714} 1715 1716/*----------------------------------------------------------------- 1717 * An IDR is used to keep track of allocated minor numbers. 1718 *---------------------------------------------------------------*/ 1719static void free_minor(int minor) 1720{ 1721 spin_lock(&_minor_lock); 1722 idr_remove(&_minor_idr, minor); 1723 spin_unlock(&_minor_lock); 1724} 1725 1726/* 1727 * See if the device with a specific minor # is free. 1728 */ 1729static int specific_minor(int minor) 1730{ 1731 int r, m; 1732 1733 if (minor >= (1 << MINORBITS)) 1734 return -EINVAL; 1735 1736 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1737 if (!r) 1738 return -ENOMEM; 1739 1740 spin_lock(&_minor_lock); 1741 1742 if (idr_find(&_minor_idr, minor)) { 1743 r = -EBUSY; 1744 goto out; 1745 } 1746 1747 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1748 if (r) 1749 goto out; 1750 1751 if (m != minor) { 1752 idr_remove(&_minor_idr, m); 1753 r = -EBUSY; 1754 goto out; 1755 } 1756 1757out: 1758 spin_unlock(&_minor_lock); 1759 return r; 1760} 1761 1762static int next_free_minor(int *minor) 1763{ 1764 int r, m; 1765 1766 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1767 if (!r) 1768 return -ENOMEM; 1769 1770 spin_lock(&_minor_lock); 1771 1772 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1773 if (r) 1774 goto out; 1775 1776 if (m >= (1 << MINORBITS)) { 1777 idr_remove(&_minor_idr, m); 1778 r = -ENOSPC; 1779 goto out; 1780 } 1781 1782 *minor = m; 1783 1784out: 1785 spin_unlock(&_minor_lock); 1786 return r; 1787} 1788 1789static const struct block_device_operations dm_blk_dops; 1790 1791static void dm_wq_work(struct work_struct *work); 1792 1793static void dm_init_md_queue(struct mapped_device *md) 1794{ 1795 /* 1796 * Request-based dm devices cannot be stacked on top of bio-based dm 1797 * devices. The type of this dm device has not been decided yet. 1798 * The type is decided at the first table loading time. 1799 * To prevent problematic device stacking, clear the queue flag 1800 * for request stacking support until then. 1801 * 1802 * This queue is new, so no concurrency on the queue_flags. 1803 */ 1804 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1805 1806 md->queue->queuedata = md; 1807 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1808 md->queue->backing_dev_info.congested_data = md; 1809 blk_queue_make_request(md->queue, dm_request); 1810 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1811 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1812} 1813 1814/* 1815 * Allocate and initialise a blank device with a given minor. 1816 */ 1817static struct mapped_device *alloc_dev(int minor) 1818{ 1819 int r; 1820 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1821 void *old_md; 1822 1823 if (!md) { 1824 DMWARN("unable to allocate device, out of memory."); 1825 return NULL; 1826 } 1827 1828 if (!try_module_get(THIS_MODULE)) 1829 goto bad_module_get; 1830 1831 /* get a minor number for the dev */ 1832 if (minor == DM_ANY_MINOR) 1833 r = next_free_minor(&minor); 1834 else 1835 r = specific_minor(minor); 1836 if (r < 0) 1837 goto bad_minor; 1838 1839 md->type = DM_TYPE_NONE; 1840 init_rwsem(&md->io_lock); 1841 mutex_init(&md->suspend_lock); 1842 mutex_init(&md->type_lock); 1843 spin_lock_init(&md->deferred_lock); 1844 rwlock_init(&md->map_lock); 1845 atomic_set(&md->holders, 1); 1846 atomic_set(&md->open_count, 0); 1847 atomic_set(&md->event_nr, 0); 1848 atomic_set(&md->uevent_seq, 0); 1849 INIT_LIST_HEAD(&md->uevent_list); 1850 spin_lock_init(&md->uevent_lock); 1851 1852 md->queue = blk_alloc_queue(GFP_KERNEL); 1853 if (!md->queue) 1854 goto bad_queue; 1855 1856 dm_init_md_queue(md); 1857 1858 md->disk = alloc_disk(1); 1859 if (!md->disk) 1860 goto bad_disk; 1861 1862 atomic_set(&md->pending[0], 0); 1863 atomic_set(&md->pending[1], 0); 1864 init_waitqueue_head(&md->wait); 1865 INIT_WORK(&md->work, dm_wq_work); 1866 init_waitqueue_head(&md->eventq); 1867 1868 md->disk->major = _major; 1869 md->disk->first_minor = minor; 1870 md->disk->fops = &dm_blk_dops; 1871 md->disk->queue = md->queue; 1872 md->disk->private_data = md; 1873 sprintf(md->disk->disk_name, "dm-%d", minor); 1874 add_disk(md->disk); 1875 format_dev_t(md->name, MKDEV(_major, minor)); 1876 1877 md->wq = alloc_workqueue("kdmflush", 1878 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1879 if (!md->wq) 1880 goto bad_thread; 1881 1882 md->bdev = bdget_disk(md->disk, 0); 1883 if (!md->bdev) 1884 goto bad_bdev; 1885 1886 bio_init(&md->flush_bio); 1887 md->flush_bio.bi_bdev = md->bdev; 1888 md->flush_bio.bi_rw = WRITE_FLUSH; 1889 1890 /* Populate the mapping, nobody knows we exist yet */ 1891 spin_lock(&_minor_lock); 1892 old_md = idr_replace(&_minor_idr, md, minor); 1893 spin_unlock(&_minor_lock); 1894 1895 BUG_ON(old_md != MINOR_ALLOCED); 1896 1897 return md; 1898 1899bad_bdev: 1900 destroy_workqueue(md->wq); 1901bad_thread: 1902 del_gendisk(md->disk); 1903 put_disk(md->disk); 1904bad_disk: 1905 blk_cleanup_queue(md->queue); 1906bad_queue: 1907 free_minor(minor); 1908bad_minor: 1909 module_put(THIS_MODULE); 1910bad_module_get: 1911 kfree(md); 1912 return NULL; 1913} 1914 1915static void unlock_fs(struct mapped_device *md); 1916 1917static void free_dev(struct mapped_device *md) 1918{ 1919 int minor = MINOR(disk_devt(md->disk)); 1920 1921 unlock_fs(md); 1922 bdput(md->bdev); 1923 destroy_workqueue(md->wq); 1924 if (md->tio_pool) 1925 mempool_destroy(md->tio_pool); 1926 if (md->io_pool) 1927 mempool_destroy(md->io_pool); 1928 if (md->bs) 1929 bioset_free(md->bs); 1930 blk_integrity_unregister(md->disk); 1931 del_gendisk(md->disk); 1932 free_minor(minor); 1933 1934 spin_lock(&_minor_lock); 1935 md->disk->private_data = NULL; 1936 spin_unlock(&_minor_lock); 1937 1938 put_disk(md->disk); 1939 blk_cleanup_queue(md->queue); 1940 module_put(THIS_MODULE); 1941 kfree(md); 1942} 1943 1944static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1945{ 1946 struct dm_md_mempools *p; 1947 1948 if (md->io_pool && md->tio_pool && md->bs) 1949 /* the md already has necessary mempools */ 1950 goto out; 1951 1952 p = dm_table_get_md_mempools(t); 1953 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1954 1955 md->io_pool = p->io_pool; 1956 p->io_pool = NULL; 1957 md->tio_pool = p->tio_pool; 1958 p->tio_pool = NULL; 1959 md->bs = p->bs; 1960 p->bs = NULL; 1961 1962out: 1963 /* mempool bind completed, now no need any mempools in the table */ 1964 dm_table_free_md_mempools(t); 1965} 1966 1967/* 1968 * Bind a table to the device. 1969 */ 1970static void event_callback(void *context) 1971{ 1972 unsigned long flags; 1973 LIST_HEAD(uevents); 1974 struct mapped_device *md = (struct mapped_device *) context; 1975 1976 spin_lock_irqsave(&md->uevent_lock, flags); 1977 list_splice_init(&md->uevent_list, &uevents); 1978 spin_unlock_irqrestore(&md->uevent_lock, flags); 1979 1980 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1981 1982 atomic_inc(&md->event_nr); 1983 wake_up(&md->eventq); 1984} 1985 1986/* 1987 * Protected by md->suspend_lock obtained by dm_swap_table(). 1988 */ 1989static void __set_size(struct mapped_device *md, sector_t size) 1990{ 1991 set_capacity(md->disk, size); 1992 1993 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1994} 1995 1996/* 1997 * Return 1 if the queue has a compulsory merge_bvec_fn function. 1998 * 1999 * If this function returns 0, then the device is either a non-dm 2000 * device without a merge_bvec_fn, or it is a dm device that is 2001 * able to split any bios it receives that are too big. 2002 */ 2003int dm_queue_merge_is_compulsory(struct request_queue *q) 2004{ 2005 struct mapped_device *dev_md; 2006 2007 if (!q->merge_bvec_fn) 2008 return 0; 2009 2010 if (q->make_request_fn == dm_request) { 2011 dev_md = q->queuedata; 2012 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2013 return 0; 2014 } 2015 2016 return 1; 2017} 2018 2019static int dm_device_merge_is_compulsory(struct dm_target *ti, 2020 struct dm_dev *dev, sector_t start, 2021 sector_t len, void *data) 2022{ 2023 struct block_device *bdev = dev->bdev; 2024 struct request_queue *q = bdev_get_queue(bdev); 2025 2026 return dm_queue_merge_is_compulsory(q); 2027} 2028 2029/* 2030 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2031 * on the properties of the underlying devices. 2032 */ 2033static int dm_table_merge_is_optional(struct dm_table *table) 2034{ 2035 unsigned i = 0; 2036 struct dm_target *ti; 2037 2038 while (i < dm_table_get_num_targets(table)) { 2039 ti = dm_table_get_target(table, i++); 2040 2041 if (ti->type->iterate_devices && 2042 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2043 return 0; 2044 } 2045 2046 return 1; 2047} 2048 2049/* 2050 * Returns old map, which caller must destroy. 2051 */ 2052static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2053 struct queue_limits *limits) 2054{ 2055 struct dm_table *old_map; 2056 struct request_queue *q = md->queue; 2057 sector_t size; 2058 unsigned long flags; 2059 int merge_is_optional; 2060 2061 size = dm_table_get_size(t); 2062 2063 /* 2064 * Wipe any geometry if the size of the table changed. 2065 */ 2066 if (size != get_capacity(md->disk)) 2067 memset(&md->geometry, 0, sizeof(md->geometry)); 2068 2069 __set_size(md, size); 2070 2071 dm_table_event_callback(t, event_callback, md); 2072 2073 /* 2074 * The queue hasn't been stopped yet, if the old table type wasn't 2075 * for request-based during suspension. So stop it to prevent 2076 * I/O mapping before resume. 2077 * This must be done before setting the queue restrictions, 2078 * because request-based dm may be run just after the setting. 2079 */ 2080 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2081 stop_queue(q); 2082 2083 __bind_mempools(md, t); 2084 2085 merge_is_optional = dm_table_merge_is_optional(t); 2086 2087 write_lock_irqsave(&md->map_lock, flags); 2088 old_map = md->map; 2089 md->map = t; 2090 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2091 2092 dm_table_set_restrictions(t, q, limits); 2093 if (merge_is_optional) 2094 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2095 else 2096 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2097 write_unlock_irqrestore(&md->map_lock, flags); 2098 2099 return old_map; 2100} 2101 2102/* 2103 * Returns unbound table for the caller to free. 2104 */ 2105static struct dm_table *__unbind(struct mapped_device *md) 2106{ 2107 struct dm_table *map = md->map; 2108 unsigned long flags; 2109 2110 if (!map) 2111 return NULL; 2112 2113 dm_table_event_callback(map, NULL, NULL); 2114 write_lock_irqsave(&md->map_lock, flags); 2115 md->map = NULL; 2116 write_unlock_irqrestore(&md->map_lock, flags); 2117 2118 return map; 2119} 2120 2121/* 2122 * Constructor for a new device. 2123 */ 2124int dm_create(int minor, struct mapped_device **result) 2125{ 2126 struct mapped_device *md; 2127 2128 md = alloc_dev(minor); 2129 if (!md) 2130 return -ENXIO; 2131 2132 dm_sysfs_init(md); 2133 2134 *result = md; 2135 return 0; 2136} 2137 2138/* 2139 * Functions to manage md->type. 2140 * All are required to hold md->type_lock. 2141 */ 2142void dm_lock_md_type(struct mapped_device *md) 2143{ 2144 mutex_lock(&md->type_lock); 2145} 2146 2147void dm_unlock_md_type(struct mapped_device *md) 2148{ 2149 mutex_unlock(&md->type_lock); 2150} 2151 2152void dm_set_md_type(struct mapped_device *md, unsigned type) 2153{ 2154 md->type = type; 2155} 2156 2157unsigned dm_get_md_type(struct mapped_device *md) 2158{ 2159 return md->type; 2160} 2161 2162struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2163{ 2164 return md->immutable_target_type; 2165} 2166 2167/* 2168 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2169 */ 2170static int dm_init_request_based_queue(struct mapped_device *md) 2171{ 2172 struct request_queue *q = NULL; 2173 2174 if (md->queue->elevator) 2175 return 1; 2176 2177 /* Fully initialize the queue */ 2178 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2179 if (!q) 2180 return 0; 2181 2182 md->queue = q; 2183 dm_init_md_queue(md); 2184 blk_queue_softirq_done(md->queue, dm_softirq_done); 2185 blk_queue_prep_rq(md->queue, dm_prep_fn); 2186 blk_queue_lld_busy(md->queue, dm_lld_busy); 2187 2188 elv_register_queue(md->queue); 2189 2190 return 1; 2191} 2192 2193/* 2194 * Setup the DM device's queue based on md's type 2195 */ 2196int dm_setup_md_queue(struct mapped_device *md) 2197{ 2198 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2199 !dm_init_request_based_queue(md)) { 2200 DMWARN("Cannot initialize queue for request-based mapped device"); 2201 return -EINVAL; 2202 } 2203 2204 return 0; 2205} 2206 2207static struct mapped_device *dm_find_md(dev_t dev) 2208{ 2209 struct mapped_device *md; 2210 unsigned minor = MINOR(dev); 2211 2212 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2213 return NULL; 2214 2215 spin_lock(&_minor_lock); 2216 2217 md = idr_find(&_minor_idr, minor); 2218 if (md && (md == MINOR_ALLOCED || 2219 (MINOR(disk_devt(dm_disk(md))) != minor) || 2220 dm_deleting_md(md) || 2221 test_bit(DMF_FREEING, &md->flags))) { 2222 md = NULL; 2223 goto out; 2224 } 2225 2226out: 2227 spin_unlock(&_minor_lock); 2228 2229 return md; 2230} 2231 2232struct mapped_device *dm_get_md(dev_t dev) 2233{ 2234 struct mapped_device *md = dm_find_md(dev); 2235 2236 if (md) 2237 dm_get(md); 2238 2239 return md; 2240} 2241EXPORT_SYMBOL_GPL(dm_get_md); 2242 2243void *dm_get_mdptr(struct mapped_device *md) 2244{ 2245 return md->interface_ptr; 2246} 2247 2248void dm_set_mdptr(struct mapped_device *md, void *ptr) 2249{ 2250 md->interface_ptr = ptr; 2251} 2252 2253void dm_get(struct mapped_device *md) 2254{ 2255 atomic_inc(&md->holders); 2256 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2257} 2258 2259const char *dm_device_name(struct mapped_device *md) 2260{ 2261 return md->name; 2262} 2263EXPORT_SYMBOL_GPL(dm_device_name); 2264 2265static void __dm_destroy(struct mapped_device *md, bool wait) 2266{ 2267 struct dm_table *map; 2268 2269 might_sleep(); 2270 2271 spin_lock(&_minor_lock); 2272 map = dm_get_live_table(md); 2273 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2274 set_bit(DMF_FREEING, &md->flags); 2275 spin_unlock(&_minor_lock); 2276 2277 if (!dm_suspended_md(md)) { 2278 dm_table_presuspend_targets(map); 2279 dm_table_postsuspend_targets(map); 2280 } 2281 2282 /* 2283 * Rare, but there may be I/O requests still going to complete, 2284 * for example. Wait for all references to disappear. 2285 * No one should increment the reference count of the mapped_device, 2286 * after the mapped_device state becomes DMF_FREEING. 2287 */ 2288 if (wait) 2289 while (atomic_read(&md->holders)) 2290 msleep(1); 2291 else if (atomic_read(&md->holders)) 2292 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2293 dm_device_name(md), atomic_read(&md->holders)); 2294 2295 dm_sysfs_exit(md); 2296 dm_table_put(map); 2297 dm_table_destroy(__unbind(md)); 2298 free_dev(md); 2299} 2300 2301void dm_destroy(struct mapped_device *md) 2302{ 2303 __dm_destroy(md, true); 2304} 2305 2306void dm_destroy_immediate(struct mapped_device *md) 2307{ 2308 __dm_destroy(md, false); 2309} 2310 2311void dm_put(struct mapped_device *md) 2312{ 2313 atomic_dec(&md->holders); 2314} 2315EXPORT_SYMBOL_GPL(dm_put); 2316 2317static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2318{ 2319 int r = 0; 2320 DECLARE_WAITQUEUE(wait, current); 2321 2322 add_wait_queue(&md->wait, &wait); 2323 2324 while (1) { 2325 set_current_state(interruptible); 2326 2327 if (!md_in_flight(md)) 2328 break; 2329 2330 if (interruptible == TASK_INTERRUPTIBLE && 2331 signal_pending(current)) { 2332 r = -EINTR; 2333 break; 2334 } 2335 2336 io_schedule(); 2337 } 2338 set_current_state(TASK_RUNNING); 2339 2340 remove_wait_queue(&md->wait, &wait); 2341 2342 return r; 2343} 2344 2345/* 2346 * Process the deferred bios 2347 */ 2348static void dm_wq_work(struct work_struct *work) 2349{ 2350 struct mapped_device *md = container_of(work, struct mapped_device, 2351 work); 2352 struct bio *c; 2353 2354 down_read(&md->io_lock); 2355 2356 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2357 spin_lock_irq(&md->deferred_lock); 2358 c = bio_list_pop(&md->deferred); 2359 spin_unlock_irq(&md->deferred_lock); 2360 2361 if (!c) 2362 break; 2363 2364 up_read(&md->io_lock); 2365 2366 if (dm_request_based(md)) 2367 generic_make_request(c); 2368 else 2369 __split_and_process_bio(md, c); 2370 2371 down_read(&md->io_lock); 2372 } 2373 2374 up_read(&md->io_lock); 2375} 2376 2377static void dm_queue_flush(struct mapped_device *md) 2378{ 2379 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2380 smp_mb__after_clear_bit(); 2381 queue_work(md->wq, &md->work); 2382} 2383 2384/* 2385 * Swap in a new table, returning the old one for the caller to destroy. 2386 */ 2387struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2388{ 2389 struct dm_table *map = ERR_PTR(-EINVAL); 2390 struct queue_limits limits; 2391 int r; 2392 2393 mutex_lock(&md->suspend_lock); 2394 2395 /* device must be suspended */ 2396 if (!dm_suspended_md(md)) 2397 goto out; 2398 2399 r = dm_calculate_queue_limits(table, &limits); 2400 if (r) { 2401 map = ERR_PTR(r); 2402 goto out; 2403 } 2404 2405 map = __bind(md, table, &limits); 2406 2407out: 2408 mutex_unlock(&md->suspend_lock); 2409 return map; 2410} 2411 2412/* 2413 * Functions to lock and unlock any filesystem running on the 2414 * device. 2415 */ 2416static int lock_fs(struct mapped_device *md) 2417{ 2418 int r; 2419 2420 WARN_ON(md->frozen_sb); 2421 2422 md->frozen_sb = freeze_bdev(md->bdev); 2423 if (IS_ERR(md->frozen_sb)) { 2424 r = PTR_ERR(md->frozen_sb); 2425 md->frozen_sb = NULL; 2426 return r; 2427 } 2428 2429 set_bit(DMF_FROZEN, &md->flags); 2430 2431 return 0; 2432} 2433 2434static void unlock_fs(struct mapped_device *md) 2435{ 2436 if (!test_bit(DMF_FROZEN, &md->flags)) 2437 return; 2438 2439 thaw_bdev(md->bdev, md->frozen_sb); 2440 md->frozen_sb = NULL; 2441 clear_bit(DMF_FROZEN, &md->flags); 2442} 2443 2444/* 2445 * We need to be able to change a mapping table under a mounted 2446 * filesystem. For example we might want to move some data in 2447 * the background. Before the table can be swapped with 2448 * dm_bind_table, dm_suspend must be called to flush any in 2449 * flight bios and ensure that any further io gets deferred. 2450 */ 2451/* 2452 * Suspend mechanism in request-based dm. 2453 * 2454 * 1. Flush all I/Os by lock_fs() if needed. 2455 * 2. Stop dispatching any I/O by stopping the request_queue. 2456 * 3. Wait for all in-flight I/Os to be completed or requeued. 2457 * 2458 * To abort suspend, start the request_queue. 2459 */ 2460int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2461{ 2462 struct dm_table *map = NULL; 2463 int r = 0; 2464 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2465 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2466 2467 mutex_lock(&md->suspend_lock); 2468 2469 if (dm_suspended_md(md)) { 2470 r = -EINVAL; 2471 goto out_unlock; 2472 } 2473 2474 map = dm_get_live_table(md); 2475 2476 /* 2477 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2478 * This flag is cleared before dm_suspend returns. 2479 */ 2480 if (noflush) 2481 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2482 2483 /* This does not get reverted if there's an error later. */ 2484 dm_table_presuspend_targets(map); 2485 2486 /* 2487 * Flush I/O to the device. 2488 * Any I/O submitted after lock_fs() may not be flushed. 2489 * noflush takes precedence over do_lockfs. 2490 * (lock_fs() flushes I/Os and waits for them to complete.) 2491 */ 2492 if (!noflush && do_lockfs) { 2493 r = lock_fs(md); 2494 if (r) 2495 goto out; 2496 } 2497 2498 /* 2499 * Here we must make sure that no processes are submitting requests 2500 * to target drivers i.e. no one may be executing 2501 * __split_and_process_bio. This is called from dm_request and 2502 * dm_wq_work. 2503 * 2504 * To get all processes out of __split_and_process_bio in dm_request, 2505 * we take the write lock. To prevent any process from reentering 2506 * __split_and_process_bio from dm_request and quiesce the thread 2507 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2508 * flush_workqueue(md->wq). 2509 */ 2510 down_write(&md->io_lock); 2511 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2512 up_write(&md->io_lock); 2513 2514 /* 2515 * Stop md->queue before flushing md->wq in case request-based 2516 * dm defers requests to md->wq from md->queue. 2517 */ 2518 if (dm_request_based(md)) 2519 stop_queue(md->queue); 2520 2521 flush_workqueue(md->wq); 2522 2523 /* 2524 * At this point no more requests are entering target request routines. 2525 * We call dm_wait_for_completion to wait for all existing requests 2526 * to finish. 2527 */ 2528 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2529 2530 down_write(&md->io_lock); 2531 if (noflush) 2532 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2533 up_write(&md->io_lock); 2534 2535 /* were we interrupted ? */ 2536 if (r < 0) { 2537 dm_queue_flush(md); 2538 2539 if (dm_request_based(md)) 2540 start_queue(md->queue); 2541 2542 unlock_fs(md); 2543 goto out; /* pushback list is already flushed, so skip flush */ 2544 } 2545 2546 /* 2547 * If dm_wait_for_completion returned 0, the device is completely 2548 * quiescent now. There is no request-processing activity. All new 2549 * requests are being added to md->deferred list. 2550 */ 2551 2552 set_bit(DMF_SUSPENDED, &md->flags); 2553 2554 dm_table_postsuspend_targets(map); 2555 2556out: 2557 dm_table_put(map); 2558 2559out_unlock: 2560 mutex_unlock(&md->suspend_lock); 2561 return r; 2562} 2563 2564int dm_resume(struct mapped_device *md) 2565{ 2566 int r = -EINVAL; 2567 struct dm_table *map = NULL; 2568 2569 mutex_lock(&md->suspend_lock); 2570 if (!dm_suspended_md(md)) 2571 goto out; 2572 2573 map = dm_get_live_table(md); 2574 if (!map || !dm_table_get_size(map)) 2575 goto out; 2576 2577 r = dm_table_resume_targets(map); 2578 if (r) 2579 goto out; 2580 2581 dm_queue_flush(md); 2582 2583 /* 2584 * Flushing deferred I/Os must be done after targets are resumed 2585 * so that mapping of targets can work correctly. 2586 * Request-based dm is queueing the deferred I/Os in its request_queue. 2587 */ 2588 if (dm_request_based(md)) 2589 start_queue(md->queue); 2590 2591 unlock_fs(md); 2592 2593 clear_bit(DMF_SUSPENDED, &md->flags); 2594 2595 r = 0; 2596out: 2597 dm_table_put(map); 2598 mutex_unlock(&md->suspend_lock); 2599 2600 return r; 2601} 2602 2603/*----------------------------------------------------------------- 2604 * Event notification. 2605 *---------------------------------------------------------------*/ 2606int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2607 unsigned cookie) 2608{ 2609 char udev_cookie[DM_COOKIE_LENGTH]; 2610 char *envp[] = { udev_cookie, NULL }; 2611 2612 if (!cookie) 2613 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2614 else { 2615 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2616 DM_COOKIE_ENV_VAR_NAME, cookie); 2617 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2618 action, envp); 2619 } 2620} 2621 2622uint32_t dm_next_uevent_seq(struct mapped_device *md) 2623{ 2624 return atomic_add_return(1, &md->uevent_seq); 2625} 2626 2627uint32_t dm_get_event_nr(struct mapped_device *md) 2628{ 2629 return atomic_read(&md->event_nr); 2630} 2631 2632int dm_wait_event(struct mapped_device *md, int event_nr) 2633{ 2634 return wait_event_interruptible(md->eventq, 2635 (event_nr != atomic_read(&md->event_nr))); 2636} 2637 2638void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2639{ 2640 unsigned long flags; 2641 2642 spin_lock_irqsave(&md->uevent_lock, flags); 2643 list_add(elist, &md->uevent_list); 2644 spin_unlock_irqrestore(&md->uevent_lock, flags); 2645} 2646 2647/* 2648 * The gendisk is only valid as long as you have a reference 2649 * count on 'md'. 2650 */ 2651struct gendisk *dm_disk(struct mapped_device *md) 2652{ 2653 return md->disk; 2654} 2655 2656struct kobject *dm_kobject(struct mapped_device *md) 2657{ 2658 return &md->kobj; 2659} 2660 2661/* 2662 * struct mapped_device should not be exported outside of dm.c 2663 * so use this check to verify that kobj is part of md structure 2664 */ 2665struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2666{ 2667 struct mapped_device *md; 2668 2669 md = container_of(kobj, struct mapped_device, kobj); 2670 if (&md->kobj != kobj) 2671 return NULL; 2672 2673 if (test_bit(DMF_FREEING, &md->flags) || 2674 dm_deleting_md(md)) 2675 return NULL; 2676 2677 dm_get(md); 2678 return md; 2679} 2680 2681int dm_suspended_md(struct mapped_device *md) 2682{ 2683 return test_bit(DMF_SUSPENDED, &md->flags); 2684} 2685 2686int dm_suspended(struct dm_target *ti) 2687{ 2688 return dm_suspended_md(dm_table_get_md(ti->table)); 2689} 2690EXPORT_SYMBOL_GPL(dm_suspended); 2691 2692int dm_noflush_suspending(struct dm_target *ti) 2693{ 2694 return __noflush_suspending(dm_table_get_md(ti->table)); 2695} 2696EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2697 2698struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) 2699{ 2700 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2701 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2702 2703 if (!pools) 2704 return NULL; 2705 2706 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2707 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2708 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2709 if (!pools->io_pool) 2710 goto free_pools_and_out; 2711 2712 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2713 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2714 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2715 if (!pools->tio_pool) 2716 goto free_io_pool_and_out; 2717 2718 pools->bs = bioset_create(pool_size, 0); 2719 if (!pools->bs) 2720 goto free_tio_pool_and_out; 2721 2722 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2723 goto free_bioset_and_out; 2724 2725 return pools; 2726 2727free_bioset_and_out: 2728 bioset_free(pools->bs); 2729 2730free_tio_pool_and_out: 2731 mempool_destroy(pools->tio_pool); 2732 2733free_io_pool_and_out: 2734 mempool_destroy(pools->io_pool); 2735 2736free_pools_and_out: 2737 kfree(pools); 2738 2739 return NULL; 2740} 2741 2742void dm_free_md_mempools(struct dm_md_mempools *pools) 2743{ 2744 if (!pools) 2745 return; 2746 2747 if (pools->io_pool) 2748 mempool_destroy(pools->io_pool); 2749 2750 if (pools->tio_pool) 2751 mempool_destroy(pools->tio_pool); 2752 2753 if (pools->bs) 2754 bioset_free(pools->bs); 2755 2756 kfree(pools); 2757} 2758 2759static const struct block_device_operations dm_blk_dops = { 2760 .open = dm_blk_open, 2761 .release = dm_blk_close, 2762 .ioctl = dm_blk_ioctl, 2763 .getgeo = dm_blk_getgeo, 2764 .owner = THIS_MODULE 2765}; 2766 2767EXPORT_SYMBOL(dm_get_mapinfo); 2768 2769/* 2770 * module hooks 2771 */ 2772module_init(dm_init); 2773module_exit(dm_exit); 2774 2775module_param(major, uint, 0); 2776MODULE_PARM_DESC(major, "The major number of the device mapper"); 2777MODULE_DESCRIPTION(DM_NAME " driver"); 2778MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2779MODULE_LICENSE("GPL"); 2780