nvme-core.c revision 53562be74bd06bbe74d2acf3caca5398f8eeb160
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/nvme.h> 16#include <linux/bio.h> 17#include <linux/bitops.h> 18#include <linux/blkdev.h> 19#include <linux/cpu.h> 20#include <linux/delay.h> 21#include <linux/errno.h> 22#include <linux/fs.h> 23#include <linux/genhd.h> 24#include <linux/hdreg.h> 25#include <linux/idr.h> 26#include <linux/init.h> 27#include <linux/interrupt.h> 28#include <linux/io.h> 29#include <linux/kdev_t.h> 30#include <linux/kthread.h> 31#include <linux/kernel.h> 32#include <linux/mm.h> 33#include <linux/module.h> 34#include <linux/moduleparam.h> 35#include <linux/pci.h> 36#include <linux/percpu.h> 37#include <linux/poison.h> 38#include <linux/ptrace.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42#include <scsi/sg.h> 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#include <trace/events/block.h> 46 47#define NVME_Q_DEPTH 1024 48#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 49#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 50#define ADMIN_TIMEOUT (60 * HZ) 51#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT) 52 53unsigned char io_timeout = 30; 54module_param(io_timeout, byte, 0644); 55MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 56 57static int nvme_major; 58module_param(nvme_major, int, 0); 59 60static int use_threaded_interrupts; 61module_param(use_threaded_interrupts, int, 0); 62 63static DEFINE_SPINLOCK(dev_list_lock); 64static LIST_HEAD(dev_list); 65static struct task_struct *nvme_thread; 66static struct workqueue_struct *nvme_workq; 67static wait_queue_head_t nvme_kthread_wait; 68 69static void nvme_reset_failed_dev(struct work_struct *ws); 70 71struct async_cmd_info { 72 struct kthread_work work; 73 struct kthread_worker *worker; 74 u32 result; 75 int status; 76 void *ctx; 77}; 78 79/* 80 * An NVM Express queue. Each device has at least two (one for admin 81 * commands and one for I/O commands). 82 */ 83struct nvme_queue { 84 struct rcu_head r_head; 85 struct device *q_dmadev; 86 struct nvme_dev *dev; 87 char irqname[24]; /* nvme4294967295-65535\0 */ 88 spinlock_t q_lock; 89 struct nvme_command *sq_cmds; 90 volatile struct nvme_completion *cqes; 91 dma_addr_t sq_dma_addr; 92 dma_addr_t cq_dma_addr; 93 wait_queue_head_t sq_full; 94 wait_queue_t sq_cong_wait; 95 struct bio_list sq_cong; 96 struct list_head iod_bio; 97 u32 __iomem *q_db; 98 u16 q_depth; 99 u16 cq_vector; 100 u16 sq_head; 101 u16 sq_tail; 102 u16 cq_head; 103 u16 qid; 104 u8 cq_phase; 105 u8 cqe_seen; 106 u8 q_suspended; 107 cpumask_var_t cpu_mask; 108 struct async_cmd_info cmdinfo; 109 unsigned long cmdid_data[]; 110}; 111 112/* 113 * Check we didin't inadvertently grow the command struct 114 */ 115static inline void _nvme_check_size(void) 116{ 117 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 118 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 119 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 120 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 121 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 122 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 123 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 124 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 125 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 126 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 127 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 128 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 129} 130 131typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 132 struct nvme_completion *); 133 134struct nvme_cmd_info { 135 nvme_completion_fn fn; 136 void *ctx; 137 unsigned long timeout; 138 int aborted; 139}; 140 141static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 142{ 143 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 144} 145 146static unsigned nvme_queue_extra(int depth) 147{ 148 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 149} 150 151/** 152 * alloc_cmdid() - Allocate a Command ID 153 * @nvmeq: The queue that will be used for this command 154 * @ctx: A pointer that will be passed to the handler 155 * @handler: The function to call on completion 156 * 157 * Allocate a Command ID for a queue. The data passed in will 158 * be passed to the completion handler. This is implemented by using 159 * the bottom two bits of the ctx pointer to store the handler ID. 160 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 161 * We can change this if it becomes a problem. 162 * 163 * May be called with local interrupts disabled and the q_lock held, 164 * or with interrupts enabled and no locks held. 165 */ 166static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 167 nvme_completion_fn handler, unsigned timeout) 168{ 169 int depth = nvmeq->q_depth - 1; 170 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 171 int cmdid; 172 173 do { 174 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 175 if (cmdid >= depth) 176 return -EBUSY; 177 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 178 179 info[cmdid].fn = handler; 180 info[cmdid].ctx = ctx; 181 info[cmdid].timeout = jiffies + timeout; 182 info[cmdid].aborted = 0; 183 return cmdid; 184} 185 186static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 187 nvme_completion_fn handler, unsigned timeout) 188{ 189 int cmdid; 190 wait_event_killable(nvmeq->sq_full, 191 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 192 return (cmdid < 0) ? -EINTR : cmdid; 193} 194 195/* Special values must be less than 0x1000 */ 196#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 197#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 198#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 199#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 200#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) 201 202static void special_completion(struct nvme_queue *nvmeq, void *ctx, 203 struct nvme_completion *cqe) 204{ 205 if (ctx == CMD_CTX_CANCELLED) 206 return; 207 if (ctx == CMD_CTX_ABORT) { 208 ++nvmeq->dev->abort_limit; 209 return; 210 } 211 if (ctx == CMD_CTX_COMPLETED) { 212 dev_warn(nvmeq->q_dmadev, 213 "completed id %d twice on queue %d\n", 214 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 215 return; 216 } 217 if (ctx == CMD_CTX_INVALID) { 218 dev_warn(nvmeq->q_dmadev, 219 "invalid id %d completed on queue %d\n", 220 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 221 return; 222 } 223 224 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 225} 226 227static void async_completion(struct nvme_queue *nvmeq, void *ctx, 228 struct nvme_completion *cqe) 229{ 230 struct async_cmd_info *cmdinfo = ctx; 231 cmdinfo->result = le32_to_cpup(&cqe->result); 232 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 233 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 234} 235 236/* 237 * Called with local interrupts disabled and the q_lock held. May not sleep. 238 */ 239static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 240 nvme_completion_fn *fn) 241{ 242 void *ctx; 243 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 244 245 if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { 246 if (fn) 247 *fn = special_completion; 248 return CMD_CTX_INVALID; 249 } 250 if (fn) 251 *fn = info[cmdid].fn; 252 ctx = info[cmdid].ctx; 253 info[cmdid].fn = special_completion; 254 info[cmdid].ctx = CMD_CTX_COMPLETED; 255 clear_bit(cmdid, nvmeq->cmdid_data); 256 wake_up(&nvmeq->sq_full); 257 return ctx; 258} 259 260static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 261 nvme_completion_fn *fn) 262{ 263 void *ctx; 264 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 265 if (fn) 266 *fn = info[cmdid].fn; 267 ctx = info[cmdid].ctx; 268 info[cmdid].fn = special_completion; 269 info[cmdid].ctx = CMD_CTX_CANCELLED; 270 return ctx; 271} 272 273static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) 274{ 275 return rcu_dereference_raw(dev->queues[qid]); 276} 277 278static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) 279{ 280 unsigned queue_id = get_cpu_var(*dev->io_queue); 281 rcu_read_lock(); 282 return rcu_dereference(dev->queues[queue_id]); 283} 284 285static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 286{ 287 rcu_read_unlock(); 288 put_cpu_var(nvmeq->dev->io_queue); 289} 290 291static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) 292 __acquires(RCU) 293{ 294 rcu_read_lock(); 295 return rcu_dereference(dev->queues[q_idx]); 296} 297 298static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 299{ 300 rcu_read_unlock(); 301} 302 303/** 304 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 305 * @nvmeq: The queue to use 306 * @cmd: The command to send 307 * 308 * Safe to use from interrupt context 309 */ 310static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 311{ 312 unsigned long flags; 313 u16 tail; 314 spin_lock_irqsave(&nvmeq->q_lock, flags); 315 if (nvmeq->q_suspended) { 316 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 317 return -EBUSY; 318 } 319 tail = nvmeq->sq_tail; 320 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 321 if (++tail == nvmeq->q_depth) 322 tail = 0; 323 writel(tail, nvmeq->q_db); 324 nvmeq->sq_tail = tail; 325 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 326 327 return 0; 328} 329 330static __le64 **iod_list(struct nvme_iod *iod) 331{ 332 return ((void *)iod) + iod->offset; 333} 334 335/* 336 * Will slightly overestimate the number of pages needed. This is OK 337 * as it only leads to a small amount of wasted memory for the lifetime of 338 * the I/O. 339 */ 340static int nvme_npages(unsigned size) 341{ 342 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 343 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 344} 345 346static struct nvme_iod * 347nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 348{ 349 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 350 sizeof(__le64 *) * nvme_npages(nbytes) + 351 sizeof(struct scatterlist) * nseg, gfp); 352 353 if (iod) { 354 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 355 iod->npages = -1; 356 iod->length = nbytes; 357 iod->nents = 0; 358 iod->first_dma = 0ULL; 359 iod->start_time = jiffies; 360 } 361 362 return iod; 363} 364 365void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 366{ 367 const int last_prp = PAGE_SIZE / 8 - 1; 368 int i; 369 __le64 **list = iod_list(iod); 370 dma_addr_t prp_dma = iod->first_dma; 371 372 if (iod->npages == 0) 373 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 374 for (i = 0; i < iod->npages; i++) { 375 __le64 *prp_list = list[i]; 376 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 377 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 378 prp_dma = next_prp_dma; 379 } 380 kfree(iod); 381} 382 383static void nvme_start_io_acct(struct bio *bio) 384{ 385 struct gendisk *disk = bio->bi_bdev->bd_disk; 386 const int rw = bio_data_dir(bio); 387 int cpu = part_stat_lock(); 388 part_round_stats(cpu, &disk->part0); 389 part_stat_inc(cpu, &disk->part0, ios[rw]); 390 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); 391 part_inc_in_flight(&disk->part0, rw); 392 part_stat_unlock(); 393} 394 395static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) 396{ 397 struct gendisk *disk = bio->bi_bdev->bd_disk; 398 const int rw = bio_data_dir(bio); 399 unsigned long duration = jiffies - start_time; 400 int cpu = part_stat_lock(); 401 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 402 part_round_stats(cpu, &disk->part0); 403 part_dec_in_flight(&disk->part0, rw); 404 part_stat_unlock(); 405} 406 407static void bio_completion(struct nvme_queue *nvmeq, void *ctx, 408 struct nvme_completion *cqe) 409{ 410 struct nvme_iod *iod = ctx; 411 struct bio *bio = iod->private; 412 u16 status = le16_to_cpup(&cqe->status) >> 1; 413 int error = 0; 414 415 if (unlikely(status)) { 416 if (!(status & NVME_SC_DNR || 417 bio->bi_rw & REQ_FAILFAST_MASK) && 418 (jiffies - iod->start_time) < IOD_TIMEOUT) { 419 if (!waitqueue_active(&nvmeq->sq_full)) 420 add_wait_queue(&nvmeq->sq_full, 421 &nvmeq->sq_cong_wait); 422 list_add_tail(&iod->node, &nvmeq->iod_bio); 423 wake_up(&nvmeq->sq_full); 424 return; 425 } 426 error = -EIO; 427 } 428 if (iod->nents) { 429 dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, 430 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 431 nvme_end_io_acct(bio, iod->start_time); 432 } 433 nvme_free_iod(nvmeq->dev, iod); 434 435 trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); 436 bio_endio(bio, error); 437} 438 439/* length is in bytes. gfp flags indicates whether we may sleep. */ 440int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, 441 gfp_t gfp) 442{ 443 struct dma_pool *pool; 444 int length = total_len; 445 struct scatterlist *sg = iod->sg; 446 int dma_len = sg_dma_len(sg); 447 u64 dma_addr = sg_dma_address(sg); 448 int offset = offset_in_page(dma_addr); 449 __le64 *prp_list; 450 __le64 **list = iod_list(iod); 451 dma_addr_t prp_dma; 452 int nprps, i; 453 454 length -= (PAGE_SIZE - offset); 455 if (length <= 0) 456 return total_len; 457 458 dma_len -= (PAGE_SIZE - offset); 459 if (dma_len) { 460 dma_addr += (PAGE_SIZE - offset); 461 } else { 462 sg = sg_next(sg); 463 dma_addr = sg_dma_address(sg); 464 dma_len = sg_dma_len(sg); 465 } 466 467 if (length <= PAGE_SIZE) { 468 iod->first_dma = dma_addr; 469 return total_len; 470 } 471 472 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 473 if (nprps <= (256 / 8)) { 474 pool = dev->prp_small_pool; 475 iod->npages = 0; 476 } else { 477 pool = dev->prp_page_pool; 478 iod->npages = 1; 479 } 480 481 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 482 if (!prp_list) { 483 iod->first_dma = dma_addr; 484 iod->npages = -1; 485 return (total_len - length) + PAGE_SIZE; 486 } 487 list[0] = prp_list; 488 iod->first_dma = prp_dma; 489 i = 0; 490 for (;;) { 491 if (i == PAGE_SIZE / 8) { 492 __le64 *old_prp_list = prp_list; 493 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 494 if (!prp_list) 495 return total_len - length; 496 list[iod->npages++] = prp_list; 497 prp_list[0] = old_prp_list[i - 1]; 498 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 499 i = 1; 500 } 501 prp_list[i++] = cpu_to_le64(dma_addr); 502 dma_len -= PAGE_SIZE; 503 dma_addr += PAGE_SIZE; 504 length -= PAGE_SIZE; 505 if (length <= 0) 506 break; 507 if (dma_len > 0) 508 continue; 509 BUG_ON(dma_len < 0); 510 sg = sg_next(sg); 511 dma_addr = sg_dma_address(sg); 512 dma_len = sg_dma_len(sg); 513 } 514 515 return total_len; 516} 517 518static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 519 int len) 520{ 521 struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); 522 if (!split) 523 return -ENOMEM; 524 525 trace_block_split(bdev_get_queue(bio->bi_bdev), bio, 526 split->bi_iter.bi_sector); 527 bio_chain(split, bio); 528 529 if (!waitqueue_active(&nvmeq->sq_full)) 530 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 531 bio_list_add(&nvmeq->sq_cong, split); 532 bio_list_add(&nvmeq->sq_cong, bio); 533 wake_up(&nvmeq->sq_full); 534 535 return 0; 536} 537 538/* NVMe scatterlists require no holes in the virtual address */ 539#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 540 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 541 542static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 543 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 544{ 545 struct bio_vec bvec, bvprv; 546 struct bvec_iter iter; 547 struct scatterlist *sg = NULL; 548 int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; 549 int first = 1; 550 551 if (nvmeq->dev->stripe_size) 552 split_len = nvmeq->dev->stripe_size - 553 ((bio->bi_iter.bi_sector << 9) & 554 (nvmeq->dev->stripe_size - 1)); 555 556 sg_init_table(iod->sg, psegs); 557 bio_for_each_segment(bvec, bio, iter) { 558 if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { 559 sg->length += bvec.bv_len; 560 } else { 561 if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) 562 return nvme_split_and_submit(bio, nvmeq, 563 length); 564 565 sg = sg ? sg + 1 : iod->sg; 566 sg_set_page(sg, bvec.bv_page, 567 bvec.bv_len, bvec.bv_offset); 568 nsegs++; 569 } 570 571 if (split_len - length < bvec.bv_len) 572 return nvme_split_and_submit(bio, nvmeq, split_len); 573 length += bvec.bv_len; 574 bvprv = bvec; 575 first = 0; 576 } 577 iod->nents = nsegs; 578 sg_mark_end(sg); 579 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 580 return -ENOMEM; 581 582 BUG_ON(length != bio->bi_iter.bi_size); 583 return length; 584} 585 586static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 587 struct bio *bio, struct nvme_iod *iod, int cmdid) 588{ 589 struct nvme_dsm_range *range = 590 (struct nvme_dsm_range *)iod_list(iod)[0]; 591 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 592 593 range->cattr = cpu_to_le32(0); 594 range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); 595 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 596 597 memset(cmnd, 0, sizeof(*cmnd)); 598 cmnd->dsm.opcode = nvme_cmd_dsm; 599 cmnd->dsm.command_id = cmdid; 600 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 601 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 602 cmnd->dsm.nr = 0; 603 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 604 605 if (++nvmeq->sq_tail == nvmeq->q_depth) 606 nvmeq->sq_tail = 0; 607 writel(nvmeq->sq_tail, nvmeq->q_db); 608 609 return 0; 610} 611 612static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 613 int cmdid) 614{ 615 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 616 617 memset(cmnd, 0, sizeof(*cmnd)); 618 cmnd->common.opcode = nvme_cmd_flush; 619 cmnd->common.command_id = cmdid; 620 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 621 622 if (++nvmeq->sq_tail == nvmeq->q_depth) 623 nvmeq->sq_tail = 0; 624 writel(nvmeq->sq_tail, nvmeq->q_db); 625 626 return 0; 627} 628 629static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) 630{ 631 struct bio *bio = iod->private; 632 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 633 struct nvme_command *cmnd; 634 int cmdid; 635 u16 control; 636 u32 dsmgmt; 637 638 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 639 if (unlikely(cmdid < 0)) 640 return cmdid; 641 642 if (bio->bi_rw & REQ_DISCARD) 643 return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 644 if (bio->bi_rw & REQ_FLUSH) 645 return nvme_submit_flush(nvmeq, ns, cmdid); 646 647 control = 0; 648 if (bio->bi_rw & REQ_FUA) 649 control |= NVME_RW_FUA; 650 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 651 control |= NVME_RW_LR; 652 653 dsmgmt = 0; 654 if (bio->bi_rw & REQ_RAHEAD) 655 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 656 657 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 658 memset(cmnd, 0, sizeof(*cmnd)); 659 660 cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; 661 cmnd->rw.command_id = cmdid; 662 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 663 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 664 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 665 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 666 cmnd->rw.length = 667 cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); 668 cmnd->rw.control = cpu_to_le16(control); 669 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 670 671 if (++nvmeq->sq_tail == nvmeq->q_depth) 672 nvmeq->sq_tail = 0; 673 writel(nvmeq->sq_tail, nvmeq->q_db); 674 675 return 0; 676} 677 678static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) 679{ 680 struct bio *split = bio_clone(bio, GFP_ATOMIC); 681 if (!split) 682 return -ENOMEM; 683 684 split->bi_iter.bi_size = 0; 685 split->bi_phys_segments = 0; 686 bio->bi_rw &= ~REQ_FLUSH; 687 bio_chain(split, bio); 688 689 if (!waitqueue_active(&nvmeq->sq_full)) 690 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 691 bio_list_add(&nvmeq->sq_cong, split); 692 bio_list_add(&nvmeq->sq_cong, bio); 693 wake_up_process(nvme_thread); 694 695 return 0; 696} 697 698/* 699 * Called with local interrupts disabled and the q_lock held. May not sleep. 700 */ 701static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 702 struct bio *bio) 703{ 704 struct nvme_iod *iod; 705 int psegs = bio_phys_segments(ns->queue, bio); 706 int result; 707 708 if ((bio->bi_rw & REQ_FLUSH) && psegs) 709 return nvme_split_flush_data(nvmeq, bio); 710 711 iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); 712 if (!iod) 713 return -ENOMEM; 714 715 iod->private = bio; 716 if (bio->bi_rw & REQ_DISCARD) { 717 void *range; 718 /* 719 * We reuse the small pool to allocate the 16-byte range here 720 * as it is not worth having a special pool for these or 721 * additional cases to handle freeing the iod. 722 */ 723 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, 724 GFP_ATOMIC, 725 &iod->first_dma); 726 if (!range) { 727 result = -ENOMEM; 728 goto free_iod; 729 } 730 iod_list(iod)[0] = (__le64 *)range; 731 iod->npages = 0; 732 } else if (psegs) { 733 result = nvme_map_bio(nvmeq, iod, bio, 734 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 735 psegs); 736 if (result <= 0) 737 goto free_iod; 738 if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != 739 result) { 740 result = -ENOMEM; 741 goto free_iod; 742 } 743 nvme_start_io_acct(bio); 744 } 745 if (unlikely(nvme_submit_iod(nvmeq, iod))) { 746 if (!waitqueue_active(&nvmeq->sq_full)) 747 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 748 list_add_tail(&iod->node, &nvmeq->iod_bio); 749 } 750 return 0; 751 752 free_iod: 753 nvme_free_iod(nvmeq->dev, iod); 754 return result; 755} 756 757static int nvme_process_cq(struct nvme_queue *nvmeq) 758{ 759 u16 head, phase; 760 761 head = nvmeq->cq_head; 762 phase = nvmeq->cq_phase; 763 764 for (;;) { 765 void *ctx; 766 nvme_completion_fn fn; 767 struct nvme_completion cqe = nvmeq->cqes[head]; 768 if ((le16_to_cpu(cqe.status) & 1) != phase) 769 break; 770 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 771 if (++head == nvmeq->q_depth) { 772 head = 0; 773 phase = !phase; 774 } 775 776 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 777 fn(nvmeq, ctx, &cqe); 778 } 779 780 /* If the controller ignores the cq head doorbell and continuously 781 * writes to the queue, it is theoretically possible to wrap around 782 * the queue twice and mistakenly return IRQ_NONE. Linux only 783 * requires that 0.1% of your interrupts are handled, so this isn't 784 * a big problem. 785 */ 786 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 787 return 0; 788 789 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 790 nvmeq->cq_head = head; 791 nvmeq->cq_phase = phase; 792 793 nvmeq->cqe_seen = 1; 794 return 1; 795} 796 797static void nvme_make_request(struct request_queue *q, struct bio *bio) 798{ 799 struct nvme_ns *ns = q->queuedata; 800 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 801 int result = -EBUSY; 802 803 if (!nvmeq) { 804 put_nvmeq(NULL); 805 bio_endio(bio, -EIO); 806 return; 807 } 808 809 spin_lock_irq(&nvmeq->q_lock); 810 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) 811 result = nvme_submit_bio_queue(nvmeq, ns, bio); 812 if (unlikely(result)) { 813 if (!waitqueue_active(&nvmeq->sq_full)) 814 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 815 bio_list_add(&nvmeq->sq_cong, bio); 816 } 817 818 nvme_process_cq(nvmeq); 819 spin_unlock_irq(&nvmeq->q_lock); 820 put_nvmeq(nvmeq); 821} 822 823static irqreturn_t nvme_irq(int irq, void *data) 824{ 825 irqreturn_t result; 826 struct nvme_queue *nvmeq = data; 827 spin_lock(&nvmeq->q_lock); 828 nvme_process_cq(nvmeq); 829 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 830 nvmeq->cqe_seen = 0; 831 spin_unlock(&nvmeq->q_lock); 832 return result; 833} 834 835static irqreturn_t nvme_irq_check(int irq, void *data) 836{ 837 struct nvme_queue *nvmeq = data; 838 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 839 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 840 return IRQ_NONE; 841 return IRQ_WAKE_THREAD; 842} 843 844static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 845{ 846 spin_lock_irq(&nvmeq->q_lock); 847 cancel_cmdid(nvmeq, cmdid, NULL); 848 spin_unlock_irq(&nvmeq->q_lock); 849} 850 851struct sync_cmd_info { 852 struct task_struct *task; 853 u32 result; 854 int status; 855}; 856 857static void sync_completion(struct nvme_queue *nvmeq, void *ctx, 858 struct nvme_completion *cqe) 859{ 860 struct sync_cmd_info *cmdinfo = ctx; 861 cmdinfo->result = le32_to_cpup(&cqe->result); 862 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 863 wake_up_process(cmdinfo->task); 864} 865 866/* 867 * Returns 0 on success. If the result is negative, it's a Linux error code; 868 * if the result is positive, it's an NVM Express status code 869 */ 870static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, 871 struct nvme_command *cmd, 872 u32 *result, unsigned timeout) 873{ 874 int cmdid, ret; 875 struct sync_cmd_info cmdinfo; 876 struct nvme_queue *nvmeq; 877 878 nvmeq = lock_nvmeq(dev, q_idx); 879 if (!nvmeq) { 880 unlock_nvmeq(nvmeq); 881 return -ENODEV; 882 } 883 884 cmdinfo.task = current; 885 cmdinfo.status = -EINTR; 886 887 cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); 888 if (cmdid < 0) { 889 unlock_nvmeq(nvmeq); 890 return cmdid; 891 } 892 cmd->common.command_id = cmdid; 893 894 set_current_state(TASK_KILLABLE); 895 ret = nvme_submit_cmd(nvmeq, cmd); 896 if (ret) { 897 free_cmdid(nvmeq, cmdid, NULL); 898 unlock_nvmeq(nvmeq); 899 set_current_state(TASK_RUNNING); 900 return ret; 901 } 902 unlock_nvmeq(nvmeq); 903 schedule_timeout(timeout); 904 905 if (cmdinfo.status == -EINTR) { 906 nvmeq = lock_nvmeq(dev, q_idx); 907 if (nvmeq) 908 nvme_abort_command(nvmeq, cmdid); 909 unlock_nvmeq(nvmeq); 910 return -EINTR; 911 } 912 913 if (result) 914 *result = cmdinfo.result; 915 916 return cmdinfo.status; 917} 918 919static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, 920 struct nvme_command *cmd, 921 struct async_cmd_info *cmdinfo, unsigned timeout) 922{ 923 int cmdid; 924 925 cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); 926 if (cmdid < 0) 927 return cmdid; 928 cmdinfo->status = -EINTR; 929 cmd->common.command_id = cmdid; 930 return nvme_submit_cmd(nvmeq, cmd); 931} 932 933int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 934 u32 *result) 935{ 936 return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); 937} 938 939int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 940 u32 *result) 941{ 942 return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, 943 NVME_IO_TIMEOUT); 944} 945 946static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, 947 struct nvme_command *cmd, struct async_cmd_info *cmdinfo) 948{ 949 return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, 950 ADMIN_TIMEOUT); 951} 952 953static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 954{ 955 int status; 956 struct nvme_command c; 957 958 memset(&c, 0, sizeof(c)); 959 c.delete_queue.opcode = opcode; 960 c.delete_queue.qid = cpu_to_le16(id); 961 962 status = nvme_submit_admin_cmd(dev, &c, NULL); 963 if (status) 964 return -EIO; 965 return 0; 966} 967 968static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 969 struct nvme_queue *nvmeq) 970{ 971 int status; 972 struct nvme_command c; 973 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 974 975 memset(&c, 0, sizeof(c)); 976 c.create_cq.opcode = nvme_admin_create_cq; 977 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 978 c.create_cq.cqid = cpu_to_le16(qid); 979 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 980 c.create_cq.cq_flags = cpu_to_le16(flags); 981 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 982 983 status = nvme_submit_admin_cmd(dev, &c, NULL); 984 if (status) 985 return -EIO; 986 return 0; 987} 988 989static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 990 struct nvme_queue *nvmeq) 991{ 992 int status; 993 struct nvme_command c; 994 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 995 996 memset(&c, 0, sizeof(c)); 997 c.create_sq.opcode = nvme_admin_create_sq; 998 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 999 c.create_sq.sqid = cpu_to_le16(qid); 1000 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1001 c.create_sq.sq_flags = cpu_to_le16(flags); 1002 c.create_sq.cqid = cpu_to_le16(qid); 1003 1004 status = nvme_submit_admin_cmd(dev, &c, NULL); 1005 if (status) 1006 return -EIO; 1007 return 0; 1008} 1009 1010static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1011{ 1012 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1013} 1014 1015static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1016{ 1017 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1018} 1019 1020int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 1021 dma_addr_t dma_addr) 1022{ 1023 struct nvme_command c; 1024 1025 memset(&c, 0, sizeof(c)); 1026 c.identify.opcode = nvme_admin_identify; 1027 c.identify.nsid = cpu_to_le32(nsid); 1028 c.identify.prp1 = cpu_to_le64(dma_addr); 1029 c.identify.cns = cpu_to_le32(cns); 1030 1031 return nvme_submit_admin_cmd(dev, &c, NULL); 1032} 1033 1034int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 1035 dma_addr_t dma_addr, u32 *result) 1036{ 1037 struct nvme_command c; 1038 1039 memset(&c, 0, sizeof(c)); 1040 c.features.opcode = nvme_admin_get_features; 1041 c.features.nsid = cpu_to_le32(nsid); 1042 c.features.prp1 = cpu_to_le64(dma_addr); 1043 c.features.fid = cpu_to_le32(fid); 1044 1045 return nvme_submit_admin_cmd(dev, &c, result); 1046} 1047 1048int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1049 dma_addr_t dma_addr, u32 *result) 1050{ 1051 struct nvme_command c; 1052 1053 memset(&c, 0, sizeof(c)); 1054 c.features.opcode = nvme_admin_set_features; 1055 c.features.prp1 = cpu_to_le64(dma_addr); 1056 c.features.fid = cpu_to_le32(fid); 1057 c.features.dword11 = cpu_to_le32(dword11); 1058 1059 return nvme_submit_admin_cmd(dev, &c, result); 1060} 1061 1062/** 1063 * nvme_abort_cmd - Attempt aborting a command 1064 * @cmdid: Command id of a timed out IO 1065 * @queue: The queue with timed out IO 1066 * 1067 * Schedule controller reset if the command was already aborted once before and 1068 * still hasn't been returned to the driver, or if this is the admin queue. 1069 */ 1070static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) 1071{ 1072 int a_cmdid; 1073 struct nvme_command cmd; 1074 struct nvme_dev *dev = nvmeq->dev; 1075 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1076 struct nvme_queue *adminq; 1077 1078 if (!nvmeq->qid || info[cmdid].aborted) { 1079 if (work_busy(&dev->reset_work)) 1080 return; 1081 list_del_init(&dev->node); 1082 dev_warn(&dev->pci_dev->dev, 1083 "I/O %d QID %d timeout, reset controller\n", cmdid, 1084 nvmeq->qid); 1085 dev->reset_workfn = nvme_reset_failed_dev; 1086 queue_work(nvme_workq, &dev->reset_work); 1087 return; 1088 } 1089 1090 if (!dev->abort_limit) 1091 return; 1092 1093 adminq = rcu_dereference(dev->queues[0]); 1094 a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, 1095 ADMIN_TIMEOUT); 1096 if (a_cmdid < 0) 1097 return; 1098 1099 memset(&cmd, 0, sizeof(cmd)); 1100 cmd.abort.opcode = nvme_admin_abort_cmd; 1101 cmd.abort.cid = cmdid; 1102 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1103 cmd.abort.command_id = a_cmdid; 1104 1105 --dev->abort_limit; 1106 info[cmdid].aborted = 1; 1107 info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; 1108 1109 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1110 nvmeq->qid); 1111 nvme_submit_cmd(adminq, &cmd); 1112} 1113 1114/** 1115 * nvme_cancel_ios - Cancel outstanding I/Os 1116 * @queue: The queue to cancel I/Os on 1117 * @timeout: True to only cancel I/Os which have timed out 1118 */ 1119static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 1120{ 1121 int depth = nvmeq->q_depth - 1; 1122 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1123 unsigned long now = jiffies; 1124 int cmdid; 1125 1126 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1127 void *ctx; 1128 nvme_completion_fn fn; 1129 static struct nvme_completion cqe = { 1130 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 1131 }; 1132 1133 if (timeout && !time_after(now, info[cmdid].timeout)) 1134 continue; 1135 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1136 continue; 1137 if (timeout && nvmeq->dev->initialized) { 1138 nvme_abort_cmd(cmdid, nvmeq); 1139 continue; 1140 } 1141 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, 1142 nvmeq->qid); 1143 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1144 fn(nvmeq, ctx, &cqe); 1145 } 1146} 1147 1148static void nvme_free_queue(struct rcu_head *r) 1149{ 1150 struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); 1151 1152 spin_lock_irq(&nvmeq->q_lock); 1153 while (bio_list_peek(&nvmeq->sq_cong)) { 1154 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1155 bio_endio(bio, -EIO); 1156 } 1157 while (!list_empty(&nvmeq->iod_bio)) { 1158 static struct nvme_completion cqe = { 1159 .status = cpu_to_le16( 1160 (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), 1161 }; 1162 struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, 1163 struct nvme_iod, 1164 node); 1165 list_del(&iod->node); 1166 bio_completion(nvmeq, iod, &cqe); 1167 } 1168 spin_unlock_irq(&nvmeq->q_lock); 1169 1170 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1171 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1172 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1173 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1174 if (nvmeq->qid) 1175 free_cpumask_var(nvmeq->cpu_mask); 1176 kfree(nvmeq); 1177} 1178 1179static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1180{ 1181 int i; 1182 1183 for (i = dev->queue_count - 1; i >= lowest; i--) { 1184 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 1185 rcu_assign_pointer(dev->queues[i], NULL); 1186 call_rcu(&nvmeq->r_head, nvme_free_queue); 1187 dev->queue_count--; 1188 } 1189} 1190 1191/** 1192 * nvme_suspend_queue - put queue into suspended state 1193 * @nvmeq - queue to suspend 1194 * 1195 * Returns 1 if already suspended, 0 otherwise. 1196 */ 1197static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1198{ 1199 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1200 1201 spin_lock_irq(&nvmeq->q_lock); 1202 if (nvmeq->q_suspended) { 1203 spin_unlock_irq(&nvmeq->q_lock); 1204 return 1; 1205 } 1206 nvmeq->q_suspended = 1; 1207 nvmeq->dev->online_queues--; 1208 spin_unlock_irq(&nvmeq->q_lock); 1209 1210 irq_set_affinity_hint(vector, NULL); 1211 free_irq(vector, nvmeq); 1212 1213 return 0; 1214} 1215 1216static void nvme_clear_queue(struct nvme_queue *nvmeq) 1217{ 1218 spin_lock_irq(&nvmeq->q_lock); 1219 nvme_process_cq(nvmeq); 1220 nvme_cancel_ios(nvmeq, false); 1221 spin_unlock_irq(&nvmeq->q_lock); 1222} 1223 1224static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1225{ 1226 struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); 1227 1228 if (!nvmeq) 1229 return; 1230 if (nvme_suspend_queue(nvmeq)) 1231 return; 1232 1233 /* Don't tell the adapter to delete the admin queue. 1234 * Don't tell a removed adapter to delete IO queues. */ 1235 if (qid && readl(&dev->bar->csts) != -1) { 1236 adapter_delete_sq(dev, qid); 1237 adapter_delete_cq(dev, qid); 1238 } 1239 nvme_clear_queue(nvmeq); 1240} 1241 1242static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1243 int depth, int vector) 1244{ 1245 struct device *dmadev = &dev->pci_dev->dev; 1246 unsigned extra = nvme_queue_extra(depth); 1247 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1248 if (!nvmeq) 1249 return NULL; 1250 1251 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1252 &nvmeq->cq_dma_addr, GFP_KERNEL); 1253 if (!nvmeq->cqes) 1254 goto free_nvmeq; 1255 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1256 1257 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1258 &nvmeq->sq_dma_addr, GFP_KERNEL); 1259 if (!nvmeq->sq_cmds) 1260 goto free_cqdma; 1261 1262 if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) 1263 goto free_sqdma; 1264 1265 nvmeq->q_dmadev = dmadev; 1266 nvmeq->dev = dev; 1267 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1268 dev->instance, qid); 1269 spin_lock_init(&nvmeq->q_lock); 1270 nvmeq->cq_head = 0; 1271 nvmeq->cq_phase = 1; 1272 init_waitqueue_head(&nvmeq->sq_full); 1273 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1274 bio_list_init(&nvmeq->sq_cong); 1275 INIT_LIST_HEAD(&nvmeq->iod_bio); 1276 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1277 nvmeq->q_depth = depth; 1278 nvmeq->cq_vector = vector; 1279 nvmeq->qid = qid; 1280 nvmeq->q_suspended = 1; 1281 dev->queue_count++; 1282 rcu_assign_pointer(dev->queues[qid], nvmeq); 1283 1284 return nvmeq; 1285 1286 free_sqdma: 1287 dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, 1288 nvmeq->sq_dma_addr); 1289 free_cqdma: 1290 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1291 nvmeq->cq_dma_addr); 1292 free_nvmeq: 1293 kfree(nvmeq); 1294 return NULL; 1295} 1296 1297static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1298 const char *name) 1299{ 1300 if (use_threaded_interrupts) 1301 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1302 nvme_irq_check, nvme_irq, IRQF_SHARED, 1303 name, nvmeq); 1304 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1305 IRQF_SHARED, name, nvmeq); 1306} 1307 1308static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1309{ 1310 struct nvme_dev *dev = nvmeq->dev; 1311 unsigned extra = nvme_queue_extra(nvmeq->q_depth); 1312 1313 nvmeq->sq_tail = 0; 1314 nvmeq->cq_head = 0; 1315 nvmeq->cq_phase = 1; 1316 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1317 memset(nvmeq->cmdid_data, 0, extra); 1318 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1319 nvme_cancel_ios(nvmeq, false); 1320 nvmeq->q_suspended = 0; 1321 dev->online_queues++; 1322} 1323 1324static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1325{ 1326 struct nvme_dev *dev = nvmeq->dev; 1327 int result; 1328 1329 result = adapter_alloc_cq(dev, qid, nvmeq); 1330 if (result < 0) 1331 return result; 1332 1333 result = adapter_alloc_sq(dev, qid, nvmeq); 1334 if (result < 0) 1335 goto release_cq; 1336 1337 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1338 if (result < 0) 1339 goto release_sq; 1340 1341 spin_lock_irq(&nvmeq->q_lock); 1342 nvme_init_queue(nvmeq, qid); 1343 spin_unlock_irq(&nvmeq->q_lock); 1344 1345 return result; 1346 1347 release_sq: 1348 adapter_delete_sq(dev, qid); 1349 release_cq: 1350 adapter_delete_cq(dev, qid); 1351 return result; 1352} 1353 1354static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1355{ 1356 unsigned long timeout; 1357 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1358 1359 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1360 1361 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1362 msleep(100); 1363 if (fatal_signal_pending(current)) 1364 return -EINTR; 1365 if (time_after(jiffies, timeout)) { 1366 dev_err(&dev->pci_dev->dev, 1367 "Device not ready; aborting %s\n", enabled ? 1368 "initialisation" : "reset"); 1369 return -ENODEV; 1370 } 1371 } 1372 1373 return 0; 1374} 1375 1376/* 1377 * If the device has been passed off to us in an enabled state, just clear 1378 * the enabled bit. The spec says we should set the 'shutdown notification 1379 * bits', but doing so may cause the device to complete commands to the 1380 * admin queue ... and we don't know what memory that might be pointing at! 1381 */ 1382static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1383{ 1384 u32 cc = readl(&dev->bar->cc); 1385 1386 if (cc & NVME_CC_ENABLE) 1387 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1388 return nvme_wait_ready(dev, cap, false); 1389} 1390 1391static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1392{ 1393 return nvme_wait_ready(dev, cap, true); 1394} 1395 1396static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1397{ 1398 unsigned long timeout; 1399 u32 cc; 1400 1401 cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; 1402 writel(cc, &dev->bar->cc); 1403 1404 timeout = 2 * HZ + jiffies; 1405 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1406 NVME_CSTS_SHST_CMPLT) { 1407 msleep(100); 1408 if (fatal_signal_pending(current)) 1409 return -EINTR; 1410 if (time_after(jiffies, timeout)) { 1411 dev_err(&dev->pci_dev->dev, 1412 "Device shutdown incomplete; abort shutdown\n"); 1413 return -ENODEV; 1414 } 1415 } 1416 1417 return 0; 1418} 1419 1420static int nvme_configure_admin_queue(struct nvme_dev *dev) 1421{ 1422 int result; 1423 u32 aqa; 1424 u64 cap = readq(&dev->bar->cap); 1425 struct nvme_queue *nvmeq; 1426 1427 result = nvme_disable_ctrl(dev, cap); 1428 if (result < 0) 1429 return result; 1430 1431 nvmeq = raw_nvmeq(dev, 0); 1432 if (!nvmeq) { 1433 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1434 if (!nvmeq) 1435 return -ENOMEM; 1436 } 1437 1438 aqa = nvmeq->q_depth - 1; 1439 aqa |= aqa << 16; 1440 1441 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1442 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1443 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1444 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1445 1446 writel(aqa, &dev->bar->aqa); 1447 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1448 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1449 writel(dev->ctrl_config, &dev->bar->cc); 1450 1451 result = nvme_enable_ctrl(dev, cap); 1452 if (result) 1453 return result; 1454 1455 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1456 if (result) 1457 return result; 1458 1459 spin_lock_irq(&nvmeq->q_lock); 1460 nvme_init_queue(nvmeq, 0); 1461 spin_unlock_irq(&nvmeq->q_lock); 1462 return result; 1463} 1464 1465struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1466 unsigned long addr, unsigned length) 1467{ 1468 int i, err, count, nents, offset; 1469 struct scatterlist *sg; 1470 struct page **pages; 1471 struct nvme_iod *iod; 1472 1473 if (addr & 3) 1474 return ERR_PTR(-EINVAL); 1475 if (!length || length > INT_MAX - PAGE_SIZE) 1476 return ERR_PTR(-EINVAL); 1477 1478 offset = offset_in_page(addr); 1479 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1480 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1481 if (!pages) 1482 return ERR_PTR(-ENOMEM); 1483 1484 err = get_user_pages_fast(addr, count, 1, pages); 1485 if (err < count) { 1486 count = err; 1487 err = -EFAULT; 1488 goto put_pages; 1489 } 1490 1491 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1492 sg = iod->sg; 1493 sg_init_table(sg, count); 1494 for (i = 0; i < count; i++) { 1495 sg_set_page(&sg[i], pages[i], 1496 min_t(unsigned, length, PAGE_SIZE - offset), 1497 offset); 1498 length -= (PAGE_SIZE - offset); 1499 offset = 0; 1500 } 1501 sg_mark_end(&sg[i - 1]); 1502 iod->nents = count; 1503 1504 err = -ENOMEM; 1505 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1506 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1507 if (!nents) 1508 goto free_iod; 1509 1510 kfree(pages); 1511 return iod; 1512 1513 free_iod: 1514 kfree(iod); 1515 put_pages: 1516 for (i = 0; i < count; i++) 1517 put_page(pages[i]); 1518 kfree(pages); 1519 return ERR_PTR(err); 1520} 1521 1522void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1523 struct nvme_iod *iod) 1524{ 1525 int i; 1526 1527 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1528 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1529 1530 for (i = 0; i < iod->nents; i++) 1531 put_page(sg_page(&iod->sg[i])); 1532} 1533 1534static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1535{ 1536 struct nvme_dev *dev = ns->dev; 1537 struct nvme_user_io io; 1538 struct nvme_command c; 1539 unsigned length, meta_len; 1540 int status, i; 1541 struct nvme_iod *iod, *meta_iod = NULL; 1542 dma_addr_t meta_dma_addr; 1543 void *meta, *uninitialized_var(meta_mem); 1544 1545 if (copy_from_user(&io, uio, sizeof(io))) 1546 return -EFAULT; 1547 length = (io.nblocks + 1) << ns->lba_shift; 1548 meta_len = (io.nblocks + 1) * ns->ms; 1549 1550 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1551 return -EINVAL; 1552 1553 switch (io.opcode) { 1554 case nvme_cmd_write: 1555 case nvme_cmd_read: 1556 case nvme_cmd_compare: 1557 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1558 break; 1559 default: 1560 return -EINVAL; 1561 } 1562 1563 if (IS_ERR(iod)) 1564 return PTR_ERR(iod); 1565 1566 memset(&c, 0, sizeof(c)); 1567 c.rw.opcode = io.opcode; 1568 c.rw.flags = io.flags; 1569 c.rw.nsid = cpu_to_le32(ns->ns_id); 1570 c.rw.slba = cpu_to_le64(io.slba); 1571 c.rw.length = cpu_to_le16(io.nblocks); 1572 c.rw.control = cpu_to_le16(io.control); 1573 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1574 c.rw.reftag = cpu_to_le32(io.reftag); 1575 c.rw.apptag = cpu_to_le16(io.apptag); 1576 c.rw.appmask = cpu_to_le16(io.appmask); 1577 1578 if (meta_len) { 1579 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, 1580 meta_len); 1581 if (IS_ERR(meta_iod)) { 1582 status = PTR_ERR(meta_iod); 1583 meta_iod = NULL; 1584 goto unmap; 1585 } 1586 1587 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1588 &meta_dma_addr, GFP_KERNEL); 1589 if (!meta_mem) { 1590 status = -ENOMEM; 1591 goto unmap; 1592 } 1593 1594 if (io.opcode & 1) { 1595 int meta_offset = 0; 1596 1597 for (i = 0; i < meta_iod->nents; i++) { 1598 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1599 meta_iod->sg[i].offset; 1600 memcpy(meta_mem + meta_offset, meta, 1601 meta_iod->sg[i].length); 1602 kunmap_atomic(meta); 1603 meta_offset += meta_iod->sg[i].length; 1604 } 1605 } 1606 1607 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1608 } 1609 1610 length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); 1611 c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 1612 c.rw.prp2 = cpu_to_le64(iod->first_dma); 1613 1614 if (length != (io.nblocks + 1) << ns->lba_shift) 1615 status = -ENOMEM; 1616 else 1617 status = nvme_submit_io_cmd(dev, &c, NULL); 1618 1619 if (meta_len) { 1620 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1621 int meta_offset = 0; 1622 1623 for (i = 0; i < meta_iod->nents; i++) { 1624 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1625 meta_iod->sg[i].offset; 1626 memcpy(meta, meta_mem + meta_offset, 1627 meta_iod->sg[i].length); 1628 kunmap_atomic(meta); 1629 meta_offset += meta_iod->sg[i].length; 1630 } 1631 } 1632 1633 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1634 meta_dma_addr); 1635 } 1636 1637 unmap: 1638 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1639 nvme_free_iod(dev, iod); 1640 1641 if (meta_iod) { 1642 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1643 nvme_free_iod(dev, meta_iod); 1644 } 1645 1646 return status; 1647} 1648 1649static int nvme_user_admin_cmd(struct nvme_dev *dev, 1650 struct nvme_admin_cmd __user *ucmd) 1651{ 1652 struct nvme_admin_cmd cmd; 1653 struct nvme_command c; 1654 int status, length; 1655 struct nvme_iod *uninitialized_var(iod); 1656 unsigned timeout; 1657 1658 if (!capable(CAP_SYS_ADMIN)) 1659 return -EACCES; 1660 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1661 return -EFAULT; 1662 1663 memset(&c, 0, sizeof(c)); 1664 c.common.opcode = cmd.opcode; 1665 c.common.flags = cmd.flags; 1666 c.common.nsid = cpu_to_le32(cmd.nsid); 1667 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1668 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1669 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1670 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1671 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1672 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1673 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1674 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1675 1676 length = cmd.data_len; 1677 if (cmd.data_len) { 1678 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1679 length); 1680 if (IS_ERR(iod)) 1681 return PTR_ERR(iod); 1682 length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); 1683 c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 1684 c.common.prp2 = cpu_to_le64(iod->first_dma); 1685 } 1686 1687 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1688 ADMIN_TIMEOUT; 1689 if (length != cmd.data_len) 1690 status = -ENOMEM; 1691 else 1692 status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); 1693 1694 if (cmd.data_len) { 1695 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1696 nvme_free_iod(dev, iod); 1697 } 1698 1699 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1700 sizeof(cmd.result))) 1701 status = -EFAULT; 1702 1703 return status; 1704} 1705 1706static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1707 unsigned long arg) 1708{ 1709 struct nvme_ns *ns = bdev->bd_disk->private_data; 1710 1711 switch (cmd) { 1712 case NVME_IOCTL_ID: 1713 force_successful_syscall_return(); 1714 return ns->ns_id; 1715 case NVME_IOCTL_ADMIN_CMD: 1716 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1717 case NVME_IOCTL_SUBMIT_IO: 1718 return nvme_submit_io(ns, (void __user *)arg); 1719 case SG_GET_VERSION_NUM: 1720 return nvme_sg_get_version_num((void __user *)arg); 1721 case SG_IO: 1722 return nvme_sg_io(ns, (void __user *)arg); 1723 default: 1724 return -ENOTTY; 1725 } 1726} 1727 1728#ifdef CONFIG_COMPAT 1729static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1730 unsigned int cmd, unsigned long arg) 1731{ 1732 struct nvme_ns *ns = bdev->bd_disk->private_data; 1733 1734 switch (cmd) { 1735 case SG_IO: 1736 return nvme_sg_io32(ns, arg); 1737 } 1738 return nvme_ioctl(bdev, mode, cmd, arg); 1739} 1740#else 1741#define nvme_compat_ioctl NULL 1742#endif 1743 1744static int nvme_open(struct block_device *bdev, fmode_t mode) 1745{ 1746 struct nvme_ns *ns = bdev->bd_disk->private_data; 1747 struct nvme_dev *dev = ns->dev; 1748 1749 kref_get(&dev->kref); 1750 return 0; 1751} 1752 1753static void nvme_free_dev(struct kref *kref); 1754 1755static void nvme_release(struct gendisk *disk, fmode_t mode) 1756{ 1757 struct nvme_ns *ns = disk->private_data; 1758 struct nvme_dev *dev = ns->dev; 1759 1760 kref_put(&dev->kref, nvme_free_dev); 1761} 1762 1763static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) 1764{ 1765 /* some standard values */ 1766 geo->heads = 1 << 6; 1767 geo->sectors = 1 << 5; 1768 geo->cylinders = get_capacity(bd->bd_disk) >> 11; 1769 return 0; 1770} 1771 1772static const struct block_device_operations nvme_fops = { 1773 .owner = THIS_MODULE, 1774 .ioctl = nvme_ioctl, 1775 .compat_ioctl = nvme_compat_ioctl, 1776 .open = nvme_open, 1777 .release = nvme_release, 1778 .getgeo = nvme_getgeo, 1779}; 1780 1781static void nvme_resubmit_iods(struct nvme_queue *nvmeq) 1782{ 1783 struct nvme_iod *iod, *next; 1784 1785 list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { 1786 if (unlikely(nvme_submit_iod(nvmeq, iod))) 1787 break; 1788 list_del(&iod->node); 1789 if (bio_list_empty(&nvmeq->sq_cong) && 1790 list_empty(&nvmeq->iod_bio)) 1791 remove_wait_queue(&nvmeq->sq_full, 1792 &nvmeq->sq_cong_wait); 1793 } 1794} 1795 1796static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1797{ 1798 while (bio_list_peek(&nvmeq->sq_cong)) { 1799 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1800 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1801 1802 if (bio_list_empty(&nvmeq->sq_cong) && 1803 list_empty(&nvmeq->iod_bio)) 1804 remove_wait_queue(&nvmeq->sq_full, 1805 &nvmeq->sq_cong_wait); 1806 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1807 if (!waitqueue_active(&nvmeq->sq_full)) 1808 add_wait_queue(&nvmeq->sq_full, 1809 &nvmeq->sq_cong_wait); 1810 bio_list_add_head(&nvmeq->sq_cong, bio); 1811 break; 1812 } 1813 } 1814} 1815 1816static int nvme_kthread(void *data) 1817{ 1818 struct nvme_dev *dev, *next; 1819 1820 while (!kthread_should_stop()) { 1821 set_current_state(TASK_INTERRUPTIBLE); 1822 spin_lock(&dev_list_lock); 1823 list_for_each_entry_safe(dev, next, &dev_list, node) { 1824 int i; 1825 if (readl(&dev->bar->csts) & NVME_CSTS_CFS && 1826 dev->initialized) { 1827 if (work_busy(&dev->reset_work)) 1828 continue; 1829 list_del_init(&dev->node); 1830 dev_warn(&dev->pci_dev->dev, 1831 "Failed status, reset controller\n"); 1832 dev->reset_workfn = nvme_reset_failed_dev; 1833 queue_work(nvme_workq, &dev->reset_work); 1834 continue; 1835 } 1836 rcu_read_lock(); 1837 for (i = 0; i < dev->queue_count; i++) { 1838 struct nvme_queue *nvmeq = 1839 rcu_dereference(dev->queues[i]); 1840 if (!nvmeq) 1841 continue; 1842 spin_lock_irq(&nvmeq->q_lock); 1843 if (nvmeq->q_suspended) 1844 goto unlock; 1845 nvme_process_cq(nvmeq); 1846 nvme_cancel_ios(nvmeq, true); 1847 nvme_resubmit_bios(nvmeq); 1848 nvme_resubmit_iods(nvmeq); 1849 unlock: 1850 spin_unlock_irq(&nvmeq->q_lock); 1851 } 1852 rcu_read_unlock(); 1853 } 1854 spin_unlock(&dev_list_lock); 1855 schedule_timeout(round_jiffies_relative(HZ)); 1856 } 1857 return 0; 1858} 1859 1860static void nvme_config_discard(struct nvme_ns *ns) 1861{ 1862 u32 logical_block_size = queue_logical_block_size(ns->queue); 1863 ns->queue->limits.discard_zeroes_data = 0; 1864 ns->queue->limits.discard_alignment = logical_block_size; 1865 ns->queue->limits.discard_granularity = logical_block_size; 1866 ns->queue->limits.max_discard_sectors = 0xffffffff; 1867 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1868} 1869 1870static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, 1871 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1872{ 1873 struct nvme_ns *ns; 1874 struct gendisk *disk; 1875 int lbaf; 1876 1877 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1878 return NULL; 1879 1880 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1881 if (!ns) 1882 return NULL; 1883 ns->queue = blk_alloc_queue(GFP_KERNEL); 1884 if (!ns->queue) 1885 goto out_free_ns; 1886 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1887 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1888 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1889 blk_queue_make_request(ns->queue, nvme_make_request); 1890 ns->dev = dev; 1891 ns->queue->queuedata = ns; 1892 1893 disk = alloc_disk(0); 1894 if (!disk) 1895 goto out_free_queue; 1896 ns->ns_id = nsid; 1897 ns->disk = disk; 1898 lbaf = id->flbas & 0xf; 1899 ns->lba_shift = id->lbaf[lbaf].ds; 1900 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1901 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1902 if (dev->max_hw_sectors) 1903 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1904 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 1905 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1906 1907 disk->major = nvme_major; 1908 disk->first_minor = 0; 1909 disk->fops = &nvme_fops; 1910 disk->private_data = ns; 1911 disk->queue = ns->queue; 1912 disk->driverfs_dev = &dev->pci_dev->dev; 1913 disk->flags = GENHD_FL_EXT_DEVT; 1914 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1915 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1916 1917 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1918 nvme_config_discard(ns); 1919 1920 return ns; 1921 1922 out_free_queue: 1923 blk_cleanup_queue(ns->queue); 1924 out_free_ns: 1925 kfree(ns); 1926 return NULL; 1927} 1928 1929static int nvme_find_closest_node(int node) 1930{ 1931 int n, val, min_val = INT_MAX, best_node = node; 1932 1933 for_each_online_node(n) { 1934 if (n == node) 1935 continue; 1936 val = node_distance(node, n); 1937 if (val < min_val) { 1938 min_val = val; 1939 best_node = n; 1940 } 1941 } 1942 return best_node; 1943} 1944 1945static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, 1946 int count) 1947{ 1948 int cpu; 1949 for_each_cpu(cpu, qmask) { 1950 if (cpumask_weight(nvmeq->cpu_mask) >= count) 1951 break; 1952 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) 1953 *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; 1954 } 1955} 1956 1957static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, 1958 const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) 1959{ 1960 int next_cpu; 1961 for_each_cpu(next_cpu, new_mask) { 1962 cpumask_or(mask, mask, get_cpu_mask(next_cpu)); 1963 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); 1964 cpumask_and(mask, mask, unassigned_cpus); 1965 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); 1966 } 1967} 1968 1969static void nvme_create_io_queues(struct nvme_dev *dev) 1970{ 1971 unsigned i, max; 1972 1973 max = min(dev->max_qid, num_online_cpus()); 1974 for (i = dev->queue_count; i <= max; i++) 1975 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) 1976 break; 1977 1978 max = min(dev->queue_count - 1, num_online_cpus()); 1979 for (i = dev->online_queues; i <= max; i++) 1980 if (nvme_create_queue(raw_nvmeq(dev, i), i)) 1981 break; 1982} 1983 1984/* 1985 * If there are fewer queues than online cpus, this will try to optimally 1986 * assign a queue to multiple cpus by grouping cpus that are "close" together: 1987 * thread siblings, core, socket, closest node, then whatever else is 1988 * available. 1989 */ 1990static void nvme_assign_io_queues(struct nvme_dev *dev) 1991{ 1992 unsigned cpu, cpus_per_queue, queues, remainder, i; 1993 cpumask_var_t unassigned_cpus; 1994 1995 nvme_create_io_queues(dev); 1996 1997 queues = min(dev->online_queues - 1, num_online_cpus()); 1998 if (!queues) 1999 return; 2000 2001 cpus_per_queue = num_online_cpus() / queues; 2002 remainder = queues - (num_online_cpus() - queues * cpus_per_queue); 2003 2004 if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) 2005 return; 2006 2007 cpumask_copy(unassigned_cpus, cpu_online_mask); 2008 cpu = cpumask_first(unassigned_cpus); 2009 for (i = 1; i <= queues; i++) { 2010 struct nvme_queue *nvmeq = lock_nvmeq(dev, i); 2011 cpumask_t mask; 2012 2013 cpumask_clear(nvmeq->cpu_mask); 2014 if (!cpumask_weight(unassigned_cpus)) { 2015 unlock_nvmeq(nvmeq); 2016 break; 2017 } 2018 2019 mask = *get_cpu_mask(cpu); 2020 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); 2021 if (cpus_weight(mask) < cpus_per_queue) 2022 nvme_add_cpus(&mask, unassigned_cpus, 2023 topology_thread_cpumask(cpu), 2024 nvmeq, cpus_per_queue); 2025 if (cpus_weight(mask) < cpus_per_queue) 2026 nvme_add_cpus(&mask, unassigned_cpus, 2027 topology_core_cpumask(cpu), 2028 nvmeq, cpus_per_queue); 2029 if (cpus_weight(mask) < cpus_per_queue) 2030 nvme_add_cpus(&mask, unassigned_cpus, 2031 cpumask_of_node(cpu_to_node(cpu)), 2032 nvmeq, cpus_per_queue); 2033 if (cpus_weight(mask) < cpus_per_queue) 2034 nvme_add_cpus(&mask, unassigned_cpus, 2035 cpumask_of_node( 2036 nvme_find_closest_node( 2037 cpu_to_node(cpu))), 2038 nvmeq, cpus_per_queue); 2039 if (cpus_weight(mask) < cpus_per_queue) 2040 nvme_add_cpus(&mask, unassigned_cpus, 2041 unassigned_cpus, 2042 nvmeq, cpus_per_queue); 2043 2044 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, 2045 "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", 2046 dev->instance, i); 2047 2048 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 2049 nvmeq->cpu_mask); 2050 cpumask_andnot(unassigned_cpus, unassigned_cpus, 2051 nvmeq->cpu_mask); 2052 cpu = cpumask_next(cpu, unassigned_cpus); 2053 if (remainder && !--remainder) 2054 cpus_per_queue++; 2055 unlock_nvmeq(nvmeq); 2056 } 2057 WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", 2058 dev->instance); 2059 i = 0; 2060 cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); 2061 for_each_cpu(cpu, unassigned_cpus) 2062 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; 2063 free_cpumask_var(unassigned_cpus); 2064} 2065 2066static int set_queue_count(struct nvme_dev *dev, int count) 2067{ 2068 int status; 2069 u32 result; 2070 u32 q_count = (count - 1) | ((count - 1) << 16); 2071 2072 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 2073 &result); 2074 if (status < 0) 2075 return status; 2076 if (status > 0) { 2077 dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", 2078 status); 2079 return -EBUSY; 2080 } 2081 return min(result & 0xffff, result >> 16) + 1; 2082} 2083 2084static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2085{ 2086 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 2087} 2088 2089static int nvme_cpu_notify(struct notifier_block *self, 2090 unsigned long action, void *hcpu) 2091{ 2092 struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); 2093 switch (action) { 2094 case CPU_ONLINE: 2095 case CPU_DEAD: 2096 nvme_assign_io_queues(dev); 2097 break; 2098 } 2099 return NOTIFY_OK; 2100} 2101 2102static int nvme_setup_io_queues(struct nvme_dev *dev) 2103{ 2104 struct nvme_queue *adminq = raw_nvmeq(dev, 0); 2105 struct pci_dev *pdev = dev->pci_dev; 2106 int result, i, vecs, nr_io_queues, size; 2107 2108 nr_io_queues = num_possible_cpus(); 2109 result = set_queue_count(dev, nr_io_queues); 2110 if (result < 0) 2111 return result; 2112 if (result < nr_io_queues) 2113 nr_io_queues = result; 2114 2115 size = db_bar_size(dev, nr_io_queues); 2116 if (size > 8192) { 2117 iounmap(dev->bar); 2118 do { 2119 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2120 if (dev->bar) 2121 break; 2122 if (!--nr_io_queues) 2123 return -ENOMEM; 2124 size = db_bar_size(dev, nr_io_queues); 2125 } while (1); 2126 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2127 adminq->q_db = dev->dbs; 2128 } 2129 2130 /* Deregister the admin queue's interrupt */ 2131 free_irq(dev->entry[0].vector, adminq); 2132 2133 for (i = 0; i < nr_io_queues; i++) 2134 dev->entry[i].entry = i; 2135 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 2136 if (vecs < 0) { 2137 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 2138 if (vecs < 0) { 2139 vecs = 1; 2140 } else { 2141 for (i = 0; i < vecs; i++) 2142 dev->entry[i].vector = i + pdev->irq; 2143 } 2144 } 2145 2146 /* 2147 * Should investigate if there's a performance win from allocating 2148 * more queues than interrupt vectors; it might allow the submission 2149 * path to scale better, even if the receive path is limited by the 2150 * number of interrupts. 2151 */ 2152 nr_io_queues = vecs; 2153 dev->max_qid = nr_io_queues; 2154 2155 result = queue_request_irq(dev, adminq, adminq->irqname); 2156 if (result) { 2157 adminq->q_suspended = 1; 2158 goto free_queues; 2159 } 2160 2161 /* Free previously allocated queues that are no longer usable */ 2162 nvme_free_queues(dev, nr_io_queues + 1); 2163 nvme_assign_io_queues(dev); 2164 2165 dev->nb.notifier_call = &nvme_cpu_notify; 2166 result = register_hotcpu_notifier(&dev->nb); 2167 if (result) 2168 goto free_queues; 2169 2170 return 0; 2171 2172 free_queues: 2173 nvme_free_queues(dev, 1); 2174 return result; 2175} 2176 2177/* 2178 * Return: error value if an error occurred setting up the queues or calling 2179 * Identify Device. 0 if these succeeded, even if adding some of the 2180 * namespaces failed. At the moment, these failures are silent. TBD which 2181 * failures should be reported. 2182 */ 2183static int nvme_dev_add(struct nvme_dev *dev) 2184{ 2185 struct pci_dev *pdev = dev->pci_dev; 2186 int res; 2187 unsigned nn, i; 2188 struct nvme_ns *ns; 2189 struct nvme_id_ctrl *ctrl; 2190 struct nvme_id_ns *id_ns; 2191 void *mem; 2192 dma_addr_t dma_addr; 2193 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2194 2195 mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); 2196 if (!mem) 2197 return -ENOMEM; 2198 2199 res = nvme_identify(dev, 0, 1, dma_addr); 2200 if (res) { 2201 dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); 2202 res = -EIO; 2203 goto out; 2204 } 2205 2206 ctrl = mem; 2207 nn = le32_to_cpup(&ctrl->nn); 2208 dev->oncs = le16_to_cpup(&ctrl->oncs); 2209 dev->abort_limit = ctrl->acl + 1; 2210 dev->vwc = ctrl->vwc; 2211 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2212 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2213 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2214 if (ctrl->mdts) 2215 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2216 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2217 (pdev->device == 0x0953) && ctrl->vs[3]) 2218 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2219 2220 id_ns = mem; 2221 for (i = 1; i <= nn; i++) { 2222 res = nvme_identify(dev, i, 0, dma_addr); 2223 if (res) 2224 continue; 2225 2226 if (id_ns->ncap == 0) 2227 continue; 2228 2229 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 2230 dma_addr + 4096, NULL); 2231 if (res) 2232 memset(mem + 4096, 0, 4096); 2233 2234 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 2235 if (ns) 2236 list_add_tail(&ns->list, &dev->namespaces); 2237 } 2238 list_for_each_entry(ns, &dev->namespaces, list) 2239 add_disk(ns->disk); 2240 res = 0; 2241 2242 out: 2243 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 2244 return res; 2245} 2246 2247static int nvme_dev_map(struct nvme_dev *dev) 2248{ 2249 u64 cap; 2250 int bars, result = -ENOMEM; 2251 struct pci_dev *pdev = dev->pci_dev; 2252 2253 if (pci_enable_device_mem(pdev)) 2254 return result; 2255 2256 dev->entry[0].vector = pdev->irq; 2257 pci_set_master(pdev); 2258 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2259 if (pci_request_selected_regions(pdev, bars, "nvme")) 2260 goto disable_pci; 2261 2262 if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && 2263 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) 2264 goto disable; 2265 2266 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2267 if (!dev->bar) 2268 goto disable; 2269 if (readl(&dev->bar->csts) == -1) { 2270 result = -ENODEV; 2271 goto unmap; 2272 } 2273 cap = readq(&dev->bar->cap); 2274 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2275 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2276 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2277 2278 return 0; 2279 2280 unmap: 2281 iounmap(dev->bar); 2282 dev->bar = NULL; 2283 disable: 2284 pci_release_regions(pdev); 2285 disable_pci: 2286 pci_disable_device(pdev); 2287 return result; 2288} 2289 2290static void nvme_dev_unmap(struct nvme_dev *dev) 2291{ 2292 if (dev->pci_dev->msi_enabled) 2293 pci_disable_msi(dev->pci_dev); 2294 else if (dev->pci_dev->msix_enabled) 2295 pci_disable_msix(dev->pci_dev); 2296 2297 if (dev->bar) { 2298 iounmap(dev->bar); 2299 dev->bar = NULL; 2300 pci_release_regions(dev->pci_dev); 2301 } 2302 2303 if (pci_is_enabled(dev->pci_dev)) 2304 pci_disable_device(dev->pci_dev); 2305} 2306 2307struct nvme_delq_ctx { 2308 struct task_struct *waiter; 2309 struct kthread_worker *worker; 2310 atomic_t refcount; 2311}; 2312 2313static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2314{ 2315 dq->waiter = current; 2316 mb(); 2317 2318 for (;;) { 2319 set_current_state(TASK_KILLABLE); 2320 if (!atomic_read(&dq->refcount)) 2321 break; 2322 if (!schedule_timeout(ADMIN_TIMEOUT) || 2323 fatal_signal_pending(current)) { 2324 set_current_state(TASK_RUNNING); 2325 2326 nvme_disable_ctrl(dev, readq(&dev->bar->cap)); 2327 nvme_disable_queue(dev, 0); 2328 2329 send_sig(SIGKILL, dq->worker->task, 1); 2330 flush_kthread_worker(dq->worker); 2331 return; 2332 } 2333 } 2334 set_current_state(TASK_RUNNING); 2335} 2336 2337static void nvme_put_dq(struct nvme_delq_ctx *dq) 2338{ 2339 atomic_dec(&dq->refcount); 2340 if (dq->waiter) 2341 wake_up_process(dq->waiter); 2342} 2343 2344static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2345{ 2346 atomic_inc(&dq->refcount); 2347 return dq; 2348} 2349 2350static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2351{ 2352 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2353 2354 nvme_clear_queue(nvmeq); 2355 nvme_put_dq(dq); 2356} 2357 2358static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2359 kthread_work_func_t fn) 2360{ 2361 struct nvme_command c; 2362 2363 memset(&c, 0, sizeof(c)); 2364 c.delete_queue.opcode = opcode; 2365 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2366 2367 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2368 return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); 2369} 2370 2371static void nvme_del_cq_work_handler(struct kthread_work *work) 2372{ 2373 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2374 cmdinfo.work); 2375 nvme_del_queue_end(nvmeq); 2376} 2377 2378static int nvme_delete_cq(struct nvme_queue *nvmeq) 2379{ 2380 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2381 nvme_del_cq_work_handler); 2382} 2383 2384static void nvme_del_sq_work_handler(struct kthread_work *work) 2385{ 2386 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2387 cmdinfo.work); 2388 int status = nvmeq->cmdinfo.status; 2389 2390 if (!status) 2391 status = nvme_delete_cq(nvmeq); 2392 if (status) 2393 nvme_del_queue_end(nvmeq); 2394} 2395 2396static int nvme_delete_sq(struct nvme_queue *nvmeq) 2397{ 2398 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2399 nvme_del_sq_work_handler); 2400} 2401 2402static void nvme_del_queue_start(struct kthread_work *work) 2403{ 2404 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2405 cmdinfo.work); 2406 allow_signal(SIGKILL); 2407 if (nvme_delete_sq(nvmeq)) 2408 nvme_del_queue_end(nvmeq); 2409} 2410 2411static void nvme_disable_io_queues(struct nvme_dev *dev) 2412{ 2413 int i; 2414 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2415 struct nvme_delq_ctx dq; 2416 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2417 &worker, "nvme%d", dev->instance); 2418 2419 if (IS_ERR(kworker_task)) { 2420 dev_err(&dev->pci_dev->dev, 2421 "Failed to create queue del task\n"); 2422 for (i = dev->queue_count - 1; i > 0; i--) 2423 nvme_disable_queue(dev, i); 2424 return; 2425 } 2426 2427 dq.waiter = NULL; 2428 atomic_set(&dq.refcount, 0); 2429 dq.worker = &worker; 2430 for (i = dev->queue_count - 1; i > 0; i--) { 2431 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2432 2433 if (nvme_suspend_queue(nvmeq)) 2434 continue; 2435 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2436 nvmeq->cmdinfo.worker = dq.worker; 2437 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2438 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2439 } 2440 nvme_wait_dq(&dq, dev); 2441 kthread_stop(kworker_task); 2442} 2443 2444/* 2445* Remove the node from the device list and check 2446* for whether or not we need to stop the nvme_thread. 2447*/ 2448static void nvme_dev_list_remove(struct nvme_dev *dev) 2449{ 2450 struct task_struct *tmp = NULL; 2451 2452 spin_lock(&dev_list_lock); 2453 list_del_init(&dev->node); 2454 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2455 tmp = nvme_thread; 2456 nvme_thread = NULL; 2457 } 2458 spin_unlock(&dev_list_lock); 2459 2460 if (tmp) 2461 kthread_stop(tmp); 2462} 2463 2464static void nvme_dev_shutdown(struct nvme_dev *dev) 2465{ 2466 int i; 2467 2468 dev->initialized = 0; 2469 unregister_hotcpu_notifier(&dev->nb); 2470 2471 nvme_dev_list_remove(dev); 2472 2473 if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { 2474 for (i = dev->queue_count - 1; i >= 0; i--) { 2475 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2476 nvme_suspend_queue(nvmeq); 2477 nvme_clear_queue(nvmeq); 2478 } 2479 } else { 2480 nvme_disable_io_queues(dev); 2481 nvme_shutdown_ctrl(dev); 2482 nvme_disable_queue(dev, 0); 2483 } 2484 nvme_dev_unmap(dev); 2485} 2486 2487static void nvme_dev_remove(struct nvme_dev *dev) 2488{ 2489 struct nvme_ns *ns; 2490 2491 list_for_each_entry(ns, &dev->namespaces, list) { 2492 if (ns->disk->flags & GENHD_FL_UP) 2493 del_gendisk(ns->disk); 2494 if (!blk_queue_dying(ns->queue)) 2495 blk_cleanup_queue(ns->queue); 2496 } 2497} 2498 2499static int nvme_setup_prp_pools(struct nvme_dev *dev) 2500{ 2501 struct device *dmadev = &dev->pci_dev->dev; 2502 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 2503 PAGE_SIZE, PAGE_SIZE, 0); 2504 if (!dev->prp_page_pool) 2505 return -ENOMEM; 2506 2507 /* Optimisation for I/Os between 4k and 128k */ 2508 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 2509 256, 256, 0); 2510 if (!dev->prp_small_pool) { 2511 dma_pool_destroy(dev->prp_page_pool); 2512 return -ENOMEM; 2513 } 2514 return 0; 2515} 2516 2517static void nvme_release_prp_pools(struct nvme_dev *dev) 2518{ 2519 dma_pool_destroy(dev->prp_page_pool); 2520 dma_pool_destroy(dev->prp_small_pool); 2521} 2522 2523static DEFINE_IDA(nvme_instance_ida); 2524 2525static int nvme_set_instance(struct nvme_dev *dev) 2526{ 2527 int instance, error; 2528 2529 do { 2530 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2531 return -ENODEV; 2532 2533 spin_lock(&dev_list_lock); 2534 error = ida_get_new(&nvme_instance_ida, &instance); 2535 spin_unlock(&dev_list_lock); 2536 } while (error == -EAGAIN); 2537 2538 if (error) 2539 return -ENODEV; 2540 2541 dev->instance = instance; 2542 return 0; 2543} 2544 2545static void nvme_release_instance(struct nvme_dev *dev) 2546{ 2547 spin_lock(&dev_list_lock); 2548 ida_remove(&nvme_instance_ida, dev->instance); 2549 spin_unlock(&dev_list_lock); 2550} 2551 2552static void nvme_free_namespaces(struct nvme_dev *dev) 2553{ 2554 struct nvme_ns *ns, *next; 2555 2556 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2557 list_del(&ns->list); 2558 put_disk(ns->disk); 2559 kfree(ns); 2560 } 2561} 2562 2563static void nvme_free_dev(struct kref *kref) 2564{ 2565 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2566 2567 nvme_free_namespaces(dev); 2568 free_percpu(dev->io_queue); 2569 kfree(dev->queues); 2570 kfree(dev->entry); 2571 kfree(dev); 2572} 2573 2574static int nvme_dev_open(struct inode *inode, struct file *f) 2575{ 2576 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 2577 miscdev); 2578 kref_get(&dev->kref); 2579 f->private_data = dev; 2580 return 0; 2581} 2582 2583static int nvme_dev_release(struct inode *inode, struct file *f) 2584{ 2585 struct nvme_dev *dev = f->private_data; 2586 kref_put(&dev->kref, nvme_free_dev); 2587 return 0; 2588} 2589 2590static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2591{ 2592 struct nvme_dev *dev = f->private_data; 2593 switch (cmd) { 2594 case NVME_IOCTL_ADMIN_CMD: 2595 return nvme_user_admin_cmd(dev, (void __user *)arg); 2596 default: 2597 return -ENOTTY; 2598 } 2599} 2600 2601static const struct file_operations nvme_dev_fops = { 2602 .owner = THIS_MODULE, 2603 .open = nvme_dev_open, 2604 .release = nvme_dev_release, 2605 .unlocked_ioctl = nvme_dev_ioctl, 2606 .compat_ioctl = nvme_dev_ioctl, 2607}; 2608 2609static int nvme_dev_start(struct nvme_dev *dev) 2610{ 2611 int result; 2612 bool start_thread = false; 2613 2614 result = nvme_dev_map(dev); 2615 if (result) 2616 return result; 2617 2618 result = nvme_configure_admin_queue(dev); 2619 if (result) 2620 goto unmap; 2621 2622 spin_lock(&dev_list_lock); 2623 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2624 start_thread = true; 2625 nvme_thread = NULL; 2626 } 2627 list_add(&dev->node, &dev_list); 2628 spin_unlock(&dev_list_lock); 2629 2630 if (start_thread) { 2631 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2632 wake_up(&nvme_kthread_wait); 2633 } else 2634 wait_event_killable(nvme_kthread_wait, nvme_thread); 2635 2636 if (IS_ERR_OR_NULL(nvme_thread)) { 2637 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 2638 goto disable; 2639 } 2640 2641 result = nvme_setup_io_queues(dev); 2642 if (result && result != -EBUSY) 2643 goto disable; 2644 2645 return result; 2646 2647 disable: 2648 nvme_disable_queue(dev, 0); 2649 nvme_dev_list_remove(dev); 2650 unmap: 2651 nvme_dev_unmap(dev); 2652 return result; 2653} 2654 2655static int nvme_remove_dead_ctrl(void *arg) 2656{ 2657 struct nvme_dev *dev = (struct nvme_dev *)arg; 2658 struct pci_dev *pdev = dev->pci_dev; 2659 2660 if (pci_get_drvdata(pdev)) 2661 pci_stop_and_remove_bus_device(pdev); 2662 kref_put(&dev->kref, nvme_free_dev); 2663 return 0; 2664} 2665 2666static void nvme_remove_disks(struct work_struct *ws) 2667{ 2668 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2669 2670 nvme_dev_remove(dev); 2671 nvme_free_queues(dev, 1); 2672} 2673 2674static int nvme_dev_resume(struct nvme_dev *dev) 2675{ 2676 int ret; 2677 2678 ret = nvme_dev_start(dev); 2679 if (ret && ret != -EBUSY) 2680 return ret; 2681 if (ret == -EBUSY) { 2682 spin_lock(&dev_list_lock); 2683 dev->reset_workfn = nvme_remove_disks; 2684 queue_work(nvme_workq, &dev->reset_work); 2685 spin_unlock(&dev_list_lock); 2686 } 2687 dev->initialized = 1; 2688 return 0; 2689} 2690 2691static void nvme_dev_reset(struct nvme_dev *dev) 2692{ 2693 nvme_dev_shutdown(dev); 2694 if (nvme_dev_resume(dev)) { 2695 dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); 2696 kref_get(&dev->kref); 2697 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2698 dev->instance))) { 2699 dev_err(&dev->pci_dev->dev, 2700 "Failed to start controller remove task\n"); 2701 kref_put(&dev->kref, nvme_free_dev); 2702 } 2703 } 2704} 2705 2706static void nvme_reset_failed_dev(struct work_struct *ws) 2707{ 2708 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2709 nvme_dev_reset(dev); 2710} 2711 2712static void nvme_reset_workfn(struct work_struct *work) 2713{ 2714 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 2715 dev->reset_workfn(work); 2716} 2717 2718static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2719{ 2720 int result = -ENOMEM; 2721 struct nvme_dev *dev; 2722 2723 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2724 if (!dev) 2725 return -ENOMEM; 2726 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2727 GFP_KERNEL); 2728 if (!dev->entry) 2729 goto free; 2730 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2731 GFP_KERNEL); 2732 if (!dev->queues) 2733 goto free; 2734 dev->io_queue = alloc_percpu(unsigned short); 2735 if (!dev->io_queue) 2736 goto free; 2737 2738 INIT_LIST_HEAD(&dev->namespaces); 2739 dev->reset_workfn = nvme_reset_failed_dev; 2740 INIT_WORK(&dev->reset_work, nvme_reset_workfn); 2741 dev->pci_dev = pdev; 2742 pci_set_drvdata(pdev, dev); 2743 result = nvme_set_instance(dev); 2744 if (result) 2745 goto free; 2746 2747 result = nvme_setup_prp_pools(dev); 2748 if (result) 2749 goto release; 2750 2751 kref_init(&dev->kref); 2752 result = nvme_dev_start(dev); 2753 if (result) { 2754 if (result == -EBUSY) 2755 goto create_cdev; 2756 goto release_pools; 2757 } 2758 2759 result = nvme_dev_add(dev); 2760 if (result) 2761 goto shutdown; 2762 2763 create_cdev: 2764 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2765 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2766 dev->miscdev.parent = &pdev->dev; 2767 dev->miscdev.name = dev->name; 2768 dev->miscdev.fops = &nvme_dev_fops; 2769 result = misc_register(&dev->miscdev); 2770 if (result) 2771 goto remove; 2772 2773 dev->initialized = 1; 2774 return 0; 2775 2776 remove: 2777 nvme_dev_remove(dev); 2778 nvme_free_namespaces(dev); 2779 shutdown: 2780 nvme_dev_shutdown(dev); 2781 release_pools: 2782 nvme_free_queues(dev, 0); 2783 nvme_release_prp_pools(dev); 2784 release: 2785 nvme_release_instance(dev); 2786 free: 2787 free_percpu(dev->io_queue); 2788 kfree(dev->queues); 2789 kfree(dev->entry); 2790 kfree(dev); 2791 return result; 2792} 2793 2794static void nvme_shutdown(struct pci_dev *pdev) 2795{ 2796 struct nvme_dev *dev = pci_get_drvdata(pdev); 2797 nvme_dev_shutdown(dev); 2798} 2799 2800static void nvme_remove(struct pci_dev *pdev) 2801{ 2802 struct nvme_dev *dev = pci_get_drvdata(pdev); 2803 2804 spin_lock(&dev_list_lock); 2805 list_del_init(&dev->node); 2806 spin_unlock(&dev_list_lock); 2807 2808 pci_set_drvdata(pdev, NULL); 2809 flush_work(&dev->reset_work); 2810 misc_deregister(&dev->miscdev); 2811 nvme_dev_remove(dev); 2812 nvme_dev_shutdown(dev); 2813 nvme_free_queues(dev, 0); 2814 rcu_barrier(); 2815 nvme_release_instance(dev); 2816 nvme_release_prp_pools(dev); 2817 kref_put(&dev->kref, nvme_free_dev); 2818} 2819 2820/* These functions are yet to be implemented */ 2821#define nvme_error_detected NULL 2822#define nvme_dump_registers NULL 2823#define nvme_link_reset NULL 2824#define nvme_slot_reset NULL 2825#define nvme_error_resume NULL 2826 2827#ifdef CONFIG_PM_SLEEP 2828static int nvme_suspend(struct device *dev) 2829{ 2830 struct pci_dev *pdev = to_pci_dev(dev); 2831 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2832 2833 nvme_dev_shutdown(ndev); 2834 return 0; 2835} 2836 2837static int nvme_resume(struct device *dev) 2838{ 2839 struct pci_dev *pdev = to_pci_dev(dev); 2840 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2841 2842 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { 2843 ndev->reset_workfn = nvme_reset_failed_dev; 2844 queue_work(nvme_workq, &ndev->reset_work); 2845 } 2846 return 0; 2847} 2848#endif 2849 2850static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2851 2852static const struct pci_error_handlers nvme_err_handler = { 2853 .error_detected = nvme_error_detected, 2854 .mmio_enabled = nvme_dump_registers, 2855 .link_reset = nvme_link_reset, 2856 .slot_reset = nvme_slot_reset, 2857 .resume = nvme_error_resume, 2858}; 2859 2860/* Move to pci_ids.h later */ 2861#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2862 2863static const struct pci_device_id nvme_id_table[] = { 2864 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2865 { 0, } 2866}; 2867MODULE_DEVICE_TABLE(pci, nvme_id_table); 2868 2869static struct pci_driver nvme_driver = { 2870 .name = "nvme", 2871 .id_table = nvme_id_table, 2872 .probe = nvme_probe, 2873 .remove = nvme_remove, 2874 .shutdown = nvme_shutdown, 2875 .driver = { 2876 .pm = &nvme_dev_pm_ops, 2877 }, 2878 .err_handler = &nvme_err_handler, 2879}; 2880 2881static int __init nvme_init(void) 2882{ 2883 int result; 2884 2885 init_waitqueue_head(&nvme_kthread_wait); 2886 2887 nvme_workq = create_singlethread_workqueue("nvme"); 2888 if (!nvme_workq) 2889 return -ENOMEM; 2890 2891 result = register_blkdev(nvme_major, "nvme"); 2892 if (result < 0) 2893 goto kill_workq; 2894 else if (result > 0) 2895 nvme_major = result; 2896 2897 result = pci_register_driver(&nvme_driver); 2898 if (result) 2899 goto unregister_blkdev; 2900 return 0; 2901 2902 unregister_blkdev: 2903 unregister_blkdev(nvme_major, "nvme"); 2904 kill_workq: 2905 destroy_workqueue(nvme_workq); 2906 return result; 2907} 2908 2909static void __exit nvme_exit(void) 2910{ 2911 pci_unregister_driver(&nvme_driver); 2912 unregister_blkdev(nvme_major, "nvme"); 2913 destroy_workqueue(nvme_workq); 2914 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 2915} 2916 2917MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2918MODULE_LICENSE("GPL"); 2919MODULE_VERSION("0.9"); 2920module_init(nvme_init); 2921module_exit(nvme_exit); 2922