nvme-core.c revision b355084a891985d4cd0ca23b1a83366af2c4232d
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/cpu.h> 24#include <linux/delay.h> 25#include <linux/errno.h> 26#include <linux/fs.h> 27#include <linux/genhd.h> 28#include <linux/idr.h> 29#include <linux/init.h> 30#include <linux/interrupt.h> 31#include <linux/io.h> 32#include <linux/kdev_t.h> 33#include <linux/kthread.h> 34#include <linux/kernel.h> 35#include <linux/mm.h> 36#include <linux/module.h> 37#include <linux/moduleparam.h> 38#include <linux/pci.h> 39#include <linux/percpu.h> 40#include <linux/poison.h> 41#include <linux/ptrace.h> 42#include <linux/sched.h> 43#include <linux/slab.h> 44#include <linux/types.h> 45#include <scsi/sg.h> 46#include <asm-generic/io-64-nonatomic-lo-hi.h> 47 48#define NVME_Q_DEPTH 1024 49#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 50#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 51#define ADMIN_TIMEOUT (60 * HZ) 52 53unsigned char io_timeout = 30; 54module_param(io_timeout, byte, 0644); 55MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 56 57static int nvme_major; 58module_param(nvme_major, int, 0); 59 60static int use_threaded_interrupts; 61module_param(use_threaded_interrupts, int, 0); 62 63static DEFINE_SPINLOCK(dev_list_lock); 64static LIST_HEAD(dev_list); 65static struct task_struct *nvme_thread; 66static struct workqueue_struct *nvme_workq; 67 68static void nvme_reset_failed_dev(struct work_struct *ws); 69 70struct async_cmd_info { 71 struct kthread_work work; 72 struct kthread_worker *worker; 73 u32 result; 74 int status; 75 void *ctx; 76}; 77 78/* 79 * An NVM Express queue. Each device has at least two (one for admin 80 * commands and one for I/O commands). 81 */ 82struct nvme_queue { 83 struct rcu_head r_head; 84 struct device *q_dmadev; 85 struct nvme_dev *dev; 86 char irqname[24]; /* nvme4294967295-65535\0 */ 87 spinlock_t q_lock; 88 struct nvme_command *sq_cmds; 89 volatile struct nvme_completion *cqes; 90 dma_addr_t sq_dma_addr; 91 dma_addr_t cq_dma_addr; 92 wait_queue_head_t sq_full; 93 wait_queue_t sq_cong_wait; 94 struct bio_list sq_cong; 95 u32 __iomem *q_db; 96 u16 q_depth; 97 u16 cq_vector; 98 u16 sq_head; 99 u16 sq_tail; 100 u16 cq_head; 101 u16 qid; 102 u8 cq_phase; 103 u8 cqe_seen; 104 u8 q_suspended; 105 cpumask_var_t cpu_mask; 106 struct async_cmd_info cmdinfo; 107 unsigned long cmdid_data[]; 108}; 109 110/* 111 * Check we didin't inadvertently grow the command struct 112 */ 113static inline void _nvme_check_size(void) 114{ 115 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 116 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 117 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 118 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 119 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 120 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 121 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 122 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 123 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 124 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 125 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 126 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 127} 128 129typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 130 struct nvme_completion *); 131 132struct nvme_cmd_info { 133 nvme_completion_fn fn; 134 void *ctx; 135 unsigned long timeout; 136 int aborted; 137}; 138 139static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 140{ 141 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 142} 143 144static unsigned nvme_queue_extra(int depth) 145{ 146 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 147} 148 149/** 150 * alloc_cmdid() - Allocate a Command ID 151 * @nvmeq: The queue that will be used for this command 152 * @ctx: A pointer that will be passed to the handler 153 * @handler: The function to call on completion 154 * 155 * Allocate a Command ID for a queue. The data passed in will 156 * be passed to the completion handler. This is implemented by using 157 * the bottom two bits of the ctx pointer to store the handler ID. 158 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 159 * We can change this if it becomes a problem. 160 * 161 * May be called with local interrupts disabled and the q_lock held, 162 * or with interrupts enabled and no locks held. 163 */ 164static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 165 nvme_completion_fn handler, unsigned timeout) 166{ 167 int depth = nvmeq->q_depth - 1; 168 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 169 int cmdid; 170 171 do { 172 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 173 if (cmdid >= depth) 174 return -EBUSY; 175 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 176 177 info[cmdid].fn = handler; 178 info[cmdid].ctx = ctx; 179 info[cmdid].timeout = jiffies + timeout; 180 info[cmdid].aborted = 0; 181 return cmdid; 182} 183 184static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 185 nvme_completion_fn handler, unsigned timeout) 186{ 187 int cmdid; 188 wait_event_killable(nvmeq->sq_full, 189 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 190 return (cmdid < 0) ? -EINTR : cmdid; 191} 192 193/* Special values must be less than 0x1000 */ 194#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 195#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 196#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 197#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 198#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 199#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) 200 201static void special_completion(struct nvme_dev *dev, void *ctx, 202 struct nvme_completion *cqe) 203{ 204 if (ctx == CMD_CTX_CANCELLED) 205 return; 206 if (ctx == CMD_CTX_FLUSH) 207 return; 208 if (ctx == CMD_CTX_ABORT) { 209 ++dev->abort_limit; 210 return; 211 } 212 if (ctx == CMD_CTX_COMPLETED) { 213 dev_warn(&dev->pci_dev->dev, 214 "completed id %d twice on queue %d\n", 215 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 216 return; 217 } 218 if (ctx == CMD_CTX_INVALID) { 219 dev_warn(&dev->pci_dev->dev, 220 "invalid id %d completed on queue %d\n", 221 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 222 return; 223 } 224 225 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 226} 227 228static void async_completion(struct nvme_dev *dev, void *ctx, 229 struct nvme_completion *cqe) 230{ 231 struct async_cmd_info *cmdinfo = ctx; 232 cmdinfo->result = le32_to_cpup(&cqe->result); 233 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 234 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 235} 236 237/* 238 * Called with local interrupts disabled and the q_lock held. May not sleep. 239 */ 240static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 241 nvme_completion_fn *fn) 242{ 243 void *ctx; 244 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 245 246 if (cmdid >= nvmeq->q_depth) { 247 *fn = special_completion; 248 return CMD_CTX_INVALID; 249 } 250 if (fn) 251 *fn = info[cmdid].fn; 252 ctx = info[cmdid].ctx; 253 info[cmdid].fn = special_completion; 254 info[cmdid].ctx = CMD_CTX_COMPLETED; 255 clear_bit(cmdid, nvmeq->cmdid_data); 256 wake_up(&nvmeq->sq_full); 257 return ctx; 258} 259 260static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 261 nvme_completion_fn *fn) 262{ 263 void *ctx; 264 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 265 if (fn) 266 *fn = info[cmdid].fn; 267 ctx = info[cmdid].ctx; 268 info[cmdid].fn = special_completion; 269 info[cmdid].ctx = CMD_CTX_CANCELLED; 270 return ctx; 271} 272 273static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) 274{ 275 return rcu_dereference_raw(dev->queues[qid]); 276} 277 278static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) 279{ 280 unsigned queue_id = get_cpu_var(*dev->io_queue); 281 rcu_read_lock(); 282 return rcu_dereference(dev->queues[queue_id]); 283} 284 285static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 286{ 287 rcu_read_unlock(); 288 put_cpu_var(nvmeq->dev->io_queue); 289} 290 291static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) 292 __acquires(RCU) 293{ 294 rcu_read_lock(); 295 return rcu_dereference(dev->queues[q_idx]); 296} 297 298static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 299{ 300 rcu_read_unlock(); 301} 302 303/** 304 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 305 * @nvmeq: The queue to use 306 * @cmd: The command to send 307 * 308 * Safe to use from interrupt context 309 */ 310static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 311{ 312 unsigned long flags; 313 u16 tail; 314 spin_lock_irqsave(&nvmeq->q_lock, flags); 315 if (nvmeq->q_suspended) { 316 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 317 return -EBUSY; 318 } 319 tail = nvmeq->sq_tail; 320 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 321 if (++tail == nvmeq->q_depth) 322 tail = 0; 323 writel(tail, nvmeq->q_db); 324 nvmeq->sq_tail = tail; 325 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 326 327 return 0; 328} 329 330static __le64 **iod_list(struct nvme_iod *iod) 331{ 332 return ((void *)iod) + iod->offset; 333} 334 335/* 336 * Will slightly overestimate the number of pages needed. This is OK 337 * as it only leads to a small amount of wasted memory for the lifetime of 338 * the I/O. 339 */ 340static int nvme_npages(unsigned size) 341{ 342 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 343 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 344} 345 346static struct nvme_iod * 347nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 348{ 349 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 350 sizeof(__le64 *) * nvme_npages(nbytes) + 351 sizeof(struct scatterlist) * nseg, gfp); 352 353 if (iod) { 354 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 355 iod->npages = -1; 356 iod->length = nbytes; 357 iod->nents = 0; 358 iod->start_time = jiffies; 359 } 360 361 return iod; 362} 363 364void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 365{ 366 const int last_prp = PAGE_SIZE / 8 - 1; 367 int i; 368 __le64 **list = iod_list(iod); 369 dma_addr_t prp_dma = iod->first_dma; 370 371 if (iod->npages == 0) 372 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 373 for (i = 0; i < iod->npages; i++) { 374 __le64 *prp_list = list[i]; 375 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 376 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 377 prp_dma = next_prp_dma; 378 } 379 kfree(iod); 380} 381 382static void nvme_start_io_acct(struct bio *bio) 383{ 384 struct gendisk *disk = bio->bi_bdev->bd_disk; 385 const int rw = bio_data_dir(bio); 386 int cpu = part_stat_lock(); 387 part_round_stats(cpu, &disk->part0); 388 part_stat_inc(cpu, &disk->part0, ios[rw]); 389 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); 390 part_inc_in_flight(&disk->part0, rw); 391 part_stat_unlock(); 392} 393 394static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) 395{ 396 struct gendisk *disk = bio->bi_bdev->bd_disk; 397 const int rw = bio_data_dir(bio); 398 unsigned long duration = jiffies - start_time; 399 int cpu = part_stat_lock(); 400 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 401 part_round_stats(cpu, &disk->part0); 402 part_dec_in_flight(&disk->part0, rw); 403 part_stat_unlock(); 404} 405 406static void bio_completion(struct nvme_dev *dev, void *ctx, 407 struct nvme_completion *cqe) 408{ 409 struct nvme_iod *iod = ctx; 410 struct bio *bio = iod->private; 411 u16 status = le16_to_cpup(&cqe->status) >> 1; 412 413 if (iod->nents) { 414 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 415 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 416 nvme_end_io_acct(bio, iod->start_time); 417 } 418 nvme_free_iod(dev, iod); 419 if (status) 420 bio_endio(bio, -EIO); 421 else 422 bio_endio(bio, 0); 423} 424 425/* length is in bytes. gfp flags indicates whether we may sleep. */ 426int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 427 struct nvme_iod *iod, int total_len, gfp_t gfp) 428{ 429 struct dma_pool *pool; 430 int length = total_len; 431 struct scatterlist *sg = iod->sg; 432 int dma_len = sg_dma_len(sg); 433 u64 dma_addr = sg_dma_address(sg); 434 int offset = offset_in_page(dma_addr); 435 __le64 *prp_list; 436 __le64 **list = iod_list(iod); 437 dma_addr_t prp_dma; 438 int nprps, i; 439 440 cmd->prp1 = cpu_to_le64(dma_addr); 441 length -= (PAGE_SIZE - offset); 442 if (length <= 0) 443 return total_len; 444 445 dma_len -= (PAGE_SIZE - offset); 446 if (dma_len) { 447 dma_addr += (PAGE_SIZE - offset); 448 } else { 449 sg = sg_next(sg); 450 dma_addr = sg_dma_address(sg); 451 dma_len = sg_dma_len(sg); 452 } 453 454 if (length <= PAGE_SIZE) { 455 cmd->prp2 = cpu_to_le64(dma_addr); 456 return total_len; 457 } 458 459 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 460 if (nprps <= (256 / 8)) { 461 pool = dev->prp_small_pool; 462 iod->npages = 0; 463 } else { 464 pool = dev->prp_page_pool; 465 iod->npages = 1; 466 } 467 468 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 469 if (!prp_list) { 470 cmd->prp2 = cpu_to_le64(dma_addr); 471 iod->npages = -1; 472 return (total_len - length) + PAGE_SIZE; 473 } 474 list[0] = prp_list; 475 iod->first_dma = prp_dma; 476 cmd->prp2 = cpu_to_le64(prp_dma); 477 i = 0; 478 for (;;) { 479 if (i == PAGE_SIZE / 8) { 480 __le64 *old_prp_list = prp_list; 481 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 482 if (!prp_list) 483 return total_len - length; 484 list[iod->npages++] = prp_list; 485 prp_list[0] = old_prp_list[i - 1]; 486 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 487 i = 1; 488 } 489 prp_list[i++] = cpu_to_le64(dma_addr); 490 dma_len -= PAGE_SIZE; 491 dma_addr += PAGE_SIZE; 492 length -= PAGE_SIZE; 493 if (length <= 0) 494 break; 495 if (dma_len > 0) 496 continue; 497 BUG_ON(dma_len < 0); 498 sg = sg_next(sg); 499 dma_addr = sg_dma_address(sg); 500 dma_len = sg_dma_len(sg); 501 } 502 503 return total_len; 504} 505 506static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 507 int len) 508{ 509 struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); 510 if (!split) 511 return -ENOMEM; 512 513 bio_chain(split, bio); 514 515 if (bio_list_empty(&nvmeq->sq_cong)) 516 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 517 bio_list_add(&nvmeq->sq_cong, split); 518 bio_list_add(&nvmeq->sq_cong, bio); 519 520 return 0; 521} 522 523/* NVMe scatterlists require no holes in the virtual address */ 524#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 525 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 526 527static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 528 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 529{ 530 struct bio_vec bvec, bvprv; 531 struct bvec_iter iter; 532 struct scatterlist *sg = NULL; 533 int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; 534 int first = 1; 535 536 if (nvmeq->dev->stripe_size) 537 split_len = nvmeq->dev->stripe_size - 538 ((bio->bi_iter.bi_sector << 9) & 539 (nvmeq->dev->stripe_size - 1)); 540 541 sg_init_table(iod->sg, psegs); 542 bio_for_each_segment(bvec, bio, iter) { 543 if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { 544 sg->length += bvec.bv_len; 545 } else { 546 if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) 547 return nvme_split_and_submit(bio, nvmeq, 548 length); 549 550 sg = sg ? sg + 1 : iod->sg; 551 sg_set_page(sg, bvec.bv_page, 552 bvec.bv_len, bvec.bv_offset); 553 nsegs++; 554 } 555 556 if (split_len - length < bvec.bv_len) 557 return nvme_split_and_submit(bio, nvmeq, split_len); 558 length += bvec.bv_len; 559 bvprv = bvec; 560 first = 0; 561 } 562 iod->nents = nsegs; 563 sg_mark_end(sg); 564 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 565 return -ENOMEM; 566 567 BUG_ON(length != bio->bi_iter.bi_size); 568 return length; 569} 570 571/* 572 * We reuse the small pool to allocate the 16-byte range here as it is not 573 * worth having a special pool for these or additional cases to handle freeing 574 * the iod. 575 */ 576static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 577 struct bio *bio, struct nvme_iod *iod, int cmdid) 578{ 579 struct nvme_dsm_range *range; 580 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 581 582 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 583 &iod->first_dma); 584 if (!range) 585 return -ENOMEM; 586 587 iod_list(iod)[0] = (__le64 *)range; 588 iod->npages = 0; 589 590 range->cattr = cpu_to_le32(0); 591 range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); 592 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 593 594 memset(cmnd, 0, sizeof(*cmnd)); 595 cmnd->dsm.opcode = nvme_cmd_dsm; 596 cmnd->dsm.command_id = cmdid; 597 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 598 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 599 cmnd->dsm.nr = 0; 600 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 601 602 if (++nvmeq->sq_tail == nvmeq->q_depth) 603 nvmeq->sq_tail = 0; 604 writel(nvmeq->sq_tail, nvmeq->q_db); 605 606 return 0; 607} 608 609static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 610 int cmdid) 611{ 612 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 613 614 memset(cmnd, 0, sizeof(*cmnd)); 615 cmnd->common.opcode = nvme_cmd_flush; 616 cmnd->common.command_id = cmdid; 617 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 618 619 if (++nvmeq->sq_tail == nvmeq->q_depth) 620 nvmeq->sq_tail = 0; 621 writel(nvmeq->sq_tail, nvmeq->q_db); 622 623 return 0; 624} 625 626int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 627{ 628 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 629 special_completion, NVME_IO_TIMEOUT); 630 if (unlikely(cmdid < 0)) 631 return cmdid; 632 633 return nvme_submit_flush(nvmeq, ns, cmdid); 634} 635 636/* 637 * Called with local interrupts disabled and the q_lock held. May not sleep. 638 */ 639static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 640 struct bio *bio) 641{ 642 struct nvme_command *cmnd; 643 struct nvme_iod *iod; 644 enum dma_data_direction dma_dir; 645 int cmdid, length, result; 646 u16 control; 647 u32 dsmgmt; 648 int psegs = bio_phys_segments(ns->queue, bio); 649 650 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 651 result = nvme_submit_flush_data(nvmeq, ns); 652 if (result) 653 return result; 654 } 655 656 result = -ENOMEM; 657 iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); 658 if (!iod) 659 goto nomem; 660 iod->private = bio; 661 662 result = -EBUSY; 663 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 664 if (unlikely(cmdid < 0)) 665 goto free_iod; 666 667 if (bio->bi_rw & REQ_DISCARD) { 668 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 669 if (result) 670 goto free_cmdid; 671 return result; 672 } 673 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 674 return nvme_submit_flush(nvmeq, ns, cmdid); 675 676 control = 0; 677 if (bio->bi_rw & REQ_FUA) 678 control |= NVME_RW_FUA; 679 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 680 control |= NVME_RW_LR; 681 682 dsmgmt = 0; 683 if (bio->bi_rw & REQ_RAHEAD) 684 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 685 686 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 687 688 memset(cmnd, 0, sizeof(*cmnd)); 689 if (bio_data_dir(bio)) { 690 cmnd->rw.opcode = nvme_cmd_write; 691 dma_dir = DMA_TO_DEVICE; 692 } else { 693 cmnd->rw.opcode = nvme_cmd_read; 694 dma_dir = DMA_FROM_DEVICE; 695 } 696 697 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 698 if (result <= 0) 699 goto free_cmdid; 700 length = result; 701 702 cmnd->rw.command_id = cmdid; 703 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 704 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 705 GFP_ATOMIC); 706 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 707 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 708 cmnd->rw.control = cpu_to_le16(control); 709 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 710 711 nvme_start_io_acct(bio); 712 if (++nvmeq->sq_tail == nvmeq->q_depth) 713 nvmeq->sq_tail = 0; 714 writel(nvmeq->sq_tail, nvmeq->q_db); 715 716 return 0; 717 718 free_cmdid: 719 free_cmdid(nvmeq, cmdid, NULL); 720 free_iod: 721 nvme_free_iod(nvmeq->dev, iod); 722 nomem: 723 return result; 724} 725 726static int nvme_process_cq(struct nvme_queue *nvmeq) 727{ 728 u16 head, phase; 729 730 head = nvmeq->cq_head; 731 phase = nvmeq->cq_phase; 732 733 for (;;) { 734 void *ctx; 735 nvme_completion_fn fn; 736 struct nvme_completion cqe = nvmeq->cqes[head]; 737 if ((le16_to_cpu(cqe.status) & 1) != phase) 738 break; 739 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 740 if (++head == nvmeq->q_depth) { 741 head = 0; 742 phase = !phase; 743 } 744 745 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 746 fn(nvmeq->dev, ctx, &cqe); 747 } 748 749 /* If the controller ignores the cq head doorbell and continuously 750 * writes to the queue, it is theoretically possible to wrap around 751 * the queue twice and mistakenly return IRQ_NONE. Linux only 752 * requires that 0.1% of your interrupts are handled, so this isn't 753 * a big problem. 754 */ 755 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 756 return 0; 757 758 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 759 nvmeq->cq_head = head; 760 nvmeq->cq_phase = phase; 761 762 nvmeq->cqe_seen = 1; 763 return 1; 764} 765 766static void nvme_make_request(struct request_queue *q, struct bio *bio) 767{ 768 struct nvme_ns *ns = q->queuedata; 769 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 770 int result = -EBUSY; 771 772 if (!nvmeq) { 773 put_nvmeq(NULL); 774 bio_endio(bio, -EIO); 775 return; 776 } 777 778 spin_lock_irq(&nvmeq->q_lock); 779 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) 780 result = nvme_submit_bio_queue(nvmeq, ns, bio); 781 if (unlikely(result)) { 782 if (bio_list_empty(&nvmeq->sq_cong)) 783 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 784 bio_list_add(&nvmeq->sq_cong, bio); 785 } 786 787 nvme_process_cq(nvmeq); 788 spin_unlock_irq(&nvmeq->q_lock); 789 put_nvmeq(nvmeq); 790} 791 792static irqreturn_t nvme_irq(int irq, void *data) 793{ 794 irqreturn_t result; 795 struct nvme_queue *nvmeq = data; 796 spin_lock(&nvmeq->q_lock); 797 nvme_process_cq(nvmeq); 798 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 799 nvmeq->cqe_seen = 0; 800 spin_unlock(&nvmeq->q_lock); 801 return result; 802} 803 804static irqreturn_t nvme_irq_check(int irq, void *data) 805{ 806 struct nvme_queue *nvmeq = data; 807 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 808 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 809 return IRQ_NONE; 810 return IRQ_WAKE_THREAD; 811} 812 813static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 814{ 815 spin_lock_irq(&nvmeq->q_lock); 816 cancel_cmdid(nvmeq, cmdid, NULL); 817 spin_unlock_irq(&nvmeq->q_lock); 818} 819 820struct sync_cmd_info { 821 struct task_struct *task; 822 u32 result; 823 int status; 824}; 825 826static void sync_completion(struct nvme_dev *dev, void *ctx, 827 struct nvme_completion *cqe) 828{ 829 struct sync_cmd_info *cmdinfo = ctx; 830 cmdinfo->result = le32_to_cpup(&cqe->result); 831 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 832 wake_up_process(cmdinfo->task); 833} 834 835/* 836 * Returns 0 on success. If the result is negative, it's a Linux error code; 837 * if the result is positive, it's an NVM Express status code 838 */ 839static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, 840 struct nvme_command *cmd, 841 u32 *result, unsigned timeout) 842{ 843 int cmdid, ret; 844 struct sync_cmd_info cmdinfo; 845 struct nvme_queue *nvmeq; 846 847 nvmeq = lock_nvmeq(dev, q_idx); 848 if (!nvmeq) { 849 unlock_nvmeq(nvmeq); 850 return -ENODEV; 851 } 852 853 cmdinfo.task = current; 854 cmdinfo.status = -EINTR; 855 856 cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); 857 if (cmdid < 0) { 858 unlock_nvmeq(nvmeq); 859 return cmdid; 860 } 861 cmd->common.command_id = cmdid; 862 863 set_current_state(TASK_KILLABLE); 864 ret = nvme_submit_cmd(nvmeq, cmd); 865 if (ret) { 866 free_cmdid(nvmeq, cmdid, NULL); 867 unlock_nvmeq(nvmeq); 868 set_current_state(TASK_RUNNING); 869 return ret; 870 } 871 unlock_nvmeq(nvmeq); 872 schedule_timeout(timeout); 873 874 if (cmdinfo.status == -EINTR) { 875 nvmeq = lock_nvmeq(dev, q_idx); 876 if (nvmeq) 877 nvme_abort_command(nvmeq, cmdid); 878 unlock_nvmeq(nvmeq); 879 return -EINTR; 880 } 881 882 if (result) 883 *result = cmdinfo.result; 884 885 return cmdinfo.status; 886} 887 888static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, 889 struct nvme_command *cmd, 890 struct async_cmd_info *cmdinfo, unsigned timeout) 891{ 892 int cmdid; 893 894 cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); 895 if (cmdid < 0) 896 return cmdid; 897 cmdinfo->status = -EINTR; 898 cmd->common.command_id = cmdid; 899 return nvme_submit_cmd(nvmeq, cmd); 900} 901 902int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 903 u32 *result) 904{ 905 return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); 906} 907 908int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 909 u32 *result) 910{ 911 return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, 912 NVME_IO_TIMEOUT); 913} 914 915static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, 916 struct nvme_command *cmd, struct async_cmd_info *cmdinfo) 917{ 918 return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, 919 ADMIN_TIMEOUT); 920} 921 922static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 923{ 924 int status; 925 struct nvme_command c; 926 927 memset(&c, 0, sizeof(c)); 928 c.delete_queue.opcode = opcode; 929 c.delete_queue.qid = cpu_to_le16(id); 930 931 status = nvme_submit_admin_cmd(dev, &c, NULL); 932 if (status) 933 return -EIO; 934 return 0; 935} 936 937static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 938 struct nvme_queue *nvmeq) 939{ 940 int status; 941 struct nvme_command c; 942 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 943 944 memset(&c, 0, sizeof(c)); 945 c.create_cq.opcode = nvme_admin_create_cq; 946 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 947 c.create_cq.cqid = cpu_to_le16(qid); 948 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 949 c.create_cq.cq_flags = cpu_to_le16(flags); 950 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 951 952 status = nvme_submit_admin_cmd(dev, &c, NULL); 953 if (status) 954 return -EIO; 955 return 0; 956} 957 958static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 959 struct nvme_queue *nvmeq) 960{ 961 int status; 962 struct nvme_command c; 963 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 964 965 memset(&c, 0, sizeof(c)); 966 c.create_sq.opcode = nvme_admin_create_sq; 967 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 968 c.create_sq.sqid = cpu_to_le16(qid); 969 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 970 c.create_sq.sq_flags = cpu_to_le16(flags); 971 c.create_sq.cqid = cpu_to_le16(qid); 972 973 status = nvme_submit_admin_cmd(dev, &c, NULL); 974 if (status) 975 return -EIO; 976 return 0; 977} 978 979static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 980{ 981 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 982} 983 984static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 985{ 986 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 987} 988 989int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 990 dma_addr_t dma_addr) 991{ 992 struct nvme_command c; 993 994 memset(&c, 0, sizeof(c)); 995 c.identify.opcode = nvme_admin_identify; 996 c.identify.nsid = cpu_to_le32(nsid); 997 c.identify.prp1 = cpu_to_le64(dma_addr); 998 c.identify.cns = cpu_to_le32(cns); 999 1000 return nvme_submit_admin_cmd(dev, &c, NULL); 1001} 1002 1003int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 1004 dma_addr_t dma_addr, u32 *result) 1005{ 1006 struct nvme_command c; 1007 1008 memset(&c, 0, sizeof(c)); 1009 c.features.opcode = nvme_admin_get_features; 1010 c.features.nsid = cpu_to_le32(nsid); 1011 c.features.prp1 = cpu_to_le64(dma_addr); 1012 c.features.fid = cpu_to_le32(fid); 1013 1014 return nvme_submit_admin_cmd(dev, &c, result); 1015} 1016 1017int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1018 dma_addr_t dma_addr, u32 *result) 1019{ 1020 struct nvme_command c; 1021 1022 memset(&c, 0, sizeof(c)); 1023 c.features.opcode = nvme_admin_set_features; 1024 c.features.prp1 = cpu_to_le64(dma_addr); 1025 c.features.fid = cpu_to_le32(fid); 1026 c.features.dword11 = cpu_to_le32(dword11); 1027 1028 return nvme_submit_admin_cmd(dev, &c, result); 1029} 1030 1031/** 1032 * nvme_abort_cmd - Attempt aborting a command 1033 * @cmdid: Command id of a timed out IO 1034 * @queue: The queue with timed out IO 1035 * 1036 * Schedule controller reset if the command was already aborted once before and 1037 * still hasn't been returned to the driver, or if this is the admin queue. 1038 */ 1039static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) 1040{ 1041 int a_cmdid; 1042 struct nvme_command cmd; 1043 struct nvme_dev *dev = nvmeq->dev; 1044 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1045 struct nvme_queue *adminq; 1046 1047 if (!nvmeq->qid || info[cmdid].aborted) { 1048 if (work_busy(&dev->reset_work)) 1049 return; 1050 list_del_init(&dev->node); 1051 dev_warn(&dev->pci_dev->dev, 1052 "I/O %d QID %d timeout, reset controller\n", cmdid, 1053 nvmeq->qid); 1054 PREPARE_WORK(&dev->reset_work, nvme_reset_failed_dev); 1055 queue_work(nvme_workq, &dev->reset_work); 1056 return; 1057 } 1058 1059 if (!dev->abort_limit) 1060 return; 1061 1062 adminq = rcu_dereference(dev->queues[0]); 1063 a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, 1064 ADMIN_TIMEOUT); 1065 if (a_cmdid < 0) 1066 return; 1067 1068 memset(&cmd, 0, sizeof(cmd)); 1069 cmd.abort.opcode = nvme_admin_abort_cmd; 1070 cmd.abort.cid = cmdid; 1071 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1072 cmd.abort.command_id = a_cmdid; 1073 1074 --dev->abort_limit; 1075 info[cmdid].aborted = 1; 1076 info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; 1077 1078 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1079 nvmeq->qid); 1080 nvme_submit_cmd(adminq, &cmd); 1081} 1082 1083/** 1084 * nvme_cancel_ios - Cancel outstanding I/Os 1085 * @queue: The queue to cancel I/Os on 1086 * @timeout: True to only cancel I/Os which have timed out 1087 */ 1088static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 1089{ 1090 int depth = nvmeq->q_depth - 1; 1091 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1092 unsigned long now = jiffies; 1093 int cmdid; 1094 1095 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1096 void *ctx; 1097 nvme_completion_fn fn; 1098 static struct nvme_completion cqe = { 1099 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 1100 }; 1101 1102 if (timeout && !time_after(now, info[cmdid].timeout)) 1103 continue; 1104 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1105 continue; 1106 if (timeout && nvmeq->dev->initialized) { 1107 nvme_abort_cmd(cmdid, nvmeq); 1108 continue; 1109 } 1110 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, 1111 nvmeq->qid); 1112 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1113 fn(nvmeq->dev, ctx, &cqe); 1114 } 1115} 1116 1117static void nvme_free_queue(struct rcu_head *r) 1118{ 1119 struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); 1120 1121 spin_lock_irq(&nvmeq->q_lock); 1122 while (bio_list_peek(&nvmeq->sq_cong)) { 1123 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1124 bio_endio(bio, -EIO); 1125 } 1126 spin_unlock_irq(&nvmeq->q_lock); 1127 1128 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1129 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1130 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1131 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1132 if (nvmeq->qid) 1133 free_cpumask_var(nvmeq->cpu_mask); 1134 kfree(nvmeq); 1135} 1136 1137static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1138{ 1139 int i; 1140 1141 for (i = dev->queue_count - 1; i >= lowest; i--) { 1142 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 1143 rcu_assign_pointer(dev->queues[i], NULL); 1144 call_rcu(&nvmeq->r_head, nvme_free_queue); 1145 dev->queue_count--; 1146 } 1147} 1148 1149/** 1150 * nvme_suspend_queue - put queue into suspended state 1151 * @nvmeq - queue to suspend 1152 * 1153 * Returns 1 if already suspended, 0 otherwise. 1154 */ 1155static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1156{ 1157 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1158 1159 spin_lock_irq(&nvmeq->q_lock); 1160 if (nvmeq->q_suspended) { 1161 spin_unlock_irq(&nvmeq->q_lock); 1162 return 1; 1163 } 1164 nvmeq->q_suspended = 1; 1165 nvmeq->dev->online_queues--; 1166 spin_unlock_irq(&nvmeq->q_lock); 1167 1168 irq_set_affinity_hint(vector, NULL); 1169 free_irq(vector, nvmeq); 1170 1171 return 0; 1172} 1173 1174static void nvme_clear_queue(struct nvme_queue *nvmeq) 1175{ 1176 spin_lock_irq(&nvmeq->q_lock); 1177 nvme_process_cq(nvmeq); 1178 nvme_cancel_ios(nvmeq, false); 1179 spin_unlock_irq(&nvmeq->q_lock); 1180} 1181 1182static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1183{ 1184 struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); 1185 1186 if (!nvmeq) 1187 return; 1188 if (nvme_suspend_queue(nvmeq)) 1189 return; 1190 1191 /* Don't tell the adapter to delete the admin queue. 1192 * Don't tell a removed adapter to delete IO queues. */ 1193 if (qid && readl(&dev->bar->csts) != -1) { 1194 adapter_delete_sq(dev, qid); 1195 adapter_delete_cq(dev, qid); 1196 } 1197 nvme_clear_queue(nvmeq); 1198} 1199 1200static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1201 int depth, int vector) 1202{ 1203 struct device *dmadev = &dev->pci_dev->dev; 1204 unsigned extra = nvme_queue_extra(depth); 1205 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1206 if (!nvmeq) 1207 return NULL; 1208 1209 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1210 &nvmeq->cq_dma_addr, GFP_KERNEL); 1211 if (!nvmeq->cqes) 1212 goto free_nvmeq; 1213 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1214 1215 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1216 &nvmeq->sq_dma_addr, GFP_KERNEL); 1217 if (!nvmeq->sq_cmds) 1218 goto free_cqdma; 1219 1220 if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) 1221 goto free_sqdma; 1222 1223 nvmeq->q_dmadev = dmadev; 1224 nvmeq->dev = dev; 1225 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1226 dev->instance, qid); 1227 spin_lock_init(&nvmeq->q_lock); 1228 nvmeq->cq_head = 0; 1229 nvmeq->cq_phase = 1; 1230 init_waitqueue_head(&nvmeq->sq_full); 1231 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1232 bio_list_init(&nvmeq->sq_cong); 1233 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1234 nvmeq->q_depth = depth; 1235 nvmeq->cq_vector = vector; 1236 nvmeq->qid = qid; 1237 nvmeq->q_suspended = 1; 1238 dev->queue_count++; 1239 rcu_assign_pointer(dev->queues[qid], nvmeq); 1240 1241 return nvmeq; 1242 1243 free_sqdma: 1244 dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, 1245 nvmeq->sq_dma_addr); 1246 free_cqdma: 1247 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1248 nvmeq->cq_dma_addr); 1249 free_nvmeq: 1250 kfree(nvmeq); 1251 return NULL; 1252} 1253 1254static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1255 const char *name) 1256{ 1257 if (use_threaded_interrupts) 1258 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1259 nvme_irq_check, nvme_irq, IRQF_SHARED, 1260 name, nvmeq); 1261 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1262 IRQF_SHARED, name, nvmeq); 1263} 1264 1265static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1266{ 1267 struct nvme_dev *dev = nvmeq->dev; 1268 unsigned extra = nvme_queue_extra(nvmeq->q_depth); 1269 1270 nvmeq->sq_tail = 0; 1271 nvmeq->cq_head = 0; 1272 nvmeq->cq_phase = 1; 1273 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1274 memset(nvmeq->cmdid_data, 0, extra); 1275 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1276 nvme_cancel_ios(nvmeq, false); 1277 nvmeq->q_suspended = 0; 1278 dev->online_queues++; 1279} 1280 1281static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1282{ 1283 struct nvme_dev *dev = nvmeq->dev; 1284 int result; 1285 1286 result = adapter_alloc_cq(dev, qid, nvmeq); 1287 if (result < 0) 1288 return result; 1289 1290 result = adapter_alloc_sq(dev, qid, nvmeq); 1291 if (result < 0) 1292 goto release_cq; 1293 1294 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1295 if (result < 0) 1296 goto release_sq; 1297 1298 spin_lock_irq(&nvmeq->q_lock); 1299 nvme_init_queue(nvmeq, qid); 1300 spin_unlock_irq(&nvmeq->q_lock); 1301 1302 return result; 1303 1304 release_sq: 1305 adapter_delete_sq(dev, qid); 1306 release_cq: 1307 adapter_delete_cq(dev, qid); 1308 return result; 1309} 1310 1311static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1312{ 1313 unsigned long timeout; 1314 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1315 1316 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1317 1318 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1319 msleep(100); 1320 if (fatal_signal_pending(current)) 1321 return -EINTR; 1322 if (time_after(jiffies, timeout)) { 1323 dev_err(&dev->pci_dev->dev, 1324 "Device not ready; aborting initialisation\n"); 1325 return -ENODEV; 1326 } 1327 } 1328 1329 return 0; 1330} 1331 1332/* 1333 * If the device has been passed off to us in an enabled state, just clear 1334 * the enabled bit. The spec says we should set the 'shutdown notification 1335 * bits', but doing so may cause the device to complete commands to the 1336 * admin queue ... and we don't know what memory that might be pointing at! 1337 */ 1338static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1339{ 1340 u32 cc = readl(&dev->bar->cc); 1341 1342 if (cc & NVME_CC_ENABLE) 1343 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1344 return nvme_wait_ready(dev, cap, false); 1345} 1346 1347static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1348{ 1349 return nvme_wait_ready(dev, cap, true); 1350} 1351 1352static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1353{ 1354 unsigned long timeout; 1355 u32 cc; 1356 1357 cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; 1358 writel(cc, &dev->bar->cc); 1359 1360 timeout = 2 * HZ + jiffies; 1361 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1362 NVME_CSTS_SHST_CMPLT) { 1363 msleep(100); 1364 if (fatal_signal_pending(current)) 1365 return -EINTR; 1366 if (time_after(jiffies, timeout)) { 1367 dev_err(&dev->pci_dev->dev, 1368 "Device shutdown incomplete; abort shutdown\n"); 1369 return -ENODEV; 1370 } 1371 } 1372 1373 return 0; 1374} 1375 1376static int nvme_configure_admin_queue(struct nvme_dev *dev) 1377{ 1378 int result; 1379 u32 aqa; 1380 u64 cap = readq(&dev->bar->cap); 1381 struct nvme_queue *nvmeq; 1382 1383 result = nvme_disable_ctrl(dev, cap); 1384 if (result < 0) 1385 return result; 1386 1387 nvmeq = raw_nvmeq(dev, 0); 1388 if (!nvmeq) { 1389 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1390 if (!nvmeq) 1391 return -ENOMEM; 1392 } 1393 1394 aqa = nvmeq->q_depth - 1; 1395 aqa |= aqa << 16; 1396 1397 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1398 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1399 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1400 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1401 1402 writel(aqa, &dev->bar->aqa); 1403 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1404 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1405 writel(dev->ctrl_config, &dev->bar->cc); 1406 1407 result = nvme_enable_ctrl(dev, cap); 1408 if (result) 1409 return result; 1410 1411 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1412 if (result) 1413 return result; 1414 1415 spin_lock_irq(&nvmeq->q_lock); 1416 nvme_init_queue(nvmeq, 0); 1417 spin_unlock_irq(&nvmeq->q_lock); 1418 return result; 1419} 1420 1421struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1422 unsigned long addr, unsigned length) 1423{ 1424 int i, err, count, nents, offset; 1425 struct scatterlist *sg; 1426 struct page **pages; 1427 struct nvme_iod *iod; 1428 1429 if (addr & 3) 1430 return ERR_PTR(-EINVAL); 1431 if (!length || length > INT_MAX - PAGE_SIZE) 1432 return ERR_PTR(-EINVAL); 1433 1434 offset = offset_in_page(addr); 1435 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1436 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1437 if (!pages) 1438 return ERR_PTR(-ENOMEM); 1439 1440 err = get_user_pages_fast(addr, count, 1, pages); 1441 if (err < count) { 1442 count = err; 1443 err = -EFAULT; 1444 goto put_pages; 1445 } 1446 1447 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1448 sg = iod->sg; 1449 sg_init_table(sg, count); 1450 for (i = 0; i < count; i++) { 1451 sg_set_page(&sg[i], pages[i], 1452 min_t(unsigned, length, PAGE_SIZE - offset), 1453 offset); 1454 length -= (PAGE_SIZE - offset); 1455 offset = 0; 1456 } 1457 sg_mark_end(&sg[i - 1]); 1458 iod->nents = count; 1459 1460 err = -ENOMEM; 1461 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1462 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1463 if (!nents) 1464 goto free_iod; 1465 1466 kfree(pages); 1467 return iod; 1468 1469 free_iod: 1470 kfree(iod); 1471 put_pages: 1472 for (i = 0; i < count; i++) 1473 put_page(pages[i]); 1474 kfree(pages); 1475 return ERR_PTR(err); 1476} 1477 1478void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1479 struct nvme_iod *iod) 1480{ 1481 int i; 1482 1483 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1484 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1485 1486 for (i = 0; i < iod->nents; i++) 1487 put_page(sg_page(&iod->sg[i])); 1488} 1489 1490static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1491{ 1492 struct nvme_dev *dev = ns->dev; 1493 struct nvme_user_io io; 1494 struct nvme_command c; 1495 unsigned length, meta_len; 1496 int status, i; 1497 struct nvme_iod *iod, *meta_iod = NULL; 1498 dma_addr_t meta_dma_addr; 1499 void *meta, *uninitialized_var(meta_mem); 1500 1501 if (copy_from_user(&io, uio, sizeof(io))) 1502 return -EFAULT; 1503 length = (io.nblocks + 1) << ns->lba_shift; 1504 meta_len = (io.nblocks + 1) * ns->ms; 1505 1506 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1507 return -EINVAL; 1508 1509 switch (io.opcode) { 1510 case nvme_cmd_write: 1511 case nvme_cmd_read: 1512 case nvme_cmd_compare: 1513 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1514 break; 1515 default: 1516 return -EINVAL; 1517 } 1518 1519 if (IS_ERR(iod)) 1520 return PTR_ERR(iod); 1521 1522 memset(&c, 0, sizeof(c)); 1523 c.rw.opcode = io.opcode; 1524 c.rw.flags = io.flags; 1525 c.rw.nsid = cpu_to_le32(ns->ns_id); 1526 c.rw.slba = cpu_to_le64(io.slba); 1527 c.rw.length = cpu_to_le16(io.nblocks); 1528 c.rw.control = cpu_to_le16(io.control); 1529 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1530 c.rw.reftag = cpu_to_le32(io.reftag); 1531 c.rw.apptag = cpu_to_le16(io.apptag); 1532 c.rw.appmask = cpu_to_le16(io.appmask); 1533 1534 if (meta_len) { 1535 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, 1536 meta_len); 1537 if (IS_ERR(meta_iod)) { 1538 status = PTR_ERR(meta_iod); 1539 meta_iod = NULL; 1540 goto unmap; 1541 } 1542 1543 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1544 &meta_dma_addr, GFP_KERNEL); 1545 if (!meta_mem) { 1546 status = -ENOMEM; 1547 goto unmap; 1548 } 1549 1550 if (io.opcode & 1) { 1551 int meta_offset = 0; 1552 1553 for (i = 0; i < meta_iod->nents; i++) { 1554 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1555 meta_iod->sg[i].offset; 1556 memcpy(meta_mem + meta_offset, meta, 1557 meta_iod->sg[i].length); 1558 kunmap_atomic(meta); 1559 meta_offset += meta_iod->sg[i].length; 1560 } 1561 } 1562 1563 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1564 } 1565 1566 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1567 1568 if (length != (io.nblocks + 1) << ns->lba_shift) 1569 status = -ENOMEM; 1570 else 1571 status = nvme_submit_io_cmd(dev, &c, NULL); 1572 1573 if (meta_len) { 1574 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1575 int meta_offset = 0; 1576 1577 for (i = 0; i < meta_iod->nents; i++) { 1578 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1579 meta_iod->sg[i].offset; 1580 memcpy(meta, meta_mem + meta_offset, 1581 meta_iod->sg[i].length); 1582 kunmap_atomic(meta); 1583 meta_offset += meta_iod->sg[i].length; 1584 } 1585 } 1586 1587 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1588 meta_dma_addr); 1589 } 1590 1591 unmap: 1592 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1593 nvme_free_iod(dev, iod); 1594 1595 if (meta_iod) { 1596 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1597 nvme_free_iod(dev, meta_iod); 1598 } 1599 1600 return status; 1601} 1602 1603static int nvme_user_admin_cmd(struct nvme_dev *dev, 1604 struct nvme_admin_cmd __user *ucmd) 1605{ 1606 struct nvme_admin_cmd cmd; 1607 struct nvme_command c; 1608 int status, length; 1609 struct nvme_iod *uninitialized_var(iod); 1610 unsigned timeout; 1611 1612 if (!capable(CAP_SYS_ADMIN)) 1613 return -EACCES; 1614 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1615 return -EFAULT; 1616 1617 memset(&c, 0, sizeof(c)); 1618 c.common.opcode = cmd.opcode; 1619 c.common.flags = cmd.flags; 1620 c.common.nsid = cpu_to_le32(cmd.nsid); 1621 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1622 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1623 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1624 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1625 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1626 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1627 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1628 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1629 1630 length = cmd.data_len; 1631 if (cmd.data_len) { 1632 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1633 length); 1634 if (IS_ERR(iod)) 1635 return PTR_ERR(iod); 1636 length = nvme_setup_prps(dev, &c.common, iod, length, 1637 GFP_KERNEL); 1638 } 1639 1640 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1641 ADMIN_TIMEOUT; 1642 if (length != cmd.data_len) 1643 status = -ENOMEM; 1644 else 1645 status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); 1646 1647 if (cmd.data_len) { 1648 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1649 nvme_free_iod(dev, iod); 1650 } 1651 1652 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1653 sizeof(cmd.result))) 1654 status = -EFAULT; 1655 1656 return status; 1657} 1658 1659static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1660 unsigned long arg) 1661{ 1662 struct nvme_ns *ns = bdev->bd_disk->private_data; 1663 1664 switch (cmd) { 1665 case NVME_IOCTL_ID: 1666 force_successful_syscall_return(); 1667 return ns->ns_id; 1668 case NVME_IOCTL_ADMIN_CMD: 1669 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1670 case NVME_IOCTL_SUBMIT_IO: 1671 return nvme_submit_io(ns, (void __user *)arg); 1672 case SG_GET_VERSION_NUM: 1673 return nvme_sg_get_version_num((void __user *)arg); 1674 case SG_IO: 1675 return nvme_sg_io(ns, (void __user *)arg); 1676 default: 1677 return -ENOTTY; 1678 } 1679} 1680 1681#ifdef CONFIG_COMPAT 1682static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1683 unsigned int cmd, unsigned long arg) 1684{ 1685 struct nvme_ns *ns = bdev->bd_disk->private_data; 1686 1687 switch (cmd) { 1688 case SG_IO: 1689 return nvme_sg_io32(ns, arg); 1690 } 1691 return nvme_ioctl(bdev, mode, cmd, arg); 1692} 1693#else 1694#define nvme_compat_ioctl NULL 1695#endif 1696 1697static int nvme_open(struct block_device *bdev, fmode_t mode) 1698{ 1699 struct nvme_ns *ns = bdev->bd_disk->private_data; 1700 struct nvme_dev *dev = ns->dev; 1701 1702 kref_get(&dev->kref); 1703 return 0; 1704} 1705 1706static void nvme_free_dev(struct kref *kref); 1707 1708static void nvme_release(struct gendisk *disk, fmode_t mode) 1709{ 1710 struct nvme_ns *ns = disk->private_data; 1711 struct nvme_dev *dev = ns->dev; 1712 1713 kref_put(&dev->kref, nvme_free_dev); 1714} 1715 1716static const struct block_device_operations nvme_fops = { 1717 .owner = THIS_MODULE, 1718 .ioctl = nvme_ioctl, 1719 .compat_ioctl = nvme_compat_ioctl, 1720 .open = nvme_open, 1721 .release = nvme_release, 1722}; 1723 1724static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1725{ 1726 while (bio_list_peek(&nvmeq->sq_cong)) { 1727 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1728 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1729 1730 if (bio_list_empty(&nvmeq->sq_cong)) 1731 remove_wait_queue(&nvmeq->sq_full, 1732 &nvmeq->sq_cong_wait); 1733 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1734 if (bio_list_empty(&nvmeq->sq_cong)) 1735 add_wait_queue(&nvmeq->sq_full, 1736 &nvmeq->sq_cong_wait); 1737 bio_list_add_head(&nvmeq->sq_cong, bio); 1738 break; 1739 } 1740 } 1741} 1742 1743static int nvme_kthread(void *data) 1744{ 1745 struct nvme_dev *dev, *next; 1746 1747 while (!kthread_should_stop()) { 1748 set_current_state(TASK_INTERRUPTIBLE); 1749 spin_lock(&dev_list_lock); 1750 list_for_each_entry_safe(dev, next, &dev_list, node) { 1751 int i; 1752 if (readl(&dev->bar->csts) & NVME_CSTS_CFS && 1753 dev->initialized) { 1754 if (work_busy(&dev->reset_work)) 1755 continue; 1756 list_del_init(&dev->node); 1757 dev_warn(&dev->pci_dev->dev, 1758 "Failed status, reset controller\n"); 1759 PREPARE_WORK(&dev->reset_work, 1760 nvme_reset_failed_dev); 1761 queue_work(nvme_workq, &dev->reset_work); 1762 continue; 1763 } 1764 rcu_read_lock(); 1765 for (i = 0; i < dev->queue_count; i++) { 1766 struct nvme_queue *nvmeq = 1767 rcu_dereference(dev->queues[i]); 1768 if (!nvmeq) 1769 continue; 1770 spin_lock_irq(&nvmeq->q_lock); 1771 if (nvmeq->q_suspended) 1772 goto unlock; 1773 nvme_process_cq(nvmeq); 1774 nvme_cancel_ios(nvmeq, true); 1775 nvme_resubmit_bios(nvmeq); 1776 unlock: 1777 spin_unlock_irq(&nvmeq->q_lock); 1778 } 1779 rcu_read_unlock(); 1780 } 1781 spin_unlock(&dev_list_lock); 1782 schedule_timeout(round_jiffies_relative(HZ)); 1783 } 1784 return 0; 1785} 1786 1787static void nvme_config_discard(struct nvme_ns *ns) 1788{ 1789 u32 logical_block_size = queue_logical_block_size(ns->queue); 1790 ns->queue->limits.discard_zeroes_data = 0; 1791 ns->queue->limits.discard_alignment = logical_block_size; 1792 ns->queue->limits.discard_granularity = logical_block_size; 1793 ns->queue->limits.max_discard_sectors = 0xffffffff; 1794 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1795} 1796 1797static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, 1798 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1799{ 1800 struct nvme_ns *ns; 1801 struct gendisk *disk; 1802 int lbaf; 1803 1804 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1805 return NULL; 1806 1807 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1808 if (!ns) 1809 return NULL; 1810 ns->queue = blk_alloc_queue(GFP_KERNEL); 1811 if (!ns->queue) 1812 goto out_free_ns; 1813 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1814 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1815 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1816 blk_queue_make_request(ns->queue, nvme_make_request); 1817 ns->dev = dev; 1818 ns->queue->queuedata = ns; 1819 1820 disk = alloc_disk(0); 1821 if (!disk) 1822 goto out_free_queue; 1823 ns->ns_id = nsid; 1824 ns->disk = disk; 1825 lbaf = id->flbas & 0xf; 1826 ns->lba_shift = id->lbaf[lbaf].ds; 1827 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1828 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1829 if (dev->max_hw_sectors) 1830 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1831 1832 disk->major = nvme_major; 1833 disk->first_minor = 0; 1834 disk->fops = &nvme_fops; 1835 disk->private_data = ns; 1836 disk->queue = ns->queue; 1837 disk->driverfs_dev = &dev->pci_dev->dev; 1838 disk->flags = GENHD_FL_EXT_DEVT; 1839 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1840 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1841 1842 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1843 nvme_config_discard(ns); 1844 1845 return ns; 1846 1847 out_free_queue: 1848 blk_cleanup_queue(ns->queue); 1849 out_free_ns: 1850 kfree(ns); 1851 return NULL; 1852} 1853 1854static int nvme_find_closest_node(int node) 1855{ 1856 int n, val, min_val = INT_MAX, best_node = node; 1857 1858 for_each_online_node(n) { 1859 if (n == node) 1860 continue; 1861 val = node_distance(node, n); 1862 if (val < min_val) { 1863 min_val = val; 1864 best_node = n; 1865 } 1866 } 1867 return best_node; 1868} 1869 1870static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, 1871 int count) 1872{ 1873 int cpu; 1874 for_each_cpu(cpu, qmask) { 1875 if (cpumask_weight(nvmeq->cpu_mask) >= count) 1876 break; 1877 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) 1878 *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; 1879 } 1880} 1881 1882static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, 1883 const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) 1884{ 1885 int next_cpu; 1886 for_each_cpu(next_cpu, new_mask) { 1887 cpumask_or(mask, mask, get_cpu_mask(next_cpu)); 1888 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); 1889 cpumask_and(mask, mask, unassigned_cpus); 1890 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); 1891 } 1892} 1893 1894static void nvme_create_io_queues(struct nvme_dev *dev) 1895{ 1896 unsigned i, max; 1897 1898 max = min(dev->max_qid, num_online_cpus()); 1899 for (i = dev->queue_count; i <= max; i++) 1900 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) 1901 break; 1902 1903 max = min(dev->queue_count - 1, num_online_cpus()); 1904 for (i = dev->online_queues; i <= max; i++) 1905 if (nvme_create_queue(raw_nvmeq(dev, i), i)) 1906 break; 1907} 1908 1909/* 1910 * If there are fewer queues than online cpus, this will try to optimally 1911 * assign a queue to multiple cpus by grouping cpus that are "close" together: 1912 * thread siblings, core, socket, closest node, then whatever else is 1913 * available. 1914 */ 1915static void nvme_assign_io_queues(struct nvme_dev *dev) 1916{ 1917 unsigned cpu, cpus_per_queue, queues, remainder, i; 1918 cpumask_var_t unassigned_cpus; 1919 1920 nvme_create_io_queues(dev); 1921 1922 queues = min(dev->online_queues - 1, num_online_cpus()); 1923 if (!queues) 1924 return; 1925 1926 cpus_per_queue = num_online_cpus() / queues; 1927 remainder = queues - (num_online_cpus() - queues * cpus_per_queue); 1928 1929 if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) 1930 return; 1931 1932 cpumask_copy(unassigned_cpus, cpu_online_mask); 1933 cpu = cpumask_first(unassigned_cpus); 1934 for (i = 1; i <= queues; i++) { 1935 struct nvme_queue *nvmeq = lock_nvmeq(dev, i); 1936 cpumask_t mask; 1937 1938 cpumask_clear(nvmeq->cpu_mask); 1939 if (!cpumask_weight(unassigned_cpus)) { 1940 unlock_nvmeq(nvmeq); 1941 break; 1942 } 1943 1944 mask = *get_cpu_mask(cpu); 1945 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); 1946 if (cpus_weight(mask) < cpus_per_queue) 1947 nvme_add_cpus(&mask, unassigned_cpus, 1948 topology_thread_cpumask(cpu), 1949 nvmeq, cpus_per_queue); 1950 if (cpus_weight(mask) < cpus_per_queue) 1951 nvme_add_cpus(&mask, unassigned_cpus, 1952 topology_core_cpumask(cpu), 1953 nvmeq, cpus_per_queue); 1954 if (cpus_weight(mask) < cpus_per_queue) 1955 nvme_add_cpus(&mask, unassigned_cpus, 1956 cpumask_of_node(cpu_to_node(cpu)), 1957 nvmeq, cpus_per_queue); 1958 if (cpus_weight(mask) < cpus_per_queue) 1959 nvme_add_cpus(&mask, unassigned_cpus, 1960 cpumask_of_node( 1961 nvme_find_closest_node( 1962 cpu_to_node(cpu))), 1963 nvmeq, cpus_per_queue); 1964 if (cpus_weight(mask) < cpus_per_queue) 1965 nvme_add_cpus(&mask, unassigned_cpus, 1966 unassigned_cpus, 1967 nvmeq, cpus_per_queue); 1968 1969 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, 1970 "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", 1971 dev->instance, i); 1972 1973 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 1974 nvmeq->cpu_mask); 1975 cpumask_andnot(unassigned_cpus, unassigned_cpus, 1976 nvmeq->cpu_mask); 1977 cpu = cpumask_next(cpu, unassigned_cpus); 1978 if (remainder && !--remainder) 1979 cpus_per_queue++; 1980 unlock_nvmeq(nvmeq); 1981 } 1982 WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", 1983 dev->instance); 1984 i = 0; 1985 cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); 1986 for_each_cpu(cpu, unassigned_cpus) 1987 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; 1988 free_cpumask_var(unassigned_cpus); 1989} 1990 1991static int set_queue_count(struct nvme_dev *dev, int count) 1992{ 1993 int status; 1994 u32 result; 1995 u32 q_count = (count - 1) | ((count - 1) << 16); 1996 1997 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1998 &result); 1999 if (status) 2000 return status < 0 ? -EIO : -EBUSY; 2001 return min(result & 0xffff, result >> 16) + 1; 2002} 2003 2004static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2005{ 2006 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 2007} 2008 2009static int nvme_cpu_notify(struct notifier_block *self, 2010 unsigned long action, void *hcpu) 2011{ 2012 struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); 2013 switch (action) { 2014 case CPU_ONLINE: 2015 case CPU_DEAD: 2016 nvme_assign_io_queues(dev); 2017 break; 2018 } 2019 return NOTIFY_OK; 2020} 2021 2022static int nvme_setup_io_queues(struct nvme_dev *dev) 2023{ 2024 struct nvme_queue *adminq = raw_nvmeq(dev, 0); 2025 struct pci_dev *pdev = dev->pci_dev; 2026 int result, i, vecs, nr_io_queues, size; 2027 2028 nr_io_queues = num_possible_cpus(); 2029 result = set_queue_count(dev, nr_io_queues); 2030 if (result < 0) 2031 return result; 2032 if (result < nr_io_queues) 2033 nr_io_queues = result; 2034 2035 size = db_bar_size(dev, nr_io_queues); 2036 if (size > 8192) { 2037 iounmap(dev->bar); 2038 do { 2039 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2040 if (dev->bar) 2041 break; 2042 if (!--nr_io_queues) 2043 return -ENOMEM; 2044 size = db_bar_size(dev, nr_io_queues); 2045 } while (1); 2046 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2047 adminq->q_db = dev->dbs; 2048 } 2049 2050 /* Deregister the admin queue's interrupt */ 2051 free_irq(dev->entry[0].vector, adminq); 2052 2053 vecs = nr_io_queues; 2054 for (i = 0; i < vecs; i++) 2055 dev->entry[i].entry = i; 2056 for (;;) { 2057 result = pci_enable_msix(pdev, dev->entry, vecs); 2058 if (result <= 0) 2059 break; 2060 vecs = result; 2061 } 2062 2063 if (result < 0) { 2064 vecs = nr_io_queues; 2065 if (vecs > 32) 2066 vecs = 32; 2067 for (;;) { 2068 result = pci_enable_msi_block(pdev, vecs); 2069 if (result == 0) { 2070 for (i = 0; i < vecs; i++) 2071 dev->entry[i].vector = i + pdev->irq; 2072 break; 2073 } else if (result < 0) { 2074 vecs = 1; 2075 break; 2076 } 2077 vecs = result; 2078 } 2079 } 2080 2081 /* 2082 * Should investigate if there's a performance win from allocating 2083 * more queues than interrupt vectors; it might allow the submission 2084 * path to scale better, even if the receive path is limited by the 2085 * number of interrupts. 2086 */ 2087 nr_io_queues = vecs; 2088 dev->max_qid = nr_io_queues; 2089 2090 result = queue_request_irq(dev, adminq, adminq->irqname); 2091 if (result) { 2092 adminq->q_suspended = 1; 2093 goto free_queues; 2094 } 2095 2096 /* Free previously allocated queues that are no longer usable */ 2097 nvme_free_queues(dev, nr_io_queues + 1); 2098 nvme_assign_io_queues(dev); 2099 2100 dev->nb.notifier_call = &nvme_cpu_notify; 2101 result = register_hotcpu_notifier(&dev->nb); 2102 if (result) 2103 goto free_queues; 2104 2105 return 0; 2106 2107 free_queues: 2108 nvme_free_queues(dev, 1); 2109 return result; 2110} 2111 2112/* 2113 * Return: error value if an error occurred setting up the queues or calling 2114 * Identify Device. 0 if these succeeded, even if adding some of the 2115 * namespaces failed. At the moment, these failures are silent. TBD which 2116 * failures should be reported. 2117 */ 2118static int nvme_dev_add(struct nvme_dev *dev) 2119{ 2120 struct pci_dev *pdev = dev->pci_dev; 2121 int res; 2122 unsigned nn, i; 2123 struct nvme_ns *ns; 2124 struct nvme_id_ctrl *ctrl; 2125 struct nvme_id_ns *id_ns; 2126 void *mem; 2127 dma_addr_t dma_addr; 2128 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2129 2130 mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); 2131 if (!mem) 2132 return -ENOMEM; 2133 2134 res = nvme_identify(dev, 0, 1, dma_addr); 2135 if (res) { 2136 res = -EIO; 2137 goto out; 2138 } 2139 2140 ctrl = mem; 2141 nn = le32_to_cpup(&ctrl->nn); 2142 dev->oncs = le16_to_cpup(&ctrl->oncs); 2143 dev->abort_limit = ctrl->acl + 1; 2144 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2145 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2146 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2147 if (ctrl->mdts) 2148 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2149 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2150 (pdev->device == 0x0953) && ctrl->vs[3]) 2151 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2152 2153 id_ns = mem; 2154 for (i = 1; i <= nn; i++) { 2155 res = nvme_identify(dev, i, 0, dma_addr); 2156 if (res) 2157 continue; 2158 2159 if (id_ns->ncap == 0) 2160 continue; 2161 2162 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 2163 dma_addr + 4096, NULL); 2164 if (res) 2165 memset(mem + 4096, 0, 4096); 2166 2167 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 2168 if (ns) 2169 list_add_tail(&ns->list, &dev->namespaces); 2170 } 2171 list_for_each_entry(ns, &dev->namespaces, list) 2172 add_disk(ns->disk); 2173 res = 0; 2174 2175 out: 2176 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 2177 return res; 2178} 2179 2180static int nvme_dev_map(struct nvme_dev *dev) 2181{ 2182 u64 cap; 2183 int bars, result = -ENOMEM; 2184 struct pci_dev *pdev = dev->pci_dev; 2185 2186 if (pci_enable_device_mem(pdev)) 2187 return result; 2188 2189 dev->entry[0].vector = pdev->irq; 2190 pci_set_master(pdev); 2191 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2192 if (pci_request_selected_regions(pdev, bars, "nvme")) 2193 goto disable_pci; 2194 2195 if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && 2196 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) 2197 goto disable; 2198 2199 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2200 if (!dev->bar) 2201 goto disable; 2202 if (readl(&dev->bar->csts) == -1) { 2203 result = -ENODEV; 2204 goto unmap; 2205 } 2206 cap = readq(&dev->bar->cap); 2207 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2208 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2209 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2210 2211 return 0; 2212 2213 unmap: 2214 iounmap(dev->bar); 2215 dev->bar = NULL; 2216 disable: 2217 pci_release_regions(pdev); 2218 disable_pci: 2219 pci_disable_device(pdev); 2220 return result; 2221} 2222 2223static void nvme_dev_unmap(struct nvme_dev *dev) 2224{ 2225 if (dev->pci_dev->msi_enabled) 2226 pci_disable_msi(dev->pci_dev); 2227 else if (dev->pci_dev->msix_enabled) 2228 pci_disable_msix(dev->pci_dev); 2229 2230 if (dev->bar) { 2231 iounmap(dev->bar); 2232 dev->bar = NULL; 2233 pci_release_regions(dev->pci_dev); 2234 } 2235 2236 if (pci_is_enabled(dev->pci_dev)) 2237 pci_disable_device(dev->pci_dev); 2238} 2239 2240struct nvme_delq_ctx { 2241 struct task_struct *waiter; 2242 struct kthread_worker *worker; 2243 atomic_t refcount; 2244}; 2245 2246static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2247{ 2248 dq->waiter = current; 2249 mb(); 2250 2251 for (;;) { 2252 set_current_state(TASK_KILLABLE); 2253 if (!atomic_read(&dq->refcount)) 2254 break; 2255 if (!schedule_timeout(ADMIN_TIMEOUT) || 2256 fatal_signal_pending(current)) { 2257 set_current_state(TASK_RUNNING); 2258 2259 nvme_disable_ctrl(dev, readq(&dev->bar->cap)); 2260 nvme_disable_queue(dev, 0); 2261 2262 send_sig(SIGKILL, dq->worker->task, 1); 2263 flush_kthread_worker(dq->worker); 2264 return; 2265 } 2266 } 2267 set_current_state(TASK_RUNNING); 2268} 2269 2270static void nvme_put_dq(struct nvme_delq_ctx *dq) 2271{ 2272 atomic_dec(&dq->refcount); 2273 if (dq->waiter) 2274 wake_up_process(dq->waiter); 2275} 2276 2277static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2278{ 2279 atomic_inc(&dq->refcount); 2280 return dq; 2281} 2282 2283static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2284{ 2285 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2286 2287 nvme_clear_queue(nvmeq); 2288 nvme_put_dq(dq); 2289} 2290 2291static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2292 kthread_work_func_t fn) 2293{ 2294 struct nvme_command c; 2295 2296 memset(&c, 0, sizeof(c)); 2297 c.delete_queue.opcode = opcode; 2298 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2299 2300 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2301 return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); 2302} 2303 2304static void nvme_del_cq_work_handler(struct kthread_work *work) 2305{ 2306 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2307 cmdinfo.work); 2308 nvme_del_queue_end(nvmeq); 2309} 2310 2311static int nvme_delete_cq(struct nvme_queue *nvmeq) 2312{ 2313 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2314 nvme_del_cq_work_handler); 2315} 2316 2317static void nvme_del_sq_work_handler(struct kthread_work *work) 2318{ 2319 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2320 cmdinfo.work); 2321 int status = nvmeq->cmdinfo.status; 2322 2323 if (!status) 2324 status = nvme_delete_cq(nvmeq); 2325 if (status) 2326 nvme_del_queue_end(nvmeq); 2327} 2328 2329static int nvme_delete_sq(struct nvme_queue *nvmeq) 2330{ 2331 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2332 nvme_del_sq_work_handler); 2333} 2334 2335static void nvme_del_queue_start(struct kthread_work *work) 2336{ 2337 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2338 cmdinfo.work); 2339 allow_signal(SIGKILL); 2340 if (nvme_delete_sq(nvmeq)) 2341 nvme_del_queue_end(nvmeq); 2342} 2343 2344static void nvme_disable_io_queues(struct nvme_dev *dev) 2345{ 2346 int i; 2347 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2348 struct nvme_delq_ctx dq; 2349 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2350 &worker, "nvme%d", dev->instance); 2351 2352 if (IS_ERR(kworker_task)) { 2353 dev_err(&dev->pci_dev->dev, 2354 "Failed to create queue del task\n"); 2355 for (i = dev->queue_count - 1; i > 0; i--) 2356 nvme_disable_queue(dev, i); 2357 return; 2358 } 2359 2360 dq.waiter = NULL; 2361 atomic_set(&dq.refcount, 0); 2362 dq.worker = &worker; 2363 for (i = dev->queue_count - 1; i > 0; i--) { 2364 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2365 2366 if (nvme_suspend_queue(nvmeq)) 2367 continue; 2368 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2369 nvmeq->cmdinfo.worker = dq.worker; 2370 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2371 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2372 } 2373 nvme_wait_dq(&dq, dev); 2374 kthread_stop(kworker_task); 2375} 2376 2377static void nvme_dev_shutdown(struct nvme_dev *dev) 2378{ 2379 int i; 2380 2381 dev->initialized = 0; 2382 unregister_hotcpu_notifier(&dev->nb); 2383 2384 spin_lock(&dev_list_lock); 2385 list_del_init(&dev->node); 2386 spin_unlock(&dev_list_lock); 2387 2388 if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { 2389 for (i = dev->queue_count - 1; i >= 0; i--) { 2390 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2391 nvme_suspend_queue(nvmeq); 2392 nvme_clear_queue(nvmeq); 2393 } 2394 } else { 2395 nvme_disable_io_queues(dev); 2396 nvme_shutdown_ctrl(dev); 2397 nvme_disable_queue(dev, 0); 2398 } 2399 nvme_dev_unmap(dev); 2400} 2401 2402static void nvme_dev_remove(struct nvme_dev *dev) 2403{ 2404 struct nvme_ns *ns; 2405 2406 list_for_each_entry(ns, &dev->namespaces, list) { 2407 if (ns->disk->flags & GENHD_FL_UP) 2408 del_gendisk(ns->disk); 2409 if (!blk_queue_dying(ns->queue)) 2410 blk_cleanup_queue(ns->queue); 2411 } 2412} 2413 2414static int nvme_setup_prp_pools(struct nvme_dev *dev) 2415{ 2416 struct device *dmadev = &dev->pci_dev->dev; 2417 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 2418 PAGE_SIZE, PAGE_SIZE, 0); 2419 if (!dev->prp_page_pool) 2420 return -ENOMEM; 2421 2422 /* Optimisation for I/Os between 4k and 128k */ 2423 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 2424 256, 256, 0); 2425 if (!dev->prp_small_pool) { 2426 dma_pool_destroy(dev->prp_page_pool); 2427 return -ENOMEM; 2428 } 2429 return 0; 2430} 2431 2432static void nvme_release_prp_pools(struct nvme_dev *dev) 2433{ 2434 dma_pool_destroy(dev->prp_page_pool); 2435 dma_pool_destroy(dev->prp_small_pool); 2436} 2437 2438static DEFINE_IDA(nvme_instance_ida); 2439 2440static int nvme_set_instance(struct nvme_dev *dev) 2441{ 2442 int instance, error; 2443 2444 do { 2445 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2446 return -ENODEV; 2447 2448 spin_lock(&dev_list_lock); 2449 error = ida_get_new(&nvme_instance_ida, &instance); 2450 spin_unlock(&dev_list_lock); 2451 } while (error == -EAGAIN); 2452 2453 if (error) 2454 return -ENODEV; 2455 2456 dev->instance = instance; 2457 return 0; 2458} 2459 2460static void nvme_release_instance(struct nvme_dev *dev) 2461{ 2462 spin_lock(&dev_list_lock); 2463 ida_remove(&nvme_instance_ida, dev->instance); 2464 spin_unlock(&dev_list_lock); 2465} 2466 2467static void nvme_free_namespaces(struct nvme_dev *dev) 2468{ 2469 struct nvme_ns *ns, *next; 2470 2471 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2472 list_del(&ns->list); 2473 put_disk(ns->disk); 2474 kfree(ns); 2475 } 2476} 2477 2478static void nvme_free_dev(struct kref *kref) 2479{ 2480 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2481 2482 nvme_free_namespaces(dev); 2483 free_percpu(dev->io_queue); 2484 kfree(dev->queues); 2485 kfree(dev->entry); 2486 kfree(dev); 2487} 2488 2489static int nvme_dev_open(struct inode *inode, struct file *f) 2490{ 2491 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 2492 miscdev); 2493 kref_get(&dev->kref); 2494 f->private_data = dev; 2495 return 0; 2496} 2497 2498static int nvme_dev_release(struct inode *inode, struct file *f) 2499{ 2500 struct nvme_dev *dev = f->private_data; 2501 kref_put(&dev->kref, nvme_free_dev); 2502 return 0; 2503} 2504 2505static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2506{ 2507 struct nvme_dev *dev = f->private_data; 2508 switch (cmd) { 2509 case NVME_IOCTL_ADMIN_CMD: 2510 return nvme_user_admin_cmd(dev, (void __user *)arg); 2511 default: 2512 return -ENOTTY; 2513 } 2514} 2515 2516static const struct file_operations nvme_dev_fops = { 2517 .owner = THIS_MODULE, 2518 .open = nvme_dev_open, 2519 .release = nvme_dev_release, 2520 .unlocked_ioctl = nvme_dev_ioctl, 2521 .compat_ioctl = nvme_dev_ioctl, 2522}; 2523 2524static int nvme_dev_start(struct nvme_dev *dev) 2525{ 2526 int result; 2527 2528 result = nvme_dev_map(dev); 2529 if (result) 2530 return result; 2531 2532 result = nvme_configure_admin_queue(dev); 2533 if (result) 2534 goto unmap; 2535 2536 spin_lock(&dev_list_lock); 2537 list_add(&dev->node, &dev_list); 2538 spin_unlock(&dev_list_lock); 2539 2540 result = nvme_setup_io_queues(dev); 2541 if (result && result != -EBUSY) 2542 goto disable; 2543 2544 return result; 2545 2546 disable: 2547 nvme_disable_queue(dev, 0); 2548 spin_lock(&dev_list_lock); 2549 list_del_init(&dev->node); 2550 spin_unlock(&dev_list_lock); 2551 unmap: 2552 nvme_dev_unmap(dev); 2553 return result; 2554} 2555 2556static int nvme_remove_dead_ctrl(void *arg) 2557{ 2558 struct nvme_dev *dev = (struct nvme_dev *)arg; 2559 struct pci_dev *pdev = dev->pci_dev; 2560 2561 if (pci_get_drvdata(pdev)) 2562 pci_stop_and_remove_bus_device(pdev); 2563 kref_put(&dev->kref, nvme_free_dev); 2564 return 0; 2565} 2566 2567static void nvme_remove_disks(struct work_struct *ws) 2568{ 2569 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2570 2571 nvme_dev_remove(dev); 2572 nvme_free_queues(dev, 1); 2573} 2574 2575static int nvme_dev_resume(struct nvme_dev *dev) 2576{ 2577 int ret; 2578 2579 ret = nvme_dev_start(dev); 2580 if (ret && ret != -EBUSY) 2581 return ret; 2582 if (ret == -EBUSY) { 2583 spin_lock(&dev_list_lock); 2584 PREPARE_WORK(&dev->reset_work, nvme_remove_disks); 2585 queue_work(nvme_workq, &dev->reset_work); 2586 spin_unlock(&dev_list_lock); 2587 } 2588 dev->initialized = 1; 2589 return 0; 2590} 2591 2592static void nvme_dev_reset(struct nvme_dev *dev) 2593{ 2594 nvme_dev_shutdown(dev); 2595 if (nvme_dev_resume(dev)) { 2596 dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); 2597 kref_get(&dev->kref); 2598 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2599 dev->instance))) { 2600 dev_err(&dev->pci_dev->dev, 2601 "Failed to start controller remove task\n"); 2602 kref_put(&dev->kref, nvme_free_dev); 2603 } 2604 } 2605} 2606 2607static void nvme_reset_failed_dev(struct work_struct *ws) 2608{ 2609 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2610 nvme_dev_reset(dev); 2611} 2612 2613static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2614{ 2615 int result = -ENOMEM; 2616 struct nvme_dev *dev; 2617 2618 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2619 if (!dev) 2620 return -ENOMEM; 2621 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2622 GFP_KERNEL); 2623 if (!dev->entry) 2624 goto free; 2625 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2626 GFP_KERNEL); 2627 if (!dev->queues) 2628 goto free; 2629 dev->io_queue = alloc_percpu(unsigned short); 2630 if (!dev->io_queue) 2631 goto free; 2632 2633 INIT_LIST_HEAD(&dev->namespaces); 2634 INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); 2635 dev->pci_dev = pdev; 2636 pci_set_drvdata(pdev, dev); 2637 result = nvme_set_instance(dev); 2638 if (result) 2639 goto free; 2640 2641 result = nvme_setup_prp_pools(dev); 2642 if (result) 2643 goto release; 2644 2645 kref_init(&dev->kref); 2646 result = nvme_dev_start(dev); 2647 if (result) { 2648 if (result == -EBUSY) 2649 goto create_cdev; 2650 goto release_pools; 2651 } 2652 2653 result = nvme_dev_add(dev); 2654 if (result) 2655 goto shutdown; 2656 2657 create_cdev: 2658 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2659 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2660 dev->miscdev.parent = &pdev->dev; 2661 dev->miscdev.name = dev->name; 2662 dev->miscdev.fops = &nvme_dev_fops; 2663 result = misc_register(&dev->miscdev); 2664 if (result) 2665 goto remove; 2666 2667 dev->initialized = 1; 2668 return 0; 2669 2670 remove: 2671 nvme_dev_remove(dev); 2672 nvme_free_namespaces(dev); 2673 shutdown: 2674 nvme_dev_shutdown(dev); 2675 release_pools: 2676 nvme_free_queues(dev, 0); 2677 nvme_release_prp_pools(dev); 2678 release: 2679 nvme_release_instance(dev); 2680 free: 2681 free_percpu(dev->io_queue); 2682 kfree(dev->queues); 2683 kfree(dev->entry); 2684 kfree(dev); 2685 return result; 2686} 2687 2688static void nvme_shutdown(struct pci_dev *pdev) 2689{ 2690 struct nvme_dev *dev = pci_get_drvdata(pdev); 2691 nvme_dev_shutdown(dev); 2692} 2693 2694static void nvme_remove(struct pci_dev *pdev) 2695{ 2696 struct nvme_dev *dev = pci_get_drvdata(pdev); 2697 2698 spin_lock(&dev_list_lock); 2699 list_del_init(&dev->node); 2700 spin_unlock(&dev_list_lock); 2701 2702 pci_set_drvdata(pdev, NULL); 2703 flush_work(&dev->reset_work); 2704 misc_deregister(&dev->miscdev); 2705 nvme_dev_remove(dev); 2706 nvme_dev_shutdown(dev); 2707 nvme_free_queues(dev, 0); 2708 rcu_barrier(); 2709 nvme_release_instance(dev); 2710 nvme_release_prp_pools(dev); 2711 kref_put(&dev->kref, nvme_free_dev); 2712} 2713 2714/* These functions are yet to be implemented */ 2715#define nvme_error_detected NULL 2716#define nvme_dump_registers NULL 2717#define nvme_link_reset NULL 2718#define nvme_slot_reset NULL 2719#define nvme_error_resume NULL 2720 2721#ifdef CONFIG_PM_SLEEP 2722static int nvme_suspend(struct device *dev) 2723{ 2724 struct pci_dev *pdev = to_pci_dev(dev); 2725 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2726 2727 nvme_dev_shutdown(ndev); 2728 return 0; 2729} 2730 2731static int nvme_resume(struct device *dev) 2732{ 2733 struct pci_dev *pdev = to_pci_dev(dev); 2734 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2735 2736 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { 2737 PREPARE_WORK(&ndev->reset_work, nvme_reset_failed_dev); 2738 queue_work(nvme_workq, &ndev->reset_work); 2739 } 2740 return 0; 2741} 2742#endif 2743 2744static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2745 2746static const struct pci_error_handlers nvme_err_handler = { 2747 .error_detected = nvme_error_detected, 2748 .mmio_enabled = nvme_dump_registers, 2749 .link_reset = nvme_link_reset, 2750 .slot_reset = nvme_slot_reset, 2751 .resume = nvme_error_resume, 2752}; 2753 2754/* Move to pci_ids.h later */ 2755#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2756 2757static const struct pci_device_id nvme_id_table[] = { 2758 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2759 { 0, } 2760}; 2761MODULE_DEVICE_TABLE(pci, nvme_id_table); 2762 2763static struct pci_driver nvme_driver = { 2764 .name = "nvme", 2765 .id_table = nvme_id_table, 2766 .probe = nvme_probe, 2767 .remove = nvme_remove, 2768 .shutdown = nvme_shutdown, 2769 .driver = { 2770 .pm = &nvme_dev_pm_ops, 2771 }, 2772 .err_handler = &nvme_err_handler, 2773}; 2774 2775static int __init nvme_init(void) 2776{ 2777 int result; 2778 2779 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2780 if (IS_ERR(nvme_thread)) 2781 return PTR_ERR(nvme_thread); 2782 2783 result = -ENOMEM; 2784 nvme_workq = create_singlethread_workqueue("nvme"); 2785 if (!nvme_workq) 2786 goto kill_kthread; 2787 2788 result = register_blkdev(nvme_major, "nvme"); 2789 if (result < 0) 2790 goto kill_workq; 2791 else if (result > 0) 2792 nvme_major = result; 2793 2794 result = pci_register_driver(&nvme_driver); 2795 if (result) 2796 goto unregister_blkdev; 2797 return 0; 2798 2799 unregister_blkdev: 2800 unregister_blkdev(nvme_major, "nvme"); 2801 kill_workq: 2802 destroy_workqueue(nvme_workq); 2803 kill_kthread: 2804 kthread_stop(nvme_thread); 2805 return result; 2806} 2807 2808static void __exit nvme_exit(void) 2809{ 2810 pci_unregister_driver(&nvme_driver); 2811 unregister_blkdev(nvme_major, "nvme"); 2812 destroy_workqueue(nvme_workq); 2813 kthread_stop(nvme_thread); 2814} 2815 2816MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2817MODULE_LICENSE("GPL"); 2818MODULE_VERSION("0.9"); 2819module_init(nvme_init); 2820module_exit(nvme_exit); 2821