nvme-core.c revision 427e97080196548557b288517537ab7eb48c309f
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42#include <scsi/sg.h> 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#define NVME_Q_DEPTH 1024 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48#define NVME_MINORS 64 49#define ADMIN_TIMEOUT (60 * HZ) 50 51static int nvme_major; 52module_param(nvme_major, int, 0); 53 54static int use_threaded_interrupts; 55module_param(use_threaded_interrupts, int, 0); 56 57static DEFINE_SPINLOCK(dev_list_lock); 58static LIST_HEAD(dev_list); 59static struct task_struct *nvme_thread; 60 61/* 62 * An NVM Express queue. Each device has at least two (one for admin 63 * commands and one for I/O commands). 64 */ 65struct nvme_queue { 66 struct device *q_dmadev; 67 struct nvme_dev *dev; 68 spinlock_t q_lock; 69 struct nvme_command *sq_cmds; 70 volatile struct nvme_completion *cqes; 71 dma_addr_t sq_dma_addr; 72 dma_addr_t cq_dma_addr; 73 wait_queue_head_t sq_full; 74 wait_queue_t sq_cong_wait; 75 struct bio_list sq_cong; 76 u32 __iomem *q_db; 77 u16 q_depth; 78 u16 cq_vector; 79 u16 sq_head; 80 u16 sq_tail; 81 u16 cq_head; 82 u16 cq_phase; 83 unsigned long cmdid_data[]; 84}; 85 86/* 87 * Check we didin't inadvertently grow the command struct 88 */ 89static inline void _nvme_check_size(void) 90{ 91 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 92 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 93 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 94 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 95 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 96 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 97 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 98 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 99 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 100 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 101 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 102} 103 104typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 105 struct nvme_completion *); 106 107struct nvme_cmd_info { 108 nvme_completion_fn fn; 109 void *ctx; 110 unsigned long timeout; 111}; 112 113static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 114{ 115 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 116} 117 118/** 119 * alloc_cmdid() - Allocate a Command ID 120 * @nvmeq: The queue that will be used for this command 121 * @ctx: A pointer that will be passed to the handler 122 * @handler: The function to call on completion 123 * 124 * Allocate a Command ID for a queue. The data passed in will 125 * be passed to the completion handler. This is implemented by using 126 * the bottom two bits of the ctx pointer to store the handler ID. 127 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 128 * We can change this if it becomes a problem. 129 * 130 * May be called with local interrupts disabled and the q_lock held, 131 * or with interrupts enabled and no locks held. 132 */ 133static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 134 nvme_completion_fn handler, unsigned timeout) 135{ 136 int depth = nvmeq->q_depth - 1; 137 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 138 int cmdid; 139 140 do { 141 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 142 if (cmdid >= depth) 143 return -EBUSY; 144 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 145 146 info[cmdid].fn = handler; 147 info[cmdid].ctx = ctx; 148 info[cmdid].timeout = jiffies + timeout; 149 return cmdid; 150} 151 152static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 153 nvme_completion_fn handler, unsigned timeout) 154{ 155 int cmdid; 156 wait_event_killable(nvmeq->sq_full, 157 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 158 return (cmdid < 0) ? -EINTR : cmdid; 159} 160 161/* Special values must be less than 0x1000 */ 162#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 163#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 164#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 165#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 166#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 167 168static void special_completion(struct nvme_dev *dev, void *ctx, 169 struct nvme_completion *cqe) 170{ 171 if (ctx == CMD_CTX_CANCELLED) 172 return; 173 if (ctx == CMD_CTX_FLUSH) 174 return; 175 if (ctx == CMD_CTX_COMPLETED) { 176 dev_warn(&dev->pci_dev->dev, 177 "completed id %d twice on queue %d\n", 178 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 179 return; 180 } 181 if (ctx == CMD_CTX_INVALID) { 182 dev_warn(&dev->pci_dev->dev, 183 "invalid id %d completed on queue %d\n", 184 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 185 return; 186 } 187 188 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 189} 190 191/* 192 * Called with local interrupts disabled and the q_lock held. May not sleep. 193 */ 194static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 195 nvme_completion_fn *fn) 196{ 197 void *ctx; 198 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 199 200 if (cmdid >= nvmeq->q_depth) { 201 *fn = special_completion; 202 return CMD_CTX_INVALID; 203 } 204 if (fn) 205 *fn = info[cmdid].fn; 206 ctx = info[cmdid].ctx; 207 info[cmdid].fn = special_completion; 208 info[cmdid].ctx = CMD_CTX_COMPLETED; 209 clear_bit(cmdid, nvmeq->cmdid_data); 210 wake_up(&nvmeq->sq_full); 211 return ctx; 212} 213 214static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 215 nvme_completion_fn *fn) 216{ 217 void *ctx; 218 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 219 if (fn) 220 *fn = info[cmdid].fn; 221 ctx = info[cmdid].ctx; 222 info[cmdid].fn = special_completion; 223 info[cmdid].ctx = CMD_CTX_CANCELLED; 224 return ctx; 225} 226 227struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 228{ 229 return dev->queues[get_cpu() + 1]; 230} 231 232void put_nvmeq(struct nvme_queue *nvmeq) 233{ 234 put_cpu(); 235} 236 237/** 238 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 239 * @nvmeq: The queue to use 240 * @cmd: The command to send 241 * 242 * Safe to use from interrupt context 243 */ 244static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 245{ 246 unsigned long flags; 247 u16 tail; 248 spin_lock_irqsave(&nvmeq->q_lock, flags); 249 tail = nvmeq->sq_tail; 250 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 251 if (++tail == nvmeq->q_depth) 252 tail = 0; 253 writel(tail, nvmeq->q_db); 254 nvmeq->sq_tail = tail; 255 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 256 257 return 0; 258} 259 260static __le64 **iod_list(struct nvme_iod *iod) 261{ 262 return ((void *)iod) + iod->offset; 263} 264 265/* 266 * Will slightly overestimate the number of pages needed. This is OK 267 * as it only leads to a small amount of wasted memory for the lifetime of 268 * the I/O. 269 */ 270static int nvme_npages(unsigned size) 271{ 272 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 273 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 274} 275 276static struct nvme_iod * 277nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 278{ 279 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 280 sizeof(__le64 *) * nvme_npages(nbytes) + 281 sizeof(struct scatterlist) * nseg, gfp); 282 283 if (iod) { 284 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 285 iod->npages = -1; 286 iod->length = nbytes; 287 iod->nents = 0; 288 } 289 290 return iod; 291} 292 293void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 294{ 295 const int last_prp = PAGE_SIZE / 8 - 1; 296 int i; 297 __le64 **list = iod_list(iod); 298 dma_addr_t prp_dma = iod->first_dma; 299 300 if (iod->npages == 0) 301 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 302 for (i = 0; i < iod->npages; i++) { 303 __le64 *prp_list = list[i]; 304 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 305 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 306 prp_dma = next_prp_dma; 307 } 308 kfree(iod); 309} 310 311static void bio_completion(struct nvme_dev *dev, void *ctx, 312 struct nvme_completion *cqe) 313{ 314 struct nvme_iod *iod = ctx; 315 struct bio *bio = iod->private; 316 u16 status = le16_to_cpup(&cqe->status) >> 1; 317 318 if (iod->nents) 319 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 320 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 321 nvme_free_iod(dev, iod); 322 if (status) 323 bio_endio(bio, -EIO); 324 else 325 bio_endio(bio, 0); 326} 327 328/* length is in bytes. gfp flags indicates whether we may sleep. */ 329int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 330 struct nvme_iod *iod, int total_len, gfp_t gfp) 331{ 332 struct dma_pool *pool; 333 int length = total_len; 334 struct scatterlist *sg = iod->sg; 335 int dma_len = sg_dma_len(sg); 336 u64 dma_addr = sg_dma_address(sg); 337 int offset = offset_in_page(dma_addr); 338 __le64 *prp_list; 339 __le64 **list = iod_list(iod); 340 dma_addr_t prp_dma; 341 int nprps, i; 342 343 cmd->prp1 = cpu_to_le64(dma_addr); 344 length -= (PAGE_SIZE - offset); 345 if (length <= 0) 346 return total_len; 347 348 dma_len -= (PAGE_SIZE - offset); 349 if (dma_len) { 350 dma_addr += (PAGE_SIZE - offset); 351 } else { 352 sg = sg_next(sg); 353 dma_addr = sg_dma_address(sg); 354 dma_len = sg_dma_len(sg); 355 } 356 357 if (length <= PAGE_SIZE) { 358 cmd->prp2 = cpu_to_le64(dma_addr); 359 return total_len; 360 } 361 362 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 363 if (nprps <= (256 / 8)) { 364 pool = dev->prp_small_pool; 365 iod->npages = 0; 366 } else { 367 pool = dev->prp_page_pool; 368 iod->npages = 1; 369 } 370 371 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 372 if (!prp_list) { 373 cmd->prp2 = cpu_to_le64(dma_addr); 374 iod->npages = -1; 375 return (total_len - length) + PAGE_SIZE; 376 } 377 list[0] = prp_list; 378 iod->first_dma = prp_dma; 379 cmd->prp2 = cpu_to_le64(prp_dma); 380 i = 0; 381 for (;;) { 382 if (i == PAGE_SIZE / 8) { 383 __le64 *old_prp_list = prp_list; 384 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 385 if (!prp_list) 386 return total_len - length; 387 list[iod->npages++] = prp_list; 388 prp_list[0] = old_prp_list[i - 1]; 389 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 390 i = 1; 391 } 392 prp_list[i++] = cpu_to_le64(dma_addr); 393 dma_len -= PAGE_SIZE; 394 dma_addr += PAGE_SIZE; 395 length -= PAGE_SIZE; 396 if (length <= 0) 397 break; 398 if (dma_len > 0) 399 continue; 400 BUG_ON(dma_len < 0); 401 sg = sg_next(sg); 402 dma_addr = sg_dma_address(sg); 403 dma_len = sg_dma_len(sg); 404 } 405 406 return total_len; 407} 408 409struct nvme_bio_pair { 410 struct bio b1, b2, *parent; 411 struct bio_vec *bv1, *bv2; 412 int err; 413 atomic_t cnt; 414}; 415 416static void nvme_bio_pair_endio(struct bio *bio, int err) 417{ 418 struct nvme_bio_pair *bp = bio->bi_private; 419 420 if (err) 421 bp->err = err; 422 423 if (atomic_dec_and_test(&bp->cnt)) { 424 bio_endio(bp->parent, bp->err); 425 if (bp->bv1) 426 kfree(bp->bv1); 427 if (bp->bv2) 428 kfree(bp->bv2); 429 kfree(bp); 430 } 431} 432 433static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, 434 int len, int offset) 435{ 436 struct nvme_bio_pair *bp; 437 438 BUG_ON(len > bio->bi_size); 439 BUG_ON(idx > bio->bi_vcnt); 440 441 bp = kmalloc(sizeof(*bp), GFP_ATOMIC); 442 if (!bp) 443 return NULL; 444 bp->err = 0; 445 446 bp->b1 = *bio; 447 bp->b2 = *bio; 448 449 bp->b1.bi_size = len; 450 bp->b2.bi_size -= len; 451 bp->b1.bi_vcnt = idx; 452 bp->b2.bi_idx = idx; 453 bp->b2.bi_sector += len >> 9; 454 455 if (offset) { 456 bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 457 GFP_ATOMIC); 458 if (!bp->bv1) 459 goto split_fail_1; 460 461 bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 462 GFP_ATOMIC); 463 if (!bp->bv2) 464 goto split_fail_2; 465 466 memcpy(bp->bv1, bio->bi_io_vec, 467 bio->bi_max_vecs * sizeof(struct bio_vec)); 468 memcpy(bp->bv2, bio->bi_io_vec, 469 bio->bi_max_vecs * sizeof(struct bio_vec)); 470 471 bp->b1.bi_io_vec = bp->bv1; 472 bp->b2.bi_io_vec = bp->bv2; 473 bp->b2.bi_io_vec[idx].bv_offset += offset; 474 bp->b2.bi_io_vec[idx].bv_len -= offset; 475 bp->b1.bi_io_vec[idx].bv_len = offset; 476 bp->b1.bi_vcnt++; 477 } else 478 bp->bv1 = bp->bv2 = NULL; 479 480 bp->b1.bi_private = bp; 481 bp->b2.bi_private = bp; 482 483 bp->b1.bi_end_io = nvme_bio_pair_endio; 484 bp->b2.bi_end_io = nvme_bio_pair_endio; 485 486 bp->parent = bio; 487 atomic_set(&bp->cnt, 2); 488 489 return bp; 490 491 split_fail_2: 492 kfree(bp->bv1); 493 split_fail_1: 494 kfree(bp); 495 return NULL; 496} 497 498static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 499 int idx, int len, int offset) 500{ 501 struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); 502 if (!bp) 503 return -ENOMEM; 504 505 if (bio_list_empty(&nvmeq->sq_cong)) 506 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 507 bio_list_add(&nvmeq->sq_cong, &bp->b1); 508 bio_list_add(&nvmeq->sq_cong, &bp->b2); 509 510 return 0; 511} 512 513/* NVMe scatterlists require no holes in the virtual address */ 514#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 515 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 516 517static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 518 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 519{ 520 struct bio_vec *bvec, *bvprv = NULL; 521 struct scatterlist *sg = NULL; 522 int i, length = 0, nsegs = 0; 523 524 sg_init_table(iod->sg, psegs); 525 bio_for_each_segment(bvec, bio, i) { 526 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 527 sg->length += bvec->bv_len; 528 } else { 529 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 530 return nvme_split_and_submit(bio, nvmeq, i, 531 length, 0); 532 533 sg = sg ? sg + 1 : iod->sg; 534 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 535 bvec->bv_offset); 536 nsegs++; 537 } 538 length += bvec->bv_len; 539 bvprv = bvec; 540 } 541 iod->nents = nsegs; 542 sg_mark_end(sg); 543 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 544 return -ENOMEM; 545 546 return length; 547} 548 549/* 550 * We reuse the small pool to allocate the 16-byte range here as it is not 551 * worth having a special pool for these or additional cases to handle freeing 552 * the iod. 553 */ 554static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 555 struct bio *bio, struct nvme_iod *iod, int cmdid) 556{ 557 struct nvme_dsm_range *range; 558 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 559 560 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 561 &iod->first_dma); 562 if (!range) 563 return -ENOMEM; 564 565 iod_list(iod)[0] = (__le64 *)range; 566 iod->npages = 0; 567 568 range->cattr = cpu_to_le32(0); 569 range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); 570 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 571 572 memset(cmnd, 0, sizeof(*cmnd)); 573 cmnd->dsm.opcode = nvme_cmd_dsm; 574 cmnd->dsm.command_id = cmdid; 575 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 576 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 577 cmnd->dsm.nr = 0; 578 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 579 580 if (++nvmeq->sq_tail == nvmeq->q_depth) 581 nvmeq->sq_tail = 0; 582 writel(nvmeq->sq_tail, nvmeq->q_db); 583 584 return 0; 585} 586 587static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 588 int cmdid) 589{ 590 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 591 592 memset(cmnd, 0, sizeof(*cmnd)); 593 cmnd->common.opcode = nvme_cmd_flush; 594 cmnd->common.command_id = cmdid; 595 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 596 597 if (++nvmeq->sq_tail == nvmeq->q_depth) 598 nvmeq->sq_tail = 0; 599 writel(nvmeq->sq_tail, nvmeq->q_db); 600 601 return 0; 602} 603 604int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 605{ 606 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 607 special_completion, NVME_IO_TIMEOUT); 608 if (unlikely(cmdid < 0)) 609 return cmdid; 610 611 return nvme_submit_flush(nvmeq, ns, cmdid); 612} 613 614/* 615 * Called with local interrupts disabled and the q_lock held. May not sleep. 616 */ 617static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 618 struct bio *bio) 619{ 620 struct nvme_command *cmnd; 621 struct nvme_iod *iod; 622 enum dma_data_direction dma_dir; 623 int cmdid, length, result = -ENOMEM; 624 u16 control; 625 u32 dsmgmt; 626 int psegs = bio_phys_segments(ns->queue, bio); 627 628 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 629 result = nvme_submit_flush_data(nvmeq, ns); 630 if (result) 631 return result; 632 } 633 634 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 635 if (!iod) 636 goto nomem; 637 iod->private = bio; 638 639 result = -EBUSY; 640 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 641 if (unlikely(cmdid < 0)) 642 goto free_iod; 643 644 if (bio->bi_rw & REQ_DISCARD) { 645 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 646 if (result) 647 goto free_cmdid; 648 return result; 649 } 650 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 651 return nvme_submit_flush(nvmeq, ns, cmdid); 652 653 control = 0; 654 if (bio->bi_rw & REQ_FUA) 655 control |= NVME_RW_FUA; 656 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 657 control |= NVME_RW_LR; 658 659 dsmgmt = 0; 660 if (bio->bi_rw & REQ_RAHEAD) 661 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 662 663 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 664 665 memset(cmnd, 0, sizeof(*cmnd)); 666 if (bio_data_dir(bio)) { 667 cmnd->rw.opcode = nvme_cmd_write; 668 dma_dir = DMA_TO_DEVICE; 669 } else { 670 cmnd->rw.opcode = nvme_cmd_read; 671 dma_dir = DMA_FROM_DEVICE; 672 } 673 674 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 675 if (result <= 0) 676 goto free_cmdid; 677 length = result; 678 679 cmnd->rw.command_id = cmdid; 680 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 681 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 682 GFP_ATOMIC); 683 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 684 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 685 cmnd->rw.control = cpu_to_le16(control); 686 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 687 688 if (++nvmeq->sq_tail == nvmeq->q_depth) 689 nvmeq->sq_tail = 0; 690 writel(nvmeq->sq_tail, nvmeq->q_db); 691 692 return 0; 693 694 free_cmdid: 695 free_cmdid(nvmeq, cmdid, NULL); 696 free_iod: 697 nvme_free_iod(nvmeq->dev, iod); 698 nomem: 699 return result; 700} 701 702static void nvme_make_request(struct request_queue *q, struct bio *bio) 703{ 704 struct nvme_ns *ns = q->queuedata; 705 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 706 int result = -EBUSY; 707 708 spin_lock_irq(&nvmeq->q_lock); 709 if (bio_list_empty(&nvmeq->sq_cong)) 710 result = nvme_submit_bio_queue(nvmeq, ns, bio); 711 if (unlikely(result)) { 712 if (bio_list_empty(&nvmeq->sq_cong)) 713 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 714 bio_list_add(&nvmeq->sq_cong, bio); 715 } 716 717 spin_unlock_irq(&nvmeq->q_lock); 718 put_nvmeq(nvmeq); 719} 720 721static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 722{ 723 u16 head, phase; 724 725 head = nvmeq->cq_head; 726 phase = nvmeq->cq_phase; 727 728 for (;;) { 729 void *ctx; 730 nvme_completion_fn fn; 731 struct nvme_completion cqe = nvmeq->cqes[head]; 732 if ((le16_to_cpu(cqe.status) & 1) != phase) 733 break; 734 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 735 if (++head == nvmeq->q_depth) { 736 head = 0; 737 phase = !phase; 738 } 739 740 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 741 fn(nvmeq->dev, ctx, &cqe); 742 } 743 744 /* If the controller ignores the cq head doorbell and continuously 745 * writes to the queue, it is theoretically possible to wrap around 746 * the queue twice and mistakenly return IRQ_NONE. Linux only 747 * requires that 0.1% of your interrupts are handled, so this isn't 748 * a big problem. 749 */ 750 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 751 return IRQ_NONE; 752 753 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 754 nvmeq->cq_head = head; 755 nvmeq->cq_phase = phase; 756 757 return IRQ_HANDLED; 758} 759 760static irqreturn_t nvme_irq(int irq, void *data) 761{ 762 irqreturn_t result; 763 struct nvme_queue *nvmeq = data; 764 spin_lock(&nvmeq->q_lock); 765 result = nvme_process_cq(nvmeq); 766 spin_unlock(&nvmeq->q_lock); 767 return result; 768} 769 770static irqreturn_t nvme_irq_check(int irq, void *data) 771{ 772 struct nvme_queue *nvmeq = data; 773 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 774 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 775 return IRQ_NONE; 776 return IRQ_WAKE_THREAD; 777} 778 779static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 780{ 781 spin_lock_irq(&nvmeq->q_lock); 782 cancel_cmdid(nvmeq, cmdid, NULL); 783 spin_unlock_irq(&nvmeq->q_lock); 784} 785 786struct sync_cmd_info { 787 struct task_struct *task; 788 u32 result; 789 int status; 790}; 791 792static void sync_completion(struct nvme_dev *dev, void *ctx, 793 struct nvme_completion *cqe) 794{ 795 struct sync_cmd_info *cmdinfo = ctx; 796 cmdinfo->result = le32_to_cpup(&cqe->result); 797 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 798 wake_up_process(cmdinfo->task); 799} 800 801/* 802 * Returns 0 on success. If the result is negative, it's a Linux error code; 803 * if the result is positive, it's an NVM Express status code 804 */ 805int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, 806 u32 *result, unsigned timeout) 807{ 808 int cmdid; 809 struct sync_cmd_info cmdinfo; 810 811 cmdinfo.task = current; 812 cmdinfo.status = -EINTR; 813 814 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 815 timeout); 816 if (cmdid < 0) 817 return cmdid; 818 cmd->common.command_id = cmdid; 819 820 set_current_state(TASK_KILLABLE); 821 nvme_submit_cmd(nvmeq, cmd); 822 schedule(); 823 824 if (cmdinfo.status == -EINTR) { 825 nvme_abort_command(nvmeq, cmdid); 826 return -EINTR; 827 } 828 829 if (result) 830 *result = cmdinfo.result; 831 832 return cmdinfo.status; 833} 834 835int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 836 u32 *result) 837{ 838 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 839} 840 841static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 842{ 843 int status; 844 struct nvme_command c; 845 846 memset(&c, 0, sizeof(c)); 847 c.delete_queue.opcode = opcode; 848 c.delete_queue.qid = cpu_to_le16(id); 849 850 status = nvme_submit_admin_cmd(dev, &c, NULL); 851 if (status) 852 return -EIO; 853 return 0; 854} 855 856static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 857 struct nvme_queue *nvmeq) 858{ 859 int status; 860 struct nvme_command c; 861 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 862 863 memset(&c, 0, sizeof(c)); 864 c.create_cq.opcode = nvme_admin_create_cq; 865 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 866 c.create_cq.cqid = cpu_to_le16(qid); 867 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 868 c.create_cq.cq_flags = cpu_to_le16(flags); 869 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 870 871 status = nvme_submit_admin_cmd(dev, &c, NULL); 872 if (status) 873 return -EIO; 874 return 0; 875} 876 877static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 878 struct nvme_queue *nvmeq) 879{ 880 int status; 881 struct nvme_command c; 882 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 883 884 memset(&c, 0, sizeof(c)); 885 c.create_sq.opcode = nvme_admin_create_sq; 886 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 887 c.create_sq.sqid = cpu_to_le16(qid); 888 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 889 c.create_sq.sq_flags = cpu_to_le16(flags); 890 c.create_sq.cqid = cpu_to_le16(qid); 891 892 status = nvme_submit_admin_cmd(dev, &c, NULL); 893 if (status) 894 return -EIO; 895 return 0; 896} 897 898static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 899{ 900 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 901} 902 903static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 904{ 905 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 906} 907 908int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 909 dma_addr_t dma_addr) 910{ 911 struct nvme_command c; 912 913 memset(&c, 0, sizeof(c)); 914 c.identify.opcode = nvme_admin_identify; 915 c.identify.nsid = cpu_to_le32(nsid); 916 c.identify.prp1 = cpu_to_le64(dma_addr); 917 c.identify.cns = cpu_to_le32(cns); 918 919 return nvme_submit_admin_cmd(dev, &c, NULL); 920} 921 922int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 923 dma_addr_t dma_addr, u32 *result) 924{ 925 struct nvme_command c; 926 927 memset(&c, 0, sizeof(c)); 928 c.features.opcode = nvme_admin_get_features; 929 c.features.nsid = cpu_to_le32(nsid); 930 c.features.prp1 = cpu_to_le64(dma_addr); 931 c.features.fid = cpu_to_le32(fid); 932 933 return nvme_submit_admin_cmd(dev, &c, result); 934} 935 936int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 937 dma_addr_t dma_addr, u32 *result) 938{ 939 struct nvme_command c; 940 941 memset(&c, 0, sizeof(c)); 942 c.features.opcode = nvme_admin_set_features; 943 c.features.prp1 = cpu_to_le64(dma_addr); 944 c.features.fid = cpu_to_le32(fid); 945 c.features.dword11 = cpu_to_le32(dword11); 946 947 return nvme_submit_admin_cmd(dev, &c, result); 948} 949 950/** 951 * nvme_cancel_ios - Cancel outstanding I/Os 952 * @queue: The queue to cancel I/Os on 953 * @timeout: True to only cancel I/Os which have timed out 954 */ 955static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 956{ 957 int depth = nvmeq->q_depth - 1; 958 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 959 unsigned long now = jiffies; 960 int cmdid; 961 962 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 963 void *ctx; 964 nvme_completion_fn fn; 965 static struct nvme_completion cqe = { 966 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 967 }; 968 969 if (timeout && !time_after(now, info[cmdid].timeout)) 970 continue; 971 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); 972 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 973 fn(nvmeq->dev, ctx, &cqe); 974 } 975} 976 977static void nvme_free_queue_mem(struct nvme_queue *nvmeq) 978{ 979 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 980 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 981 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 982 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 983 kfree(nvmeq); 984} 985 986static void nvme_free_queue(struct nvme_dev *dev, int qid) 987{ 988 struct nvme_queue *nvmeq = dev->queues[qid]; 989 int vector = dev->entry[nvmeq->cq_vector].vector; 990 991 spin_lock_irq(&nvmeq->q_lock); 992 nvme_cancel_ios(nvmeq, false); 993 while (bio_list_peek(&nvmeq->sq_cong)) { 994 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 995 bio_endio(bio, -EIO); 996 } 997 spin_unlock_irq(&nvmeq->q_lock); 998 999 irq_set_affinity_hint(vector, NULL); 1000 free_irq(vector, nvmeq); 1001 1002 /* Don't tell the adapter to delete the admin queue */ 1003 if (qid) { 1004 adapter_delete_sq(dev, qid); 1005 adapter_delete_cq(dev, qid); 1006 } 1007 1008 nvme_free_queue_mem(nvmeq); 1009} 1010 1011static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1012 int depth, int vector) 1013{ 1014 struct device *dmadev = &dev->pci_dev->dev; 1015 unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * 1016 sizeof(struct nvme_cmd_info)); 1017 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1018 if (!nvmeq) 1019 return NULL; 1020 1021 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1022 &nvmeq->cq_dma_addr, GFP_KERNEL); 1023 if (!nvmeq->cqes) 1024 goto free_nvmeq; 1025 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1026 1027 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1028 &nvmeq->sq_dma_addr, GFP_KERNEL); 1029 if (!nvmeq->sq_cmds) 1030 goto free_cqdma; 1031 1032 nvmeq->q_dmadev = dmadev; 1033 nvmeq->dev = dev; 1034 spin_lock_init(&nvmeq->q_lock); 1035 nvmeq->cq_head = 0; 1036 nvmeq->cq_phase = 1; 1037 init_waitqueue_head(&nvmeq->sq_full); 1038 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1039 bio_list_init(&nvmeq->sq_cong); 1040 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 1041 nvmeq->q_depth = depth; 1042 nvmeq->cq_vector = vector; 1043 1044 return nvmeq; 1045 1046 free_cqdma: 1047 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1048 nvmeq->cq_dma_addr); 1049 free_nvmeq: 1050 kfree(nvmeq); 1051 return NULL; 1052} 1053 1054static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1055 const char *name) 1056{ 1057 if (use_threaded_interrupts) 1058 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1059 nvme_irq_check, nvme_irq, 1060 IRQF_DISABLED | IRQF_SHARED, 1061 name, nvmeq); 1062 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1063 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 1064} 1065 1066static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, 1067 int cq_size, int vector) 1068{ 1069 int result; 1070 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 1071 1072 if (!nvmeq) 1073 return ERR_PTR(-ENOMEM); 1074 1075 result = adapter_alloc_cq(dev, qid, nvmeq); 1076 if (result < 0) 1077 goto free_nvmeq; 1078 1079 result = adapter_alloc_sq(dev, qid, nvmeq); 1080 if (result < 0) 1081 goto release_cq; 1082 1083 result = queue_request_irq(dev, nvmeq, "nvme"); 1084 if (result < 0) 1085 goto release_sq; 1086 1087 return nvmeq; 1088 1089 release_sq: 1090 adapter_delete_sq(dev, qid); 1091 release_cq: 1092 adapter_delete_cq(dev, qid); 1093 free_nvmeq: 1094 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1095 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1096 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1097 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1098 kfree(nvmeq); 1099 return ERR_PTR(result); 1100} 1101 1102static int nvme_configure_admin_queue(struct nvme_dev *dev) 1103{ 1104 int result = 0; 1105 u32 aqa; 1106 u64 cap; 1107 unsigned long timeout; 1108 struct nvme_queue *nvmeq; 1109 1110 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1111 1112 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1113 if (!nvmeq) 1114 return -ENOMEM; 1115 1116 aqa = nvmeq->q_depth - 1; 1117 aqa |= aqa << 16; 1118 1119 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1120 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1121 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1122 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1123 1124 writel(0, &dev->bar->cc); 1125 writel(aqa, &dev->bar->aqa); 1126 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1127 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1128 writel(dev->ctrl_config, &dev->bar->cc); 1129 1130 cap = readq(&dev->bar->cap); 1131 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1132 dev->db_stride = NVME_CAP_STRIDE(cap); 1133 1134 while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 1135 msleep(100); 1136 if (fatal_signal_pending(current)) 1137 result = -EINTR; 1138 if (time_after(jiffies, timeout)) { 1139 dev_err(&dev->pci_dev->dev, 1140 "Device not ready; aborting initialisation\n"); 1141 result = -ENODEV; 1142 } 1143 } 1144 1145 if (result) 1146 goto free_q; 1147 1148 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1149 if (result) 1150 goto free_q; 1151 1152 dev->queues[0] = nvmeq; 1153 return result; 1154 1155 free_q: 1156 nvme_free_queue_mem(nvmeq); 1157 return result; 1158} 1159 1160struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1161 unsigned long addr, unsigned length) 1162{ 1163 int i, err, count, nents, offset; 1164 struct scatterlist *sg; 1165 struct page **pages; 1166 struct nvme_iod *iod; 1167 1168 if (addr & 3) 1169 return ERR_PTR(-EINVAL); 1170 if (!length) 1171 return ERR_PTR(-EINVAL); 1172 1173 offset = offset_in_page(addr); 1174 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1175 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1176 if (!pages) 1177 return ERR_PTR(-ENOMEM); 1178 1179 err = get_user_pages_fast(addr, count, 1, pages); 1180 if (err < count) { 1181 count = err; 1182 err = -EFAULT; 1183 goto put_pages; 1184 } 1185 1186 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1187 sg = iod->sg; 1188 sg_init_table(sg, count); 1189 for (i = 0; i < count; i++) { 1190 sg_set_page(&sg[i], pages[i], 1191 min_t(int, length, PAGE_SIZE - offset), offset); 1192 length -= (PAGE_SIZE - offset); 1193 offset = 0; 1194 } 1195 sg_mark_end(&sg[i - 1]); 1196 iod->nents = count; 1197 1198 err = -ENOMEM; 1199 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1200 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1201 if (!nents) 1202 goto free_iod; 1203 1204 kfree(pages); 1205 return iod; 1206 1207 free_iod: 1208 kfree(iod); 1209 put_pages: 1210 for (i = 0; i < count; i++) 1211 put_page(pages[i]); 1212 kfree(pages); 1213 return ERR_PTR(err); 1214} 1215 1216void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1217 struct nvme_iod *iod) 1218{ 1219 int i; 1220 1221 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1222 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1223 1224 for (i = 0; i < iod->nents; i++) 1225 put_page(sg_page(&iod->sg[i])); 1226} 1227 1228static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1229{ 1230 struct nvme_dev *dev = ns->dev; 1231 struct nvme_queue *nvmeq; 1232 struct nvme_user_io io; 1233 struct nvme_command c; 1234 unsigned length; 1235 int status; 1236 struct nvme_iod *iod; 1237 1238 if (copy_from_user(&io, uio, sizeof(io))) 1239 return -EFAULT; 1240 length = (io.nblocks + 1) << ns->lba_shift; 1241 1242 switch (io.opcode) { 1243 case nvme_cmd_write: 1244 case nvme_cmd_read: 1245 case nvme_cmd_compare: 1246 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1247 break; 1248 default: 1249 return -EINVAL; 1250 } 1251 1252 if (IS_ERR(iod)) 1253 return PTR_ERR(iod); 1254 1255 memset(&c, 0, sizeof(c)); 1256 c.rw.opcode = io.opcode; 1257 c.rw.flags = io.flags; 1258 c.rw.nsid = cpu_to_le32(ns->ns_id); 1259 c.rw.slba = cpu_to_le64(io.slba); 1260 c.rw.length = cpu_to_le16(io.nblocks); 1261 c.rw.control = cpu_to_le16(io.control); 1262 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1263 c.rw.reftag = cpu_to_le32(io.reftag); 1264 c.rw.apptag = cpu_to_le16(io.apptag); 1265 c.rw.appmask = cpu_to_le16(io.appmask); 1266 /* XXX: metadata */ 1267 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1268 1269 nvmeq = get_nvmeq(dev); 1270 /* 1271 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1272 * disabled. We may be preempted at any point, and be rescheduled 1273 * to a different CPU. That will cause cacheline bouncing, but no 1274 * additional races since q_lock already protects against other CPUs. 1275 */ 1276 put_nvmeq(nvmeq); 1277 if (length != (io.nblocks + 1) << ns->lba_shift) 1278 status = -ENOMEM; 1279 else 1280 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1281 1282 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1283 nvme_free_iod(dev, iod); 1284 return status; 1285} 1286 1287static int nvme_user_admin_cmd(struct nvme_dev *dev, 1288 struct nvme_admin_cmd __user *ucmd) 1289{ 1290 struct nvme_admin_cmd cmd; 1291 struct nvme_command c; 1292 int status, length; 1293 struct nvme_iod *uninitialized_var(iod); 1294 1295 if (!capable(CAP_SYS_ADMIN)) 1296 return -EACCES; 1297 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1298 return -EFAULT; 1299 1300 memset(&c, 0, sizeof(c)); 1301 c.common.opcode = cmd.opcode; 1302 c.common.flags = cmd.flags; 1303 c.common.nsid = cpu_to_le32(cmd.nsid); 1304 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1305 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1306 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1307 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1308 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1309 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1310 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1311 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1312 1313 length = cmd.data_len; 1314 if (cmd.data_len) { 1315 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1316 length); 1317 if (IS_ERR(iod)) 1318 return PTR_ERR(iod); 1319 length = nvme_setup_prps(dev, &c.common, iod, length, 1320 GFP_KERNEL); 1321 } 1322 1323 if (length != cmd.data_len) 1324 status = -ENOMEM; 1325 else 1326 status = nvme_submit_admin_cmd(dev, &c, &cmd.result); 1327 1328 if (cmd.data_len) { 1329 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1330 nvme_free_iod(dev, iod); 1331 } 1332 1333 if (!status && copy_to_user(&ucmd->result, &cmd.result, 1334 sizeof(cmd.result))) 1335 status = -EFAULT; 1336 1337 return status; 1338} 1339 1340static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1341 unsigned long arg) 1342{ 1343 struct nvme_ns *ns = bdev->bd_disk->private_data; 1344 1345 switch (cmd) { 1346 case NVME_IOCTL_ID: 1347 return ns->ns_id; 1348 case NVME_IOCTL_ADMIN_CMD: 1349 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1350 case NVME_IOCTL_SUBMIT_IO: 1351 return nvme_submit_io(ns, (void __user *)arg); 1352 case SG_GET_VERSION_NUM: 1353 return nvme_sg_get_version_num((void __user *)arg); 1354 case SG_IO: 1355 return nvme_sg_io(ns, (void __user *)arg); 1356 default: 1357 return -ENOTTY; 1358 } 1359} 1360 1361static const struct block_device_operations nvme_fops = { 1362 .owner = THIS_MODULE, 1363 .ioctl = nvme_ioctl, 1364 .compat_ioctl = nvme_ioctl, 1365}; 1366 1367static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1368{ 1369 while (bio_list_peek(&nvmeq->sq_cong)) { 1370 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1371 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1372 1373 if (bio_list_empty(&nvmeq->sq_cong)) 1374 remove_wait_queue(&nvmeq->sq_full, 1375 &nvmeq->sq_cong_wait); 1376 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1377 if (bio_list_empty(&nvmeq->sq_cong)) 1378 add_wait_queue(&nvmeq->sq_full, 1379 &nvmeq->sq_cong_wait); 1380 bio_list_add_head(&nvmeq->sq_cong, bio); 1381 break; 1382 } 1383 } 1384} 1385 1386static int nvme_kthread(void *data) 1387{ 1388 struct nvme_dev *dev; 1389 1390 while (!kthread_should_stop()) { 1391 set_current_state(TASK_INTERRUPTIBLE); 1392 spin_lock(&dev_list_lock); 1393 list_for_each_entry(dev, &dev_list, node) { 1394 int i; 1395 for (i = 0; i < dev->queue_count; i++) { 1396 struct nvme_queue *nvmeq = dev->queues[i]; 1397 if (!nvmeq) 1398 continue; 1399 spin_lock_irq(&nvmeq->q_lock); 1400 if (nvme_process_cq(nvmeq)) 1401 printk("process_cq did something\n"); 1402 nvme_cancel_ios(nvmeq, true); 1403 nvme_resubmit_bios(nvmeq); 1404 spin_unlock_irq(&nvmeq->q_lock); 1405 } 1406 } 1407 spin_unlock(&dev_list_lock); 1408 schedule_timeout(round_jiffies_relative(HZ)); 1409 } 1410 return 0; 1411} 1412 1413static DEFINE_IDA(nvme_index_ida); 1414 1415static int nvme_get_ns_idx(void) 1416{ 1417 int index, error; 1418 1419 do { 1420 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1421 return -1; 1422 1423 spin_lock(&dev_list_lock); 1424 error = ida_get_new(&nvme_index_ida, &index); 1425 spin_unlock(&dev_list_lock); 1426 } while (error == -EAGAIN); 1427 1428 if (error) 1429 index = -1; 1430 return index; 1431} 1432 1433static void nvme_put_ns_idx(int index) 1434{ 1435 spin_lock(&dev_list_lock); 1436 ida_remove(&nvme_index_ida, index); 1437 spin_unlock(&dev_list_lock); 1438} 1439 1440static void nvme_config_discard(struct nvme_ns *ns) 1441{ 1442 u32 logical_block_size = queue_logical_block_size(ns->queue); 1443 ns->queue->limits.discard_zeroes_data = 0; 1444 ns->queue->limits.discard_alignment = logical_block_size; 1445 ns->queue->limits.discard_granularity = logical_block_size; 1446 ns->queue->limits.max_discard_sectors = 0xffffffff; 1447 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1448} 1449 1450static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1451 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1452{ 1453 struct nvme_ns *ns; 1454 struct gendisk *disk; 1455 int lbaf; 1456 1457 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1458 return NULL; 1459 1460 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1461 if (!ns) 1462 return NULL; 1463 ns->queue = blk_alloc_queue(GFP_KERNEL); 1464 if (!ns->queue) 1465 goto out_free_ns; 1466 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1467 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1468 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1469 blk_queue_make_request(ns->queue, nvme_make_request); 1470 ns->dev = dev; 1471 ns->queue->queuedata = ns; 1472 1473 disk = alloc_disk(NVME_MINORS); 1474 if (!disk) 1475 goto out_free_queue; 1476 ns->ns_id = nsid; 1477 ns->disk = disk; 1478 lbaf = id->flbas & 0xf; 1479 ns->lba_shift = id->lbaf[lbaf].ds; 1480 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1481 if (dev->max_hw_sectors) 1482 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1483 1484 disk->major = nvme_major; 1485 disk->minors = NVME_MINORS; 1486 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1487 disk->fops = &nvme_fops; 1488 disk->private_data = ns; 1489 disk->queue = ns->queue; 1490 disk->driverfs_dev = &dev->pci_dev->dev; 1491 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1492 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1493 1494 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1495 nvme_config_discard(ns); 1496 1497 return ns; 1498 1499 out_free_queue: 1500 blk_cleanup_queue(ns->queue); 1501 out_free_ns: 1502 kfree(ns); 1503 return NULL; 1504} 1505 1506static void nvme_ns_free(struct nvme_ns *ns) 1507{ 1508 int index = ns->disk->first_minor / NVME_MINORS; 1509 put_disk(ns->disk); 1510 nvme_put_ns_idx(index); 1511 blk_cleanup_queue(ns->queue); 1512 kfree(ns); 1513} 1514 1515static int set_queue_count(struct nvme_dev *dev, int count) 1516{ 1517 int status; 1518 u32 result; 1519 u32 q_count = (count - 1) | ((count - 1) << 16); 1520 1521 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1522 &result); 1523 if (status) 1524 return -EIO; 1525 return min(result & 0xffff, result >> 16) + 1; 1526} 1527 1528static int nvme_setup_io_queues(struct nvme_dev *dev) 1529{ 1530 int result, cpu, i, nr_io_queues, db_bar_size, q_depth; 1531 1532 nr_io_queues = num_online_cpus(); 1533 result = set_queue_count(dev, nr_io_queues); 1534 if (result < 0) 1535 return result; 1536 if (result < nr_io_queues) 1537 nr_io_queues = result; 1538 1539 /* Deregister the admin queue's interrupt */ 1540 free_irq(dev->entry[0].vector, dev->queues[0]); 1541 1542 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1543 if (db_bar_size > 8192) { 1544 iounmap(dev->bar); 1545 dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 1546 db_bar_size); 1547 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1548 dev->queues[0]->q_db = dev->dbs; 1549 } 1550 1551 for (i = 0; i < nr_io_queues; i++) 1552 dev->entry[i].entry = i; 1553 for (;;) { 1554 result = pci_enable_msix(dev->pci_dev, dev->entry, 1555 nr_io_queues); 1556 if (result == 0) { 1557 break; 1558 } else if (result > 0) { 1559 nr_io_queues = result; 1560 continue; 1561 } else { 1562 nr_io_queues = 1; 1563 break; 1564 } 1565 } 1566 1567 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1568 /* XXX: handle failure here */ 1569 1570 cpu = cpumask_first(cpu_online_mask); 1571 for (i = 0; i < nr_io_queues; i++) { 1572 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1573 cpu = cpumask_next(cpu, cpu_online_mask); 1574 } 1575 1576 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1577 NVME_Q_DEPTH); 1578 for (i = 0; i < nr_io_queues; i++) { 1579 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); 1580 if (IS_ERR(dev->queues[i + 1])) 1581 return PTR_ERR(dev->queues[i + 1]); 1582 dev->queue_count++; 1583 } 1584 1585 for (; i < num_possible_cpus(); i++) { 1586 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1587 dev->queues[i + 1] = dev->queues[target + 1]; 1588 } 1589 1590 return 0; 1591} 1592 1593static void nvme_free_queues(struct nvme_dev *dev) 1594{ 1595 int i; 1596 1597 for (i = dev->queue_count - 1; i >= 0; i--) 1598 nvme_free_queue(dev, i); 1599} 1600 1601/* 1602 * Return: error value if an error occurred setting up the queues or calling 1603 * Identify Device. 0 if these succeeded, even if adding some of the 1604 * namespaces failed. At the moment, these failures are silent. TBD which 1605 * failures should be reported. 1606 */ 1607static int nvme_dev_add(struct nvme_dev *dev) 1608{ 1609 int res, nn, i; 1610 struct nvme_ns *ns; 1611 struct nvme_id_ctrl *ctrl; 1612 struct nvme_id_ns *id_ns; 1613 void *mem; 1614 dma_addr_t dma_addr; 1615 1616 res = nvme_setup_io_queues(dev); 1617 if (res) 1618 return res; 1619 1620 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1621 GFP_KERNEL); 1622 if (!mem) 1623 return -ENOMEM; 1624 1625 res = nvme_identify(dev, 0, 1, dma_addr); 1626 if (res) { 1627 res = -EIO; 1628 goto out; 1629 } 1630 1631 ctrl = mem; 1632 nn = le32_to_cpup(&ctrl->nn); 1633 dev->oncs = le16_to_cpup(&ctrl->oncs); 1634 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1635 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1636 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1637 if (ctrl->mdts) { 1638 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1639 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 1640 } 1641 1642 id_ns = mem; 1643 for (i = 1; i <= nn; i++) { 1644 res = nvme_identify(dev, i, 0, dma_addr); 1645 if (res) 1646 continue; 1647 1648 if (id_ns->ncap == 0) 1649 continue; 1650 1651 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1652 dma_addr + 4096, NULL); 1653 if (res) 1654 memset(mem + 4096, 0, 4096); 1655 1656 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1657 if (ns) 1658 list_add_tail(&ns->list, &dev->namespaces); 1659 } 1660 list_for_each_entry(ns, &dev->namespaces, list) 1661 add_disk(ns->disk); 1662 res = 0; 1663 1664 out: 1665 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1666 return res; 1667} 1668 1669static int nvme_dev_remove(struct nvme_dev *dev) 1670{ 1671 struct nvme_ns *ns, *next; 1672 1673 spin_lock(&dev_list_lock); 1674 list_del(&dev->node); 1675 spin_unlock(&dev_list_lock); 1676 1677 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1678 list_del(&ns->list); 1679 del_gendisk(ns->disk); 1680 nvme_ns_free(ns); 1681 } 1682 1683 nvme_free_queues(dev); 1684 1685 return 0; 1686} 1687 1688static int nvme_setup_prp_pools(struct nvme_dev *dev) 1689{ 1690 struct device *dmadev = &dev->pci_dev->dev; 1691 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1692 PAGE_SIZE, PAGE_SIZE, 0); 1693 if (!dev->prp_page_pool) 1694 return -ENOMEM; 1695 1696 /* Optimisation for I/Os between 4k and 128k */ 1697 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1698 256, 256, 0); 1699 if (!dev->prp_small_pool) { 1700 dma_pool_destroy(dev->prp_page_pool); 1701 return -ENOMEM; 1702 } 1703 return 0; 1704} 1705 1706static void nvme_release_prp_pools(struct nvme_dev *dev) 1707{ 1708 dma_pool_destroy(dev->prp_page_pool); 1709 dma_pool_destroy(dev->prp_small_pool); 1710} 1711 1712static DEFINE_IDA(nvme_instance_ida); 1713 1714static int nvme_set_instance(struct nvme_dev *dev) 1715{ 1716 int instance, error; 1717 1718 do { 1719 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1720 return -ENODEV; 1721 1722 spin_lock(&dev_list_lock); 1723 error = ida_get_new(&nvme_instance_ida, &instance); 1724 spin_unlock(&dev_list_lock); 1725 } while (error == -EAGAIN); 1726 1727 if (error) 1728 return -ENODEV; 1729 1730 dev->instance = instance; 1731 return 0; 1732} 1733 1734static void nvme_release_instance(struct nvme_dev *dev) 1735{ 1736 spin_lock(&dev_list_lock); 1737 ida_remove(&nvme_instance_ida, dev->instance); 1738 spin_unlock(&dev_list_lock); 1739} 1740 1741static void nvme_free_dev(struct kref *kref) 1742{ 1743 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 1744 nvme_dev_remove(dev); 1745 pci_disable_msix(dev->pci_dev); 1746 iounmap(dev->bar); 1747 nvme_release_instance(dev); 1748 nvme_release_prp_pools(dev); 1749 pci_disable_device(dev->pci_dev); 1750 pci_release_regions(dev->pci_dev); 1751 kfree(dev->queues); 1752 kfree(dev->entry); 1753 kfree(dev); 1754} 1755 1756static int nvme_dev_open(struct inode *inode, struct file *f) 1757{ 1758 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 1759 miscdev); 1760 kref_get(&dev->kref); 1761 f->private_data = dev; 1762 return 0; 1763} 1764 1765static int nvme_dev_release(struct inode *inode, struct file *f) 1766{ 1767 struct nvme_dev *dev = f->private_data; 1768 kref_put(&dev->kref, nvme_free_dev); 1769 return 0; 1770} 1771 1772static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1773{ 1774 struct nvme_dev *dev = f->private_data; 1775 switch (cmd) { 1776 case NVME_IOCTL_ADMIN_CMD: 1777 return nvme_user_admin_cmd(dev, (void __user *)arg); 1778 default: 1779 return -ENOTTY; 1780 } 1781} 1782 1783static const struct file_operations nvme_dev_fops = { 1784 .owner = THIS_MODULE, 1785 .open = nvme_dev_open, 1786 .release = nvme_dev_release, 1787 .unlocked_ioctl = nvme_dev_ioctl, 1788 .compat_ioctl = nvme_dev_ioctl, 1789}; 1790 1791static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1792{ 1793 int bars, result = -ENOMEM; 1794 struct nvme_dev *dev; 1795 1796 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1797 if (!dev) 1798 return -ENOMEM; 1799 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1800 GFP_KERNEL); 1801 if (!dev->entry) 1802 goto free; 1803 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1804 GFP_KERNEL); 1805 if (!dev->queues) 1806 goto free; 1807 1808 if (pci_enable_device_mem(pdev)) 1809 goto free; 1810 pci_set_master(pdev); 1811 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1812 if (pci_request_selected_regions(pdev, bars, "nvme")) 1813 goto disable; 1814 1815 INIT_LIST_HEAD(&dev->namespaces); 1816 dev->pci_dev = pdev; 1817 pci_set_drvdata(pdev, dev); 1818 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1819 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1820 result = nvme_set_instance(dev); 1821 if (result) 1822 goto disable; 1823 1824 dev->entry[0].vector = pdev->irq; 1825 1826 result = nvme_setup_prp_pools(dev); 1827 if (result) 1828 goto disable_msix; 1829 1830 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1831 if (!dev->bar) { 1832 result = -ENOMEM; 1833 goto disable_msix; 1834 } 1835 1836 result = nvme_configure_admin_queue(dev); 1837 if (result) 1838 goto unmap; 1839 dev->queue_count++; 1840 1841 spin_lock(&dev_list_lock); 1842 list_add(&dev->node, &dev_list); 1843 spin_unlock(&dev_list_lock); 1844 1845 result = nvme_dev_add(dev); 1846 if (result) 1847 goto delete; 1848 1849 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 1850 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 1851 dev->miscdev.parent = &pdev->dev; 1852 dev->miscdev.name = dev->name; 1853 dev->miscdev.fops = &nvme_dev_fops; 1854 result = misc_register(&dev->miscdev); 1855 if (result) 1856 goto remove; 1857 1858 kref_init(&dev->kref); 1859 return 0; 1860 1861 remove: 1862 nvme_dev_remove(dev); 1863 delete: 1864 spin_lock(&dev_list_lock); 1865 list_del(&dev->node); 1866 spin_unlock(&dev_list_lock); 1867 1868 nvme_free_queues(dev); 1869 unmap: 1870 iounmap(dev->bar); 1871 disable_msix: 1872 pci_disable_msix(pdev); 1873 nvme_release_instance(dev); 1874 nvme_release_prp_pools(dev); 1875 disable: 1876 pci_disable_device(pdev); 1877 pci_release_regions(pdev); 1878 free: 1879 kfree(dev->queues); 1880 kfree(dev->entry); 1881 kfree(dev); 1882 return result; 1883} 1884 1885static void nvme_remove(struct pci_dev *pdev) 1886{ 1887 struct nvme_dev *dev = pci_get_drvdata(pdev); 1888 misc_deregister(&dev->miscdev); 1889 kref_put(&dev->kref, nvme_free_dev); 1890} 1891 1892/* These functions are yet to be implemented */ 1893#define nvme_error_detected NULL 1894#define nvme_dump_registers NULL 1895#define nvme_link_reset NULL 1896#define nvme_slot_reset NULL 1897#define nvme_error_resume NULL 1898#define nvme_suspend NULL 1899#define nvme_resume NULL 1900 1901static const struct pci_error_handlers nvme_err_handler = { 1902 .error_detected = nvme_error_detected, 1903 .mmio_enabled = nvme_dump_registers, 1904 .link_reset = nvme_link_reset, 1905 .slot_reset = nvme_slot_reset, 1906 .resume = nvme_error_resume, 1907}; 1908 1909/* Move to pci_ids.h later */ 1910#define PCI_CLASS_STORAGE_EXPRESS 0x010802 1911 1912static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 1913 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 1914 { 0, } 1915}; 1916MODULE_DEVICE_TABLE(pci, nvme_id_table); 1917 1918static struct pci_driver nvme_driver = { 1919 .name = "nvme", 1920 .id_table = nvme_id_table, 1921 .probe = nvme_probe, 1922 .remove = nvme_remove, 1923 .suspend = nvme_suspend, 1924 .resume = nvme_resume, 1925 .err_handler = &nvme_err_handler, 1926}; 1927 1928static int __init nvme_init(void) 1929{ 1930 int result; 1931 1932 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1933 if (IS_ERR(nvme_thread)) 1934 return PTR_ERR(nvme_thread); 1935 1936 result = register_blkdev(nvme_major, "nvme"); 1937 if (result < 0) 1938 goto kill_kthread; 1939 else if (result > 0) 1940 nvme_major = result; 1941 1942 result = pci_register_driver(&nvme_driver); 1943 if (result) 1944 goto unregister_blkdev; 1945 return 0; 1946 1947 unregister_blkdev: 1948 unregister_blkdev(nvme_major, "nvme"); 1949 kill_kthread: 1950 kthread_stop(nvme_thread); 1951 return result; 1952} 1953 1954static void __exit nvme_exit(void) 1955{ 1956 pci_unregister_driver(&nvme_driver); 1957 unregister_blkdev(nvme_major, "nvme"); 1958 kthread_stop(nvme_thread); 1959} 1960 1961MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 1962MODULE_LICENSE("GPL"); 1963MODULE_VERSION("0.8"); 1964module_init(nvme_init); 1965module_exit(nvme_exit); 1966