1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#define NVME_Q_DEPTH 1024 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48#define NVME_MINORS 64 49#define NVME_IO_TIMEOUT (5 * HZ) 50#define ADMIN_TIMEOUT (60 * HZ) 51 52static int nvme_major; 53module_param(nvme_major, int, 0); 54 55static int use_threaded_interrupts; 56module_param(use_threaded_interrupts, int, 0); 57 58static DEFINE_SPINLOCK(dev_list_lock); 59static LIST_HEAD(dev_list); 60static struct task_struct *nvme_thread; 61 62/* 63 * Represents an NVM Express device. Each nvme_dev is a PCI function. 64 */ 65struct nvme_dev { 66 struct list_head node; 67 struct nvme_queue **queues; 68 u32 __iomem *dbs; 69 struct pci_dev *pci_dev; 70 struct dma_pool *prp_page_pool; 71 struct dma_pool *prp_small_pool; 72 int instance; 73 int queue_count; 74 int db_stride; 75 u32 ctrl_config; 76 struct msix_entry *entry; 77 struct nvme_bar __iomem *bar; 78 struct list_head namespaces; 79 char serial[20]; 80 char model[40]; 81 char firmware_rev[8]; 82}; 83 84/* 85 * An NVM Express namespace is equivalent to a SCSI LUN 86 */ 87struct nvme_ns { 88 struct list_head list; 89 90 struct nvme_dev *dev; 91 struct request_queue *queue; 92 struct gendisk *disk; 93 94 int ns_id; 95 int lba_shift; 96}; 97 98/* 99 * An NVM Express queue. Each device has at least two (one for admin 100 * commands and one for I/O commands). 101 */ 102struct nvme_queue { 103 struct device *q_dmadev; 104 struct nvme_dev *dev; 105 spinlock_t q_lock; 106 struct nvme_command *sq_cmds; 107 volatile struct nvme_completion *cqes; 108 dma_addr_t sq_dma_addr; 109 dma_addr_t cq_dma_addr; 110 wait_queue_head_t sq_full; 111 wait_queue_t sq_cong_wait; 112 struct bio_list sq_cong; 113 u32 __iomem *q_db; 114 u16 q_depth; 115 u16 cq_vector; 116 u16 sq_head; 117 u16 sq_tail; 118 u16 cq_head; 119 u16 cq_phase; 120 unsigned long cmdid_data[]; 121}; 122 123/* 124 * Check we didin't inadvertently grow the command struct 125 */ 126static inline void _nvme_check_size(void) 127{ 128 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 129 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 130 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 131 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 132 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 133 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 134 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 135 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 136 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 137} 138 139typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 140 struct nvme_completion *); 141 142struct nvme_cmd_info { 143 nvme_completion_fn fn; 144 void *ctx; 145 unsigned long timeout; 146}; 147 148static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 149{ 150 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 151} 152 153/** 154 * alloc_cmdid() - Allocate a Command ID 155 * @nvmeq: The queue that will be used for this command 156 * @ctx: A pointer that will be passed to the handler 157 * @handler: The function to call on completion 158 * 159 * Allocate a Command ID for a queue. The data passed in will 160 * be passed to the completion handler. This is implemented by using 161 * the bottom two bits of the ctx pointer to store the handler ID. 162 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 163 * We can change this if it becomes a problem. 164 * 165 * May be called with local interrupts disabled and the q_lock held, 166 * or with interrupts enabled and no locks held. 167 */ 168static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 169 nvme_completion_fn handler, unsigned timeout) 170{ 171 int depth = nvmeq->q_depth - 1; 172 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 173 int cmdid; 174 175 do { 176 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 177 if (cmdid >= depth) 178 return -EBUSY; 179 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 180 181 info[cmdid].fn = handler; 182 info[cmdid].ctx = ctx; 183 info[cmdid].timeout = jiffies + timeout; 184 return cmdid; 185} 186 187static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 188 nvme_completion_fn handler, unsigned timeout) 189{ 190 int cmdid; 191 wait_event_killable(nvmeq->sq_full, 192 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 193 return (cmdid < 0) ? -EINTR : cmdid; 194} 195 196/* Special values must be less than 0x1000 */ 197#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 198#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 199#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 200#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 201#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 202 203static void special_completion(struct nvme_dev *dev, void *ctx, 204 struct nvme_completion *cqe) 205{ 206 if (ctx == CMD_CTX_CANCELLED) 207 return; 208 if (ctx == CMD_CTX_FLUSH) 209 return; 210 if (ctx == CMD_CTX_COMPLETED) { 211 dev_warn(&dev->pci_dev->dev, 212 "completed id %d twice on queue %d\n", 213 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 214 return; 215 } 216 if (ctx == CMD_CTX_INVALID) { 217 dev_warn(&dev->pci_dev->dev, 218 "invalid id %d completed on queue %d\n", 219 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 220 return; 221 } 222 223 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 224} 225 226/* 227 * Called with local interrupts disabled and the q_lock held. May not sleep. 228 */ 229static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 230 nvme_completion_fn *fn) 231{ 232 void *ctx; 233 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 234 235 if (cmdid >= nvmeq->q_depth) { 236 *fn = special_completion; 237 return CMD_CTX_INVALID; 238 } 239 *fn = info[cmdid].fn; 240 ctx = info[cmdid].ctx; 241 info[cmdid].fn = special_completion; 242 info[cmdid].ctx = CMD_CTX_COMPLETED; 243 clear_bit(cmdid, nvmeq->cmdid_data); 244 wake_up(&nvmeq->sq_full); 245 return ctx; 246} 247 248static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 249 nvme_completion_fn *fn) 250{ 251 void *ctx; 252 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 253 if (fn) 254 *fn = info[cmdid].fn; 255 ctx = info[cmdid].ctx; 256 info[cmdid].fn = special_completion; 257 info[cmdid].ctx = CMD_CTX_CANCELLED; 258 return ctx; 259} 260 261static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 262{ 263 return dev->queues[get_cpu() + 1]; 264} 265 266static void put_nvmeq(struct nvme_queue *nvmeq) 267{ 268 put_cpu(); 269} 270 271/** 272 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 273 * @nvmeq: The queue to use 274 * @cmd: The command to send 275 * 276 * Safe to use from interrupt context 277 */ 278static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 279{ 280 unsigned long flags; 281 u16 tail; 282 spin_lock_irqsave(&nvmeq->q_lock, flags); 283 tail = nvmeq->sq_tail; 284 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 285 if (++tail == nvmeq->q_depth) 286 tail = 0; 287 writel(tail, nvmeq->q_db); 288 nvmeq->sq_tail = tail; 289 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 290 291 return 0; 292} 293 294/* 295 * The nvme_iod describes the data in an I/O, including the list of PRP 296 * entries. You can't see it in this data structure because C doesn't let 297 * me express that. Use nvme_alloc_iod to ensure there's enough space 298 * allocated to store the PRP list. 299 */ 300struct nvme_iod { 301 void *private; /* For the use of the submitter of the I/O */ 302 int npages; /* In the PRP list. 0 means small pool in use */ 303 int offset; /* Of PRP list */ 304 int nents; /* Used in scatterlist */ 305 int length; /* Of data, in bytes */ 306 dma_addr_t first_dma; 307 struct scatterlist sg[0]; 308}; 309 310static __le64 **iod_list(struct nvme_iod *iod) 311{ 312 return ((void *)iod) + iod->offset; 313} 314 315/* 316 * Will slightly overestimate the number of pages needed. This is OK 317 * as it only leads to a small amount of wasted memory for the lifetime of 318 * the I/O. 319 */ 320static int nvme_npages(unsigned size) 321{ 322 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 323 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 324} 325 326static struct nvme_iod * 327nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 328{ 329 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 330 sizeof(__le64 *) * nvme_npages(nbytes) + 331 sizeof(struct scatterlist) * nseg, gfp); 332 333 if (iod) { 334 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 335 iod->npages = -1; 336 iod->length = nbytes; 337 } 338 339 return iod; 340} 341 342static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 343{ 344 const int last_prp = PAGE_SIZE / 8 - 1; 345 int i; 346 __le64 **list = iod_list(iod); 347 dma_addr_t prp_dma = iod->first_dma; 348 349 if (iod->npages == 0) 350 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 351 for (i = 0; i < iod->npages; i++) { 352 __le64 *prp_list = list[i]; 353 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 354 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 355 prp_dma = next_prp_dma; 356 } 357 kfree(iod); 358} 359 360static void requeue_bio(struct nvme_dev *dev, struct bio *bio) 361{ 362 struct nvme_queue *nvmeq = get_nvmeq(dev); 363 if (bio_list_empty(&nvmeq->sq_cong)) 364 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 365 bio_list_add(&nvmeq->sq_cong, bio); 366 put_nvmeq(nvmeq); 367 wake_up_process(nvme_thread); 368} 369 370static void bio_completion(struct nvme_dev *dev, void *ctx, 371 struct nvme_completion *cqe) 372{ 373 struct nvme_iod *iod = ctx; 374 struct bio *bio = iod->private; 375 u16 status = le16_to_cpup(&cqe->status) >> 1; 376 377 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 378 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 379 nvme_free_iod(dev, iod); 380 if (status) { 381 bio_endio(bio, -EIO); 382 } else if (bio->bi_vcnt > bio->bi_idx) { 383 requeue_bio(dev, bio); 384 } else { 385 bio_endio(bio, 0); 386 } 387} 388 389/* length is in bytes. gfp flags indicates whether we may sleep. */ 390static int nvme_setup_prps(struct nvme_dev *dev, 391 struct nvme_common_command *cmd, struct nvme_iod *iod, 392 int total_len, gfp_t gfp) 393{ 394 struct dma_pool *pool; 395 int length = total_len; 396 struct scatterlist *sg = iod->sg; 397 int dma_len = sg_dma_len(sg); 398 u64 dma_addr = sg_dma_address(sg); 399 int offset = offset_in_page(dma_addr); 400 __le64 *prp_list; 401 __le64 **list = iod_list(iod); 402 dma_addr_t prp_dma; 403 int nprps, i; 404 405 cmd->prp1 = cpu_to_le64(dma_addr); 406 length -= (PAGE_SIZE - offset); 407 if (length <= 0) 408 return total_len; 409 410 dma_len -= (PAGE_SIZE - offset); 411 if (dma_len) { 412 dma_addr += (PAGE_SIZE - offset); 413 } else { 414 sg = sg_next(sg); 415 dma_addr = sg_dma_address(sg); 416 dma_len = sg_dma_len(sg); 417 } 418 419 if (length <= PAGE_SIZE) { 420 cmd->prp2 = cpu_to_le64(dma_addr); 421 return total_len; 422 } 423 424 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 425 if (nprps <= (256 / 8)) { 426 pool = dev->prp_small_pool; 427 iod->npages = 0; 428 } else { 429 pool = dev->prp_page_pool; 430 iod->npages = 1; 431 } 432 433 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 434 if (!prp_list) { 435 cmd->prp2 = cpu_to_le64(dma_addr); 436 iod->npages = -1; 437 return (total_len - length) + PAGE_SIZE; 438 } 439 list[0] = prp_list; 440 iod->first_dma = prp_dma; 441 cmd->prp2 = cpu_to_le64(prp_dma); 442 i = 0; 443 for (;;) { 444 if (i == PAGE_SIZE / 8) { 445 __le64 *old_prp_list = prp_list; 446 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 447 if (!prp_list) 448 return total_len - length; 449 list[iod->npages++] = prp_list; 450 prp_list[0] = old_prp_list[i - 1]; 451 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 452 i = 1; 453 } 454 prp_list[i++] = cpu_to_le64(dma_addr); 455 dma_len -= PAGE_SIZE; 456 dma_addr += PAGE_SIZE; 457 length -= PAGE_SIZE; 458 if (length <= 0) 459 break; 460 if (dma_len > 0) 461 continue; 462 BUG_ON(dma_len < 0); 463 sg = sg_next(sg); 464 dma_addr = sg_dma_address(sg); 465 dma_len = sg_dma_len(sg); 466 } 467 468 return total_len; 469} 470 471/* NVMe scatterlists require no holes in the virtual address */ 472#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 473 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 474 475static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, 476 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 477{ 478 struct bio_vec *bvec, *bvprv = NULL; 479 struct scatterlist *sg = NULL; 480 int i, old_idx, length = 0, nsegs = 0; 481 482 sg_init_table(iod->sg, psegs); 483 old_idx = bio->bi_idx; 484 bio_for_each_segment(bvec, bio, i) { 485 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 486 sg->length += bvec->bv_len; 487 } else { 488 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 489 break; 490 sg = sg ? sg + 1 : iod->sg; 491 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 492 bvec->bv_offset); 493 nsegs++; 494 } 495 length += bvec->bv_len; 496 bvprv = bvec; 497 } 498 bio->bi_idx = i; 499 iod->nents = nsegs; 500 sg_mark_end(sg); 501 if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { 502 bio->bi_idx = old_idx; 503 return -ENOMEM; 504 } 505 return length; 506} 507 508static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 509 int cmdid) 510{ 511 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 512 513 memset(cmnd, 0, sizeof(*cmnd)); 514 cmnd->common.opcode = nvme_cmd_flush; 515 cmnd->common.command_id = cmdid; 516 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 517 518 if (++nvmeq->sq_tail == nvmeq->q_depth) 519 nvmeq->sq_tail = 0; 520 writel(nvmeq->sq_tail, nvmeq->q_db); 521 522 return 0; 523} 524 525static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 526{ 527 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 528 special_completion, NVME_IO_TIMEOUT); 529 if (unlikely(cmdid < 0)) 530 return cmdid; 531 532 return nvme_submit_flush(nvmeq, ns, cmdid); 533} 534 535/* 536 * Called with local interrupts disabled and the q_lock held. May not sleep. 537 */ 538static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 539 struct bio *bio) 540{ 541 struct nvme_command *cmnd; 542 struct nvme_iod *iod; 543 enum dma_data_direction dma_dir; 544 int cmdid, length, result = -ENOMEM; 545 u16 control; 546 u32 dsmgmt; 547 int psegs = bio_phys_segments(ns->queue, bio); 548 549 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 550 result = nvme_submit_flush_data(nvmeq, ns); 551 if (result) 552 return result; 553 } 554 555 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 556 if (!iod) 557 goto nomem; 558 iod->private = bio; 559 560 result = -EBUSY; 561 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 562 if (unlikely(cmdid < 0)) 563 goto free_iod; 564 565 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 566 return nvme_submit_flush(nvmeq, ns, cmdid); 567 568 control = 0; 569 if (bio->bi_rw & REQ_FUA) 570 control |= NVME_RW_FUA; 571 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 572 control |= NVME_RW_LR; 573 574 dsmgmt = 0; 575 if (bio->bi_rw & REQ_RAHEAD) 576 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 577 578 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 579 580 memset(cmnd, 0, sizeof(*cmnd)); 581 if (bio_data_dir(bio)) { 582 cmnd->rw.opcode = nvme_cmd_write; 583 dma_dir = DMA_TO_DEVICE; 584 } else { 585 cmnd->rw.opcode = nvme_cmd_read; 586 dma_dir = DMA_FROM_DEVICE; 587 } 588 589 result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); 590 if (result < 0) 591 goto free_iod; 592 length = result; 593 594 cmnd->rw.command_id = cmdid; 595 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 596 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 597 GFP_ATOMIC); 598 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); 599 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 600 cmnd->rw.control = cpu_to_le16(control); 601 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 602 603 bio->bi_sector += length >> 9; 604 605 if (++nvmeq->sq_tail == nvmeq->q_depth) 606 nvmeq->sq_tail = 0; 607 writel(nvmeq->sq_tail, nvmeq->q_db); 608 609 return 0; 610 611 free_iod: 612 nvme_free_iod(nvmeq->dev, iod); 613 nomem: 614 return result; 615} 616 617static void nvme_make_request(struct request_queue *q, struct bio *bio) 618{ 619 struct nvme_ns *ns = q->queuedata; 620 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 621 int result = -EBUSY; 622 623 spin_lock_irq(&nvmeq->q_lock); 624 if (bio_list_empty(&nvmeq->sq_cong)) 625 result = nvme_submit_bio_queue(nvmeq, ns, bio); 626 if (unlikely(result)) { 627 if (bio_list_empty(&nvmeq->sq_cong)) 628 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 629 bio_list_add(&nvmeq->sq_cong, bio); 630 } 631 632 spin_unlock_irq(&nvmeq->q_lock); 633 put_nvmeq(nvmeq); 634} 635 636static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 637{ 638 u16 head, phase; 639 640 head = nvmeq->cq_head; 641 phase = nvmeq->cq_phase; 642 643 for (;;) { 644 void *ctx; 645 nvme_completion_fn fn; 646 struct nvme_completion cqe = nvmeq->cqes[head]; 647 if ((le16_to_cpu(cqe.status) & 1) != phase) 648 break; 649 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 650 if (++head == nvmeq->q_depth) { 651 head = 0; 652 phase = !phase; 653 } 654 655 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 656 fn(nvmeq->dev, ctx, &cqe); 657 } 658 659 /* If the controller ignores the cq head doorbell and continuously 660 * writes to the queue, it is theoretically possible to wrap around 661 * the queue twice and mistakenly return IRQ_NONE. Linux only 662 * requires that 0.1% of your interrupts are handled, so this isn't 663 * a big problem. 664 */ 665 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 666 return IRQ_NONE; 667 668 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 669 nvmeq->cq_head = head; 670 nvmeq->cq_phase = phase; 671 672 return IRQ_HANDLED; 673} 674 675static irqreturn_t nvme_irq(int irq, void *data) 676{ 677 irqreturn_t result; 678 struct nvme_queue *nvmeq = data; 679 spin_lock(&nvmeq->q_lock); 680 result = nvme_process_cq(nvmeq); 681 spin_unlock(&nvmeq->q_lock); 682 return result; 683} 684 685static irqreturn_t nvme_irq_check(int irq, void *data) 686{ 687 struct nvme_queue *nvmeq = data; 688 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 689 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 690 return IRQ_NONE; 691 return IRQ_WAKE_THREAD; 692} 693 694static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 695{ 696 spin_lock_irq(&nvmeq->q_lock); 697 cancel_cmdid(nvmeq, cmdid, NULL); 698 spin_unlock_irq(&nvmeq->q_lock); 699} 700 701struct sync_cmd_info { 702 struct task_struct *task; 703 u32 result; 704 int status; 705}; 706 707static void sync_completion(struct nvme_dev *dev, void *ctx, 708 struct nvme_completion *cqe) 709{ 710 struct sync_cmd_info *cmdinfo = ctx; 711 cmdinfo->result = le32_to_cpup(&cqe->result); 712 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 713 wake_up_process(cmdinfo->task); 714} 715 716/* 717 * Returns 0 on success. If the result is negative, it's a Linux error code; 718 * if the result is positive, it's an NVM Express status code 719 */ 720static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, 721 struct nvme_command *cmd, u32 *result, unsigned timeout) 722{ 723 int cmdid; 724 struct sync_cmd_info cmdinfo; 725 726 cmdinfo.task = current; 727 cmdinfo.status = -EINTR; 728 729 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 730 timeout); 731 if (cmdid < 0) 732 return cmdid; 733 cmd->common.command_id = cmdid; 734 735 set_current_state(TASK_KILLABLE); 736 nvme_submit_cmd(nvmeq, cmd); 737 schedule(); 738 739 if (cmdinfo.status == -EINTR) { 740 nvme_abort_command(nvmeq, cmdid); 741 return -EINTR; 742 } 743 744 if (result) 745 *result = cmdinfo.result; 746 747 return cmdinfo.status; 748} 749 750static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 751 u32 *result) 752{ 753 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 754} 755 756static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 757{ 758 int status; 759 struct nvme_command c; 760 761 memset(&c, 0, sizeof(c)); 762 c.delete_queue.opcode = opcode; 763 c.delete_queue.qid = cpu_to_le16(id); 764 765 status = nvme_submit_admin_cmd(dev, &c, NULL); 766 if (status) 767 return -EIO; 768 return 0; 769} 770 771static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 772 struct nvme_queue *nvmeq) 773{ 774 int status; 775 struct nvme_command c; 776 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 777 778 memset(&c, 0, sizeof(c)); 779 c.create_cq.opcode = nvme_admin_create_cq; 780 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 781 c.create_cq.cqid = cpu_to_le16(qid); 782 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 783 c.create_cq.cq_flags = cpu_to_le16(flags); 784 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 785 786 status = nvme_submit_admin_cmd(dev, &c, NULL); 787 if (status) 788 return -EIO; 789 return 0; 790} 791 792static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 793 struct nvme_queue *nvmeq) 794{ 795 int status; 796 struct nvme_command c; 797 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 798 799 memset(&c, 0, sizeof(c)); 800 c.create_sq.opcode = nvme_admin_create_sq; 801 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 802 c.create_sq.sqid = cpu_to_le16(qid); 803 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 804 c.create_sq.sq_flags = cpu_to_le16(flags); 805 c.create_sq.cqid = cpu_to_le16(qid); 806 807 status = nvme_submit_admin_cmd(dev, &c, NULL); 808 if (status) 809 return -EIO; 810 return 0; 811} 812 813static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 814{ 815 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 816} 817 818static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 819{ 820 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 821} 822 823static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 824 dma_addr_t dma_addr) 825{ 826 struct nvme_command c; 827 828 memset(&c, 0, sizeof(c)); 829 c.identify.opcode = nvme_admin_identify; 830 c.identify.nsid = cpu_to_le32(nsid); 831 c.identify.prp1 = cpu_to_le64(dma_addr); 832 c.identify.cns = cpu_to_le32(cns); 833 834 return nvme_submit_admin_cmd(dev, &c, NULL); 835} 836 837static int nvme_get_features(struct nvme_dev *dev, unsigned fid, 838 unsigned dword11, dma_addr_t dma_addr) 839{ 840 struct nvme_command c; 841 842 memset(&c, 0, sizeof(c)); 843 c.features.opcode = nvme_admin_get_features; 844 c.features.prp1 = cpu_to_le64(dma_addr); 845 c.features.fid = cpu_to_le32(fid); 846 c.features.dword11 = cpu_to_le32(dword11); 847 848 return nvme_submit_admin_cmd(dev, &c, NULL); 849} 850 851static int nvme_set_features(struct nvme_dev *dev, unsigned fid, 852 unsigned dword11, dma_addr_t dma_addr, u32 *result) 853{ 854 struct nvme_command c; 855 856 memset(&c, 0, sizeof(c)); 857 c.features.opcode = nvme_admin_set_features; 858 c.features.prp1 = cpu_to_le64(dma_addr); 859 c.features.fid = cpu_to_le32(fid); 860 c.features.dword11 = cpu_to_le32(dword11); 861 862 return nvme_submit_admin_cmd(dev, &c, result); 863} 864 865static void nvme_free_queue(struct nvme_dev *dev, int qid) 866{ 867 struct nvme_queue *nvmeq = dev->queues[qid]; 868 int vector = dev->entry[nvmeq->cq_vector].vector; 869 870 irq_set_affinity_hint(vector, NULL); 871 free_irq(vector, nvmeq); 872 873 /* Don't tell the adapter to delete the admin queue */ 874 if (qid) { 875 adapter_delete_sq(dev, qid); 876 adapter_delete_cq(dev, qid); 877 } 878 879 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 880 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 881 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 882 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 883 kfree(nvmeq); 884} 885 886static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 887 int depth, int vector) 888{ 889 struct device *dmadev = &dev->pci_dev->dev; 890 unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); 891 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 892 if (!nvmeq) 893 return NULL; 894 895 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 896 &nvmeq->cq_dma_addr, GFP_KERNEL); 897 if (!nvmeq->cqes) 898 goto free_nvmeq; 899 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 900 901 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 902 &nvmeq->sq_dma_addr, GFP_KERNEL); 903 if (!nvmeq->sq_cmds) 904 goto free_cqdma; 905 906 nvmeq->q_dmadev = dmadev; 907 nvmeq->dev = dev; 908 spin_lock_init(&nvmeq->q_lock); 909 nvmeq->cq_head = 0; 910 nvmeq->cq_phase = 1; 911 init_waitqueue_head(&nvmeq->sq_full); 912 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 913 bio_list_init(&nvmeq->sq_cong); 914 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 915 nvmeq->q_depth = depth; 916 nvmeq->cq_vector = vector; 917 918 return nvmeq; 919 920 free_cqdma: 921 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, 922 nvmeq->cq_dma_addr); 923 free_nvmeq: 924 kfree(nvmeq); 925 return NULL; 926} 927 928static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 929 const char *name) 930{ 931 if (use_threaded_interrupts) 932 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 933 nvme_irq_check, nvme_irq, 934 IRQF_DISABLED | IRQF_SHARED, 935 name, nvmeq); 936 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 937 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 938} 939 940static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, 941 int qid, int cq_size, int vector) 942{ 943 int result; 944 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 945 946 if (!nvmeq) 947 return ERR_PTR(-ENOMEM); 948 949 result = adapter_alloc_cq(dev, qid, nvmeq); 950 if (result < 0) 951 goto free_nvmeq; 952 953 result = adapter_alloc_sq(dev, qid, nvmeq); 954 if (result < 0) 955 goto release_cq; 956 957 result = queue_request_irq(dev, nvmeq, "nvme"); 958 if (result < 0) 959 goto release_sq; 960 961 return nvmeq; 962 963 release_sq: 964 adapter_delete_sq(dev, qid); 965 release_cq: 966 adapter_delete_cq(dev, qid); 967 free_nvmeq: 968 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 969 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 970 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 971 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 972 kfree(nvmeq); 973 return ERR_PTR(result); 974} 975 976static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) 977{ 978 int result; 979 u32 aqa; 980 u64 cap; 981 unsigned long timeout; 982 struct nvme_queue *nvmeq; 983 984 dev->dbs = ((void __iomem *)dev->bar) + 4096; 985 986 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 987 if (!nvmeq) 988 return -ENOMEM; 989 990 aqa = nvmeq->q_depth - 1; 991 aqa |= aqa << 16; 992 993 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 994 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 995 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 996 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 997 998 writel(0, &dev->bar->cc); 999 writel(aqa, &dev->bar->aqa); 1000 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1001 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1002 writel(dev->ctrl_config, &dev->bar->cc); 1003 1004 cap = readq(&dev->bar->cap); 1005 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1006 dev->db_stride = NVME_CAP_STRIDE(cap); 1007 1008 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 1009 msleep(100); 1010 if (fatal_signal_pending(current)) 1011 return -EINTR; 1012 if (time_after(jiffies, timeout)) { 1013 dev_err(&dev->pci_dev->dev, 1014 "Device not ready; aborting initialisation\n"); 1015 return -ENODEV; 1016 } 1017 } 1018 1019 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1020 dev->queues[0] = nvmeq; 1021 return result; 1022} 1023 1024static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1025 unsigned long addr, unsigned length) 1026{ 1027 int i, err, count, nents, offset; 1028 struct scatterlist *sg; 1029 struct page **pages; 1030 struct nvme_iod *iod; 1031 1032 if (addr & 3) 1033 return ERR_PTR(-EINVAL); 1034 if (!length) 1035 return ERR_PTR(-EINVAL); 1036 1037 offset = offset_in_page(addr); 1038 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1039 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1040 1041 err = get_user_pages_fast(addr, count, 1, pages); 1042 if (err < count) { 1043 count = err; 1044 err = -EFAULT; 1045 goto put_pages; 1046 } 1047 1048 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1049 sg = iod->sg; 1050 sg_init_table(sg, count); 1051 for (i = 0; i < count; i++) { 1052 sg_set_page(&sg[i], pages[i], 1053 min_t(int, length, PAGE_SIZE - offset), offset); 1054 length -= (PAGE_SIZE - offset); 1055 offset = 0; 1056 } 1057 sg_mark_end(&sg[i - 1]); 1058 iod->nents = count; 1059 1060 err = -ENOMEM; 1061 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1062 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1063 if (!nents) 1064 goto free_iod; 1065 1066 kfree(pages); 1067 return iod; 1068 1069 free_iod: 1070 kfree(iod); 1071 put_pages: 1072 for (i = 0; i < count; i++) 1073 put_page(pages[i]); 1074 kfree(pages); 1075 return ERR_PTR(err); 1076} 1077 1078static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1079 struct nvme_iod *iod) 1080{ 1081 int i; 1082 1083 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1084 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1085 1086 for (i = 0; i < iod->nents; i++) 1087 put_page(sg_page(&iod->sg[i])); 1088} 1089 1090static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1091{ 1092 struct nvme_dev *dev = ns->dev; 1093 struct nvme_queue *nvmeq; 1094 struct nvme_user_io io; 1095 struct nvme_command c; 1096 unsigned length; 1097 int status; 1098 struct nvme_iod *iod; 1099 1100 if (copy_from_user(&io, uio, sizeof(io))) 1101 return -EFAULT; 1102 length = (io.nblocks + 1) << ns->lba_shift; 1103 1104 switch (io.opcode) { 1105 case nvme_cmd_write: 1106 case nvme_cmd_read: 1107 case nvme_cmd_compare: 1108 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1109 break; 1110 default: 1111 return -EINVAL; 1112 } 1113 1114 if (IS_ERR(iod)) 1115 return PTR_ERR(iod); 1116 1117 memset(&c, 0, sizeof(c)); 1118 c.rw.opcode = io.opcode; 1119 c.rw.flags = io.flags; 1120 c.rw.nsid = cpu_to_le32(ns->ns_id); 1121 c.rw.slba = cpu_to_le64(io.slba); 1122 c.rw.length = cpu_to_le16(io.nblocks); 1123 c.rw.control = cpu_to_le16(io.control); 1124 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); 1125 c.rw.reftag = io.reftag; 1126 c.rw.apptag = io.apptag; 1127 c.rw.appmask = io.appmask; 1128 /* XXX: metadata */ 1129 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1130 1131 nvmeq = get_nvmeq(dev); 1132 /* 1133 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1134 * disabled. We may be preempted at any point, and be rescheduled 1135 * to a different CPU. That will cause cacheline bouncing, but no 1136 * additional races since q_lock already protects against other CPUs. 1137 */ 1138 put_nvmeq(nvmeq); 1139 if (length != (io.nblocks + 1) << ns->lba_shift) 1140 status = -ENOMEM; 1141 else 1142 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1143 1144 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1145 nvme_free_iod(dev, iod); 1146 return status; 1147} 1148 1149static int nvme_user_admin_cmd(struct nvme_ns *ns, 1150 struct nvme_admin_cmd __user *ucmd) 1151{ 1152 struct nvme_dev *dev = ns->dev; 1153 struct nvme_admin_cmd cmd; 1154 struct nvme_command c; 1155 int status, length; 1156 struct nvme_iod *iod; 1157 1158 if (!capable(CAP_SYS_ADMIN)) 1159 return -EACCES; 1160 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1161 return -EFAULT; 1162 1163 memset(&c, 0, sizeof(c)); 1164 c.common.opcode = cmd.opcode; 1165 c.common.flags = cmd.flags; 1166 c.common.nsid = cpu_to_le32(cmd.nsid); 1167 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1168 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1169 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1170 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1171 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1172 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1173 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1174 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1175 1176 length = cmd.data_len; 1177 if (cmd.data_len) { 1178 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1179 length); 1180 if (IS_ERR(iod)) 1181 return PTR_ERR(iod); 1182 length = nvme_setup_prps(dev, &c.common, iod, length, 1183 GFP_KERNEL); 1184 } 1185 1186 if (length != cmd.data_len) 1187 status = -ENOMEM; 1188 else 1189 status = nvme_submit_admin_cmd(dev, &c, NULL); 1190 1191 if (cmd.data_len) { 1192 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1193 nvme_free_iod(dev, iod); 1194 } 1195 return status; 1196} 1197 1198static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1199 unsigned long arg) 1200{ 1201 struct nvme_ns *ns = bdev->bd_disk->private_data; 1202 1203 switch (cmd) { 1204 case NVME_IOCTL_ID: 1205 return ns->ns_id; 1206 case NVME_IOCTL_ADMIN_CMD: 1207 return nvme_user_admin_cmd(ns, (void __user *)arg); 1208 case NVME_IOCTL_SUBMIT_IO: 1209 return nvme_submit_io(ns, (void __user *)arg); 1210 default: 1211 return -ENOTTY; 1212 } 1213} 1214 1215static const struct block_device_operations nvme_fops = { 1216 .owner = THIS_MODULE, 1217 .ioctl = nvme_ioctl, 1218 .compat_ioctl = nvme_ioctl, 1219}; 1220 1221static void nvme_timeout_ios(struct nvme_queue *nvmeq) 1222{ 1223 int depth = nvmeq->q_depth - 1; 1224 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1225 unsigned long now = jiffies; 1226 int cmdid; 1227 1228 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1229 void *ctx; 1230 nvme_completion_fn fn; 1231 static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, }; 1232 1233 if (!time_after(now, info[cmdid].timeout)) 1234 continue; 1235 dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid); 1236 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1237 fn(nvmeq->dev, ctx, &cqe); 1238 } 1239} 1240 1241static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1242{ 1243 while (bio_list_peek(&nvmeq->sq_cong)) { 1244 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1245 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1246 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1247 bio_list_add_head(&nvmeq->sq_cong, bio); 1248 break; 1249 } 1250 if (bio_list_empty(&nvmeq->sq_cong)) 1251 remove_wait_queue(&nvmeq->sq_full, 1252 &nvmeq->sq_cong_wait); 1253 } 1254} 1255 1256static int nvme_kthread(void *data) 1257{ 1258 struct nvme_dev *dev; 1259 1260 while (!kthread_should_stop()) { 1261 __set_current_state(TASK_RUNNING); 1262 spin_lock(&dev_list_lock); 1263 list_for_each_entry(dev, &dev_list, node) { 1264 int i; 1265 for (i = 0; i < dev->queue_count; i++) { 1266 struct nvme_queue *nvmeq = dev->queues[i]; 1267 if (!nvmeq) 1268 continue; 1269 spin_lock_irq(&nvmeq->q_lock); 1270 if (nvme_process_cq(nvmeq)) 1271 printk("process_cq did something\n"); 1272 nvme_timeout_ios(nvmeq); 1273 nvme_resubmit_bios(nvmeq); 1274 spin_unlock_irq(&nvmeq->q_lock); 1275 } 1276 } 1277 spin_unlock(&dev_list_lock); 1278 set_current_state(TASK_INTERRUPTIBLE); 1279 schedule_timeout(HZ); 1280 } 1281 return 0; 1282} 1283 1284static DEFINE_IDA(nvme_index_ida); 1285 1286static int nvme_get_ns_idx(void) 1287{ 1288 int index, error; 1289 1290 do { 1291 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1292 return -1; 1293 1294 spin_lock(&dev_list_lock); 1295 error = ida_get_new(&nvme_index_ida, &index); 1296 spin_unlock(&dev_list_lock); 1297 } while (error == -EAGAIN); 1298 1299 if (error) 1300 index = -1; 1301 return index; 1302} 1303 1304static void nvme_put_ns_idx(int index) 1305{ 1306 spin_lock(&dev_list_lock); 1307 ida_remove(&nvme_index_ida, index); 1308 spin_unlock(&dev_list_lock); 1309} 1310 1311static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1312 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1313{ 1314 struct nvme_ns *ns; 1315 struct gendisk *disk; 1316 int lbaf; 1317 1318 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1319 return NULL; 1320 1321 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1322 if (!ns) 1323 return NULL; 1324 ns->queue = blk_alloc_queue(GFP_KERNEL); 1325 if (!ns->queue) 1326 goto out_free_ns; 1327 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1328 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1329 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1330/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ 1331 blk_queue_make_request(ns->queue, nvme_make_request); 1332 ns->dev = dev; 1333 ns->queue->queuedata = ns; 1334 1335 disk = alloc_disk(NVME_MINORS); 1336 if (!disk) 1337 goto out_free_queue; 1338 ns->ns_id = nsid; 1339 ns->disk = disk; 1340 lbaf = id->flbas & 0xf; 1341 ns->lba_shift = id->lbaf[lbaf].ds; 1342 1343 disk->major = nvme_major; 1344 disk->minors = NVME_MINORS; 1345 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1346 disk->fops = &nvme_fops; 1347 disk->private_data = ns; 1348 disk->queue = ns->queue; 1349 disk->driverfs_dev = &dev->pci_dev->dev; 1350 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1351 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1352 1353 return ns; 1354 1355 out_free_queue: 1356 blk_cleanup_queue(ns->queue); 1357 out_free_ns: 1358 kfree(ns); 1359 return NULL; 1360} 1361 1362static void nvme_ns_free(struct nvme_ns *ns) 1363{ 1364 int index = ns->disk->first_minor / NVME_MINORS; 1365 put_disk(ns->disk); 1366 nvme_put_ns_idx(index); 1367 blk_cleanup_queue(ns->queue); 1368 kfree(ns); 1369} 1370 1371static int set_queue_count(struct nvme_dev *dev, int count) 1372{ 1373 int status; 1374 u32 result; 1375 u32 q_count = (count - 1) | ((count - 1) << 16); 1376 1377 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1378 &result); 1379 if (status) 1380 return -EIO; 1381 return min(result & 0xffff, result >> 16) + 1; 1382} 1383 1384static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) 1385{ 1386 int result, cpu, i, nr_io_queues, db_bar_size; 1387 1388 nr_io_queues = num_online_cpus(); 1389 result = set_queue_count(dev, nr_io_queues); 1390 if (result < 0) 1391 return result; 1392 if (result < nr_io_queues) 1393 nr_io_queues = result; 1394 1395 /* Deregister the admin queue's interrupt */ 1396 free_irq(dev->entry[0].vector, dev->queues[0]); 1397 1398 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1399 if (db_bar_size > 8192) { 1400 iounmap(dev->bar); 1401 dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 1402 db_bar_size); 1403 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1404 dev->queues[0]->q_db = dev->dbs; 1405 } 1406 1407 for (i = 0; i < nr_io_queues; i++) 1408 dev->entry[i].entry = i; 1409 for (;;) { 1410 result = pci_enable_msix(dev->pci_dev, dev->entry, 1411 nr_io_queues); 1412 if (result == 0) { 1413 break; 1414 } else if (result > 0) { 1415 nr_io_queues = result; 1416 continue; 1417 } else { 1418 nr_io_queues = 1; 1419 break; 1420 } 1421 } 1422 1423 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1424 /* XXX: handle failure here */ 1425 1426 cpu = cpumask_first(cpu_online_mask); 1427 for (i = 0; i < nr_io_queues; i++) { 1428 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1429 cpu = cpumask_next(cpu, cpu_online_mask); 1430 } 1431 1432 for (i = 0; i < nr_io_queues; i++) { 1433 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, 1434 NVME_Q_DEPTH, i); 1435 if (IS_ERR(dev->queues[i + 1])) 1436 return PTR_ERR(dev->queues[i + 1]); 1437 dev->queue_count++; 1438 } 1439 1440 for (; i < num_possible_cpus(); i++) { 1441 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1442 dev->queues[i + 1] = dev->queues[target + 1]; 1443 } 1444 1445 return 0; 1446} 1447 1448static void nvme_free_queues(struct nvme_dev *dev) 1449{ 1450 int i; 1451 1452 for (i = dev->queue_count - 1; i >= 0; i--) 1453 nvme_free_queue(dev, i); 1454} 1455 1456static int __devinit nvme_dev_add(struct nvme_dev *dev) 1457{ 1458 int res, nn, i; 1459 struct nvme_ns *ns, *next; 1460 struct nvme_id_ctrl *ctrl; 1461 struct nvme_id_ns *id_ns; 1462 void *mem; 1463 dma_addr_t dma_addr; 1464 1465 res = nvme_setup_io_queues(dev); 1466 if (res) 1467 return res; 1468 1469 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1470 GFP_KERNEL); 1471 1472 res = nvme_identify(dev, 0, 1, dma_addr); 1473 if (res) { 1474 res = -EIO; 1475 goto out_free; 1476 } 1477 1478 ctrl = mem; 1479 nn = le32_to_cpup(&ctrl->nn); 1480 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1481 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1482 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1483 1484 id_ns = mem; 1485 for (i = 1; i <= nn; i++) { 1486 res = nvme_identify(dev, i, 0, dma_addr); 1487 if (res) 1488 continue; 1489 1490 if (id_ns->ncap == 0) 1491 continue; 1492 1493 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1494 dma_addr + 4096); 1495 if (res) 1496 continue; 1497 1498 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1499 if (ns) 1500 list_add_tail(&ns->list, &dev->namespaces); 1501 } 1502 list_for_each_entry(ns, &dev->namespaces, list) 1503 add_disk(ns->disk); 1504 1505 goto out; 1506 1507 out_free: 1508 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1509 list_del(&ns->list); 1510 nvme_ns_free(ns); 1511 } 1512 1513 out: 1514 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1515 return res; 1516} 1517 1518static int nvme_dev_remove(struct nvme_dev *dev) 1519{ 1520 struct nvme_ns *ns, *next; 1521 1522 spin_lock(&dev_list_lock); 1523 list_del(&dev->node); 1524 spin_unlock(&dev_list_lock); 1525 1526 /* TODO: wait all I/O finished or cancel them */ 1527 1528 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1529 list_del(&ns->list); 1530 del_gendisk(ns->disk); 1531 nvme_ns_free(ns); 1532 } 1533 1534 nvme_free_queues(dev); 1535 1536 return 0; 1537} 1538 1539static int nvme_setup_prp_pools(struct nvme_dev *dev) 1540{ 1541 struct device *dmadev = &dev->pci_dev->dev; 1542 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1543 PAGE_SIZE, PAGE_SIZE, 0); 1544 if (!dev->prp_page_pool) 1545 return -ENOMEM; 1546 1547 /* Optimisation for I/Os between 4k and 128k */ 1548 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1549 256, 256, 0); 1550 if (!dev->prp_small_pool) { 1551 dma_pool_destroy(dev->prp_page_pool); 1552 return -ENOMEM; 1553 } 1554 return 0; 1555} 1556 1557static void nvme_release_prp_pools(struct nvme_dev *dev) 1558{ 1559 dma_pool_destroy(dev->prp_page_pool); 1560 dma_pool_destroy(dev->prp_small_pool); 1561} 1562 1563/* XXX: Use an ida or something to let remove / add work correctly */ 1564static void nvme_set_instance(struct nvme_dev *dev) 1565{ 1566 static int instance; 1567 dev->instance = instance++; 1568} 1569 1570static void nvme_release_instance(struct nvme_dev *dev) 1571{ 1572} 1573 1574static int __devinit nvme_probe(struct pci_dev *pdev, 1575 const struct pci_device_id *id) 1576{ 1577 int bars, result = -ENOMEM; 1578 struct nvme_dev *dev; 1579 1580 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1581 if (!dev) 1582 return -ENOMEM; 1583 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1584 GFP_KERNEL); 1585 if (!dev->entry) 1586 goto free; 1587 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1588 GFP_KERNEL); 1589 if (!dev->queues) 1590 goto free; 1591 1592 if (pci_enable_device_mem(pdev)) 1593 goto free; 1594 pci_set_master(pdev); 1595 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1596 if (pci_request_selected_regions(pdev, bars, "nvme")) 1597 goto disable; 1598 1599 INIT_LIST_HEAD(&dev->namespaces); 1600 dev->pci_dev = pdev; 1601 pci_set_drvdata(pdev, dev); 1602 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1603 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1604 nvme_set_instance(dev); 1605 dev->entry[0].vector = pdev->irq; 1606 1607 result = nvme_setup_prp_pools(dev); 1608 if (result) 1609 goto disable_msix; 1610 1611 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1612 if (!dev->bar) { 1613 result = -ENOMEM; 1614 goto disable_msix; 1615 } 1616 1617 result = nvme_configure_admin_queue(dev); 1618 if (result) 1619 goto unmap; 1620 dev->queue_count++; 1621 1622 spin_lock(&dev_list_lock); 1623 list_add(&dev->node, &dev_list); 1624 spin_unlock(&dev_list_lock); 1625 1626 result = nvme_dev_add(dev); 1627 if (result) 1628 goto delete; 1629 1630 return 0; 1631 1632 delete: 1633 spin_lock(&dev_list_lock); 1634 list_del(&dev->node); 1635 spin_unlock(&dev_list_lock); 1636 1637 nvme_free_queues(dev); 1638 unmap: 1639 iounmap(dev->bar); 1640 disable_msix: 1641 pci_disable_msix(pdev); 1642 nvme_release_instance(dev); 1643 nvme_release_prp_pools(dev); 1644 disable: 1645 pci_disable_device(pdev); 1646 pci_release_regions(pdev); 1647 free: 1648 kfree(dev->queues); 1649 kfree(dev->entry); 1650 kfree(dev); 1651 return result; 1652} 1653 1654static void __devexit nvme_remove(struct pci_dev *pdev) 1655{ 1656 struct nvme_dev *dev = pci_get_drvdata(pdev); 1657 nvme_dev_remove(dev); 1658 pci_disable_msix(pdev); 1659 iounmap(dev->bar); 1660 nvme_release_instance(dev); 1661 nvme_release_prp_pools(dev); 1662 pci_disable_device(pdev); 1663 pci_release_regions(pdev); 1664 kfree(dev->queues); 1665 kfree(dev->entry); 1666 kfree(dev); 1667} 1668 1669/* These functions are yet to be implemented */ 1670#define nvme_error_detected NULL 1671#define nvme_dump_registers NULL 1672#define nvme_link_reset NULL 1673#define nvme_slot_reset NULL 1674#define nvme_error_resume NULL 1675#define nvme_suspend NULL 1676#define nvme_resume NULL 1677 1678static struct pci_error_handlers nvme_err_handler = { 1679 .error_detected = nvme_error_detected, 1680 .mmio_enabled = nvme_dump_registers, 1681 .link_reset = nvme_link_reset, 1682 .slot_reset = nvme_slot_reset, 1683 .resume = nvme_error_resume, 1684}; 1685 1686/* Move to pci_ids.h later */ 1687#define PCI_CLASS_STORAGE_EXPRESS 0x010802 1688 1689static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 1690 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 1691 { 0, } 1692}; 1693MODULE_DEVICE_TABLE(pci, nvme_id_table); 1694 1695static struct pci_driver nvme_driver = { 1696 .name = "nvme", 1697 .id_table = nvme_id_table, 1698 .probe = nvme_probe, 1699 .remove = __devexit_p(nvme_remove), 1700 .suspend = nvme_suspend, 1701 .resume = nvme_resume, 1702 .err_handler = &nvme_err_handler, 1703}; 1704 1705static int __init nvme_init(void) 1706{ 1707 int result = -EBUSY; 1708 1709 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1710 if (IS_ERR(nvme_thread)) 1711 return PTR_ERR(nvme_thread); 1712 1713 nvme_major = register_blkdev(nvme_major, "nvme"); 1714 if (nvme_major <= 0) 1715 goto kill_kthread; 1716 1717 result = pci_register_driver(&nvme_driver); 1718 if (result) 1719 goto unregister_blkdev; 1720 return 0; 1721 1722 unregister_blkdev: 1723 unregister_blkdev(nvme_major, "nvme"); 1724 kill_kthread: 1725 kthread_stop(nvme_thread); 1726 return result; 1727} 1728 1729static void __exit nvme_exit(void) 1730{ 1731 pci_unregister_driver(&nvme_driver); 1732 unregister_blkdev(nvme_major, "nvme"); 1733 kthread_stop(nvme_thread); 1734} 1735 1736MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 1737MODULE_LICENSE("GPL"); 1738MODULE_VERSION("0.8"); 1739module_init(nvme_init); 1740module_exit(nvme_exit); 1741