nvme-core.c revision d0ba1e497bca83a3d353eb47c9658afc54d83228
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42#include <linux/version.h> 43 44#define NVME_Q_DEPTH 1024 45#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 46#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 47#define NVME_MINORS 64 48#define IO_TIMEOUT (5 * HZ) 49#define ADMIN_TIMEOUT (60 * HZ) 50 51static int nvme_major; 52module_param(nvme_major, int, 0); 53 54static int use_threaded_interrupts; 55module_param(use_threaded_interrupts, int, 0); 56 57static DEFINE_SPINLOCK(dev_list_lock); 58static LIST_HEAD(dev_list); 59static struct task_struct *nvme_thread; 60 61/* 62 * Represents an NVM Express device. Each nvme_dev is a PCI function. 63 */ 64struct nvme_dev { 65 struct list_head node; 66 struct nvme_queue **queues; 67 u32 __iomem *dbs; 68 struct pci_dev *pci_dev; 69 struct dma_pool *prp_page_pool; 70 struct dma_pool *prp_small_pool; 71 int instance; 72 int queue_count; 73 u32 ctrl_config; 74 struct msix_entry *entry; 75 struct nvme_bar __iomem *bar; 76 struct list_head namespaces; 77 char serial[20]; 78 char model[40]; 79 char firmware_rev[8]; 80}; 81 82/* 83 * An NVM Express namespace is equivalent to a SCSI LUN 84 */ 85struct nvme_ns { 86 struct list_head list; 87 88 struct nvme_dev *dev; 89 struct request_queue *queue; 90 struct gendisk *disk; 91 92 int ns_id; 93 int lba_shift; 94}; 95 96/* 97 * An NVM Express queue. Each device has at least two (one for admin 98 * commands and one for I/O commands). 99 */ 100struct nvme_queue { 101 struct device *q_dmadev; 102 struct nvme_dev *dev; 103 spinlock_t q_lock; 104 struct nvme_command *sq_cmds; 105 volatile struct nvme_completion *cqes; 106 dma_addr_t sq_dma_addr; 107 dma_addr_t cq_dma_addr; 108 wait_queue_head_t sq_full; 109 wait_queue_t sq_cong_wait; 110 struct bio_list sq_cong; 111 u32 __iomem *q_db; 112 u16 q_depth; 113 u16 cq_vector; 114 u16 sq_head; 115 u16 sq_tail; 116 u16 cq_head; 117 u16 cq_phase; 118 unsigned long cmdid_data[]; 119}; 120 121/* 122 * Check we didin't inadvertently grow the command struct 123 */ 124static inline void _nvme_check_size(void) 125{ 126 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 127 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 128 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 129 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 130 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 131 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 132 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 133 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 134 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 135} 136 137struct nvme_cmd_info { 138 unsigned long ctx; 139 unsigned long timeout; 140}; 141 142static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 143{ 144 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 145} 146 147/** 148 * alloc_cmdid() - Allocate a Command ID 149 * @nvmeq: The queue that will be used for this command 150 * @ctx: A pointer that will be passed to the handler 151 * @handler: The ID of the handler to call 152 * 153 * Allocate a Command ID for a queue. The data passed in will 154 * be passed to the completion handler. This is implemented by using 155 * the bottom two bits of the ctx pointer to store the handler ID. 156 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 157 * We can change this if it becomes a problem. 158 * 159 * May be called with local interrupts disabled and the q_lock held, 160 * or with interrupts enabled and no locks held. 161 */ 162static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler, 163 unsigned timeout) 164{ 165 int depth = nvmeq->q_depth - 1; 166 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 167 int cmdid; 168 169 BUG_ON((unsigned long)ctx & 3); 170 171 do { 172 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 173 if (cmdid >= depth) 174 return -EBUSY; 175 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 176 177 info[cmdid].ctx = (unsigned long)ctx | handler; 178 info[cmdid].timeout = jiffies + timeout; 179 return cmdid; 180} 181 182static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 183 int handler, unsigned timeout) 184{ 185 int cmdid; 186 wait_event_killable(nvmeq->sq_full, 187 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 188 return (cmdid < 0) ? -EINTR : cmdid; 189} 190 191/* 192 * If you need more than four handlers, you'll need to change how 193 * alloc_cmdid and nvme_process_cq work. Consider using a special 194 * CMD_CTX value instead, if that works for your situation. 195 */ 196enum { 197 sync_completion_id = 0, 198 bio_completion_id, 199}; 200 201/* Special values must be a multiple of 4, and less than 0x1000 */ 202#define CMD_CTX_BASE (POISON_POINTER_DELTA + sync_completion_id) 203#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 204#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 205#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 206#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 207 208/* 209 * Called with local interrupts disabled and the q_lock held. May not sleep. 210 */ 211static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid) 212{ 213 unsigned long data; 214 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 215 216 if (cmdid >= nvmeq->q_depth) 217 return CMD_CTX_INVALID; 218 data = info[cmdid].ctx; 219 info[cmdid].ctx = CMD_CTX_COMPLETED; 220 clear_bit(cmdid, nvmeq->cmdid_data); 221 wake_up(&nvmeq->sq_full); 222 return data; 223} 224 225static unsigned long cancel_cmdid(struct nvme_queue *nvmeq, int cmdid) 226{ 227 unsigned long data; 228 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 229 data = info[cmdid].ctx; 230 info[cmdid].ctx = CMD_CTX_CANCELLED; 231 return data; 232} 233 234static struct nvme_queue *get_nvmeq(struct nvme_ns *ns) 235{ 236 return ns->dev->queues[get_cpu() + 1]; 237} 238 239static void put_nvmeq(struct nvme_queue *nvmeq) 240{ 241 put_cpu(); 242} 243 244/** 245 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 246 * @nvmeq: The queue to use 247 * @cmd: The command to send 248 * 249 * Safe to use from interrupt context 250 */ 251static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 252{ 253 unsigned long flags; 254 u16 tail; 255 spin_lock_irqsave(&nvmeq->q_lock, flags); 256 tail = nvmeq->sq_tail; 257 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 258 if (++tail == nvmeq->q_depth) 259 tail = 0; 260 writel(tail, nvmeq->q_db); 261 nvmeq->sq_tail = tail; 262 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 263 264 return 0; 265} 266 267struct nvme_prps { 268 int npages; 269 dma_addr_t first_dma; 270 __le64 *list[0]; 271}; 272 273static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps) 274{ 275 const int last_prp = PAGE_SIZE / 8 - 1; 276 int i; 277 dma_addr_t prp_dma; 278 279 if (!prps) 280 return; 281 282 prp_dma = prps->first_dma; 283 284 if (prps->npages == 0) 285 dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma); 286 for (i = 0; i < prps->npages; i++) { 287 __le64 *prp_list = prps->list[i]; 288 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 289 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 290 prp_dma = next_prp_dma; 291 } 292 kfree(prps); 293} 294 295struct nvme_bio { 296 struct bio *bio; 297 int nents; 298 struct nvme_prps *prps; 299 struct scatterlist sg[0]; 300}; 301 302/* XXX: use a mempool */ 303static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp) 304{ 305 return kzalloc(sizeof(struct nvme_bio) + 306 sizeof(struct scatterlist) * nseg, gfp); 307} 308 309static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio) 310{ 311 nvme_free_prps(nvmeq->dev, nbio->prps); 312 kfree(nbio); 313} 314 315static void bio_completion(struct nvme_queue *nvmeq, void *ctx, 316 struct nvme_completion *cqe) 317{ 318 struct nvme_bio *nbio = ctx; 319 struct bio *bio = nbio->bio; 320 u16 status = le16_to_cpup(&cqe->status) >> 1; 321 322 dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents, 323 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 324 free_nbio(nvmeq, nbio); 325 if (status) { 326 bio_endio(bio, -EIO); 327 } else if (bio->bi_vcnt > bio->bi_idx) { 328 if (bio_list_empty(&nvmeq->sq_cong)) 329 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 330 bio_list_add(&nvmeq->sq_cong, bio); 331 wake_up_process(nvme_thread); 332 } else { 333 bio_endio(bio, 0); 334 } 335} 336 337/* length is in bytes. gfp flags indicates whether we may sleep. */ 338static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev, 339 struct nvme_common_command *cmd, 340 struct scatterlist *sg, int *len, 341 gfp_t gfp) 342{ 343 struct dma_pool *pool; 344 int length = *len; 345 int dma_len = sg_dma_len(sg); 346 u64 dma_addr = sg_dma_address(sg); 347 int offset = offset_in_page(dma_addr); 348 __le64 *prp_list; 349 dma_addr_t prp_dma; 350 int nprps, npages, i, prp_page; 351 struct nvme_prps *prps = NULL; 352 353 cmd->prp1 = cpu_to_le64(dma_addr); 354 length -= (PAGE_SIZE - offset); 355 if (length <= 0) 356 return prps; 357 358 dma_len -= (PAGE_SIZE - offset); 359 if (dma_len) { 360 dma_addr += (PAGE_SIZE - offset); 361 } else { 362 sg = sg_next(sg); 363 dma_addr = sg_dma_address(sg); 364 dma_len = sg_dma_len(sg); 365 } 366 367 if (length <= PAGE_SIZE) { 368 cmd->prp2 = cpu_to_le64(dma_addr); 369 return prps; 370 } 371 372 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 373 npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE); 374 prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp); 375 if (!prps) { 376 cmd->prp2 = cpu_to_le64(dma_addr); 377 *len = (*len - length) + PAGE_SIZE; 378 return prps; 379 } 380 prp_page = 0; 381 if (nprps <= (256 / 8)) { 382 pool = dev->prp_small_pool; 383 prps->npages = 0; 384 } else { 385 pool = dev->prp_page_pool; 386 prps->npages = npages; 387 } 388 389 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 390 if (!prp_list) { 391 cmd->prp2 = cpu_to_le64(dma_addr); 392 *len = (*len - length) + PAGE_SIZE; 393 kfree(prps); 394 return NULL; 395 } 396 prps->list[prp_page++] = prp_list; 397 prps->first_dma = prp_dma; 398 cmd->prp2 = cpu_to_le64(prp_dma); 399 i = 0; 400 for (;;) { 401 if (i == PAGE_SIZE / 8) { 402 __le64 *old_prp_list = prp_list; 403 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 404 if (!prp_list) { 405 *len = (*len - length); 406 return prps; 407 } 408 prps->list[prp_page++] = prp_list; 409 prp_list[0] = old_prp_list[i - 1]; 410 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 411 i = 1; 412 } 413 prp_list[i++] = cpu_to_le64(dma_addr); 414 dma_len -= PAGE_SIZE; 415 dma_addr += PAGE_SIZE; 416 length -= PAGE_SIZE; 417 if (length <= 0) 418 break; 419 if (dma_len > 0) 420 continue; 421 BUG_ON(dma_len < 0); 422 sg = sg_next(sg); 423 dma_addr = sg_dma_address(sg); 424 dma_len = sg_dma_len(sg); 425 } 426 427 return prps; 428} 429 430/* NVMe scatterlists require no holes in the virtual address */ 431#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 432 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 433 434static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio, 435 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 436{ 437 struct bio_vec *bvec, *bvprv = NULL; 438 struct scatterlist *sg = NULL; 439 int i, old_idx, length = 0, nsegs = 0; 440 441 sg_init_table(nbio->sg, psegs); 442 old_idx = bio->bi_idx; 443 bio_for_each_segment(bvec, bio, i) { 444 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 445 sg->length += bvec->bv_len; 446 } else { 447 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 448 break; 449 sg = sg ? sg + 1 : nbio->sg; 450 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 451 bvec->bv_offset); 452 nsegs++; 453 } 454 length += bvec->bv_len; 455 bvprv = bvec; 456 } 457 bio->bi_idx = i; 458 nbio->nents = nsegs; 459 sg_mark_end(sg); 460 if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) { 461 bio->bi_idx = old_idx; 462 return -ENOMEM; 463 } 464 return length; 465} 466 467static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 468 int cmdid) 469{ 470 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 471 472 memset(cmnd, 0, sizeof(*cmnd)); 473 cmnd->common.opcode = nvme_cmd_flush; 474 cmnd->common.command_id = cmdid; 475 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 476 477 if (++nvmeq->sq_tail == nvmeq->q_depth) 478 nvmeq->sq_tail = 0; 479 writel(nvmeq->sq_tail, nvmeq->q_db); 480 481 return 0; 482} 483 484static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 485{ 486 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 487 sync_completion_id, IO_TIMEOUT); 488 if (unlikely(cmdid < 0)) 489 return cmdid; 490 491 return nvme_submit_flush(nvmeq, ns, cmdid); 492} 493 494/* 495 * Called with local interrupts disabled and the q_lock held. May not sleep. 496 */ 497static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 498 struct bio *bio) 499{ 500 struct nvme_command *cmnd; 501 struct nvme_bio *nbio; 502 enum dma_data_direction dma_dir; 503 int cmdid, length, result = -ENOMEM; 504 u16 control; 505 u32 dsmgmt; 506 int psegs = bio_phys_segments(ns->queue, bio); 507 508 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 509 result = nvme_submit_flush_data(nvmeq, ns); 510 if (result) 511 return result; 512 } 513 514 nbio = alloc_nbio(psegs, GFP_ATOMIC); 515 if (!nbio) 516 goto nomem; 517 nbio->bio = bio; 518 519 result = -EBUSY; 520 cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT); 521 if (unlikely(cmdid < 0)) 522 goto free_nbio; 523 524 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 525 return nvme_submit_flush(nvmeq, ns, cmdid); 526 527 control = 0; 528 if (bio->bi_rw & REQ_FUA) 529 control |= NVME_RW_FUA; 530 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 531 control |= NVME_RW_LR; 532 533 dsmgmt = 0; 534 if (bio->bi_rw & REQ_RAHEAD) 535 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 536 537 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 538 539 memset(cmnd, 0, sizeof(*cmnd)); 540 if (bio_data_dir(bio)) { 541 cmnd->rw.opcode = nvme_cmd_write; 542 dma_dir = DMA_TO_DEVICE; 543 } else { 544 cmnd->rw.opcode = nvme_cmd_read; 545 dma_dir = DMA_FROM_DEVICE; 546 } 547 548 result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs); 549 if (result < 0) 550 goto free_nbio; 551 length = result; 552 553 cmnd->rw.command_id = cmdid; 554 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 555 nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg, 556 &length, GFP_ATOMIC); 557 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); 558 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 559 cmnd->rw.control = cpu_to_le16(control); 560 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 561 562 bio->bi_sector += length >> 9; 563 564 if (++nvmeq->sq_tail == nvmeq->q_depth) 565 nvmeq->sq_tail = 0; 566 writel(nvmeq->sq_tail, nvmeq->q_db); 567 568 return 0; 569 570 free_nbio: 571 free_nbio(nvmeq, nbio); 572 nomem: 573 return result; 574} 575 576/* 577 * NB: return value of non-zero would mean that we were a stacking driver. 578 * make_request must always succeed. 579 */ 580static int nvme_make_request(struct request_queue *q, struct bio *bio) 581{ 582 struct nvme_ns *ns = q->queuedata; 583 struct nvme_queue *nvmeq = get_nvmeq(ns); 584 int result = -EBUSY; 585 586 spin_lock_irq(&nvmeq->q_lock); 587 if (bio_list_empty(&nvmeq->sq_cong)) 588 result = nvme_submit_bio_queue(nvmeq, ns, bio); 589 if (unlikely(result)) { 590 if (bio_list_empty(&nvmeq->sq_cong)) 591 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 592 bio_list_add(&nvmeq->sq_cong, bio); 593 } 594 595 spin_unlock_irq(&nvmeq->q_lock); 596 put_nvmeq(nvmeq); 597 598 return 0; 599} 600 601struct sync_cmd_info { 602 struct task_struct *task; 603 u32 result; 604 int status; 605}; 606 607static void sync_completion(struct nvme_queue *nvmeq, void *ctx, 608 struct nvme_completion *cqe) 609{ 610 struct sync_cmd_info *cmdinfo = ctx; 611 if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED)) 612 return; 613 if ((unsigned long)cmdinfo == CMD_CTX_FLUSH) 614 return; 615 if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) { 616 dev_warn(nvmeq->q_dmadev, 617 "completed id %d twice on queue %d\n", 618 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 619 return; 620 } 621 if (unlikely((unsigned long)cmdinfo == CMD_CTX_INVALID)) { 622 dev_warn(nvmeq->q_dmadev, 623 "invalid id %d completed on queue %d\n", 624 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 625 return; 626 } 627 cmdinfo->result = le32_to_cpup(&cqe->result); 628 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 629 wake_up_process(cmdinfo->task); 630} 631 632typedef void (*completion_fn)(struct nvme_queue *, void *, 633 struct nvme_completion *); 634 635static const completion_fn nvme_completions[4] = { 636 [sync_completion_id] = sync_completion, 637 [bio_completion_id] = bio_completion, 638}; 639 640static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 641{ 642 u16 head, phase; 643 644 head = nvmeq->cq_head; 645 phase = nvmeq->cq_phase; 646 647 for (;;) { 648 unsigned long data; 649 void *ptr; 650 unsigned char handler; 651 struct nvme_completion cqe = nvmeq->cqes[head]; 652 if ((le16_to_cpu(cqe.status) & 1) != phase) 653 break; 654 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 655 if (++head == nvmeq->q_depth) { 656 head = 0; 657 phase = !phase; 658 } 659 660 data = free_cmdid(nvmeq, cqe.command_id); 661 handler = data & 3; 662 ptr = (void *)(data & ~3UL); 663 nvme_completions[handler](nvmeq, ptr, &cqe); 664 } 665 666 /* If the controller ignores the cq head doorbell and continuously 667 * writes to the queue, it is theoretically possible to wrap around 668 * the queue twice and mistakenly return IRQ_NONE. Linux only 669 * requires that 0.1% of your interrupts are handled, so this isn't 670 * a big problem. 671 */ 672 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 673 return IRQ_NONE; 674 675 writel(head, nvmeq->q_db + 1); 676 nvmeq->cq_head = head; 677 nvmeq->cq_phase = phase; 678 679 return IRQ_HANDLED; 680} 681 682static irqreturn_t nvme_irq(int irq, void *data) 683{ 684 irqreturn_t result; 685 struct nvme_queue *nvmeq = data; 686 spin_lock(&nvmeq->q_lock); 687 result = nvme_process_cq(nvmeq); 688 spin_unlock(&nvmeq->q_lock); 689 return result; 690} 691 692static irqreturn_t nvme_irq_check(int irq, void *data) 693{ 694 struct nvme_queue *nvmeq = data; 695 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 696 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 697 return IRQ_NONE; 698 return IRQ_WAKE_THREAD; 699} 700 701static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 702{ 703 spin_lock_irq(&nvmeq->q_lock); 704 cancel_cmdid(nvmeq, cmdid); 705 spin_unlock_irq(&nvmeq->q_lock); 706} 707 708/* 709 * Returns 0 on success. If the result is negative, it's a Linux error code; 710 * if the result is positive, it's an NVM Express status code 711 */ 712static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, 713 struct nvme_command *cmd, u32 *result, unsigned timeout) 714{ 715 int cmdid; 716 struct sync_cmd_info cmdinfo; 717 718 cmdinfo.task = current; 719 cmdinfo.status = -EINTR; 720 721 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id, 722 timeout); 723 if (cmdid < 0) 724 return cmdid; 725 cmd->common.command_id = cmdid; 726 727 set_current_state(TASK_KILLABLE); 728 nvme_submit_cmd(nvmeq, cmd); 729 schedule(); 730 731 if (cmdinfo.status == -EINTR) { 732 nvme_abort_command(nvmeq, cmdid); 733 return -EINTR; 734 } 735 736 if (result) 737 *result = cmdinfo.result; 738 739 return cmdinfo.status; 740} 741 742static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 743 u32 *result) 744{ 745 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 746} 747 748static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 749{ 750 int status; 751 struct nvme_command c; 752 753 memset(&c, 0, sizeof(c)); 754 c.delete_queue.opcode = opcode; 755 c.delete_queue.qid = cpu_to_le16(id); 756 757 status = nvme_submit_admin_cmd(dev, &c, NULL); 758 if (status) 759 return -EIO; 760 return 0; 761} 762 763static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 764 struct nvme_queue *nvmeq) 765{ 766 int status; 767 struct nvme_command c; 768 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 769 770 memset(&c, 0, sizeof(c)); 771 c.create_cq.opcode = nvme_admin_create_cq; 772 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 773 c.create_cq.cqid = cpu_to_le16(qid); 774 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 775 c.create_cq.cq_flags = cpu_to_le16(flags); 776 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 777 778 status = nvme_submit_admin_cmd(dev, &c, NULL); 779 if (status) 780 return -EIO; 781 return 0; 782} 783 784static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 785 struct nvme_queue *nvmeq) 786{ 787 int status; 788 struct nvme_command c; 789 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 790 791 memset(&c, 0, sizeof(c)); 792 c.create_sq.opcode = nvme_admin_create_sq; 793 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 794 c.create_sq.sqid = cpu_to_le16(qid); 795 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 796 c.create_sq.sq_flags = cpu_to_le16(flags); 797 c.create_sq.cqid = cpu_to_le16(qid); 798 799 status = nvme_submit_admin_cmd(dev, &c, NULL); 800 if (status) 801 return -EIO; 802 return 0; 803} 804 805static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 806{ 807 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 808} 809 810static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 811{ 812 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 813} 814 815static void nvme_free_queue(struct nvme_dev *dev, int qid) 816{ 817 struct nvme_queue *nvmeq = dev->queues[qid]; 818 int vector = dev->entry[nvmeq->cq_vector].vector; 819 820 irq_set_affinity_hint(vector, NULL); 821 free_irq(vector, nvmeq); 822 823 /* Don't tell the adapter to delete the admin queue */ 824 if (qid) { 825 adapter_delete_sq(dev, qid); 826 adapter_delete_cq(dev, qid); 827 } 828 829 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 830 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 831 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 832 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 833 kfree(nvmeq); 834} 835 836static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 837 int depth, int vector) 838{ 839 struct device *dmadev = &dev->pci_dev->dev; 840 unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); 841 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 842 if (!nvmeq) 843 return NULL; 844 845 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 846 &nvmeq->cq_dma_addr, GFP_KERNEL); 847 if (!nvmeq->cqes) 848 goto free_nvmeq; 849 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 850 851 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 852 &nvmeq->sq_dma_addr, GFP_KERNEL); 853 if (!nvmeq->sq_cmds) 854 goto free_cqdma; 855 856 nvmeq->q_dmadev = dmadev; 857 nvmeq->dev = dev; 858 spin_lock_init(&nvmeq->q_lock); 859 nvmeq->cq_head = 0; 860 nvmeq->cq_phase = 1; 861 init_waitqueue_head(&nvmeq->sq_full); 862 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 863 bio_list_init(&nvmeq->sq_cong); 864 nvmeq->q_db = &dev->dbs[qid * 2]; 865 nvmeq->q_depth = depth; 866 nvmeq->cq_vector = vector; 867 868 return nvmeq; 869 870 free_cqdma: 871 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, 872 nvmeq->cq_dma_addr); 873 free_nvmeq: 874 kfree(nvmeq); 875 return NULL; 876} 877 878static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 879 const char *name) 880{ 881 if (use_threaded_interrupts) 882 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 883 nvme_irq_check, nvme_irq, 884 IRQF_DISABLED | IRQF_SHARED, 885 name, nvmeq); 886 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 887 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 888} 889 890static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, 891 int qid, int cq_size, int vector) 892{ 893 int result; 894 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 895 896 if (!nvmeq) 897 return ERR_PTR(-ENOMEM); 898 899 result = adapter_alloc_cq(dev, qid, nvmeq); 900 if (result < 0) 901 goto free_nvmeq; 902 903 result = adapter_alloc_sq(dev, qid, nvmeq); 904 if (result < 0) 905 goto release_cq; 906 907 result = queue_request_irq(dev, nvmeq, "nvme"); 908 if (result < 0) 909 goto release_sq; 910 911 return nvmeq; 912 913 release_sq: 914 adapter_delete_sq(dev, qid); 915 release_cq: 916 adapter_delete_cq(dev, qid); 917 free_nvmeq: 918 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 919 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 920 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 921 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 922 kfree(nvmeq); 923 return ERR_PTR(result); 924} 925 926static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) 927{ 928 int result; 929 u32 aqa; 930 u64 cap; 931 unsigned long timeout; 932 struct nvme_queue *nvmeq; 933 934 dev->dbs = ((void __iomem *)dev->bar) + 4096; 935 936 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 937 if (!nvmeq) 938 return -ENOMEM; 939 940 aqa = nvmeq->q_depth - 1; 941 aqa |= aqa << 16; 942 943 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 944 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 945 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 946 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 947 948 writel(0, &dev->bar->cc); 949 writel(aqa, &dev->bar->aqa); 950 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 951 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 952 writel(dev->ctrl_config, &dev->bar->cc); 953 954 cap = readq(&dev->bar->cap); 955 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 956 957 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 958 msleep(100); 959 if (fatal_signal_pending(current)) 960 return -EINTR; 961 if (time_after(jiffies, timeout)) { 962 dev_err(&dev->pci_dev->dev, 963 "Device not ready; aborting initialisation\n"); 964 return -ENODEV; 965 } 966 } 967 968 result = queue_request_irq(dev, nvmeq, "nvme admin"); 969 dev->queues[0] = nvmeq; 970 return result; 971} 972 973static int nvme_map_user_pages(struct nvme_dev *dev, int write, 974 unsigned long addr, unsigned length, 975 struct scatterlist **sgp) 976{ 977 int i, err, count, nents, offset; 978 struct scatterlist *sg; 979 struct page **pages; 980 981 if (addr & 3) 982 return -EINVAL; 983 if (!length) 984 return -EINVAL; 985 986 offset = offset_in_page(addr); 987 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 988 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 989 990 err = get_user_pages_fast(addr, count, 1, pages); 991 if (err < count) { 992 count = err; 993 err = -EFAULT; 994 goto put_pages; 995 } 996 997 sg = kcalloc(count, sizeof(*sg), GFP_KERNEL); 998 sg_init_table(sg, count); 999 for (i = 0; i < count; i++) { 1000 sg_set_page(&sg[i], pages[i], 1001 min_t(int, length, PAGE_SIZE - offset), offset); 1002 length -= (PAGE_SIZE - offset); 1003 offset = 0; 1004 } 1005 1006 err = -ENOMEM; 1007 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1008 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1009 if (!nents) 1010 goto put_pages; 1011 1012 kfree(pages); 1013 *sgp = sg; 1014 return nents; 1015 1016 put_pages: 1017 for (i = 0; i < count; i++) 1018 put_page(pages[i]); 1019 kfree(pages); 1020 return err; 1021} 1022 1023static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1024 unsigned long addr, int length, 1025 struct scatterlist *sg, int nents) 1026{ 1027 int i, count; 1028 1029 count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE); 1030 dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE); 1031 1032 for (i = 0; i < count; i++) 1033 put_page(sg_page(&sg[i])); 1034} 1035 1036static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1037{ 1038 struct nvme_dev *dev = ns->dev; 1039 struct nvme_queue *nvmeq; 1040 struct nvme_user_io io; 1041 struct nvme_command c; 1042 unsigned length; 1043 int nents, status; 1044 struct scatterlist *sg; 1045 struct nvme_prps *prps; 1046 1047 if (copy_from_user(&io, uio, sizeof(io))) 1048 return -EFAULT; 1049 length = (io.nblocks + 1) << ns->lba_shift; 1050 1051 switch (io.opcode) { 1052 case nvme_cmd_write: 1053 case nvme_cmd_read: 1054 case nvme_cmd_compare: 1055 nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, 1056 length, &sg); 1057 break; 1058 default: 1059 return -EINVAL; 1060 } 1061 1062 if (nents < 0) 1063 return nents; 1064 1065 memset(&c, 0, sizeof(c)); 1066 c.rw.opcode = io.opcode; 1067 c.rw.flags = io.flags; 1068 c.rw.nsid = cpu_to_le32(ns->ns_id); 1069 c.rw.slba = cpu_to_le64(io.slba); 1070 c.rw.length = cpu_to_le16(io.nblocks); 1071 c.rw.control = cpu_to_le16(io.control); 1072 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); 1073 c.rw.reftag = io.reftag; 1074 c.rw.apptag = io.apptag; 1075 c.rw.appmask = io.appmask; 1076 /* XXX: metadata */ 1077 prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL); 1078 1079 nvmeq = get_nvmeq(ns); 1080 /* 1081 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1082 * disabled. We may be preempted at any point, and be rescheduled 1083 * to a different CPU. That will cause cacheline bouncing, but no 1084 * additional races since q_lock already protects against other CPUs. 1085 */ 1086 put_nvmeq(nvmeq); 1087 if (length != (io.nblocks + 1) << ns->lba_shift) 1088 status = -ENOMEM; 1089 else 1090 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT); 1091 1092 nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents); 1093 nvme_free_prps(dev, prps); 1094 return status; 1095} 1096 1097static int nvme_user_admin_cmd(struct nvme_ns *ns, 1098 struct nvme_admin_cmd __user *ucmd) 1099{ 1100 struct nvme_dev *dev = ns->dev; 1101 struct nvme_admin_cmd cmd; 1102 struct nvme_command c; 1103 int status, length, nents = 0; 1104 struct scatterlist *sg; 1105 struct nvme_prps *prps = NULL; 1106 1107 if (!capable(CAP_SYS_ADMIN)) 1108 return -EACCES; 1109 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1110 return -EFAULT; 1111 1112 memset(&c, 0, sizeof(c)); 1113 c.common.opcode = cmd.opcode; 1114 c.common.flags = cmd.flags; 1115 c.common.nsid = cpu_to_le32(cmd.nsid); 1116 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1117 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1118 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1119 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1120 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1121 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1122 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1123 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1124 1125 length = cmd.data_len; 1126 if (cmd.data_len) { 1127 nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg); 1128 if (nents < 0) 1129 return nents; 1130 prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL); 1131 } 1132 1133 if (length != cmd.data_len) 1134 status = -ENOMEM; 1135 else 1136 status = nvme_submit_admin_cmd(dev, &c, NULL); 1137 if (cmd.data_len) { 1138 nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg, 1139 nents); 1140 nvme_free_prps(dev, prps); 1141 } 1142 return status; 1143} 1144 1145static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1146 unsigned long arg) 1147{ 1148 struct nvme_ns *ns = bdev->bd_disk->private_data; 1149 1150 switch (cmd) { 1151 case NVME_IOCTL_ID: 1152 return ns->ns_id; 1153 case NVME_IOCTL_ADMIN_CMD: 1154 return nvme_user_admin_cmd(ns, (void __user *)arg); 1155 case NVME_IOCTL_SUBMIT_IO: 1156 return nvme_submit_io(ns, (void __user *)arg); 1157 default: 1158 return -ENOTTY; 1159 } 1160} 1161 1162static const struct block_device_operations nvme_fops = { 1163 .owner = THIS_MODULE, 1164 .ioctl = nvme_ioctl, 1165 .compat_ioctl = nvme_ioctl, 1166}; 1167 1168static void nvme_timeout_ios(struct nvme_queue *nvmeq) 1169{ 1170 int depth = nvmeq->q_depth - 1; 1171 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1172 unsigned long now = jiffies; 1173 int cmdid; 1174 1175 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1176 unsigned long data; 1177 void *ptr; 1178 unsigned char handler; 1179 static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, }; 1180 1181 if (!time_after(now, info[cmdid].timeout)) 1182 continue; 1183 dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid); 1184 data = cancel_cmdid(nvmeq, cmdid); 1185 handler = data & 3; 1186 ptr = (void *)(data & ~3UL); 1187 nvme_completions[handler](nvmeq, ptr, &cqe); 1188 } 1189} 1190 1191static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1192{ 1193 while (bio_list_peek(&nvmeq->sq_cong)) { 1194 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1195 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1196 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1197 bio_list_add_head(&nvmeq->sq_cong, bio); 1198 break; 1199 } 1200 if (bio_list_empty(&nvmeq->sq_cong)) 1201 remove_wait_queue(&nvmeq->sq_full, 1202 &nvmeq->sq_cong_wait); 1203 } 1204} 1205 1206static int nvme_kthread(void *data) 1207{ 1208 struct nvme_dev *dev; 1209 1210 while (!kthread_should_stop()) { 1211 __set_current_state(TASK_RUNNING); 1212 spin_lock(&dev_list_lock); 1213 list_for_each_entry(dev, &dev_list, node) { 1214 int i; 1215 for (i = 0; i < dev->queue_count; i++) { 1216 struct nvme_queue *nvmeq = dev->queues[i]; 1217 if (!nvmeq) 1218 continue; 1219 spin_lock_irq(&nvmeq->q_lock); 1220 if (nvme_process_cq(nvmeq)) 1221 printk("process_cq did something\n"); 1222 nvme_timeout_ios(nvmeq); 1223 nvme_resubmit_bios(nvmeq); 1224 spin_unlock_irq(&nvmeq->q_lock); 1225 } 1226 } 1227 spin_unlock(&dev_list_lock); 1228 set_current_state(TASK_INTERRUPTIBLE); 1229 schedule_timeout(HZ); 1230 } 1231 return 0; 1232} 1233 1234static DEFINE_IDA(nvme_index_ida); 1235 1236static int nvme_get_ns_idx(void) 1237{ 1238 int index, error; 1239 1240 do { 1241 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1242 return -1; 1243 1244 spin_lock(&dev_list_lock); 1245 error = ida_get_new(&nvme_index_ida, &index); 1246 spin_unlock(&dev_list_lock); 1247 } while (error == -EAGAIN); 1248 1249 if (error) 1250 index = -1; 1251 return index; 1252} 1253 1254static void nvme_put_ns_idx(int index) 1255{ 1256 spin_lock(&dev_list_lock); 1257 ida_remove(&nvme_index_ida, index); 1258 spin_unlock(&dev_list_lock); 1259} 1260 1261static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1262 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1263{ 1264 struct nvme_ns *ns; 1265 struct gendisk *disk; 1266 int lbaf; 1267 1268 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1269 return NULL; 1270 1271 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1272 if (!ns) 1273 return NULL; 1274 ns->queue = blk_alloc_queue(GFP_KERNEL); 1275 if (!ns->queue) 1276 goto out_free_ns; 1277 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES | 1278 QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD; 1279 blk_queue_make_request(ns->queue, nvme_make_request); 1280 ns->dev = dev; 1281 ns->queue->queuedata = ns; 1282 1283 disk = alloc_disk(NVME_MINORS); 1284 if (!disk) 1285 goto out_free_queue; 1286 ns->ns_id = nsid; 1287 ns->disk = disk; 1288 lbaf = id->flbas & 0xf; 1289 ns->lba_shift = id->lbaf[lbaf].ds; 1290 1291 disk->major = nvme_major; 1292 disk->minors = NVME_MINORS; 1293 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1294 disk->fops = &nvme_fops; 1295 disk->private_data = ns; 1296 disk->queue = ns->queue; 1297 disk->driverfs_dev = &dev->pci_dev->dev; 1298 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1299 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1300 1301 return ns; 1302 1303 out_free_queue: 1304 blk_cleanup_queue(ns->queue); 1305 out_free_ns: 1306 kfree(ns); 1307 return NULL; 1308} 1309 1310static void nvme_ns_free(struct nvme_ns *ns) 1311{ 1312 int index = ns->disk->first_minor / NVME_MINORS; 1313 put_disk(ns->disk); 1314 nvme_put_ns_idx(index); 1315 blk_cleanup_queue(ns->queue); 1316 kfree(ns); 1317} 1318 1319static int set_queue_count(struct nvme_dev *dev, int count) 1320{ 1321 int status; 1322 u32 result; 1323 struct nvme_command c; 1324 u32 q_count = (count - 1) | ((count - 1) << 16); 1325 1326 memset(&c, 0, sizeof(c)); 1327 c.features.opcode = nvme_admin_get_features; 1328 c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES); 1329 c.features.dword11 = cpu_to_le32(q_count); 1330 1331 status = nvme_submit_admin_cmd(dev, &c, &result); 1332 if (status) 1333 return -EIO; 1334 return min(result & 0xffff, result >> 16) + 1; 1335} 1336 1337static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) 1338{ 1339 int result, cpu, i, nr_io_queues; 1340 1341 nr_io_queues = num_online_cpus(); 1342 result = set_queue_count(dev, nr_io_queues); 1343 if (result < 0) 1344 return result; 1345 if (result < nr_io_queues) 1346 nr_io_queues = result; 1347 1348 /* Deregister the admin queue's interrupt */ 1349 free_irq(dev->entry[0].vector, dev->queues[0]); 1350 1351 for (i = 0; i < nr_io_queues; i++) 1352 dev->entry[i].entry = i; 1353 for (;;) { 1354 result = pci_enable_msix(dev->pci_dev, dev->entry, 1355 nr_io_queues); 1356 if (result == 0) { 1357 break; 1358 } else if (result > 0) { 1359 nr_io_queues = result; 1360 continue; 1361 } else { 1362 nr_io_queues = 1; 1363 break; 1364 } 1365 } 1366 1367 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1368 /* XXX: handle failure here */ 1369 1370 cpu = cpumask_first(cpu_online_mask); 1371 for (i = 0; i < nr_io_queues; i++) { 1372 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1373 cpu = cpumask_next(cpu, cpu_online_mask); 1374 } 1375 1376 for (i = 0; i < nr_io_queues; i++) { 1377 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, 1378 NVME_Q_DEPTH, i); 1379 if (IS_ERR(dev->queues[i + 1])) 1380 return PTR_ERR(dev->queues[i + 1]); 1381 dev->queue_count++; 1382 } 1383 1384 for (; i < num_possible_cpus(); i++) { 1385 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1386 dev->queues[i + 1] = dev->queues[target + 1]; 1387 } 1388 1389 return 0; 1390} 1391 1392static void nvme_free_queues(struct nvme_dev *dev) 1393{ 1394 int i; 1395 1396 for (i = dev->queue_count - 1; i >= 0; i--) 1397 nvme_free_queue(dev, i); 1398} 1399 1400static int __devinit nvme_dev_add(struct nvme_dev *dev) 1401{ 1402 int res, nn, i; 1403 struct nvme_ns *ns, *next; 1404 struct nvme_id_ctrl *ctrl; 1405 void *id; 1406 dma_addr_t dma_addr; 1407 struct nvme_command cid, crt; 1408 1409 res = nvme_setup_io_queues(dev); 1410 if (res) 1411 return res; 1412 1413 /* XXX: Switch to a SG list once prp2 works */ 1414 id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1415 GFP_KERNEL); 1416 1417 memset(&cid, 0, sizeof(cid)); 1418 cid.identify.opcode = nvme_admin_identify; 1419 cid.identify.nsid = 0; 1420 cid.identify.prp1 = cpu_to_le64(dma_addr); 1421 cid.identify.cns = cpu_to_le32(1); 1422 1423 res = nvme_submit_admin_cmd(dev, &cid, NULL); 1424 if (res) { 1425 res = -EIO; 1426 goto out_free; 1427 } 1428 1429 ctrl = id; 1430 nn = le32_to_cpup(&ctrl->nn); 1431 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1432 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1433 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1434 1435 cid.identify.cns = 0; 1436 memset(&crt, 0, sizeof(crt)); 1437 crt.features.opcode = nvme_admin_get_features; 1438 crt.features.prp1 = cpu_to_le64(dma_addr + 4096); 1439 crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE); 1440 1441 for (i = 0; i <= nn; i++) { 1442 cid.identify.nsid = cpu_to_le32(i); 1443 res = nvme_submit_admin_cmd(dev, &cid, NULL); 1444 if (res) 1445 continue; 1446 1447 if (((struct nvme_id_ns *)id)->ncap == 0) 1448 continue; 1449 1450 crt.features.nsid = cpu_to_le32(i); 1451 res = nvme_submit_admin_cmd(dev, &crt, NULL); 1452 if (res) 1453 continue; 1454 1455 ns = nvme_alloc_ns(dev, i, id, id + 4096); 1456 if (ns) 1457 list_add_tail(&ns->list, &dev->namespaces); 1458 } 1459 list_for_each_entry(ns, &dev->namespaces, list) 1460 add_disk(ns->disk); 1461 1462 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); 1463 return 0; 1464 1465 out_free: 1466 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1467 list_del(&ns->list); 1468 nvme_ns_free(ns); 1469 } 1470 1471 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); 1472 return res; 1473} 1474 1475static int nvme_dev_remove(struct nvme_dev *dev) 1476{ 1477 struct nvme_ns *ns, *next; 1478 1479 spin_lock(&dev_list_lock); 1480 list_del(&dev->node); 1481 spin_unlock(&dev_list_lock); 1482 1483 /* TODO: wait all I/O finished or cancel them */ 1484 1485 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1486 list_del(&ns->list); 1487 del_gendisk(ns->disk); 1488 nvme_ns_free(ns); 1489 } 1490 1491 nvme_free_queues(dev); 1492 1493 return 0; 1494} 1495 1496static int nvme_setup_prp_pools(struct nvme_dev *dev) 1497{ 1498 struct device *dmadev = &dev->pci_dev->dev; 1499 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1500 PAGE_SIZE, PAGE_SIZE, 0); 1501 if (!dev->prp_page_pool) 1502 return -ENOMEM; 1503 1504 /* Optimisation for I/Os between 4k and 128k */ 1505 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1506 256, 256, 0); 1507 if (!dev->prp_small_pool) { 1508 dma_pool_destroy(dev->prp_page_pool); 1509 return -ENOMEM; 1510 } 1511 return 0; 1512} 1513 1514static void nvme_release_prp_pools(struct nvme_dev *dev) 1515{ 1516 dma_pool_destroy(dev->prp_page_pool); 1517 dma_pool_destroy(dev->prp_small_pool); 1518} 1519 1520/* XXX: Use an ida or something to let remove / add work correctly */ 1521static void nvme_set_instance(struct nvme_dev *dev) 1522{ 1523 static int instance; 1524 dev->instance = instance++; 1525} 1526 1527static void nvme_release_instance(struct nvme_dev *dev) 1528{ 1529} 1530 1531static int __devinit nvme_probe(struct pci_dev *pdev, 1532 const struct pci_device_id *id) 1533{ 1534 int bars, result = -ENOMEM; 1535 struct nvme_dev *dev; 1536 1537 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1538 if (!dev) 1539 return -ENOMEM; 1540 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1541 GFP_KERNEL); 1542 if (!dev->entry) 1543 goto free; 1544 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1545 GFP_KERNEL); 1546 if (!dev->queues) 1547 goto free; 1548 1549 if (pci_enable_device_mem(pdev)) 1550 goto free; 1551 pci_set_master(pdev); 1552 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1553 if (pci_request_selected_regions(pdev, bars, "nvme")) 1554 goto disable; 1555 1556 INIT_LIST_HEAD(&dev->namespaces); 1557 dev->pci_dev = pdev; 1558 pci_set_drvdata(pdev, dev); 1559 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1560 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1561 nvme_set_instance(dev); 1562 dev->entry[0].vector = pdev->irq; 1563 1564 result = nvme_setup_prp_pools(dev); 1565 if (result) 1566 goto disable_msix; 1567 1568 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1569 if (!dev->bar) { 1570 result = -ENOMEM; 1571 goto disable_msix; 1572 } 1573 1574 result = nvme_configure_admin_queue(dev); 1575 if (result) 1576 goto unmap; 1577 dev->queue_count++; 1578 1579 spin_lock(&dev_list_lock); 1580 list_add(&dev->node, &dev_list); 1581 spin_unlock(&dev_list_lock); 1582 1583 result = nvme_dev_add(dev); 1584 if (result) 1585 goto delete; 1586 1587 return 0; 1588 1589 delete: 1590 spin_lock(&dev_list_lock); 1591 list_del(&dev->node); 1592 spin_unlock(&dev_list_lock); 1593 1594 nvme_free_queues(dev); 1595 unmap: 1596 iounmap(dev->bar); 1597 disable_msix: 1598 pci_disable_msix(pdev); 1599 nvme_release_instance(dev); 1600 nvme_release_prp_pools(dev); 1601 disable: 1602 pci_disable_device(pdev); 1603 pci_release_regions(pdev); 1604 free: 1605 kfree(dev->queues); 1606 kfree(dev->entry); 1607 kfree(dev); 1608 return result; 1609} 1610 1611static void __devexit nvme_remove(struct pci_dev *pdev) 1612{ 1613 struct nvme_dev *dev = pci_get_drvdata(pdev); 1614 nvme_dev_remove(dev); 1615 pci_disable_msix(pdev); 1616 iounmap(dev->bar); 1617 nvme_release_instance(dev); 1618 nvme_release_prp_pools(dev); 1619 pci_disable_device(pdev); 1620 pci_release_regions(pdev); 1621 kfree(dev->queues); 1622 kfree(dev->entry); 1623 kfree(dev); 1624} 1625 1626/* These functions are yet to be implemented */ 1627#define nvme_error_detected NULL 1628#define nvme_dump_registers NULL 1629#define nvme_link_reset NULL 1630#define nvme_slot_reset NULL 1631#define nvme_error_resume NULL 1632#define nvme_suspend NULL 1633#define nvme_resume NULL 1634 1635static struct pci_error_handlers nvme_err_handler = { 1636 .error_detected = nvme_error_detected, 1637 .mmio_enabled = nvme_dump_registers, 1638 .link_reset = nvme_link_reset, 1639 .slot_reset = nvme_slot_reset, 1640 .resume = nvme_error_resume, 1641}; 1642 1643/* Move to pci_ids.h later */ 1644#define PCI_CLASS_STORAGE_EXPRESS 0x010802 1645 1646static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 1647 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 1648 { 0, } 1649}; 1650MODULE_DEVICE_TABLE(pci, nvme_id_table); 1651 1652static struct pci_driver nvme_driver = { 1653 .name = "nvme", 1654 .id_table = nvme_id_table, 1655 .probe = nvme_probe, 1656 .remove = __devexit_p(nvme_remove), 1657 .suspend = nvme_suspend, 1658 .resume = nvme_resume, 1659 .err_handler = &nvme_err_handler, 1660}; 1661 1662static int __init nvme_init(void) 1663{ 1664 int result = -EBUSY; 1665 1666 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1667 if (IS_ERR(nvme_thread)) 1668 return PTR_ERR(nvme_thread); 1669 1670 nvme_major = register_blkdev(nvme_major, "nvme"); 1671 if (nvme_major <= 0) 1672 goto kill_kthread; 1673 1674 result = pci_register_driver(&nvme_driver); 1675 if (result) 1676 goto unregister_blkdev; 1677 return 0; 1678 1679 unregister_blkdev: 1680 unregister_blkdev(nvme_major, "nvme"); 1681 kill_kthread: 1682 kthread_stop(nvme_thread); 1683 return result; 1684} 1685 1686static void __exit nvme_exit(void) 1687{ 1688 pci_unregister_driver(&nvme_driver); 1689 unregister_blkdev(nvme_major, "nvme"); 1690 kthread_stop(nvme_thread); 1691} 1692 1693MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 1694MODULE_LICENSE("GPL"); 1695MODULE_VERSION("0.6"); 1696module_init(nvme_init); 1697module_exit(nvme_exit); 1698