nvme-core.c revision 388f037f4e7f0a24bac6b1a24f144f5d939f58cf
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/errno.h> 23#include <linux/fs.h> 24#include <linux/genhd.h> 25#include <linux/init.h> 26#include <linux/interrupt.h> 27#include <linux/io.h> 28#include <linux/kdev_t.h> 29#include <linux/kernel.h> 30#include <linux/mm.h> 31#include <linux/module.h> 32#include <linux/moduleparam.h> 33#include <linux/pci.h> 34#include <linux/sched.h> 35#include <linux/slab.h> 36#include <linux/types.h> 37#include <linux/version.h> 38 39#define NVME_Q_DEPTH 1024 40#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 41#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 42#define NVME_MINORS 64 43 44static int nvme_major; 45module_param(nvme_major, int, 0); 46 47/* 48 * Represents an NVM Express device. Each nvme_dev is a PCI function. 49 */ 50struct nvme_dev { 51 struct nvme_queue **queues; 52 u32 __iomem *dbs; 53 struct pci_dev *pci_dev; 54 int instance; 55 int queue_count; 56 u32 ctrl_config; 57 struct msix_entry *entry; 58 struct nvme_bar __iomem *bar; 59 struct list_head namespaces; 60 char serial[20]; 61 char model[40]; 62 char firmware_rev[8]; 63}; 64 65/* 66 * An NVM Express namespace is equivalent to a SCSI LUN 67 */ 68struct nvme_ns { 69 struct list_head list; 70 71 struct nvme_dev *dev; 72 struct request_queue *queue; 73 struct gendisk *disk; 74 75 int ns_id; 76 int lba_shift; 77}; 78 79/* 80 * An NVM Express queue. Each device has at least two (one for admin 81 * commands and one for I/O commands). 82 */ 83struct nvme_queue { 84 struct device *q_dmadev; 85 spinlock_t q_lock; 86 struct nvme_command *sq_cmds; 87 volatile struct nvme_completion *cqes; 88 dma_addr_t sq_dma_addr; 89 dma_addr_t cq_dma_addr; 90 wait_queue_head_t sq_full; 91 struct bio_list sq_cong; 92 u32 __iomem *q_db; 93 u16 q_depth; 94 u16 cq_vector; 95 u16 sq_head; 96 u16 sq_tail; 97 u16 cq_head; 98 u16 cq_phase; 99 unsigned long cmdid_data[]; 100}; 101 102/* 103 * Check we didin't inadvertently grow the command struct 104 */ 105static inline void _nvme_check_size(void) 106{ 107 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 108 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 109 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 110 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 111 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 112 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 113 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 114 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 115 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 116} 117 118/** 119 * alloc_cmdid - Allocate a Command ID 120 * @param nvmeq The queue that will be used for this command 121 * @param ctx A pointer that will be passed to the handler 122 * @param handler The ID of the handler to call 123 * 124 * Allocate a Command ID for a queue. The data passed in will 125 * be passed to the completion handler. This is implemented by using 126 * the bottom two bits of the ctx pointer to store the handler ID. 127 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 128 * We can change this if it becomes a problem. 129 */ 130static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler) 131{ 132 int depth = nvmeq->q_depth; 133 unsigned long data = (unsigned long)ctx | handler; 134 int cmdid; 135 136 BUG_ON((unsigned long)ctx & 3); 137 138 do { 139 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 140 if (cmdid >= depth) 141 return -EBUSY; 142 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 143 144 nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data; 145 return cmdid; 146} 147 148static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 149 int handler) 150{ 151 int cmdid; 152 wait_event_killable(nvmeq->sq_full, 153 (cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0); 154 return (cmdid < 0) ? -EINTR : cmdid; 155} 156 157/* If you need more than four handlers, you'll need to change how 158 * alloc_cmdid and nvme_process_cq work 159 */ 160enum { 161 sync_completion_id = 0, 162 bio_completion_id, 163}; 164 165static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid) 166{ 167 unsigned long data; 168 169 data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)]; 170 clear_bit(cmdid, nvmeq->cmdid_data); 171 wake_up(&nvmeq->sq_full); 172 return data; 173} 174 175static struct nvme_queue *get_nvmeq(struct nvme_ns *ns) 176{ 177 int qid, cpu = get_cpu(); 178 if (cpu < ns->dev->queue_count) 179 qid = cpu + 1; 180 else 181 qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1; 182 return ns->dev->queues[qid]; 183} 184 185static void put_nvmeq(struct nvme_queue *nvmeq) 186{ 187 put_cpu(); 188} 189 190/** 191 * nvme_submit_cmd: Copy a command into a queue and ring the doorbell 192 * @nvmeq: The queue to use 193 * @cmd: The command to send 194 * 195 * Safe to use from interrupt context 196 */ 197static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 198{ 199 unsigned long flags; 200 u16 tail; 201 /* XXX: Need to check tail isn't going to overrun head */ 202 spin_lock_irqsave(&nvmeq->q_lock, flags); 203 tail = nvmeq->sq_tail; 204 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 205 writel(tail, nvmeq->q_db); 206 if (++tail == nvmeq->q_depth) 207 tail = 0; 208 nvmeq->sq_tail = tail; 209 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 210 211 return 0; 212} 213 214struct nvme_req_info { 215 struct bio *bio; 216 int nents; 217 struct scatterlist sg[0]; 218}; 219 220/* XXX: use a mempool */ 221static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp) 222{ 223 return kmalloc(sizeof(struct nvme_req_info) + 224 sizeof(struct scatterlist) * nseg, gfp); 225} 226 227static void free_info(struct nvme_req_info *info) 228{ 229 kfree(info); 230} 231 232static void bio_completion(struct nvme_queue *nvmeq, void *ctx, 233 struct nvme_completion *cqe) 234{ 235 struct nvme_req_info *info = ctx; 236 struct bio *bio = info->bio; 237 u16 status = le16_to_cpup(&cqe->status) >> 1; 238 239 dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents, 240 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 241 free_info(info); 242 bio_endio(bio, status ? -EIO : 0); 243} 244 245/* length is in bytes */ 246static void nvme_setup_prps(struct nvme_common_command *cmd, 247 struct scatterlist *sg, int length) 248{ 249 int dma_len = sg_dma_len(sg); 250 u64 dma_addr = sg_dma_address(sg); 251 int offset = offset_in_page(dma_addr); 252 253 cmd->prp1 = cpu_to_le64(dma_addr); 254 length -= (PAGE_SIZE - offset); 255 if (length <= 0) 256 return; 257 258 dma_len -= (PAGE_SIZE - offset); 259 if (dma_len) { 260 dma_addr += (PAGE_SIZE - offset); 261 } else { 262 sg = sg_next(sg); 263 dma_addr = sg_dma_address(sg); 264 dma_len = sg_dma_len(sg); 265 } 266 267 if (length <= PAGE_SIZE) { 268 cmd->prp2 = cpu_to_le64(dma_addr); 269 return; 270 } 271 272 /* XXX: support PRP lists */ 273} 274 275static int nvme_map_bio(struct device *dev, struct nvme_req_info *info, 276 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 277{ 278 struct bio_vec *bvec; 279 struct scatterlist *sg = info->sg; 280 int i, nsegs; 281 282 sg_init_table(sg, psegs); 283 bio_for_each_segment(bvec, bio, i) { 284 sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); 285 /* XXX: handle non-mergable here */ 286 nsegs++; 287 } 288 info->nents = nsegs; 289 290 return dma_map_sg(dev, info->sg, info->nents, dma_dir); 291} 292 293static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 294 struct bio *bio) 295{ 296 struct nvme_command *cmnd; 297 struct nvme_req_info *info; 298 enum dma_data_direction dma_dir; 299 int cmdid; 300 u16 control; 301 u32 dsmgmt; 302 unsigned long flags; 303 int psegs = bio_phys_segments(ns->queue, bio); 304 305 info = alloc_info(psegs, GFP_NOIO); 306 if (!info) 307 goto congestion; 308 info->bio = bio; 309 310 cmdid = alloc_cmdid(nvmeq, info, bio_completion_id); 311 if (unlikely(cmdid < 0)) 312 goto free_info; 313 314 control = 0; 315 if (bio->bi_rw & REQ_FUA) 316 control |= NVME_RW_FUA; 317 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 318 control |= NVME_RW_LR; 319 320 dsmgmt = 0; 321 if (bio->bi_rw & REQ_RAHEAD) 322 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 323 324 spin_lock_irqsave(&nvmeq->q_lock, flags); 325 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 326 327 memset(cmnd, 0, sizeof(*cmnd)); 328 if (bio_data_dir(bio)) { 329 cmnd->rw.opcode = nvme_cmd_write; 330 dma_dir = DMA_TO_DEVICE; 331 } else { 332 cmnd->rw.opcode = nvme_cmd_read; 333 dma_dir = DMA_FROM_DEVICE; 334 } 335 336 nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs); 337 338 cmnd->rw.flags = 1; 339 cmnd->rw.command_id = cmdid; 340 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 341 nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size); 342 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); 343 cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1); 344 cmnd->rw.control = cpu_to_le16(control); 345 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 346 347 writel(nvmeq->sq_tail, nvmeq->q_db); 348 if (++nvmeq->sq_tail == nvmeq->q_depth) 349 nvmeq->sq_tail = 0; 350 351 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 352 353 return 0; 354 355 free_info: 356 free_info(info); 357 congestion: 358 return -EBUSY; 359} 360 361/* 362 * NB: return value of non-zero would mean that we were a stacking driver. 363 * make_request must always succeed. 364 */ 365static int nvme_make_request(struct request_queue *q, struct bio *bio) 366{ 367 struct nvme_ns *ns = q->queuedata; 368 struct nvme_queue *nvmeq = get_nvmeq(ns); 369 370 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 371 blk_set_queue_congested(q, rw_is_sync(bio->bi_rw)); 372 bio_list_add(&nvmeq->sq_cong, bio); 373 } 374 put_nvmeq(nvmeq); 375 376 return 0; 377} 378 379struct sync_cmd_info { 380 struct task_struct *task; 381 u32 result; 382 int status; 383}; 384 385static void sync_completion(struct nvme_queue *nvmeq, void *ctx, 386 struct nvme_completion *cqe) 387{ 388 struct sync_cmd_info *cmdinfo = ctx; 389 cmdinfo->result = le32_to_cpup(&cqe->result); 390 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 391 wake_up_process(cmdinfo->task); 392} 393 394typedef void (*completion_fn)(struct nvme_queue *, void *, 395 struct nvme_completion *); 396 397static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 398{ 399 u16 head, phase; 400 401 static const completion_fn completions[4] = { 402 [sync_completion_id] = sync_completion, 403 [bio_completion_id] = bio_completion, 404 }; 405 406 head = nvmeq->cq_head; 407 phase = nvmeq->cq_phase; 408 409 for (;;) { 410 unsigned long data; 411 void *ptr; 412 unsigned char handler; 413 struct nvme_completion cqe = nvmeq->cqes[head]; 414 if ((le16_to_cpu(cqe.status) & 1) != phase) 415 break; 416 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 417 if (++head == nvmeq->q_depth) { 418 head = 0; 419 phase = !phase; 420 } 421 422 data = free_cmdid(nvmeq, cqe.command_id); 423 handler = data & 3; 424 ptr = (void *)(data & ~3UL); 425 completions[handler](nvmeq, ptr, &cqe); 426 } 427 428 /* If the controller ignores the cq head doorbell and continuously 429 * writes to the queue, it is theoretically possible to wrap around 430 * the queue twice and mistakenly return IRQ_NONE. Linux only 431 * requires that 0.1% of your interrupts are handled, so this isn't 432 * a big problem. 433 */ 434 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 435 return IRQ_NONE; 436 437 writel(head, nvmeq->q_db + 1); 438 nvmeq->cq_head = head; 439 nvmeq->cq_phase = phase; 440 441 return IRQ_HANDLED; 442} 443 444static irqreturn_t nvme_irq(int irq, void *data) 445{ 446 return nvme_process_cq(data); 447} 448 449/* 450 * Returns 0 on success. If the result is negative, it's a Linux error code; 451 * if the result is positive, it's an NVM Express status code 452 */ 453static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd, 454 u32 *result) 455{ 456 int cmdid; 457 struct sync_cmd_info cmdinfo; 458 459 cmdinfo.task = current; 460 cmdinfo.status = -EINTR; 461 462 cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id); 463 if (cmdid < 0) 464 return cmdid; 465 cmd->common.command_id = cmdid; 466 467 set_current_state(TASK_UNINTERRUPTIBLE); 468 nvme_submit_cmd(q, cmd); 469 schedule(); 470 471 if (result) 472 *result = cmdinfo.result; 473 474 return cmdinfo.status; 475} 476 477static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 478 u32 *result) 479{ 480 return nvme_submit_sync_cmd(dev->queues[0], cmd, result); 481} 482 483static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 484{ 485 int status; 486 struct nvme_command c; 487 488 memset(&c, 0, sizeof(c)); 489 c.delete_queue.opcode = opcode; 490 c.delete_queue.qid = cpu_to_le16(id); 491 492 status = nvme_submit_admin_cmd(dev, &c, NULL); 493 if (status) 494 return -EIO; 495 return 0; 496} 497 498static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 499 struct nvme_queue *nvmeq) 500{ 501 int status; 502 struct nvme_command c; 503 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 504 505 memset(&c, 0, sizeof(c)); 506 c.create_cq.opcode = nvme_admin_create_cq; 507 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 508 c.create_cq.cqid = cpu_to_le16(qid); 509 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 510 c.create_cq.cq_flags = cpu_to_le16(flags); 511 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 512 513 status = nvme_submit_admin_cmd(dev, &c, NULL); 514 if (status) 515 return -EIO; 516 return 0; 517} 518 519static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 520 struct nvme_queue *nvmeq) 521{ 522 int status; 523 struct nvme_command c; 524 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 525 526 memset(&c, 0, sizeof(c)); 527 c.create_sq.opcode = nvme_admin_create_sq; 528 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 529 c.create_sq.sqid = cpu_to_le16(qid); 530 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 531 c.create_sq.sq_flags = cpu_to_le16(flags); 532 c.create_sq.cqid = cpu_to_le16(qid); 533 534 status = nvme_submit_admin_cmd(dev, &c, NULL); 535 if (status) 536 return -EIO; 537 return 0; 538} 539 540static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 541{ 542 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 543} 544 545static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 546{ 547 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 548} 549 550static void nvme_free_queue(struct nvme_dev *dev, int qid) 551{ 552 struct nvme_queue *nvmeq = dev->queues[qid]; 553 554 free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq); 555 556 /* Don't tell the adapter to delete the admin queue */ 557 if (qid) { 558 adapter_delete_sq(dev, qid); 559 adapter_delete_cq(dev, qid); 560 } 561 562 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 563 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 564 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 565 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 566 kfree(nvmeq); 567} 568 569static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 570 int depth, int vector) 571{ 572 struct device *dmadev = &dev->pci_dev->dev; 573 unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long); 574 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 575 if (!nvmeq) 576 return NULL; 577 578 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 579 &nvmeq->cq_dma_addr, GFP_KERNEL); 580 if (!nvmeq->cqes) 581 goto free_nvmeq; 582 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 583 584 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 585 &nvmeq->sq_dma_addr, GFP_KERNEL); 586 if (!nvmeq->sq_cmds) 587 goto free_cqdma; 588 589 nvmeq->q_dmadev = dmadev; 590 spin_lock_init(&nvmeq->q_lock); 591 nvmeq->cq_head = 0; 592 nvmeq->cq_phase = 1; 593 init_waitqueue_head(&nvmeq->sq_full); 594 bio_list_init(&nvmeq->sq_cong); 595 nvmeq->q_db = &dev->dbs[qid * 2]; 596 nvmeq->q_depth = depth; 597 nvmeq->cq_vector = vector; 598 599 return nvmeq; 600 601 free_cqdma: 602 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, 603 nvmeq->cq_dma_addr); 604 free_nvmeq: 605 kfree(nvmeq); 606 return NULL; 607} 608 609static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 610 const char *name) 611{ 612 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 613 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 614} 615 616static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, 617 int qid, int cq_size, int vector) 618{ 619 int result; 620 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 621 622 if (!nvmeq) 623 return NULL; 624 625 result = adapter_alloc_cq(dev, qid, nvmeq); 626 if (result < 0) 627 goto free_nvmeq; 628 629 result = adapter_alloc_sq(dev, qid, nvmeq); 630 if (result < 0) 631 goto release_cq; 632 633 result = queue_request_irq(dev, nvmeq, "nvme"); 634 if (result < 0) 635 goto release_sq; 636 637 return nvmeq; 638 639 release_sq: 640 adapter_delete_sq(dev, qid); 641 release_cq: 642 adapter_delete_cq(dev, qid); 643 free_nvmeq: 644 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 645 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 646 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 647 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 648 kfree(nvmeq); 649 return NULL; 650} 651 652static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) 653{ 654 int result; 655 u32 aqa; 656 struct nvme_queue *nvmeq; 657 658 dev->dbs = ((void __iomem *)dev->bar) + 4096; 659 660 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 661 if (!nvmeq) 662 return -ENOMEM; 663 664 aqa = nvmeq->q_depth - 1; 665 aqa |= aqa << 16; 666 667 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 668 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 669 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 670 671 writel(0, &dev->bar->cc); 672 writel(aqa, &dev->bar->aqa); 673 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 674 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 675 writel(dev->ctrl_config, &dev->bar->cc); 676 677 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 678 msleep(100); 679 if (fatal_signal_pending(current)) 680 return -EINTR; 681 } 682 683 result = queue_request_irq(dev, nvmeq, "nvme admin"); 684 dev->queues[0] = nvmeq; 685 return result; 686} 687 688static int nvme_map_user_pages(struct nvme_dev *dev, int write, 689 unsigned long addr, unsigned length, 690 struct scatterlist **sgp) 691{ 692 int i, err, count, nents, offset; 693 struct scatterlist *sg; 694 struct page **pages; 695 696 if (addr & 3) 697 return -EINVAL; 698 if (!length) 699 return -EINVAL; 700 701 offset = offset_in_page(addr); 702 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 703 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 704 705 err = get_user_pages_fast(addr, count, 1, pages); 706 if (err < count) { 707 count = err; 708 err = -EFAULT; 709 goto put_pages; 710 } 711 712 sg = kcalloc(count, sizeof(*sg), GFP_KERNEL); 713 sg_init_table(sg, count); 714 sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset); 715 length -= (PAGE_SIZE - offset); 716 for (i = 1; i < count; i++) { 717 sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0); 718 length -= PAGE_SIZE; 719 } 720 721 err = -ENOMEM; 722 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 723 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 724 if (!nents) 725 goto put_pages; 726 727 kfree(pages); 728 *sgp = sg; 729 return nents; 730 731 put_pages: 732 for (i = 0; i < count; i++) 733 put_page(pages[i]); 734 kfree(pages); 735 return err; 736} 737 738static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 739 unsigned long addr, int length, 740 struct scatterlist *sg, int nents) 741{ 742 int i, count; 743 744 count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE); 745 dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE); 746 747 for (i = 0; i < count; i++) 748 put_page(sg_page(&sg[i])); 749} 750 751static int nvme_submit_user_admin_command(struct nvme_dev *dev, 752 unsigned long addr, unsigned length, 753 struct nvme_command *cmd) 754{ 755 int err, nents; 756 struct scatterlist *sg; 757 758 nents = nvme_map_user_pages(dev, 0, addr, length, &sg); 759 if (nents < 0) 760 return nents; 761 nvme_setup_prps(&cmd->common, sg, length); 762 err = nvme_submit_admin_cmd(dev, cmd, NULL); 763 nvme_unmap_user_pages(dev, 0, addr, length, sg, nents); 764 return err ? -EIO : 0; 765} 766 767static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns) 768{ 769 struct nvme_command c; 770 771 memset(&c, 0, sizeof(c)); 772 c.identify.opcode = nvme_admin_identify; 773 c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id); 774 c.identify.cns = cpu_to_le32(cns); 775 776 return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c); 777} 778 779static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr) 780{ 781 struct nvme_command c; 782 783 memset(&c, 0, sizeof(c)); 784 c.features.opcode = nvme_admin_get_features; 785 c.features.nsid = cpu_to_le32(ns->ns_id); 786 c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE); 787 788 return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c); 789} 790 791static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 792{ 793 struct nvme_dev *dev = ns->dev; 794 struct nvme_queue *nvmeq; 795 struct nvme_user_io io; 796 struct nvme_command c; 797 unsigned length; 798 u32 result; 799 int nents, status; 800 struct scatterlist *sg; 801 802 if (copy_from_user(&io, uio, sizeof(io))) 803 return -EFAULT; 804 length = io.nblocks << io.block_shift; 805 nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg); 806 if (nents < 0) 807 return nents; 808 809 memset(&c, 0, sizeof(c)); 810 c.rw.opcode = io.opcode; 811 c.rw.flags = io.flags; 812 c.rw.nsid = cpu_to_le32(io.nsid); 813 c.rw.slba = cpu_to_le64(io.slba); 814 c.rw.length = cpu_to_le16(io.nblocks - 1); 815 c.rw.control = cpu_to_le16(io.control); 816 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); 817 c.rw.reftag = cpu_to_le32(io.reftag); /* XXX: endian? */ 818 c.rw.apptag = cpu_to_le16(io.apptag); 819 c.rw.appmask = cpu_to_le16(io.appmask); 820 /* XXX: metadata */ 821 nvme_setup_prps(&c.common, sg, length); 822 823 nvmeq = get_nvmeq(ns); 824 status = nvme_submit_sync_cmd(nvmeq, &c, &result); 825 put_nvmeq(nvmeq); 826 827 nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents); 828 put_user(result, &uio->result); 829 return status; 830} 831 832static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 833 unsigned long arg) 834{ 835 struct nvme_ns *ns = bdev->bd_disk->private_data; 836 837 switch (cmd) { 838 case NVME_IOCTL_IDENTIFY_NS: 839 return nvme_identify(ns, arg, 0); 840 case NVME_IOCTL_IDENTIFY_CTRL: 841 return nvme_identify(ns, arg, 1); 842 case NVME_IOCTL_GET_RANGE_TYPE: 843 return nvme_get_range_type(ns, arg); 844 case NVME_IOCTL_SUBMIT_IO: 845 return nvme_submit_io(ns, (void __user *)arg); 846 default: 847 return -ENOTTY; 848 } 849} 850 851static const struct block_device_operations nvme_fops = { 852 .owner = THIS_MODULE, 853 .ioctl = nvme_ioctl, 854}; 855 856static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index, 857 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 858{ 859 struct nvme_ns *ns; 860 struct gendisk *disk; 861 int lbaf; 862 863 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 864 return NULL; 865 866 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 867 if (!ns) 868 return NULL; 869 ns->queue = blk_alloc_queue(GFP_KERNEL); 870 if (!ns->queue) 871 goto out_free_ns; 872 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES | 873 QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD; 874 blk_queue_make_request(ns->queue, nvme_make_request); 875 ns->dev = dev; 876 ns->queue->queuedata = ns; 877 878 disk = alloc_disk(NVME_MINORS); 879 if (!disk) 880 goto out_free_queue; 881 ns->ns_id = index; 882 ns->disk = disk; 883 lbaf = id->flbas & 0xf; 884 ns->lba_shift = id->lbaf[lbaf].ds; 885 886 disk->major = nvme_major; 887 disk->minors = NVME_MINORS; 888 disk->first_minor = NVME_MINORS * index; 889 disk->fops = &nvme_fops; 890 disk->private_data = ns; 891 disk->queue = ns->queue; 892 disk->driverfs_dev = &dev->pci_dev->dev; 893 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index); 894 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 895 896 return ns; 897 898 out_free_queue: 899 blk_cleanup_queue(ns->queue); 900 out_free_ns: 901 kfree(ns); 902 return NULL; 903} 904 905static void nvme_ns_free(struct nvme_ns *ns) 906{ 907 put_disk(ns->disk); 908 blk_cleanup_queue(ns->queue); 909 kfree(ns); 910} 911 912static int set_queue_count(struct nvme_dev *dev, int count) 913{ 914 int status; 915 u32 result; 916 struct nvme_command c; 917 u32 q_count = (count - 1) | ((count - 1) << 16); 918 919 memset(&c, 0, sizeof(c)); 920 c.features.opcode = nvme_admin_get_features; 921 c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES); 922 c.features.dword11 = cpu_to_le32(q_count); 923 924 status = nvme_submit_admin_cmd(dev, &c, &result); 925 if (status) 926 return -EIO; 927 return min(result & 0xffff, result >> 16) + 1; 928} 929 930static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) 931{ 932 int result, cpu, i, nr_queues; 933 934 nr_queues = num_online_cpus(); 935 result = set_queue_count(dev, nr_queues); 936 if (result < 0) 937 return result; 938 if (result < nr_queues) 939 nr_queues = result; 940 941 /* Deregister the admin queue's interrupt */ 942 free_irq(dev->entry[0].vector, dev->queues[0]); 943 944 for (i = 0; i < nr_queues; i++) 945 dev->entry[i].entry = i; 946 for (;;) { 947 result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues); 948 if (result == 0) { 949 break; 950 } else if (result > 0) { 951 nr_queues = result; 952 continue; 953 } else { 954 nr_queues = 1; 955 break; 956 } 957 } 958 959 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 960 /* XXX: handle failure here */ 961 962 cpu = cpumask_first(cpu_online_mask); 963 for (i = 0; i < nr_queues; i++) { 964 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 965 cpu = cpumask_next(cpu, cpu_online_mask); 966 } 967 968 for (i = 0; i < nr_queues; i++) { 969 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, 970 NVME_Q_DEPTH, i); 971 if (!dev->queues[i + 1]) 972 return -ENOMEM; 973 dev->queue_count++; 974 } 975 976 return 0; 977} 978 979static void nvme_free_queues(struct nvme_dev *dev) 980{ 981 int i; 982 983 for (i = dev->queue_count - 1; i >= 0; i--) 984 nvme_free_queue(dev, i); 985} 986 987static int __devinit nvme_dev_add(struct nvme_dev *dev) 988{ 989 int res, nn, i; 990 struct nvme_ns *ns, *next; 991 struct nvme_id_ctrl *ctrl; 992 void *id; 993 dma_addr_t dma_addr; 994 struct nvme_command cid, crt; 995 996 res = nvme_setup_io_queues(dev); 997 if (res) 998 return res; 999 1000 /* XXX: Switch to a SG list once prp2 works */ 1001 id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1002 GFP_KERNEL); 1003 1004 memset(&cid, 0, sizeof(cid)); 1005 cid.identify.opcode = nvme_admin_identify; 1006 cid.identify.nsid = 0; 1007 cid.identify.prp1 = cpu_to_le64(dma_addr); 1008 cid.identify.cns = cpu_to_le32(1); 1009 1010 res = nvme_submit_admin_cmd(dev, &cid, NULL); 1011 if (res) { 1012 res = -EIO; 1013 goto out_free; 1014 } 1015 1016 ctrl = id; 1017 nn = le32_to_cpup(&ctrl->nn); 1018 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1019 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1020 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1021 1022 cid.identify.cns = 0; 1023 memset(&crt, 0, sizeof(crt)); 1024 crt.features.opcode = nvme_admin_get_features; 1025 crt.features.prp1 = cpu_to_le64(dma_addr + 4096); 1026 crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE); 1027 1028 for (i = 0; i < nn; i++) { 1029 cid.identify.nsid = cpu_to_le32(i); 1030 res = nvme_submit_admin_cmd(dev, &cid, NULL); 1031 if (res) 1032 continue; 1033 1034 if (((struct nvme_id_ns *)id)->ncap == 0) 1035 continue; 1036 1037 crt.features.nsid = cpu_to_le32(i); 1038 res = nvme_submit_admin_cmd(dev, &crt, NULL); 1039 if (res) 1040 continue; 1041 1042 ns = nvme_alloc_ns(dev, i, id, id + 4096); 1043 if (ns) 1044 list_add_tail(&ns->list, &dev->namespaces); 1045 } 1046 list_for_each_entry(ns, &dev->namespaces, list) 1047 add_disk(ns->disk); 1048 1049 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); 1050 return 0; 1051 1052 out_free: 1053 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1054 list_del(&ns->list); 1055 nvme_ns_free(ns); 1056 } 1057 1058 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); 1059 return res; 1060} 1061 1062static int nvme_dev_remove(struct nvme_dev *dev) 1063{ 1064 struct nvme_ns *ns, *next; 1065 1066 /* TODO: wait all I/O finished or cancel them */ 1067 1068 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1069 list_del(&ns->list); 1070 del_gendisk(ns->disk); 1071 nvme_ns_free(ns); 1072 } 1073 1074 nvme_free_queues(dev); 1075 1076 return 0; 1077} 1078 1079/* XXX: Use an ida or something to let remove / add work correctly */ 1080static void nvme_set_instance(struct nvme_dev *dev) 1081{ 1082 static int instance; 1083 dev->instance = instance++; 1084} 1085 1086static void nvme_release_instance(struct nvme_dev *dev) 1087{ 1088} 1089 1090static int __devinit nvme_probe(struct pci_dev *pdev, 1091 const struct pci_device_id *id) 1092{ 1093 int bars, result = -ENOMEM; 1094 struct nvme_dev *dev; 1095 1096 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1097 if (!dev) 1098 return -ENOMEM; 1099 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1100 GFP_KERNEL); 1101 if (!dev->entry) 1102 goto free; 1103 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1104 GFP_KERNEL); 1105 if (!dev->queues) 1106 goto free; 1107 1108 if (pci_enable_device_mem(pdev)) 1109 goto free; 1110 pci_set_master(pdev); 1111 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1112 if (pci_request_selected_regions(pdev, bars, "nvme")) 1113 goto disable; 1114 1115 INIT_LIST_HEAD(&dev->namespaces); 1116 dev->pci_dev = pdev; 1117 pci_set_drvdata(pdev, dev); 1118 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1119 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1120 nvme_set_instance(dev); 1121 dev->entry[0].vector = pdev->irq; 1122 1123 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1124 if (!dev->bar) { 1125 result = -ENOMEM; 1126 goto disable_msix; 1127 } 1128 1129 result = nvme_configure_admin_queue(dev); 1130 if (result) 1131 goto unmap; 1132 dev->queue_count++; 1133 1134 result = nvme_dev_add(dev); 1135 if (result) 1136 goto delete; 1137 return 0; 1138 1139 delete: 1140 nvme_free_queues(dev); 1141 unmap: 1142 iounmap(dev->bar); 1143 disable_msix: 1144 pci_disable_msix(pdev); 1145 nvme_release_instance(dev); 1146 disable: 1147 pci_disable_device(pdev); 1148 pci_release_regions(pdev); 1149 free: 1150 kfree(dev->queues); 1151 kfree(dev->entry); 1152 kfree(dev); 1153 return result; 1154} 1155 1156static void __devexit nvme_remove(struct pci_dev *pdev) 1157{ 1158 struct nvme_dev *dev = pci_get_drvdata(pdev); 1159 nvme_dev_remove(dev); 1160 pci_disable_msix(pdev); 1161 iounmap(dev->bar); 1162 nvme_release_instance(dev); 1163 pci_disable_device(pdev); 1164 pci_release_regions(pdev); 1165 kfree(dev->queues); 1166 kfree(dev->entry); 1167 kfree(dev); 1168} 1169 1170/* These functions are yet to be implemented */ 1171#define nvme_error_detected NULL 1172#define nvme_dump_registers NULL 1173#define nvme_link_reset NULL 1174#define nvme_slot_reset NULL 1175#define nvme_error_resume NULL 1176#define nvme_suspend NULL 1177#define nvme_resume NULL 1178 1179static struct pci_error_handlers nvme_err_handler = { 1180 .error_detected = nvme_error_detected, 1181 .mmio_enabled = nvme_dump_registers, 1182 .link_reset = nvme_link_reset, 1183 .slot_reset = nvme_slot_reset, 1184 .resume = nvme_error_resume, 1185}; 1186 1187/* Move to pci_ids.h later */ 1188#define PCI_CLASS_STORAGE_EXPRESS 0x010802 1189 1190static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 1191 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 1192 { 0, } 1193}; 1194MODULE_DEVICE_TABLE(pci, nvme_id_table); 1195 1196static struct pci_driver nvme_driver = { 1197 .name = "nvme", 1198 .id_table = nvme_id_table, 1199 .probe = nvme_probe, 1200 .remove = __devexit_p(nvme_remove), 1201 .suspend = nvme_suspend, 1202 .resume = nvme_resume, 1203 .err_handler = &nvme_err_handler, 1204}; 1205 1206static int __init nvme_init(void) 1207{ 1208 int result; 1209 1210 nvme_major = register_blkdev(nvme_major, "nvme"); 1211 if (nvme_major <= 0) 1212 return -EBUSY; 1213 1214 result = pci_register_driver(&nvme_driver); 1215 if (!result) 1216 return 0; 1217 1218 unregister_blkdev(nvme_major, "nvme"); 1219 return result; 1220} 1221 1222static void __exit nvme_exit(void) 1223{ 1224 pci_unregister_driver(&nvme_driver); 1225 unregister_blkdev(nvme_major, "nvme"); 1226} 1227 1228MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 1229MODULE_LICENSE("GPL"); 1230MODULE_VERSION("0.1"); 1231module_init(nvme_init); 1232module_exit(nvme_exit); 1233