nvme-core.c revision cf90bc4830b858487fe4b9b9ecd0031e23ca3e83
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42#include <scsi/sg.h> 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#define NVME_Q_DEPTH 1024 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48#define NVME_MINORS 64 49#define ADMIN_TIMEOUT (60 * HZ) 50 51static int nvme_major; 52module_param(nvme_major, int, 0); 53 54static int use_threaded_interrupts; 55module_param(use_threaded_interrupts, int, 0); 56 57static DEFINE_SPINLOCK(dev_list_lock); 58static LIST_HEAD(dev_list); 59static struct task_struct *nvme_thread; 60 61/* 62 * An NVM Express queue. Each device has at least two (one for admin 63 * commands and one for I/O commands). 64 */ 65struct nvme_queue { 66 struct device *q_dmadev; 67 struct nvme_dev *dev; 68 spinlock_t q_lock; 69 struct nvme_command *sq_cmds; 70 volatile struct nvme_completion *cqes; 71 dma_addr_t sq_dma_addr; 72 dma_addr_t cq_dma_addr; 73 wait_queue_head_t sq_full; 74 wait_queue_t sq_cong_wait; 75 struct bio_list sq_cong; 76 u32 __iomem *q_db; 77 u16 q_depth; 78 u16 cq_vector; 79 u16 sq_head; 80 u16 sq_tail; 81 u16 cq_head; 82 u16 cq_phase; 83 unsigned long cmdid_data[]; 84}; 85 86/* 87 * Check we didin't inadvertently grow the command struct 88 */ 89static inline void _nvme_check_size(void) 90{ 91 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 92 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 93 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 94 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 95 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 96 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 97 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 98 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 99 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 100 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 101 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 102} 103 104typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 105 struct nvme_completion *); 106 107struct nvme_cmd_info { 108 nvme_completion_fn fn; 109 void *ctx; 110 unsigned long timeout; 111}; 112 113static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 114{ 115 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 116} 117 118/** 119 * alloc_cmdid() - Allocate a Command ID 120 * @nvmeq: The queue that will be used for this command 121 * @ctx: A pointer that will be passed to the handler 122 * @handler: The function to call on completion 123 * 124 * Allocate a Command ID for a queue. The data passed in will 125 * be passed to the completion handler. This is implemented by using 126 * the bottom two bits of the ctx pointer to store the handler ID. 127 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 128 * We can change this if it becomes a problem. 129 * 130 * May be called with local interrupts disabled and the q_lock held, 131 * or with interrupts enabled and no locks held. 132 */ 133static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 134 nvme_completion_fn handler, unsigned timeout) 135{ 136 int depth = nvmeq->q_depth - 1; 137 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 138 int cmdid; 139 140 do { 141 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 142 if (cmdid >= depth) 143 return -EBUSY; 144 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 145 146 info[cmdid].fn = handler; 147 info[cmdid].ctx = ctx; 148 info[cmdid].timeout = jiffies + timeout; 149 return cmdid; 150} 151 152static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 153 nvme_completion_fn handler, unsigned timeout) 154{ 155 int cmdid; 156 wait_event_killable(nvmeq->sq_full, 157 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 158 return (cmdid < 0) ? -EINTR : cmdid; 159} 160 161/* Special values must be less than 0x1000 */ 162#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 163#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 164#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 165#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 166#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 167 168static void special_completion(struct nvme_dev *dev, void *ctx, 169 struct nvme_completion *cqe) 170{ 171 if (ctx == CMD_CTX_CANCELLED) 172 return; 173 if (ctx == CMD_CTX_FLUSH) 174 return; 175 if (ctx == CMD_CTX_COMPLETED) { 176 dev_warn(&dev->pci_dev->dev, 177 "completed id %d twice on queue %d\n", 178 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 179 return; 180 } 181 if (ctx == CMD_CTX_INVALID) { 182 dev_warn(&dev->pci_dev->dev, 183 "invalid id %d completed on queue %d\n", 184 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 185 return; 186 } 187 188 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 189} 190 191/* 192 * Called with local interrupts disabled and the q_lock held. May not sleep. 193 */ 194static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 195 nvme_completion_fn *fn) 196{ 197 void *ctx; 198 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 199 200 if (cmdid >= nvmeq->q_depth) { 201 *fn = special_completion; 202 return CMD_CTX_INVALID; 203 } 204 if (fn) 205 *fn = info[cmdid].fn; 206 ctx = info[cmdid].ctx; 207 info[cmdid].fn = special_completion; 208 info[cmdid].ctx = CMD_CTX_COMPLETED; 209 clear_bit(cmdid, nvmeq->cmdid_data); 210 wake_up(&nvmeq->sq_full); 211 return ctx; 212} 213 214static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 215 nvme_completion_fn *fn) 216{ 217 void *ctx; 218 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 219 if (fn) 220 *fn = info[cmdid].fn; 221 ctx = info[cmdid].ctx; 222 info[cmdid].fn = special_completion; 223 info[cmdid].ctx = CMD_CTX_CANCELLED; 224 return ctx; 225} 226 227struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 228{ 229 return dev->queues[get_cpu() + 1]; 230} 231 232void put_nvmeq(struct nvme_queue *nvmeq) 233{ 234 put_cpu(); 235} 236 237/** 238 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 239 * @nvmeq: The queue to use 240 * @cmd: The command to send 241 * 242 * Safe to use from interrupt context 243 */ 244static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 245{ 246 unsigned long flags; 247 u16 tail; 248 spin_lock_irqsave(&nvmeq->q_lock, flags); 249 tail = nvmeq->sq_tail; 250 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 251 if (++tail == nvmeq->q_depth) 252 tail = 0; 253 writel(tail, nvmeq->q_db); 254 nvmeq->sq_tail = tail; 255 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 256 257 return 0; 258} 259 260static __le64 **iod_list(struct nvme_iod *iod) 261{ 262 return ((void *)iod) + iod->offset; 263} 264 265/* 266 * Will slightly overestimate the number of pages needed. This is OK 267 * as it only leads to a small amount of wasted memory for the lifetime of 268 * the I/O. 269 */ 270static int nvme_npages(unsigned size) 271{ 272 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 273 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 274} 275 276static struct nvme_iod * 277nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 278{ 279 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 280 sizeof(__le64 *) * nvme_npages(nbytes) + 281 sizeof(struct scatterlist) * nseg, gfp); 282 283 if (iod) { 284 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 285 iod->npages = -1; 286 iod->length = nbytes; 287 iod->nents = 0; 288 } 289 290 return iod; 291} 292 293void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 294{ 295 const int last_prp = PAGE_SIZE / 8 - 1; 296 int i; 297 __le64 **list = iod_list(iod); 298 dma_addr_t prp_dma = iod->first_dma; 299 300 if (iod->npages == 0) 301 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 302 for (i = 0; i < iod->npages; i++) { 303 __le64 *prp_list = list[i]; 304 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 305 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 306 prp_dma = next_prp_dma; 307 } 308 kfree(iod); 309} 310 311static void bio_completion(struct nvme_dev *dev, void *ctx, 312 struct nvme_completion *cqe) 313{ 314 struct nvme_iod *iod = ctx; 315 struct bio *bio = iod->private; 316 u16 status = le16_to_cpup(&cqe->status) >> 1; 317 318 if (iod->nents) 319 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 320 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 321 nvme_free_iod(dev, iod); 322 if (status) 323 bio_endio(bio, -EIO); 324 else 325 bio_endio(bio, 0); 326} 327 328/* length is in bytes. gfp flags indicates whether we may sleep. */ 329int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 330 struct nvme_iod *iod, int total_len, gfp_t gfp) 331{ 332 struct dma_pool *pool; 333 int length = total_len; 334 struct scatterlist *sg = iod->sg; 335 int dma_len = sg_dma_len(sg); 336 u64 dma_addr = sg_dma_address(sg); 337 int offset = offset_in_page(dma_addr); 338 __le64 *prp_list; 339 __le64 **list = iod_list(iod); 340 dma_addr_t prp_dma; 341 int nprps, i; 342 343 cmd->prp1 = cpu_to_le64(dma_addr); 344 length -= (PAGE_SIZE - offset); 345 if (length <= 0) 346 return total_len; 347 348 dma_len -= (PAGE_SIZE - offset); 349 if (dma_len) { 350 dma_addr += (PAGE_SIZE - offset); 351 } else { 352 sg = sg_next(sg); 353 dma_addr = sg_dma_address(sg); 354 dma_len = sg_dma_len(sg); 355 } 356 357 if (length <= PAGE_SIZE) { 358 cmd->prp2 = cpu_to_le64(dma_addr); 359 return total_len; 360 } 361 362 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 363 if (nprps <= (256 / 8)) { 364 pool = dev->prp_small_pool; 365 iod->npages = 0; 366 } else { 367 pool = dev->prp_page_pool; 368 iod->npages = 1; 369 } 370 371 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 372 if (!prp_list) { 373 cmd->prp2 = cpu_to_le64(dma_addr); 374 iod->npages = -1; 375 return (total_len - length) + PAGE_SIZE; 376 } 377 list[0] = prp_list; 378 iod->first_dma = prp_dma; 379 cmd->prp2 = cpu_to_le64(prp_dma); 380 i = 0; 381 for (;;) { 382 if (i == PAGE_SIZE / 8) { 383 __le64 *old_prp_list = prp_list; 384 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 385 if (!prp_list) 386 return total_len - length; 387 list[iod->npages++] = prp_list; 388 prp_list[0] = old_prp_list[i - 1]; 389 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 390 i = 1; 391 } 392 prp_list[i++] = cpu_to_le64(dma_addr); 393 dma_len -= PAGE_SIZE; 394 dma_addr += PAGE_SIZE; 395 length -= PAGE_SIZE; 396 if (length <= 0) 397 break; 398 if (dma_len > 0) 399 continue; 400 BUG_ON(dma_len < 0); 401 sg = sg_next(sg); 402 dma_addr = sg_dma_address(sg); 403 dma_len = sg_dma_len(sg); 404 } 405 406 return total_len; 407} 408 409struct nvme_bio_pair { 410 struct bio b1, b2, *parent; 411 struct bio_vec *bv1, *bv2; 412 int err; 413 atomic_t cnt; 414}; 415 416static void nvme_bio_pair_endio(struct bio *bio, int err) 417{ 418 struct nvme_bio_pair *bp = bio->bi_private; 419 420 if (err) 421 bp->err = err; 422 423 if (atomic_dec_and_test(&bp->cnt)) { 424 bio_endio(bp->parent, bp->err); 425 if (bp->bv1) 426 kfree(bp->bv1); 427 if (bp->bv2) 428 kfree(bp->bv2); 429 kfree(bp); 430 } 431} 432 433static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, 434 int len, int offset) 435{ 436 struct nvme_bio_pair *bp; 437 438 BUG_ON(len > bio->bi_size); 439 BUG_ON(idx > bio->bi_vcnt); 440 441 bp = kmalloc(sizeof(*bp), GFP_ATOMIC); 442 if (!bp) 443 return NULL; 444 bp->err = 0; 445 446 bp->b1 = *bio; 447 bp->b2 = *bio; 448 449 bp->b1.bi_size = len; 450 bp->b2.bi_size -= len; 451 bp->b1.bi_vcnt = idx; 452 bp->b2.bi_idx = idx; 453 bp->b2.bi_sector += len >> 9; 454 455 if (offset) { 456 bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 457 GFP_ATOMIC); 458 if (!bp->bv1) 459 goto split_fail_1; 460 461 bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 462 GFP_ATOMIC); 463 if (!bp->bv2) 464 goto split_fail_2; 465 466 memcpy(bp->bv1, bio->bi_io_vec, 467 bio->bi_max_vecs * sizeof(struct bio_vec)); 468 memcpy(bp->bv2, bio->bi_io_vec, 469 bio->bi_max_vecs * sizeof(struct bio_vec)); 470 471 bp->b1.bi_io_vec = bp->bv1; 472 bp->b2.bi_io_vec = bp->bv2; 473 bp->b2.bi_io_vec[idx].bv_offset += offset; 474 bp->b2.bi_io_vec[idx].bv_len -= offset; 475 bp->b1.bi_io_vec[idx].bv_len = offset; 476 bp->b1.bi_vcnt++; 477 } else 478 bp->bv1 = bp->bv2 = NULL; 479 480 bp->b1.bi_private = bp; 481 bp->b2.bi_private = bp; 482 483 bp->b1.bi_end_io = nvme_bio_pair_endio; 484 bp->b2.bi_end_io = nvme_bio_pair_endio; 485 486 bp->parent = bio; 487 atomic_set(&bp->cnt, 2); 488 489 return bp; 490 491 split_fail_2: 492 kfree(bp->bv1); 493 split_fail_1: 494 kfree(bp); 495 return NULL; 496} 497 498static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 499 int idx, int len, int offset) 500{ 501 struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); 502 if (!bp) 503 return -ENOMEM; 504 505 if (bio_list_empty(&nvmeq->sq_cong)) 506 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 507 bio_list_add(&nvmeq->sq_cong, &bp->b1); 508 bio_list_add(&nvmeq->sq_cong, &bp->b2); 509 510 return 0; 511} 512 513/* NVMe scatterlists require no holes in the virtual address */ 514#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 515 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 516 517static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 518 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 519{ 520 struct bio_vec *bvec, *bvprv = NULL; 521 struct scatterlist *sg = NULL; 522 int i, length = 0, nsegs = 0, split_len = bio->bi_size; 523 524 if (nvmeq->dev->stripe_size) 525 split_len = nvmeq->dev->stripe_size - 526 ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); 527 528 sg_init_table(iod->sg, psegs); 529 bio_for_each_segment(bvec, bio, i) { 530 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 531 sg->length += bvec->bv_len; 532 } else { 533 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 534 return nvme_split_and_submit(bio, nvmeq, i, 535 length, 0); 536 537 sg = sg ? sg + 1 : iod->sg; 538 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 539 bvec->bv_offset); 540 nsegs++; 541 } 542 543 if (split_len - length < bvec->bv_len) 544 return nvme_split_and_submit(bio, nvmeq, i, split_len, 545 split_len - length); 546 length += bvec->bv_len; 547 bvprv = bvec; 548 } 549 iod->nents = nsegs; 550 sg_mark_end(sg); 551 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 552 return -ENOMEM; 553 554 BUG_ON(length != bio->bi_size); 555 return length; 556} 557 558/* 559 * We reuse the small pool to allocate the 16-byte range here as it is not 560 * worth having a special pool for these or additional cases to handle freeing 561 * the iod. 562 */ 563static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 564 struct bio *bio, struct nvme_iod *iod, int cmdid) 565{ 566 struct nvme_dsm_range *range; 567 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 568 569 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 570 &iod->first_dma); 571 if (!range) 572 return -ENOMEM; 573 574 iod_list(iod)[0] = (__le64 *)range; 575 iod->npages = 0; 576 577 range->cattr = cpu_to_le32(0); 578 range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); 579 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 580 581 memset(cmnd, 0, sizeof(*cmnd)); 582 cmnd->dsm.opcode = nvme_cmd_dsm; 583 cmnd->dsm.command_id = cmdid; 584 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 585 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 586 cmnd->dsm.nr = 0; 587 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 588 589 if (++nvmeq->sq_tail == nvmeq->q_depth) 590 nvmeq->sq_tail = 0; 591 writel(nvmeq->sq_tail, nvmeq->q_db); 592 593 return 0; 594} 595 596static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 597 int cmdid) 598{ 599 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 600 601 memset(cmnd, 0, sizeof(*cmnd)); 602 cmnd->common.opcode = nvme_cmd_flush; 603 cmnd->common.command_id = cmdid; 604 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 605 606 if (++nvmeq->sq_tail == nvmeq->q_depth) 607 nvmeq->sq_tail = 0; 608 writel(nvmeq->sq_tail, nvmeq->q_db); 609 610 return 0; 611} 612 613int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 614{ 615 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 616 special_completion, NVME_IO_TIMEOUT); 617 if (unlikely(cmdid < 0)) 618 return cmdid; 619 620 return nvme_submit_flush(nvmeq, ns, cmdid); 621} 622 623/* 624 * Called with local interrupts disabled and the q_lock held. May not sleep. 625 */ 626static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 627 struct bio *bio) 628{ 629 struct nvme_command *cmnd; 630 struct nvme_iod *iod; 631 enum dma_data_direction dma_dir; 632 int cmdid, length, result; 633 u16 control; 634 u32 dsmgmt; 635 int psegs = bio_phys_segments(ns->queue, bio); 636 637 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 638 result = nvme_submit_flush_data(nvmeq, ns); 639 if (result) 640 return result; 641 } 642 643 result = -ENOMEM; 644 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 645 if (!iod) 646 goto nomem; 647 iod->private = bio; 648 649 result = -EBUSY; 650 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 651 if (unlikely(cmdid < 0)) 652 goto free_iod; 653 654 if (bio->bi_rw & REQ_DISCARD) { 655 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 656 if (result) 657 goto free_cmdid; 658 return result; 659 } 660 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 661 return nvme_submit_flush(nvmeq, ns, cmdid); 662 663 control = 0; 664 if (bio->bi_rw & REQ_FUA) 665 control |= NVME_RW_FUA; 666 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 667 control |= NVME_RW_LR; 668 669 dsmgmt = 0; 670 if (bio->bi_rw & REQ_RAHEAD) 671 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 672 673 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 674 675 memset(cmnd, 0, sizeof(*cmnd)); 676 if (bio_data_dir(bio)) { 677 cmnd->rw.opcode = nvme_cmd_write; 678 dma_dir = DMA_TO_DEVICE; 679 } else { 680 cmnd->rw.opcode = nvme_cmd_read; 681 dma_dir = DMA_FROM_DEVICE; 682 } 683 684 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 685 if (result <= 0) 686 goto free_cmdid; 687 length = result; 688 689 cmnd->rw.command_id = cmdid; 690 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 691 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 692 GFP_ATOMIC); 693 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 694 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 695 cmnd->rw.control = cpu_to_le16(control); 696 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 697 698 if (++nvmeq->sq_tail == nvmeq->q_depth) 699 nvmeq->sq_tail = 0; 700 writel(nvmeq->sq_tail, nvmeq->q_db); 701 702 return 0; 703 704 free_cmdid: 705 free_cmdid(nvmeq, cmdid, NULL); 706 free_iod: 707 nvme_free_iod(nvmeq->dev, iod); 708 nomem: 709 return result; 710} 711 712static void nvme_make_request(struct request_queue *q, struct bio *bio) 713{ 714 struct nvme_ns *ns = q->queuedata; 715 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 716 int result = -EBUSY; 717 718 spin_lock_irq(&nvmeq->q_lock); 719 if (bio_list_empty(&nvmeq->sq_cong)) 720 result = nvme_submit_bio_queue(nvmeq, ns, bio); 721 if (unlikely(result)) { 722 if (bio_list_empty(&nvmeq->sq_cong)) 723 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 724 bio_list_add(&nvmeq->sq_cong, bio); 725 } 726 727 spin_unlock_irq(&nvmeq->q_lock); 728 put_nvmeq(nvmeq); 729} 730 731static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 732{ 733 u16 head, phase; 734 735 head = nvmeq->cq_head; 736 phase = nvmeq->cq_phase; 737 738 for (;;) { 739 void *ctx; 740 nvme_completion_fn fn; 741 struct nvme_completion cqe = nvmeq->cqes[head]; 742 if ((le16_to_cpu(cqe.status) & 1) != phase) 743 break; 744 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 745 if (++head == nvmeq->q_depth) { 746 head = 0; 747 phase = !phase; 748 } 749 750 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 751 fn(nvmeq->dev, ctx, &cqe); 752 } 753 754 /* If the controller ignores the cq head doorbell and continuously 755 * writes to the queue, it is theoretically possible to wrap around 756 * the queue twice and mistakenly return IRQ_NONE. Linux only 757 * requires that 0.1% of your interrupts are handled, so this isn't 758 * a big problem. 759 */ 760 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 761 return IRQ_NONE; 762 763 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 764 nvmeq->cq_head = head; 765 nvmeq->cq_phase = phase; 766 767 return IRQ_HANDLED; 768} 769 770static irqreturn_t nvme_irq(int irq, void *data) 771{ 772 irqreturn_t result; 773 struct nvme_queue *nvmeq = data; 774 spin_lock(&nvmeq->q_lock); 775 result = nvme_process_cq(nvmeq); 776 spin_unlock(&nvmeq->q_lock); 777 return result; 778} 779 780static irqreturn_t nvme_irq_check(int irq, void *data) 781{ 782 struct nvme_queue *nvmeq = data; 783 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 784 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 785 return IRQ_NONE; 786 return IRQ_WAKE_THREAD; 787} 788 789static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 790{ 791 spin_lock_irq(&nvmeq->q_lock); 792 cancel_cmdid(nvmeq, cmdid, NULL); 793 spin_unlock_irq(&nvmeq->q_lock); 794} 795 796struct sync_cmd_info { 797 struct task_struct *task; 798 u32 result; 799 int status; 800}; 801 802static void sync_completion(struct nvme_dev *dev, void *ctx, 803 struct nvme_completion *cqe) 804{ 805 struct sync_cmd_info *cmdinfo = ctx; 806 cmdinfo->result = le32_to_cpup(&cqe->result); 807 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 808 wake_up_process(cmdinfo->task); 809} 810 811/* 812 * Returns 0 on success. If the result is negative, it's a Linux error code; 813 * if the result is positive, it's an NVM Express status code 814 */ 815int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, 816 u32 *result, unsigned timeout) 817{ 818 int cmdid; 819 struct sync_cmd_info cmdinfo; 820 821 cmdinfo.task = current; 822 cmdinfo.status = -EINTR; 823 824 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 825 timeout); 826 if (cmdid < 0) 827 return cmdid; 828 cmd->common.command_id = cmdid; 829 830 set_current_state(TASK_KILLABLE); 831 nvme_submit_cmd(nvmeq, cmd); 832 schedule_timeout(timeout); 833 834 if (cmdinfo.status == -EINTR) { 835 nvme_abort_command(nvmeq, cmdid); 836 return -EINTR; 837 } 838 839 if (result) 840 *result = cmdinfo.result; 841 842 return cmdinfo.status; 843} 844 845int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 846 u32 *result) 847{ 848 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 849} 850 851static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 852{ 853 int status; 854 struct nvme_command c; 855 856 memset(&c, 0, sizeof(c)); 857 c.delete_queue.opcode = opcode; 858 c.delete_queue.qid = cpu_to_le16(id); 859 860 status = nvme_submit_admin_cmd(dev, &c, NULL); 861 if (status) 862 return -EIO; 863 return 0; 864} 865 866static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 867 struct nvme_queue *nvmeq) 868{ 869 int status; 870 struct nvme_command c; 871 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 872 873 memset(&c, 0, sizeof(c)); 874 c.create_cq.opcode = nvme_admin_create_cq; 875 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 876 c.create_cq.cqid = cpu_to_le16(qid); 877 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 878 c.create_cq.cq_flags = cpu_to_le16(flags); 879 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 880 881 status = nvme_submit_admin_cmd(dev, &c, NULL); 882 if (status) 883 return -EIO; 884 return 0; 885} 886 887static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 888 struct nvme_queue *nvmeq) 889{ 890 int status; 891 struct nvme_command c; 892 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 893 894 memset(&c, 0, sizeof(c)); 895 c.create_sq.opcode = nvme_admin_create_sq; 896 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 897 c.create_sq.sqid = cpu_to_le16(qid); 898 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 899 c.create_sq.sq_flags = cpu_to_le16(flags); 900 c.create_sq.cqid = cpu_to_le16(qid); 901 902 status = nvme_submit_admin_cmd(dev, &c, NULL); 903 if (status) 904 return -EIO; 905 return 0; 906} 907 908static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 909{ 910 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 911} 912 913static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 914{ 915 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 916} 917 918int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 919 dma_addr_t dma_addr) 920{ 921 struct nvme_command c; 922 923 memset(&c, 0, sizeof(c)); 924 c.identify.opcode = nvme_admin_identify; 925 c.identify.nsid = cpu_to_le32(nsid); 926 c.identify.prp1 = cpu_to_le64(dma_addr); 927 c.identify.cns = cpu_to_le32(cns); 928 929 return nvme_submit_admin_cmd(dev, &c, NULL); 930} 931 932int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 933 dma_addr_t dma_addr, u32 *result) 934{ 935 struct nvme_command c; 936 937 memset(&c, 0, sizeof(c)); 938 c.features.opcode = nvme_admin_get_features; 939 c.features.nsid = cpu_to_le32(nsid); 940 c.features.prp1 = cpu_to_le64(dma_addr); 941 c.features.fid = cpu_to_le32(fid); 942 943 return nvme_submit_admin_cmd(dev, &c, result); 944} 945 946int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 947 dma_addr_t dma_addr, u32 *result) 948{ 949 struct nvme_command c; 950 951 memset(&c, 0, sizeof(c)); 952 c.features.opcode = nvme_admin_set_features; 953 c.features.prp1 = cpu_to_le64(dma_addr); 954 c.features.fid = cpu_to_le32(fid); 955 c.features.dword11 = cpu_to_le32(dword11); 956 957 return nvme_submit_admin_cmd(dev, &c, result); 958} 959 960/** 961 * nvme_cancel_ios - Cancel outstanding I/Os 962 * @queue: The queue to cancel I/Os on 963 * @timeout: True to only cancel I/Os which have timed out 964 */ 965static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 966{ 967 int depth = nvmeq->q_depth - 1; 968 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 969 unsigned long now = jiffies; 970 int cmdid; 971 972 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 973 void *ctx; 974 nvme_completion_fn fn; 975 static struct nvme_completion cqe = { 976 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 977 }; 978 979 if (timeout && !time_after(now, info[cmdid].timeout)) 980 continue; 981 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 982 continue; 983 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); 984 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 985 fn(nvmeq->dev, ctx, &cqe); 986 } 987} 988 989static void nvme_free_queue_mem(struct nvme_queue *nvmeq) 990{ 991 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 992 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 993 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 994 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 995 kfree(nvmeq); 996} 997 998static void nvme_free_queue(struct nvme_dev *dev, int qid) 999{ 1000 struct nvme_queue *nvmeq = dev->queues[qid]; 1001 int vector = dev->entry[nvmeq->cq_vector].vector; 1002 1003 spin_lock_irq(&nvmeq->q_lock); 1004 nvme_cancel_ios(nvmeq, false); 1005 while (bio_list_peek(&nvmeq->sq_cong)) { 1006 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1007 bio_endio(bio, -EIO); 1008 } 1009 spin_unlock_irq(&nvmeq->q_lock); 1010 1011 irq_set_affinity_hint(vector, NULL); 1012 free_irq(vector, nvmeq); 1013 1014 /* Don't tell the adapter to delete the admin queue */ 1015 if (qid) { 1016 adapter_delete_sq(dev, qid); 1017 adapter_delete_cq(dev, qid); 1018 } 1019 1020 nvme_free_queue_mem(nvmeq); 1021} 1022 1023static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1024 int depth, int vector) 1025{ 1026 struct device *dmadev = &dev->pci_dev->dev; 1027 unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * 1028 sizeof(struct nvme_cmd_info)); 1029 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1030 if (!nvmeq) 1031 return NULL; 1032 1033 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1034 &nvmeq->cq_dma_addr, GFP_KERNEL); 1035 if (!nvmeq->cqes) 1036 goto free_nvmeq; 1037 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1038 1039 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1040 &nvmeq->sq_dma_addr, GFP_KERNEL); 1041 if (!nvmeq->sq_cmds) 1042 goto free_cqdma; 1043 1044 nvmeq->q_dmadev = dmadev; 1045 nvmeq->dev = dev; 1046 spin_lock_init(&nvmeq->q_lock); 1047 nvmeq->cq_head = 0; 1048 nvmeq->cq_phase = 1; 1049 init_waitqueue_head(&nvmeq->sq_full); 1050 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1051 bio_list_init(&nvmeq->sq_cong); 1052 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 1053 nvmeq->q_depth = depth; 1054 nvmeq->cq_vector = vector; 1055 1056 return nvmeq; 1057 1058 free_cqdma: 1059 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1060 nvmeq->cq_dma_addr); 1061 free_nvmeq: 1062 kfree(nvmeq); 1063 return NULL; 1064} 1065 1066static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1067 const char *name) 1068{ 1069 if (use_threaded_interrupts) 1070 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1071 nvme_irq_check, nvme_irq, 1072 IRQF_DISABLED | IRQF_SHARED, 1073 name, nvmeq); 1074 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1075 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 1076} 1077 1078static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, 1079 int cq_size, int vector) 1080{ 1081 int result; 1082 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 1083 1084 if (!nvmeq) 1085 return ERR_PTR(-ENOMEM); 1086 1087 result = adapter_alloc_cq(dev, qid, nvmeq); 1088 if (result < 0) 1089 goto free_nvmeq; 1090 1091 result = adapter_alloc_sq(dev, qid, nvmeq); 1092 if (result < 0) 1093 goto release_cq; 1094 1095 result = queue_request_irq(dev, nvmeq, "nvme"); 1096 if (result < 0) 1097 goto release_sq; 1098 1099 return nvmeq; 1100 1101 release_sq: 1102 adapter_delete_sq(dev, qid); 1103 release_cq: 1104 adapter_delete_cq(dev, qid); 1105 free_nvmeq: 1106 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1107 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1108 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1109 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1110 kfree(nvmeq); 1111 return ERR_PTR(result); 1112} 1113 1114static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1115{ 1116 unsigned long timeout; 1117 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1118 1119 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1120 1121 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1122 msleep(100); 1123 if (fatal_signal_pending(current)) 1124 return -EINTR; 1125 if (time_after(jiffies, timeout)) { 1126 dev_err(&dev->pci_dev->dev, 1127 "Device not ready; aborting initialisation\n"); 1128 return -ENODEV; 1129 } 1130 } 1131 1132 return 0; 1133} 1134 1135/* 1136 * If the device has been passed off to us in an enabled state, just clear 1137 * the enabled bit. The spec says we should set the 'shutdown notification 1138 * bits', but doing so may cause the device to complete commands to the 1139 * admin queue ... and we don't know what memory that might be pointing at! 1140 */ 1141static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1142{ 1143 u32 cc = readl(&dev->bar->cc); 1144 1145 if (cc & NVME_CC_ENABLE) 1146 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1147 return nvme_wait_ready(dev, cap, false); 1148} 1149 1150static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1151{ 1152 return nvme_wait_ready(dev, cap, true); 1153} 1154 1155static int nvme_configure_admin_queue(struct nvme_dev *dev) 1156{ 1157 int result; 1158 u32 aqa; 1159 u64 cap = readq(&dev->bar->cap); 1160 struct nvme_queue *nvmeq; 1161 1162 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1163 dev->db_stride = NVME_CAP_STRIDE(cap); 1164 1165 result = nvme_disable_ctrl(dev, cap); 1166 if (result < 0) 1167 return result; 1168 1169 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1170 if (!nvmeq) 1171 return -ENOMEM; 1172 1173 aqa = nvmeq->q_depth - 1; 1174 aqa |= aqa << 16; 1175 1176 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1177 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1178 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1179 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1180 1181 writel(aqa, &dev->bar->aqa); 1182 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1183 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1184 writel(dev->ctrl_config, &dev->bar->cc); 1185 1186 result = nvme_enable_ctrl(dev, cap); 1187 if (result) 1188 goto free_q; 1189 1190 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1191 if (result) 1192 goto free_q; 1193 1194 dev->queues[0] = nvmeq; 1195 return result; 1196 1197 free_q: 1198 nvme_free_queue_mem(nvmeq); 1199 return result; 1200} 1201 1202struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1203 unsigned long addr, unsigned length) 1204{ 1205 int i, err, count, nents, offset; 1206 struct scatterlist *sg; 1207 struct page **pages; 1208 struct nvme_iod *iod; 1209 1210 if (addr & 3) 1211 return ERR_PTR(-EINVAL); 1212 if (!length || length > INT_MAX - PAGE_SIZE) 1213 return ERR_PTR(-EINVAL); 1214 1215 offset = offset_in_page(addr); 1216 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1217 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1218 if (!pages) 1219 return ERR_PTR(-ENOMEM); 1220 1221 err = get_user_pages_fast(addr, count, 1, pages); 1222 if (err < count) { 1223 count = err; 1224 err = -EFAULT; 1225 goto put_pages; 1226 } 1227 1228 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1229 sg = iod->sg; 1230 sg_init_table(sg, count); 1231 for (i = 0; i < count; i++) { 1232 sg_set_page(&sg[i], pages[i], 1233 min_t(unsigned, length, PAGE_SIZE - offset), 1234 offset); 1235 length -= (PAGE_SIZE - offset); 1236 offset = 0; 1237 } 1238 sg_mark_end(&sg[i - 1]); 1239 iod->nents = count; 1240 1241 err = -ENOMEM; 1242 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1243 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1244 if (!nents) 1245 goto free_iod; 1246 1247 kfree(pages); 1248 return iod; 1249 1250 free_iod: 1251 kfree(iod); 1252 put_pages: 1253 for (i = 0; i < count; i++) 1254 put_page(pages[i]); 1255 kfree(pages); 1256 return ERR_PTR(err); 1257} 1258 1259void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1260 struct nvme_iod *iod) 1261{ 1262 int i; 1263 1264 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1265 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1266 1267 for (i = 0; i < iod->nents; i++) 1268 put_page(sg_page(&iod->sg[i])); 1269} 1270 1271static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1272{ 1273 struct nvme_dev *dev = ns->dev; 1274 struct nvme_queue *nvmeq; 1275 struct nvme_user_io io; 1276 struct nvme_command c; 1277 unsigned length, meta_len; 1278 int status, i; 1279 struct nvme_iod *iod, *meta_iod = NULL; 1280 dma_addr_t meta_dma_addr; 1281 void *meta, *uninitialized_var(meta_mem); 1282 1283 if (copy_from_user(&io, uio, sizeof(io))) 1284 return -EFAULT; 1285 length = (io.nblocks + 1) << ns->lba_shift; 1286 meta_len = (io.nblocks + 1) * ns->ms; 1287 1288 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1289 return -EINVAL; 1290 1291 switch (io.opcode) { 1292 case nvme_cmd_write: 1293 case nvme_cmd_read: 1294 case nvme_cmd_compare: 1295 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1296 break; 1297 default: 1298 return -EINVAL; 1299 } 1300 1301 if (IS_ERR(iod)) 1302 return PTR_ERR(iod); 1303 1304 memset(&c, 0, sizeof(c)); 1305 c.rw.opcode = io.opcode; 1306 c.rw.flags = io.flags; 1307 c.rw.nsid = cpu_to_le32(ns->ns_id); 1308 c.rw.slba = cpu_to_le64(io.slba); 1309 c.rw.length = cpu_to_le16(io.nblocks); 1310 c.rw.control = cpu_to_le16(io.control); 1311 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1312 c.rw.reftag = cpu_to_le32(io.reftag); 1313 c.rw.apptag = cpu_to_le16(io.apptag); 1314 c.rw.appmask = cpu_to_le16(io.appmask); 1315 1316 if (meta_len) { 1317 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); 1318 if (IS_ERR(meta_iod)) { 1319 status = PTR_ERR(meta_iod); 1320 meta_iod = NULL; 1321 goto unmap; 1322 } 1323 1324 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1325 &meta_dma_addr, GFP_KERNEL); 1326 if (!meta_mem) { 1327 status = -ENOMEM; 1328 goto unmap; 1329 } 1330 1331 if (io.opcode & 1) { 1332 int meta_offset = 0; 1333 1334 for (i = 0; i < meta_iod->nents; i++) { 1335 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1336 meta_iod->sg[i].offset; 1337 memcpy(meta_mem + meta_offset, meta, 1338 meta_iod->sg[i].length); 1339 kunmap_atomic(meta); 1340 meta_offset += meta_iod->sg[i].length; 1341 } 1342 } 1343 1344 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1345 } 1346 1347 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1348 1349 nvmeq = get_nvmeq(dev); 1350 /* 1351 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1352 * disabled. We may be preempted at any point, and be rescheduled 1353 * to a different CPU. That will cause cacheline bouncing, but no 1354 * additional races since q_lock already protects against other CPUs. 1355 */ 1356 put_nvmeq(nvmeq); 1357 if (length != (io.nblocks + 1) << ns->lba_shift) 1358 status = -ENOMEM; 1359 else 1360 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1361 1362 if (meta_len) { 1363 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1364 int meta_offset = 0; 1365 1366 for (i = 0; i < meta_iod->nents; i++) { 1367 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1368 meta_iod->sg[i].offset; 1369 memcpy(meta, meta_mem + meta_offset, 1370 meta_iod->sg[i].length); 1371 kunmap_atomic(meta); 1372 meta_offset += meta_iod->sg[i].length; 1373 } 1374 } 1375 1376 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1377 meta_dma_addr); 1378 } 1379 1380 unmap: 1381 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1382 nvme_free_iod(dev, iod); 1383 1384 if (meta_iod) { 1385 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1386 nvme_free_iod(dev, meta_iod); 1387 } 1388 1389 return status; 1390} 1391 1392static int nvme_user_admin_cmd(struct nvme_dev *dev, 1393 struct nvme_admin_cmd __user *ucmd) 1394{ 1395 struct nvme_admin_cmd cmd; 1396 struct nvme_command c; 1397 int status, length; 1398 struct nvme_iod *uninitialized_var(iod); 1399 unsigned timeout; 1400 1401 if (!capable(CAP_SYS_ADMIN)) 1402 return -EACCES; 1403 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1404 return -EFAULT; 1405 1406 memset(&c, 0, sizeof(c)); 1407 c.common.opcode = cmd.opcode; 1408 c.common.flags = cmd.flags; 1409 c.common.nsid = cpu_to_le32(cmd.nsid); 1410 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1411 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1412 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1413 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1414 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1415 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1416 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1417 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1418 1419 length = cmd.data_len; 1420 if (cmd.data_len) { 1421 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1422 length); 1423 if (IS_ERR(iod)) 1424 return PTR_ERR(iod); 1425 length = nvme_setup_prps(dev, &c.common, iod, length, 1426 GFP_KERNEL); 1427 } 1428 1429 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1430 ADMIN_TIMEOUT; 1431 if (length != cmd.data_len) 1432 status = -ENOMEM; 1433 else 1434 status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, 1435 timeout); 1436 1437 if (cmd.data_len) { 1438 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1439 nvme_free_iod(dev, iod); 1440 } 1441 1442 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1443 sizeof(cmd.result))) 1444 status = -EFAULT; 1445 1446 return status; 1447} 1448 1449static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1450 unsigned long arg) 1451{ 1452 struct nvme_ns *ns = bdev->bd_disk->private_data; 1453 1454 switch (cmd) { 1455 case NVME_IOCTL_ID: 1456 return ns->ns_id; 1457 case NVME_IOCTL_ADMIN_CMD: 1458 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1459 case NVME_IOCTL_SUBMIT_IO: 1460 return nvme_submit_io(ns, (void __user *)arg); 1461 case SG_GET_VERSION_NUM: 1462 return nvme_sg_get_version_num((void __user *)arg); 1463 case SG_IO: 1464 return nvme_sg_io(ns, (void __user *)arg); 1465 default: 1466 return -ENOTTY; 1467 } 1468} 1469 1470static const struct block_device_operations nvme_fops = { 1471 .owner = THIS_MODULE, 1472 .ioctl = nvme_ioctl, 1473 .compat_ioctl = nvme_ioctl, 1474}; 1475 1476static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1477{ 1478 while (bio_list_peek(&nvmeq->sq_cong)) { 1479 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1480 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1481 1482 if (bio_list_empty(&nvmeq->sq_cong)) 1483 remove_wait_queue(&nvmeq->sq_full, 1484 &nvmeq->sq_cong_wait); 1485 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1486 if (bio_list_empty(&nvmeq->sq_cong)) 1487 add_wait_queue(&nvmeq->sq_full, 1488 &nvmeq->sq_cong_wait); 1489 bio_list_add_head(&nvmeq->sq_cong, bio); 1490 break; 1491 } 1492 } 1493} 1494 1495static int nvme_kthread(void *data) 1496{ 1497 struct nvme_dev *dev; 1498 1499 while (!kthread_should_stop()) { 1500 set_current_state(TASK_INTERRUPTIBLE); 1501 spin_lock(&dev_list_lock); 1502 list_for_each_entry(dev, &dev_list, node) { 1503 int i; 1504 for (i = 0; i < dev->queue_count; i++) { 1505 struct nvme_queue *nvmeq = dev->queues[i]; 1506 if (!nvmeq) 1507 continue; 1508 spin_lock_irq(&nvmeq->q_lock); 1509 if (nvme_process_cq(nvmeq)) 1510 printk("process_cq did something\n"); 1511 nvme_cancel_ios(nvmeq, true); 1512 nvme_resubmit_bios(nvmeq); 1513 spin_unlock_irq(&nvmeq->q_lock); 1514 } 1515 } 1516 spin_unlock(&dev_list_lock); 1517 schedule_timeout(round_jiffies_relative(HZ)); 1518 } 1519 return 0; 1520} 1521 1522static DEFINE_IDA(nvme_index_ida); 1523 1524static int nvme_get_ns_idx(void) 1525{ 1526 int index, error; 1527 1528 do { 1529 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1530 return -1; 1531 1532 spin_lock(&dev_list_lock); 1533 error = ida_get_new(&nvme_index_ida, &index); 1534 spin_unlock(&dev_list_lock); 1535 } while (error == -EAGAIN); 1536 1537 if (error) 1538 index = -1; 1539 return index; 1540} 1541 1542static void nvme_put_ns_idx(int index) 1543{ 1544 spin_lock(&dev_list_lock); 1545 ida_remove(&nvme_index_ida, index); 1546 spin_unlock(&dev_list_lock); 1547} 1548 1549static void nvme_config_discard(struct nvme_ns *ns) 1550{ 1551 u32 logical_block_size = queue_logical_block_size(ns->queue); 1552 ns->queue->limits.discard_zeroes_data = 0; 1553 ns->queue->limits.discard_alignment = logical_block_size; 1554 ns->queue->limits.discard_granularity = logical_block_size; 1555 ns->queue->limits.max_discard_sectors = 0xffffffff; 1556 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1557} 1558 1559static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1560 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1561{ 1562 struct nvme_ns *ns; 1563 struct gendisk *disk; 1564 int lbaf; 1565 1566 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1567 return NULL; 1568 1569 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1570 if (!ns) 1571 return NULL; 1572 ns->queue = blk_alloc_queue(GFP_KERNEL); 1573 if (!ns->queue) 1574 goto out_free_ns; 1575 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1576 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1577 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1578 blk_queue_make_request(ns->queue, nvme_make_request); 1579 ns->dev = dev; 1580 ns->queue->queuedata = ns; 1581 1582 disk = alloc_disk(NVME_MINORS); 1583 if (!disk) 1584 goto out_free_queue; 1585 ns->ns_id = nsid; 1586 ns->disk = disk; 1587 lbaf = id->flbas & 0xf; 1588 ns->lba_shift = id->lbaf[lbaf].ds; 1589 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1590 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1591 if (dev->max_hw_sectors) 1592 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1593 1594 disk->major = nvme_major; 1595 disk->minors = NVME_MINORS; 1596 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1597 disk->fops = &nvme_fops; 1598 disk->private_data = ns; 1599 disk->queue = ns->queue; 1600 disk->driverfs_dev = &dev->pci_dev->dev; 1601 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1602 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1603 1604 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1605 nvme_config_discard(ns); 1606 1607 return ns; 1608 1609 out_free_queue: 1610 blk_cleanup_queue(ns->queue); 1611 out_free_ns: 1612 kfree(ns); 1613 return NULL; 1614} 1615 1616static void nvme_ns_free(struct nvme_ns *ns) 1617{ 1618 int index = ns->disk->first_minor / NVME_MINORS; 1619 put_disk(ns->disk); 1620 nvme_put_ns_idx(index); 1621 blk_cleanup_queue(ns->queue); 1622 kfree(ns); 1623} 1624 1625static int set_queue_count(struct nvme_dev *dev, int count) 1626{ 1627 int status; 1628 u32 result; 1629 u32 q_count = (count - 1) | ((count - 1) << 16); 1630 1631 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1632 &result); 1633 if (status) 1634 return -EIO; 1635 return min(result & 0xffff, result >> 16) + 1; 1636} 1637 1638static int nvme_setup_io_queues(struct nvme_dev *dev) 1639{ 1640 int result, cpu, i, nr_io_queues, db_bar_size, q_depth; 1641 1642 nr_io_queues = num_online_cpus(); 1643 result = set_queue_count(dev, nr_io_queues); 1644 if (result < 0) 1645 return result; 1646 if (result < nr_io_queues) 1647 nr_io_queues = result; 1648 1649 /* Deregister the admin queue's interrupt */ 1650 free_irq(dev->entry[0].vector, dev->queues[0]); 1651 1652 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1653 if (db_bar_size > 8192) { 1654 iounmap(dev->bar); 1655 dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 1656 db_bar_size); 1657 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1658 dev->queues[0]->q_db = dev->dbs; 1659 } 1660 1661 for (i = 0; i < nr_io_queues; i++) 1662 dev->entry[i].entry = i; 1663 for (;;) { 1664 result = pci_enable_msix(dev->pci_dev, dev->entry, 1665 nr_io_queues); 1666 if (result == 0) { 1667 break; 1668 } else if (result > 0) { 1669 nr_io_queues = result; 1670 continue; 1671 } else { 1672 nr_io_queues = 1; 1673 break; 1674 } 1675 } 1676 1677 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1678 /* XXX: handle failure here */ 1679 1680 cpu = cpumask_first(cpu_online_mask); 1681 for (i = 0; i < nr_io_queues; i++) { 1682 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1683 cpu = cpumask_next(cpu, cpu_online_mask); 1684 } 1685 1686 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1687 NVME_Q_DEPTH); 1688 for (i = 0; i < nr_io_queues; i++) { 1689 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); 1690 if (IS_ERR(dev->queues[i + 1])) 1691 return PTR_ERR(dev->queues[i + 1]); 1692 dev->queue_count++; 1693 } 1694 1695 for (; i < num_possible_cpus(); i++) { 1696 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1697 dev->queues[i + 1] = dev->queues[target + 1]; 1698 } 1699 1700 return 0; 1701} 1702 1703static void nvme_free_queues(struct nvme_dev *dev) 1704{ 1705 int i; 1706 1707 for (i = dev->queue_count - 1; i >= 0; i--) 1708 nvme_free_queue(dev, i); 1709} 1710 1711/* 1712 * Return: error value if an error occurred setting up the queues or calling 1713 * Identify Device. 0 if these succeeded, even if adding some of the 1714 * namespaces failed. At the moment, these failures are silent. TBD which 1715 * failures should be reported. 1716 */ 1717static int nvme_dev_add(struct nvme_dev *dev) 1718{ 1719 int res, nn, i; 1720 struct nvme_ns *ns; 1721 struct nvme_id_ctrl *ctrl; 1722 struct nvme_id_ns *id_ns; 1723 void *mem; 1724 dma_addr_t dma_addr; 1725 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1726 1727 res = nvme_setup_io_queues(dev); 1728 if (res) 1729 return res; 1730 1731 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1732 GFP_KERNEL); 1733 if (!mem) 1734 return -ENOMEM; 1735 1736 res = nvme_identify(dev, 0, 1, dma_addr); 1737 if (res) { 1738 res = -EIO; 1739 goto out; 1740 } 1741 1742 ctrl = mem; 1743 nn = le32_to_cpup(&ctrl->nn); 1744 dev->oncs = le16_to_cpup(&ctrl->oncs); 1745 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1746 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1747 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1748 if (ctrl->mdts) 1749 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 1750 if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) && 1751 (dev->pci_dev->device == 0x0953) && ctrl->vs[3]) 1752 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 1753 1754 id_ns = mem; 1755 for (i = 1; i <= nn; i++) { 1756 res = nvme_identify(dev, i, 0, dma_addr); 1757 if (res) 1758 continue; 1759 1760 if (id_ns->ncap == 0) 1761 continue; 1762 1763 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1764 dma_addr + 4096, NULL); 1765 if (res) 1766 memset(mem + 4096, 0, 4096); 1767 1768 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1769 if (ns) 1770 list_add_tail(&ns->list, &dev->namespaces); 1771 } 1772 list_for_each_entry(ns, &dev->namespaces, list) 1773 add_disk(ns->disk); 1774 res = 0; 1775 1776 out: 1777 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1778 return res; 1779} 1780 1781static int nvme_dev_remove(struct nvme_dev *dev) 1782{ 1783 struct nvme_ns *ns, *next; 1784 1785 spin_lock(&dev_list_lock); 1786 list_del(&dev->node); 1787 spin_unlock(&dev_list_lock); 1788 1789 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1790 list_del(&ns->list); 1791 del_gendisk(ns->disk); 1792 nvme_ns_free(ns); 1793 } 1794 1795 nvme_free_queues(dev); 1796 1797 return 0; 1798} 1799 1800static int nvme_setup_prp_pools(struct nvme_dev *dev) 1801{ 1802 struct device *dmadev = &dev->pci_dev->dev; 1803 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1804 PAGE_SIZE, PAGE_SIZE, 0); 1805 if (!dev->prp_page_pool) 1806 return -ENOMEM; 1807 1808 /* Optimisation for I/Os between 4k and 128k */ 1809 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1810 256, 256, 0); 1811 if (!dev->prp_small_pool) { 1812 dma_pool_destroy(dev->prp_page_pool); 1813 return -ENOMEM; 1814 } 1815 return 0; 1816} 1817 1818static void nvme_release_prp_pools(struct nvme_dev *dev) 1819{ 1820 dma_pool_destroy(dev->prp_page_pool); 1821 dma_pool_destroy(dev->prp_small_pool); 1822} 1823 1824static DEFINE_IDA(nvme_instance_ida); 1825 1826static int nvme_set_instance(struct nvme_dev *dev) 1827{ 1828 int instance, error; 1829 1830 do { 1831 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1832 return -ENODEV; 1833 1834 spin_lock(&dev_list_lock); 1835 error = ida_get_new(&nvme_instance_ida, &instance); 1836 spin_unlock(&dev_list_lock); 1837 } while (error == -EAGAIN); 1838 1839 if (error) 1840 return -ENODEV; 1841 1842 dev->instance = instance; 1843 return 0; 1844} 1845 1846static void nvme_release_instance(struct nvme_dev *dev) 1847{ 1848 spin_lock(&dev_list_lock); 1849 ida_remove(&nvme_instance_ida, dev->instance); 1850 spin_unlock(&dev_list_lock); 1851} 1852 1853static void nvme_free_dev(struct kref *kref) 1854{ 1855 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 1856 nvme_dev_remove(dev); 1857 pci_disable_msix(dev->pci_dev); 1858 iounmap(dev->bar); 1859 nvme_release_instance(dev); 1860 nvme_release_prp_pools(dev); 1861 pci_disable_device(dev->pci_dev); 1862 pci_release_regions(dev->pci_dev); 1863 kfree(dev->queues); 1864 kfree(dev->entry); 1865 kfree(dev); 1866} 1867 1868static int nvme_dev_open(struct inode *inode, struct file *f) 1869{ 1870 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 1871 miscdev); 1872 kref_get(&dev->kref); 1873 f->private_data = dev; 1874 return 0; 1875} 1876 1877static int nvme_dev_release(struct inode *inode, struct file *f) 1878{ 1879 struct nvme_dev *dev = f->private_data; 1880 kref_put(&dev->kref, nvme_free_dev); 1881 return 0; 1882} 1883 1884static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1885{ 1886 struct nvme_dev *dev = f->private_data; 1887 switch (cmd) { 1888 case NVME_IOCTL_ADMIN_CMD: 1889 return nvme_user_admin_cmd(dev, (void __user *)arg); 1890 default: 1891 return -ENOTTY; 1892 } 1893} 1894 1895static const struct file_operations nvme_dev_fops = { 1896 .owner = THIS_MODULE, 1897 .open = nvme_dev_open, 1898 .release = nvme_dev_release, 1899 .unlocked_ioctl = nvme_dev_ioctl, 1900 .compat_ioctl = nvme_dev_ioctl, 1901}; 1902 1903static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1904{ 1905 int bars, result = -ENOMEM; 1906 struct nvme_dev *dev; 1907 1908 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1909 if (!dev) 1910 return -ENOMEM; 1911 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1912 GFP_KERNEL); 1913 if (!dev->entry) 1914 goto free; 1915 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1916 GFP_KERNEL); 1917 if (!dev->queues) 1918 goto free; 1919 1920 if (pci_enable_device_mem(pdev)) 1921 goto free; 1922 pci_set_master(pdev); 1923 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1924 if (pci_request_selected_regions(pdev, bars, "nvme")) 1925 goto disable; 1926 1927 INIT_LIST_HEAD(&dev->namespaces); 1928 dev->pci_dev = pdev; 1929 pci_set_drvdata(pdev, dev); 1930 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1931 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1932 result = nvme_set_instance(dev); 1933 if (result) 1934 goto disable; 1935 1936 dev->entry[0].vector = pdev->irq; 1937 1938 result = nvme_setup_prp_pools(dev); 1939 if (result) 1940 goto disable_msix; 1941 1942 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1943 if (!dev->bar) { 1944 result = -ENOMEM; 1945 goto disable_msix; 1946 } 1947 1948 result = nvme_configure_admin_queue(dev); 1949 if (result) 1950 goto unmap; 1951 dev->queue_count++; 1952 1953 spin_lock(&dev_list_lock); 1954 list_add(&dev->node, &dev_list); 1955 spin_unlock(&dev_list_lock); 1956 1957 result = nvme_dev_add(dev); 1958 if (result) 1959 goto delete; 1960 1961 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 1962 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 1963 dev->miscdev.parent = &pdev->dev; 1964 dev->miscdev.name = dev->name; 1965 dev->miscdev.fops = &nvme_dev_fops; 1966 result = misc_register(&dev->miscdev); 1967 if (result) 1968 goto remove; 1969 1970 kref_init(&dev->kref); 1971 return 0; 1972 1973 remove: 1974 nvme_dev_remove(dev); 1975 delete: 1976 spin_lock(&dev_list_lock); 1977 list_del(&dev->node); 1978 spin_unlock(&dev_list_lock); 1979 1980 nvme_free_queues(dev); 1981 unmap: 1982 iounmap(dev->bar); 1983 disable_msix: 1984 pci_disable_msix(pdev); 1985 nvme_release_instance(dev); 1986 nvme_release_prp_pools(dev); 1987 disable: 1988 pci_disable_device(pdev); 1989 pci_release_regions(pdev); 1990 free: 1991 kfree(dev->queues); 1992 kfree(dev->entry); 1993 kfree(dev); 1994 return result; 1995} 1996 1997static void nvme_remove(struct pci_dev *pdev) 1998{ 1999 struct nvme_dev *dev = pci_get_drvdata(pdev); 2000 misc_deregister(&dev->miscdev); 2001 kref_put(&dev->kref, nvme_free_dev); 2002} 2003 2004/* These functions are yet to be implemented */ 2005#define nvme_error_detected NULL 2006#define nvme_dump_registers NULL 2007#define nvme_link_reset NULL 2008#define nvme_slot_reset NULL 2009#define nvme_error_resume NULL 2010#define nvme_suspend NULL 2011#define nvme_resume NULL 2012 2013static const struct pci_error_handlers nvme_err_handler = { 2014 .error_detected = nvme_error_detected, 2015 .mmio_enabled = nvme_dump_registers, 2016 .link_reset = nvme_link_reset, 2017 .slot_reset = nvme_slot_reset, 2018 .resume = nvme_error_resume, 2019}; 2020 2021/* Move to pci_ids.h later */ 2022#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2023 2024static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 2025 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2026 { 0, } 2027}; 2028MODULE_DEVICE_TABLE(pci, nvme_id_table); 2029 2030static struct pci_driver nvme_driver = { 2031 .name = "nvme", 2032 .id_table = nvme_id_table, 2033 .probe = nvme_probe, 2034 .remove = nvme_remove, 2035 .suspend = nvme_suspend, 2036 .resume = nvme_resume, 2037 .err_handler = &nvme_err_handler, 2038}; 2039 2040static int __init nvme_init(void) 2041{ 2042 int result; 2043 2044 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2045 if (IS_ERR(nvme_thread)) 2046 return PTR_ERR(nvme_thread); 2047 2048 result = register_blkdev(nvme_major, "nvme"); 2049 if (result < 0) 2050 goto kill_kthread; 2051 else if (result > 0) 2052 nvme_major = result; 2053 2054 result = pci_register_driver(&nvme_driver); 2055 if (result) 2056 goto unregister_blkdev; 2057 return 0; 2058 2059 unregister_blkdev: 2060 unregister_blkdev(nvme_major, "nvme"); 2061 kill_kthread: 2062 kthread_stop(nvme_thread); 2063 return result; 2064} 2065 2066static void __exit nvme_exit(void) 2067{ 2068 pci_unregister_driver(&nvme_driver); 2069 unregister_blkdev(nvme_major, "nvme"); 2070 kthread_stop(nvme_thread); 2071} 2072 2073MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2074MODULE_LICENSE("GPL"); 2075MODULE_VERSION("0.8"); 2076module_init(nvme_init); 2077module_exit(nvme_exit); 2078