nvme-core.c revision 7d8224574cbd2326a6be00f319f5f7597abec3f6
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42#include <scsi/sg.h> 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#define NVME_Q_DEPTH 1024 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48#define NVME_MINORS 64 49#define ADMIN_TIMEOUT (60 * HZ) 50 51static int nvme_major; 52module_param(nvme_major, int, 0); 53 54static int use_threaded_interrupts; 55module_param(use_threaded_interrupts, int, 0); 56 57static DEFINE_SPINLOCK(dev_list_lock); 58static LIST_HEAD(dev_list); 59static struct task_struct *nvme_thread; 60 61/* 62 * An NVM Express queue. Each device has at least two (one for admin 63 * commands and one for I/O commands). 64 */ 65struct nvme_queue { 66 struct device *q_dmadev; 67 struct nvme_dev *dev; 68 spinlock_t q_lock; 69 struct nvme_command *sq_cmds; 70 volatile struct nvme_completion *cqes; 71 dma_addr_t sq_dma_addr; 72 dma_addr_t cq_dma_addr; 73 wait_queue_head_t sq_full; 74 wait_queue_t sq_cong_wait; 75 struct bio_list sq_cong; 76 u32 __iomem *q_db; 77 u16 q_depth; 78 u16 cq_vector; 79 u16 sq_head; 80 u16 sq_tail; 81 u16 cq_head; 82 u8 cq_phase; 83 u8 cqe_seen; 84 unsigned long cmdid_data[]; 85}; 86 87/* 88 * Check we didin't inadvertently grow the command struct 89 */ 90static inline void _nvme_check_size(void) 91{ 92 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 93 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 94 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 95 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 96 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 97 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 98 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 99 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 100 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 101 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 102 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 103} 104 105typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 106 struct nvme_completion *); 107 108struct nvme_cmd_info { 109 nvme_completion_fn fn; 110 void *ctx; 111 unsigned long timeout; 112}; 113 114static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 115{ 116 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 117} 118 119/** 120 * alloc_cmdid() - Allocate a Command ID 121 * @nvmeq: The queue that will be used for this command 122 * @ctx: A pointer that will be passed to the handler 123 * @handler: The function to call on completion 124 * 125 * Allocate a Command ID for a queue. The data passed in will 126 * be passed to the completion handler. This is implemented by using 127 * the bottom two bits of the ctx pointer to store the handler ID. 128 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 129 * We can change this if it becomes a problem. 130 * 131 * May be called with local interrupts disabled and the q_lock held, 132 * or with interrupts enabled and no locks held. 133 */ 134static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 135 nvme_completion_fn handler, unsigned timeout) 136{ 137 int depth = nvmeq->q_depth - 1; 138 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 139 int cmdid; 140 141 do { 142 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 143 if (cmdid >= depth) 144 return -EBUSY; 145 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 146 147 info[cmdid].fn = handler; 148 info[cmdid].ctx = ctx; 149 info[cmdid].timeout = jiffies + timeout; 150 return cmdid; 151} 152 153static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 154 nvme_completion_fn handler, unsigned timeout) 155{ 156 int cmdid; 157 wait_event_killable(nvmeq->sq_full, 158 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 159 return (cmdid < 0) ? -EINTR : cmdid; 160} 161 162/* Special values must be less than 0x1000 */ 163#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 164#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 165#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 166#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 167#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 168 169static void special_completion(struct nvme_dev *dev, void *ctx, 170 struct nvme_completion *cqe) 171{ 172 if (ctx == CMD_CTX_CANCELLED) 173 return; 174 if (ctx == CMD_CTX_FLUSH) 175 return; 176 if (ctx == CMD_CTX_COMPLETED) { 177 dev_warn(&dev->pci_dev->dev, 178 "completed id %d twice on queue %d\n", 179 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 180 return; 181 } 182 if (ctx == CMD_CTX_INVALID) { 183 dev_warn(&dev->pci_dev->dev, 184 "invalid id %d completed on queue %d\n", 185 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 186 return; 187 } 188 189 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 190} 191 192/* 193 * Called with local interrupts disabled and the q_lock held. May not sleep. 194 */ 195static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 196 nvme_completion_fn *fn) 197{ 198 void *ctx; 199 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 200 201 if (cmdid >= nvmeq->q_depth) { 202 *fn = special_completion; 203 return CMD_CTX_INVALID; 204 } 205 if (fn) 206 *fn = info[cmdid].fn; 207 ctx = info[cmdid].ctx; 208 info[cmdid].fn = special_completion; 209 info[cmdid].ctx = CMD_CTX_COMPLETED; 210 clear_bit(cmdid, nvmeq->cmdid_data); 211 wake_up(&nvmeq->sq_full); 212 return ctx; 213} 214 215static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 216 nvme_completion_fn *fn) 217{ 218 void *ctx; 219 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 220 if (fn) 221 *fn = info[cmdid].fn; 222 ctx = info[cmdid].ctx; 223 info[cmdid].fn = special_completion; 224 info[cmdid].ctx = CMD_CTX_CANCELLED; 225 return ctx; 226} 227 228struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 229{ 230 return dev->queues[get_cpu() + 1]; 231} 232 233void put_nvmeq(struct nvme_queue *nvmeq) 234{ 235 put_cpu(); 236} 237 238/** 239 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 240 * @nvmeq: The queue to use 241 * @cmd: The command to send 242 * 243 * Safe to use from interrupt context 244 */ 245static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 246{ 247 unsigned long flags; 248 u16 tail; 249 spin_lock_irqsave(&nvmeq->q_lock, flags); 250 tail = nvmeq->sq_tail; 251 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 252 if (++tail == nvmeq->q_depth) 253 tail = 0; 254 writel(tail, nvmeq->q_db); 255 nvmeq->sq_tail = tail; 256 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 257 258 return 0; 259} 260 261static __le64 **iod_list(struct nvme_iod *iod) 262{ 263 return ((void *)iod) + iod->offset; 264} 265 266/* 267 * Will slightly overestimate the number of pages needed. This is OK 268 * as it only leads to a small amount of wasted memory for the lifetime of 269 * the I/O. 270 */ 271static int nvme_npages(unsigned size) 272{ 273 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 274 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 275} 276 277static struct nvme_iod * 278nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 279{ 280 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 281 sizeof(__le64 *) * nvme_npages(nbytes) + 282 sizeof(struct scatterlist) * nseg, gfp); 283 284 if (iod) { 285 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 286 iod->npages = -1; 287 iod->length = nbytes; 288 iod->nents = 0; 289 iod->start_time = jiffies; 290 } 291 292 return iod; 293} 294 295void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 296{ 297 const int last_prp = PAGE_SIZE / 8 - 1; 298 int i; 299 __le64 **list = iod_list(iod); 300 dma_addr_t prp_dma = iod->first_dma; 301 302 if (iod->npages == 0) 303 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 304 for (i = 0; i < iod->npages; i++) { 305 __le64 *prp_list = list[i]; 306 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 307 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 308 prp_dma = next_prp_dma; 309 } 310 kfree(iod); 311} 312 313static void nvme_start_io_acct(struct bio *bio) 314{ 315 struct gendisk *disk = bio->bi_bdev->bd_disk; 316 const int rw = bio_data_dir(bio); 317 int cpu = part_stat_lock(); 318 part_round_stats(cpu, &disk->part0); 319 part_stat_inc(cpu, &disk->part0, ios[rw]); 320 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); 321 part_inc_in_flight(&disk->part0, rw); 322 part_stat_unlock(); 323} 324 325static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) 326{ 327 struct gendisk *disk = bio->bi_bdev->bd_disk; 328 const int rw = bio_data_dir(bio); 329 unsigned long duration = jiffies - start_time; 330 int cpu = part_stat_lock(); 331 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 332 part_round_stats(cpu, &disk->part0); 333 part_dec_in_flight(&disk->part0, rw); 334 part_stat_unlock(); 335} 336 337static void bio_completion(struct nvme_dev *dev, void *ctx, 338 struct nvme_completion *cqe) 339{ 340 struct nvme_iod *iod = ctx; 341 struct bio *bio = iod->private; 342 u16 status = le16_to_cpup(&cqe->status) >> 1; 343 344 if (iod->nents) 345 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 346 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 347 348 nvme_end_io_acct(bio, iod->start_time); 349 nvme_free_iod(dev, iod); 350 if (status) 351 bio_endio(bio, -EIO); 352 else 353 bio_endio(bio, 0); 354} 355 356/* length is in bytes. gfp flags indicates whether we may sleep. */ 357int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 358 struct nvme_iod *iod, int total_len, gfp_t gfp) 359{ 360 struct dma_pool *pool; 361 int length = total_len; 362 struct scatterlist *sg = iod->sg; 363 int dma_len = sg_dma_len(sg); 364 u64 dma_addr = sg_dma_address(sg); 365 int offset = offset_in_page(dma_addr); 366 __le64 *prp_list; 367 __le64 **list = iod_list(iod); 368 dma_addr_t prp_dma; 369 int nprps, i; 370 371 cmd->prp1 = cpu_to_le64(dma_addr); 372 length -= (PAGE_SIZE - offset); 373 if (length <= 0) 374 return total_len; 375 376 dma_len -= (PAGE_SIZE - offset); 377 if (dma_len) { 378 dma_addr += (PAGE_SIZE - offset); 379 } else { 380 sg = sg_next(sg); 381 dma_addr = sg_dma_address(sg); 382 dma_len = sg_dma_len(sg); 383 } 384 385 if (length <= PAGE_SIZE) { 386 cmd->prp2 = cpu_to_le64(dma_addr); 387 return total_len; 388 } 389 390 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 391 if (nprps <= (256 / 8)) { 392 pool = dev->prp_small_pool; 393 iod->npages = 0; 394 } else { 395 pool = dev->prp_page_pool; 396 iod->npages = 1; 397 } 398 399 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 400 if (!prp_list) { 401 cmd->prp2 = cpu_to_le64(dma_addr); 402 iod->npages = -1; 403 return (total_len - length) + PAGE_SIZE; 404 } 405 list[0] = prp_list; 406 iod->first_dma = prp_dma; 407 cmd->prp2 = cpu_to_le64(prp_dma); 408 i = 0; 409 for (;;) { 410 if (i == PAGE_SIZE / 8) { 411 __le64 *old_prp_list = prp_list; 412 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 413 if (!prp_list) 414 return total_len - length; 415 list[iod->npages++] = prp_list; 416 prp_list[0] = old_prp_list[i - 1]; 417 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 418 i = 1; 419 } 420 prp_list[i++] = cpu_to_le64(dma_addr); 421 dma_len -= PAGE_SIZE; 422 dma_addr += PAGE_SIZE; 423 length -= PAGE_SIZE; 424 if (length <= 0) 425 break; 426 if (dma_len > 0) 427 continue; 428 BUG_ON(dma_len < 0); 429 sg = sg_next(sg); 430 dma_addr = sg_dma_address(sg); 431 dma_len = sg_dma_len(sg); 432 } 433 434 return total_len; 435} 436 437struct nvme_bio_pair { 438 struct bio b1, b2, *parent; 439 struct bio_vec *bv1, *bv2; 440 int err; 441 atomic_t cnt; 442}; 443 444static void nvme_bio_pair_endio(struct bio *bio, int err) 445{ 446 struct nvme_bio_pair *bp = bio->bi_private; 447 448 if (err) 449 bp->err = err; 450 451 if (atomic_dec_and_test(&bp->cnt)) { 452 bio_endio(bp->parent, bp->err); 453 if (bp->bv1) 454 kfree(bp->bv1); 455 if (bp->bv2) 456 kfree(bp->bv2); 457 kfree(bp); 458 } 459} 460 461static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, 462 int len, int offset) 463{ 464 struct nvme_bio_pair *bp; 465 466 BUG_ON(len > bio->bi_size); 467 BUG_ON(idx > bio->bi_vcnt); 468 469 bp = kmalloc(sizeof(*bp), GFP_ATOMIC); 470 if (!bp) 471 return NULL; 472 bp->err = 0; 473 474 bp->b1 = *bio; 475 bp->b2 = *bio; 476 477 bp->b1.bi_size = len; 478 bp->b2.bi_size -= len; 479 bp->b1.bi_vcnt = idx; 480 bp->b2.bi_idx = idx; 481 bp->b2.bi_sector += len >> 9; 482 483 if (offset) { 484 bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 485 GFP_ATOMIC); 486 if (!bp->bv1) 487 goto split_fail_1; 488 489 bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 490 GFP_ATOMIC); 491 if (!bp->bv2) 492 goto split_fail_2; 493 494 memcpy(bp->bv1, bio->bi_io_vec, 495 bio->bi_max_vecs * sizeof(struct bio_vec)); 496 memcpy(bp->bv2, bio->bi_io_vec, 497 bio->bi_max_vecs * sizeof(struct bio_vec)); 498 499 bp->b1.bi_io_vec = bp->bv1; 500 bp->b2.bi_io_vec = bp->bv2; 501 bp->b2.bi_io_vec[idx].bv_offset += offset; 502 bp->b2.bi_io_vec[idx].bv_len -= offset; 503 bp->b1.bi_io_vec[idx].bv_len = offset; 504 bp->b1.bi_vcnt++; 505 } else 506 bp->bv1 = bp->bv2 = NULL; 507 508 bp->b1.bi_private = bp; 509 bp->b2.bi_private = bp; 510 511 bp->b1.bi_end_io = nvme_bio_pair_endio; 512 bp->b2.bi_end_io = nvme_bio_pair_endio; 513 514 bp->parent = bio; 515 atomic_set(&bp->cnt, 2); 516 517 return bp; 518 519 split_fail_2: 520 kfree(bp->bv1); 521 split_fail_1: 522 kfree(bp); 523 return NULL; 524} 525 526static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 527 int idx, int len, int offset) 528{ 529 struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); 530 if (!bp) 531 return -ENOMEM; 532 533 if (bio_list_empty(&nvmeq->sq_cong)) 534 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 535 bio_list_add(&nvmeq->sq_cong, &bp->b1); 536 bio_list_add(&nvmeq->sq_cong, &bp->b2); 537 538 return 0; 539} 540 541/* NVMe scatterlists require no holes in the virtual address */ 542#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 543 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 544 545static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 546 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 547{ 548 struct bio_vec *bvec, *bvprv = NULL; 549 struct scatterlist *sg = NULL; 550 int i, length = 0, nsegs = 0, split_len = bio->bi_size; 551 552 if (nvmeq->dev->stripe_size) 553 split_len = nvmeq->dev->stripe_size - 554 ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); 555 556 sg_init_table(iod->sg, psegs); 557 bio_for_each_segment(bvec, bio, i) { 558 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 559 sg->length += bvec->bv_len; 560 } else { 561 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 562 return nvme_split_and_submit(bio, nvmeq, i, 563 length, 0); 564 565 sg = sg ? sg + 1 : iod->sg; 566 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 567 bvec->bv_offset); 568 nsegs++; 569 } 570 571 if (split_len - length < bvec->bv_len) 572 return nvme_split_and_submit(bio, nvmeq, i, split_len, 573 split_len - length); 574 length += bvec->bv_len; 575 bvprv = bvec; 576 } 577 iod->nents = nsegs; 578 sg_mark_end(sg); 579 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 580 return -ENOMEM; 581 582 BUG_ON(length != bio->bi_size); 583 return length; 584} 585 586/* 587 * We reuse the small pool to allocate the 16-byte range here as it is not 588 * worth having a special pool for these or additional cases to handle freeing 589 * the iod. 590 */ 591static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 592 struct bio *bio, struct nvme_iod *iod, int cmdid) 593{ 594 struct nvme_dsm_range *range; 595 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 596 597 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 598 &iod->first_dma); 599 if (!range) 600 return -ENOMEM; 601 602 iod_list(iod)[0] = (__le64 *)range; 603 iod->npages = 0; 604 605 range->cattr = cpu_to_le32(0); 606 range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); 607 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 608 609 memset(cmnd, 0, sizeof(*cmnd)); 610 cmnd->dsm.opcode = nvme_cmd_dsm; 611 cmnd->dsm.command_id = cmdid; 612 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 613 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 614 cmnd->dsm.nr = 0; 615 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 616 617 if (++nvmeq->sq_tail == nvmeq->q_depth) 618 nvmeq->sq_tail = 0; 619 writel(nvmeq->sq_tail, nvmeq->q_db); 620 621 return 0; 622} 623 624static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 625 int cmdid) 626{ 627 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 628 629 memset(cmnd, 0, sizeof(*cmnd)); 630 cmnd->common.opcode = nvme_cmd_flush; 631 cmnd->common.command_id = cmdid; 632 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 633 634 if (++nvmeq->sq_tail == nvmeq->q_depth) 635 nvmeq->sq_tail = 0; 636 writel(nvmeq->sq_tail, nvmeq->q_db); 637 638 return 0; 639} 640 641int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 642{ 643 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 644 special_completion, NVME_IO_TIMEOUT); 645 if (unlikely(cmdid < 0)) 646 return cmdid; 647 648 return nvme_submit_flush(nvmeq, ns, cmdid); 649} 650 651/* 652 * Called with local interrupts disabled and the q_lock held. May not sleep. 653 */ 654static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 655 struct bio *bio) 656{ 657 struct nvme_command *cmnd; 658 struct nvme_iod *iod; 659 enum dma_data_direction dma_dir; 660 int cmdid, length, result; 661 u16 control; 662 u32 dsmgmt; 663 int psegs = bio_phys_segments(ns->queue, bio); 664 665 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 666 result = nvme_submit_flush_data(nvmeq, ns); 667 if (result) 668 return result; 669 } 670 671 result = -ENOMEM; 672 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 673 if (!iod) 674 goto nomem; 675 iod->private = bio; 676 677 result = -EBUSY; 678 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 679 if (unlikely(cmdid < 0)) 680 goto free_iod; 681 682 if (bio->bi_rw & REQ_DISCARD) { 683 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 684 if (result) 685 goto free_cmdid; 686 return result; 687 } 688 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 689 return nvme_submit_flush(nvmeq, ns, cmdid); 690 691 control = 0; 692 if (bio->bi_rw & REQ_FUA) 693 control |= NVME_RW_FUA; 694 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 695 control |= NVME_RW_LR; 696 697 dsmgmt = 0; 698 if (bio->bi_rw & REQ_RAHEAD) 699 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 700 701 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 702 703 memset(cmnd, 0, sizeof(*cmnd)); 704 if (bio_data_dir(bio)) { 705 cmnd->rw.opcode = nvme_cmd_write; 706 dma_dir = DMA_TO_DEVICE; 707 } else { 708 cmnd->rw.opcode = nvme_cmd_read; 709 dma_dir = DMA_FROM_DEVICE; 710 } 711 712 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 713 if (result <= 0) 714 goto free_cmdid; 715 length = result; 716 717 cmnd->rw.command_id = cmdid; 718 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 719 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 720 GFP_ATOMIC); 721 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 722 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 723 cmnd->rw.control = cpu_to_le16(control); 724 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 725 726 nvme_start_io_acct(bio); 727 if (++nvmeq->sq_tail == nvmeq->q_depth) 728 nvmeq->sq_tail = 0; 729 writel(nvmeq->sq_tail, nvmeq->q_db); 730 731 return 0; 732 733 free_cmdid: 734 free_cmdid(nvmeq, cmdid, NULL); 735 free_iod: 736 nvme_free_iod(nvmeq->dev, iod); 737 nomem: 738 return result; 739} 740 741static int nvme_process_cq(struct nvme_queue *nvmeq) 742{ 743 u16 head, phase; 744 745 head = nvmeq->cq_head; 746 phase = nvmeq->cq_phase; 747 748 for (;;) { 749 void *ctx; 750 nvme_completion_fn fn; 751 struct nvme_completion cqe = nvmeq->cqes[head]; 752 if ((le16_to_cpu(cqe.status) & 1) != phase) 753 break; 754 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 755 if (++head == nvmeq->q_depth) { 756 head = 0; 757 phase = !phase; 758 } 759 760 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 761 fn(nvmeq->dev, ctx, &cqe); 762 } 763 764 /* If the controller ignores the cq head doorbell and continuously 765 * writes to the queue, it is theoretically possible to wrap around 766 * the queue twice and mistakenly return IRQ_NONE. Linux only 767 * requires that 0.1% of your interrupts are handled, so this isn't 768 * a big problem. 769 */ 770 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 771 return 0; 772 773 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 774 nvmeq->cq_head = head; 775 nvmeq->cq_phase = phase; 776 777 nvmeq->cqe_seen = 1; 778 return 1; 779} 780 781static void nvme_make_request(struct request_queue *q, struct bio *bio) 782{ 783 struct nvme_ns *ns = q->queuedata; 784 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 785 int result = -EBUSY; 786 787 spin_lock_irq(&nvmeq->q_lock); 788 if (bio_list_empty(&nvmeq->sq_cong)) 789 result = nvme_submit_bio_queue(nvmeq, ns, bio); 790 if (unlikely(result)) { 791 if (bio_list_empty(&nvmeq->sq_cong)) 792 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 793 bio_list_add(&nvmeq->sq_cong, bio); 794 } 795 796 nvme_process_cq(nvmeq); 797 spin_unlock_irq(&nvmeq->q_lock); 798 put_nvmeq(nvmeq); 799} 800 801static irqreturn_t nvme_irq(int irq, void *data) 802{ 803 irqreturn_t result; 804 struct nvme_queue *nvmeq = data; 805 spin_lock(&nvmeq->q_lock); 806 nvme_process_cq(nvmeq); 807 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 808 nvmeq->cqe_seen = 0; 809 spin_unlock(&nvmeq->q_lock); 810 return result; 811} 812 813static irqreturn_t nvme_irq_check(int irq, void *data) 814{ 815 struct nvme_queue *nvmeq = data; 816 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 817 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 818 return IRQ_NONE; 819 return IRQ_WAKE_THREAD; 820} 821 822static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 823{ 824 spin_lock_irq(&nvmeq->q_lock); 825 cancel_cmdid(nvmeq, cmdid, NULL); 826 spin_unlock_irq(&nvmeq->q_lock); 827} 828 829struct sync_cmd_info { 830 struct task_struct *task; 831 u32 result; 832 int status; 833}; 834 835static void sync_completion(struct nvme_dev *dev, void *ctx, 836 struct nvme_completion *cqe) 837{ 838 struct sync_cmd_info *cmdinfo = ctx; 839 cmdinfo->result = le32_to_cpup(&cqe->result); 840 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 841 wake_up_process(cmdinfo->task); 842} 843 844/* 845 * Returns 0 on success. If the result is negative, it's a Linux error code; 846 * if the result is positive, it's an NVM Express status code 847 */ 848int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, 849 u32 *result, unsigned timeout) 850{ 851 int cmdid; 852 struct sync_cmd_info cmdinfo; 853 854 cmdinfo.task = current; 855 cmdinfo.status = -EINTR; 856 857 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 858 timeout); 859 if (cmdid < 0) 860 return cmdid; 861 cmd->common.command_id = cmdid; 862 863 set_current_state(TASK_KILLABLE); 864 nvme_submit_cmd(nvmeq, cmd); 865 schedule_timeout(timeout); 866 867 if (cmdinfo.status == -EINTR) { 868 nvme_abort_command(nvmeq, cmdid); 869 return -EINTR; 870 } 871 872 if (result) 873 *result = cmdinfo.result; 874 875 return cmdinfo.status; 876} 877 878int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 879 u32 *result) 880{ 881 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 882} 883 884static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 885{ 886 int status; 887 struct nvme_command c; 888 889 memset(&c, 0, sizeof(c)); 890 c.delete_queue.opcode = opcode; 891 c.delete_queue.qid = cpu_to_le16(id); 892 893 status = nvme_submit_admin_cmd(dev, &c, NULL); 894 if (status) 895 return -EIO; 896 return 0; 897} 898 899static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 900 struct nvme_queue *nvmeq) 901{ 902 int status; 903 struct nvme_command c; 904 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 905 906 memset(&c, 0, sizeof(c)); 907 c.create_cq.opcode = nvme_admin_create_cq; 908 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 909 c.create_cq.cqid = cpu_to_le16(qid); 910 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 911 c.create_cq.cq_flags = cpu_to_le16(flags); 912 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 913 914 status = nvme_submit_admin_cmd(dev, &c, NULL); 915 if (status) 916 return -EIO; 917 return 0; 918} 919 920static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 921 struct nvme_queue *nvmeq) 922{ 923 int status; 924 struct nvme_command c; 925 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 926 927 memset(&c, 0, sizeof(c)); 928 c.create_sq.opcode = nvme_admin_create_sq; 929 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 930 c.create_sq.sqid = cpu_to_le16(qid); 931 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 932 c.create_sq.sq_flags = cpu_to_le16(flags); 933 c.create_sq.cqid = cpu_to_le16(qid); 934 935 status = nvme_submit_admin_cmd(dev, &c, NULL); 936 if (status) 937 return -EIO; 938 return 0; 939} 940 941static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 942{ 943 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 944} 945 946static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 947{ 948 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 949} 950 951int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 952 dma_addr_t dma_addr) 953{ 954 struct nvme_command c; 955 956 memset(&c, 0, sizeof(c)); 957 c.identify.opcode = nvme_admin_identify; 958 c.identify.nsid = cpu_to_le32(nsid); 959 c.identify.prp1 = cpu_to_le64(dma_addr); 960 c.identify.cns = cpu_to_le32(cns); 961 962 return nvme_submit_admin_cmd(dev, &c, NULL); 963} 964 965int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 966 dma_addr_t dma_addr, u32 *result) 967{ 968 struct nvme_command c; 969 970 memset(&c, 0, sizeof(c)); 971 c.features.opcode = nvme_admin_get_features; 972 c.features.nsid = cpu_to_le32(nsid); 973 c.features.prp1 = cpu_to_le64(dma_addr); 974 c.features.fid = cpu_to_le32(fid); 975 976 return nvme_submit_admin_cmd(dev, &c, result); 977} 978 979int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 980 dma_addr_t dma_addr, u32 *result) 981{ 982 struct nvme_command c; 983 984 memset(&c, 0, sizeof(c)); 985 c.features.opcode = nvme_admin_set_features; 986 c.features.prp1 = cpu_to_le64(dma_addr); 987 c.features.fid = cpu_to_le32(fid); 988 c.features.dword11 = cpu_to_le32(dword11); 989 990 return nvme_submit_admin_cmd(dev, &c, result); 991} 992 993/** 994 * nvme_cancel_ios - Cancel outstanding I/Os 995 * @queue: The queue to cancel I/Os on 996 * @timeout: True to only cancel I/Os which have timed out 997 */ 998static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 999{ 1000 int depth = nvmeq->q_depth - 1; 1001 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1002 unsigned long now = jiffies; 1003 int cmdid; 1004 1005 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1006 void *ctx; 1007 nvme_completion_fn fn; 1008 static struct nvme_completion cqe = { 1009 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 1010 }; 1011 1012 if (timeout && !time_after(now, info[cmdid].timeout)) 1013 continue; 1014 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1015 continue; 1016 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); 1017 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1018 fn(nvmeq->dev, ctx, &cqe); 1019 } 1020} 1021 1022static void nvme_free_queue_mem(struct nvme_queue *nvmeq) 1023{ 1024 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1025 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1026 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1027 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1028 kfree(nvmeq); 1029} 1030 1031static void nvme_free_queue(struct nvme_dev *dev, int qid) 1032{ 1033 struct nvme_queue *nvmeq = dev->queues[qid]; 1034 int vector = dev->entry[nvmeq->cq_vector].vector; 1035 1036 spin_lock_irq(&nvmeq->q_lock); 1037 nvme_cancel_ios(nvmeq, false); 1038 while (bio_list_peek(&nvmeq->sq_cong)) { 1039 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1040 bio_endio(bio, -EIO); 1041 } 1042 spin_unlock_irq(&nvmeq->q_lock); 1043 1044 irq_set_affinity_hint(vector, NULL); 1045 free_irq(vector, nvmeq); 1046 1047 /* Don't tell the adapter to delete the admin queue */ 1048 if (qid) { 1049 adapter_delete_sq(dev, qid); 1050 adapter_delete_cq(dev, qid); 1051 } 1052 1053 nvme_free_queue_mem(nvmeq); 1054} 1055 1056static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1057 int depth, int vector) 1058{ 1059 struct device *dmadev = &dev->pci_dev->dev; 1060 unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * 1061 sizeof(struct nvme_cmd_info)); 1062 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1063 if (!nvmeq) 1064 return NULL; 1065 1066 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1067 &nvmeq->cq_dma_addr, GFP_KERNEL); 1068 if (!nvmeq->cqes) 1069 goto free_nvmeq; 1070 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1071 1072 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1073 &nvmeq->sq_dma_addr, GFP_KERNEL); 1074 if (!nvmeq->sq_cmds) 1075 goto free_cqdma; 1076 1077 nvmeq->q_dmadev = dmadev; 1078 nvmeq->dev = dev; 1079 spin_lock_init(&nvmeq->q_lock); 1080 nvmeq->cq_head = 0; 1081 nvmeq->cq_phase = 1; 1082 init_waitqueue_head(&nvmeq->sq_full); 1083 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1084 bio_list_init(&nvmeq->sq_cong); 1085 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 1086 nvmeq->q_depth = depth; 1087 nvmeq->cq_vector = vector; 1088 1089 return nvmeq; 1090 1091 free_cqdma: 1092 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1093 nvmeq->cq_dma_addr); 1094 free_nvmeq: 1095 kfree(nvmeq); 1096 return NULL; 1097} 1098 1099static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1100 const char *name) 1101{ 1102 if (use_threaded_interrupts) 1103 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1104 nvme_irq_check, nvme_irq, 1105 IRQF_DISABLED | IRQF_SHARED, 1106 name, nvmeq); 1107 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1108 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 1109} 1110 1111static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, 1112 int cq_size, int vector) 1113{ 1114 int result; 1115 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 1116 1117 if (!nvmeq) 1118 return ERR_PTR(-ENOMEM); 1119 1120 result = adapter_alloc_cq(dev, qid, nvmeq); 1121 if (result < 0) 1122 goto free_nvmeq; 1123 1124 result = adapter_alloc_sq(dev, qid, nvmeq); 1125 if (result < 0) 1126 goto release_cq; 1127 1128 result = queue_request_irq(dev, nvmeq, "nvme"); 1129 if (result < 0) 1130 goto release_sq; 1131 1132 return nvmeq; 1133 1134 release_sq: 1135 adapter_delete_sq(dev, qid); 1136 release_cq: 1137 adapter_delete_cq(dev, qid); 1138 free_nvmeq: 1139 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1140 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1141 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1142 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1143 kfree(nvmeq); 1144 return ERR_PTR(result); 1145} 1146 1147static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1148{ 1149 unsigned long timeout; 1150 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1151 1152 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1153 1154 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1155 msleep(100); 1156 if (fatal_signal_pending(current)) 1157 return -EINTR; 1158 if (time_after(jiffies, timeout)) { 1159 dev_err(&dev->pci_dev->dev, 1160 "Device not ready; aborting initialisation\n"); 1161 return -ENODEV; 1162 } 1163 } 1164 1165 return 0; 1166} 1167 1168/* 1169 * If the device has been passed off to us in an enabled state, just clear 1170 * the enabled bit. The spec says we should set the 'shutdown notification 1171 * bits', but doing so may cause the device to complete commands to the 1172 * admin queue ... and we don't know what memory that might be pointing at! 1173 */ 1174static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1175{ 1176 u32 cc = readl(&dev->bar->cc); 1177 1178 if (cc & NVME_CC_ENABLE) 1179 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1180 return nvme_wait_ready(dev, cap, false); 1181} 1182 1183static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1184{ 1185 return nvme_wait_ready(dev, cap, true); 1186} 1187 1188static int nvme_configure_admin_queue(struct nvme_dev *dev) 1189{ 1190 int result; 1191 u32 aqa; 1192 u64 cap = readq(&dev->bar->cap); 1193 struct nvme_queue *nvmeq; 1194 1195 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1196 dev->db_stride = NVME_CAP_STRIDE(cap); 1197 1198 result = nvme_disable_ctrl(dev, cap); 1199 if (result < 0) 1200 return result; 1201 1202 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1203 if (!nvmeq) 1204 return -ENOMEM; 1205 1206 aqa = nvmeq->q_depth - 1; 1207 aqa |= aqa << 16; 1208 1209 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1210 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1211 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1212 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1213 1214 writel(aqa, &dev->bar->aqa); 1215 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1216 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1217 writel(dev->ctrl_config, &dev->bar->cc); 1218 1219 result = nvme_enable_ctrl(dev, cap); 1220 if (result) 1221 goto free_q; 1222 1223 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1224 if (result) 1225 goto free_q; 1226 1227 dev->queues[0] = nvmeq; 1228 return result; 1229 1230 free_q: 1231 nvme_free_queue_mem(nvmeq); 1232 return result; 1233} 1234 1235struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1236 unsigned long addr, unsigned length) 1237{ 1238 int i, err, count, nents, offset; 1239 struct scatterlist *sg; 1240 struct page **pages; 1241 struct nvme_iod *iod; 1242 1243 if (addr & 3) 1244 return ERR_PTR(-EINVAL); 1245 if (!length || length > INT_MAX - PAGE_SIZE) 1246 return ERR_PTR(-EINVAL); 1247 1248 offset = offset_in_page(addr); 1249 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1250 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1251 if (!pages) 1252 return ERR_PTR(-ENOMEM); 1253 1254 err = get_user_pages_fast(addr, count, 1, pages); 1255 if (err < count) { 1256 count = err; 1257 err = -EFAULT; 1258 goto put_pages; 1259 } 1260 1261 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1262 sg = iod->sg; 1263 sg_init_table(sg, count); 1264 for (i = 0; i < count; i++) { 1265 sg_set_page(&sg[i], pages[i], 1266 min_t(unsigned, length, PAGE_SIZE - offset), 1267 offset); 1268 length -= (PAGE_SIZE - offset); 1269 offset = 0; 1270 } 1271 sg_mark_end(&sg[i - 1]); 1272 iod->nents = count; 1273 1274 err = -ENOMEM; 1275 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1276 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1277 if (!nents) 1278 goto free_iod; 1279 1280 kfree(pages); 1281 return iod; 1282 1283 free_iod: 1284 kfree(iod); 1285 put_pages: 1286 for (i = 0; i < count; i++) 1287 put_page(pages[i]); 1288 kfree(pages); 1289 return ERR_PTR(err); 1290} 1291 1292void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1293 struct nvme_iod *iod) 1294{ 1295 int i; 1296 1297 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1298 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1299 1300 for (i = 0; i < iod->nents; i++) 1301 put_page(sg_page(&iod->sg[i])); 1302} 1303 1304static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1305{ 1306 struct nvme_dev *dev = ns->dev; 1307 struct nvme_queue *nvmeq; 1308 struct nvme_user_io io; 1309 struct nvme_command c; 1310 unsigned length, meta_len; 1311 int status, i; 1312 struct nvme_iod *iod, *meta_iod = NULL; 1313 dma_addr_t meta_dma_addr; 1314 void *meta, *uninitialized_var(meta_mem); 1315 1316 if (copy_from_user(&io, uio, sizeof(io))) 1317 return -EFAULT; 1318 length = (io.nblocks + 1) << ns->lba_shift; 1319 meta_len = (io.nblocks + 1) * ns->ms; 1320 1321 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1322 return -EINVAL; 1323 1324 switch (io.opcode) { 1325 case nvme_cmd_write: 1326 case nvme_cmd_read: 1327 case nvme_cmd_compare: 1328 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1329 break; 1330 default: 1331 return -EINVAL; 1332 } 1333 1334 if (IS_ERR(iod)) 1335 return PTR_ERR(iod); 1336 1337 memset(&c, 0, sizeof(c)); 1338 c.rw.opcode = io.opcode; 1339 c.rw.flags = io.flags; 1340 c.rw.nsid = cpu_to_le32(ns->ns_id); 1341 c.rw.slba = cpu_to_le64(io.slba); 1342 c.rw.length = cpu_to_le16(io.nblocks); 1343 c.rw.control = cpu_to_le16(io.control); 1344 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1345 c.rw.reftag = cpu_to_le32(io.reftag); 1346 c.rw.apptag = cpu_to_le16(io.apptag); 1347 c.rw.appmask = cpu_to_le16(io.appmask); 1348 1349 if (meta_len) { 1350 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); 1351 if (IS_ERR(meta_iod)) { 1352 status = PTR_ERR(meta_iod); 1353 meta_iod = NULL; 1354 goto unmap; 1355 } 1356 1357 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1358 &meta_dma_addr, GFP_KERNEL); 1359 if (!meta_mem) { 1360 status = -ENOMEM; 1361 goto unmap; 1362 } 1363 1364 if (io.opcode & 1) { 1365 int meta_offset = 0; 1366 1367 for (i = 0; i < meta_iod->nents; i++) { 1368 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1369 meta_iod->sg[i].offset; 1370 memcpy(meta_mem + meta_offset, meta, 1371 meta_iod->sg[i].length); 1372 kunmap_atomic(meta); 1373 meta_offset += meta_iod->sg[i].length; 1374 } 1375 } 1376 1377 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1378 } 1379 1380 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1381 1382 nvmeq = get_nvmeq(dev); 1383 /* 1384 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1385 * disabled. We may be preempted at any point, and be rescheduled 1386 * to a different CPU. That will cause cacheline bouncing, but no 1387 * additional races since q_lock already protects against other CPUs. 1388 */ 1389 put_nvmeq(nvmeq); 1390 if (length != (io.nblocks + 1) << ns->lba_shift) 1391 status = -ENOMEM; 1392 else 1393 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1394 1395 if (meta_len) { 1396 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1397 int meta_offset = 0; 1398 1399 for (i = 0; i < meta_iod->nents; i++) { 1400 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1401 meta_iod->sg[i].offset; 1402 memcpy(meta, meta_mem + meta_offset, 1403 meta_iod->sg[i].length); 1404 kunmap_atomic(meta); 1405 meta_offset += meta_iod->sg[i].length; 1406 } 1407 } 1408 1409 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1410 meta_dma_addr); 1411 } 1412 1413 unmap: 1414 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1415 nvme_free_iod(dev, iod); 1416 1417 if (meta_iod) { 1418 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1419 nvme_free_iod(dev, meta_iod); 1420 } 1421 1422 return status; 1423} 1424 1425static int nvme_user_admin_cmd(struct nvme_dev *dev, 1426 struct nvme_admin_cmd __user *ucmd) 1427{ 1428 struct nvme_admin_cmd cmd; 1429 struct nvme_command c; 1430 int status, length; 1431 struct nvme_iod *uninitialized_var(iod); 1432 unsigned timeout; 1433 1434 if (!capable(CAP_SYS_ADMIN)) 1435 return -EACCES; 1436 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1437 return -EFAULT; 1438 1439 memset(&c, 0, sizeof(c)); 1440 c.common.opcode = cmd.opcode; 1441 c.common.flags = cmd.flags; 1442 c.common.nsid = cpu_to_le32(cmd.nsid); 1443 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1444 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1445 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1446 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1447 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1448 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1449 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1450 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1451 1452 length = cmd.data_len; 1453 if (cmd.data_len) { 1454 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1455 length); 1456 if (IS_ERR(iod)) 1457 return PTR_ERR(iod); 1458 length = nvme_setup_prps(dev, &c.common, iod, length, 1459 GFP_KERNEL); 1460 } 1461 1462 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1463 ADMIN_TIMEOUT; 1464 if (length != cmd.data_len) 1465 status = -ENOMEM; 1466 else 1467 status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, 1468 timeout); 1469 1470 if (cmd.data_len) { 1471 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1472 nvme_free_iod(dev, iod); 1473 } 1474 1475 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1476 sizeof(cmd.result))) 1477 status = -EFAULT; 1478 1479 return status; 1480} 1481 1482static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1483 unsigned long arg) 1484{ 1485 struct nvme_ns *ns = bdev->bd_disk->private_data; 1486 1487 switch (cmd) { 1488 case NVME_IOCTL_ID: 1489 return ns->ns_id; 1490 case NVME_IOCTL_ADMIN_CMD: 1491 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1492 case NVME_IOCTL_SUBMIT_IO: 1493 return nvme_submit_io(ns, (void __user *)arg); 1494 case SG_GET_VERSION_NUM: 1495 return nvme_sg_get_version_num((void __user *)arg); 1496 case SG_IO: 1497 return nvme_sg_io(ns, (void __user *)arg); 1498 default: 1499 return -ENOTTY; 1500 } 1501} 1502 1503static const struct block_device_operations nvme_fops = { 1504 .owner = THIS_MODULE, 1505 .ioctl = nvme_ioctl, 1506 .compat_ioctl = nvme_ioctl, 1507}; 1508 1509static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1510{ 1511 while (bio_list_peek(&nvmeq->sq_cong)) { 1512 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1513 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1514 1515 if (bio_list_empty(&nvmeq->sq_cong)) 1516 remove_wait_queue(&nvmeq->sq_full, 1517 &nvmeq->sq_cong_wait); 1518 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1519 if (bio_list_empty(&nvmeq->sq_cong)) 1520 add_wait_queue(&nvmeq->sq_full, 1521 &nvmeq->sq_cong_wait); 1522 bio_list_add_head(&nvmeq->sq_cong, bio); 1523 break; 1524 } 1525 } 1526} 1527 1528static int nvme_kthread(void *data) 1529{ 1530 struct nvme_dev *dev; 1531 1532 while (!kthread_should_stop()) { 1533 set_current_state(TASK_INTERRUPTIBLE); 1534 spin_lock(&dev_list_lock); 1535 list_for_each_entry(dev, &dev_list, node) { 1536 int i; 1537 for (i = 0; i < dev->queue_count; i++) { 1538 struct nvme_queue *nvmeq = dev->queues[i]; 1539 if (!nvmeq) 1540 continue; 1541 spin_lock_irq(&nvmeq->q_lock); 1542 nvme_process_cq(nvmeq); 1543 nvme_cancel_ios(nvmeq, true); 1544 nvme_resubmit_bios(nvmeq); 1545 spin_unlock_irq(&nvmeq->q_lock); 1546 } 1547 } 1548 spin_unlock(&dev_list_lock); 1549 schedule_timeout(round_jiffies_relative(HZ)); 1550 } 1551 return 0; 1552} 1553 1554static DEFINE_IDA(nvme_index_ida); 1555 1556static int nvme_get_ns_idx(void) 1557{ 1558 int index, error; 1559 1560 do { 1561 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1562 return -1; 1563 1564 spin_lock(&dev_list_lock); 1565 error = ida_get_new(&nvme_index_ida, &index); 1566 spin_unlock(&dev_list_lock); 1567 } while (error == -EAGAIN); 1568 1569 if (error) 1570 index = -1; 1571 return index; 1572} 1573 1574static void nvme_put_ns_idx(int index) 1575{ 1576 spin_lock(&dev_list_lock); 1577 ida_remove(&nvme_index_ida, index); 1578 spin_unlock(&dev_list_lock); 1579} 1580 1581static void nvme_config_discard(struct nvme_ns *ns) 1582{ 1583 u32 logical_block_size = queue_logical_block_size(ns->queue); 1584 ns->queue->limits.discard_zeroes_data = 0; 1585 ns->queue->limits.discard_alignment = logical_block_size; 1586 ns->queue->limits.discard_granularity = logical_block_size; 1587 ns->queue->limits.max_discard_sectors = 0xffffffff; 1588 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1589} 1590 1591static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1592 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1593{ 1594 struct nvme_ns *ns; 1595 struct gendisk *disk; 1596 int lbaf; 1597 1598 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1599 return NULL; 1600 1601 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1602 if (!ns) 1603 return NULL; 1604 ns->queue = blk_alloc_queue(GFP_KERNEL); 1605 if (!ns->queue) 1606 goto out_free_ns; 1607 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1608 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1609 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1610 blk_queue_make_request(ns->queue, nvme_make_request); 1611 ns->dev = dev; 1612 ns->queue->queuedata = ns; 1613 1614 disk = alloc_disk(NVME_MINORS); 1615 if (!disk) 1616 goto out_free_queue; 1617 ns->ns_id = nsid; 1618 ns->disk = disk; 1619 lbaf = id->flbas & 0xf; 1620 ns->lba_shift = id->lbaf[lbaf].ds; 1621 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1622 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1623 if (dev->max_hw_sectors) 1624 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1625 1626 disk->major = nvme_major; 1627 disk->minors = NVME_MINORS; 1628 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1629 disk->fops = &nvme_fops; 1630 disk->private_data = ns; 1631 disk->queue = ns->queue; 1632 disk->driverfs_dev = &dev->pci_dev->dev; 1633 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1634 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1635 1636 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1637 nvme_config_discard(ns); 1638 1639 return ns; 1640 1641 out_free_queue: 1642 blk_cleanup_queue(ns->queue); 1643 out_free_ns: 1644 kfree(ns); 1645 return NULL; 1646} 1647 1648static void nvme_ns_free(struct nvme_ns *ns) 1649{ 1650 int index = ns->disk->first_minor / NVME_MINORS; 1651 put_disk(ns->disk); 1652 nvme_put_ns_idx(index); 1653 blk_cleanup_queue(ns->queue); 1654 kfree(ns); 1655} 1656 1657static int set_queue_count(struct nvme_dev *dev, int count) 1658{ 1659 int status; 1660 u32 result; 1661 u32 q_count = (count - 1) | ((count - 1) << 16); 1662 1663 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1664 &result); 1665 if (status) 1666 return -EIO; 1667 return min(result & 0xffff, result >> 16) + 1; 1668} 1669 1670static int nvme_setup_io_queues(struct nvme_dev *dev) 1671{ 1672 struct pci_dev *pdev = dev->pci_dev; 1673 int result, cpu, i, vecs, nr_io_queues, db_bar_size, q_depth; 1674 1675 nr_io_queues = num_online_cpus(); 1676 result = set_queue_count(dev, nr_io_queues); 1677 if (result < 0) 1678 return result; 1679 if (result < nr_io_queues) 1680 nr_io_queues = result; 1681 1682 /* Deregister the admin queue's interrupt */ 1683 free_irq(dev->entry[0].vector, dev->queues[0]); 1684 1685 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1686 if (db_bar_size > 8192) { 1687 iounmap(dev->bar); 1688 dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size); 1689 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1690 dev->queues[0]->q_db = dev->dbs; 1691 } 1692 1693 vecs = nr_io_queues; 1694 for (i = 0; i < vecs; i++) 1695 dev->entry[i].entry = i; 1696 for (;;) { 1697 result = pci_enable_msix(pdev, dev->entry, vecs); 1698 if (result <= 0) 1699 break; 1700 vecs = result; 1701 } 1702 1703 if (result < 0) { 1704 vecs = nr_io_queues; 1705 if (vecs > 32) 1706 vecs = 32; 1707 for (;;) { 1708 result = pci_enable_msi_block(pdev, vecs); 1709 if (result == 0) { 1710 for (i = 0; i < vecs; i++) 1711 dev->entry[i].vector = i + pdev->irq; 1712 break; 1713 } else if (result < 0) { 1714 vecs = 1; 1715 break; 1716 } 1717 vecs = result; 1718 } 1719 } 1720 1721 /* 1722 * Should investigate if there's a performance win from allocating 1723 * more queues than interrupt vectors; it might allow the submission 1724 * path to scale better, even if the receive path is limited by the 1725 * number of interrupts. 1726 */ 1727 nr_io_queues = vecs; 1728 1729 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1730 /* XXX: handle failure here */ 1731 1732 cpu = cpumask_first(cpu_online_mask); 1733 for (i = 0; i < nr_io_queues; i++) { 1734 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1735 cpu = cpumask_next(cpu, cpu_online_mask); 1736 } 1737 1738 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1739 NVME_Q_DEPTH); 1740 for (i = 0; i < nr_io_queues; i++) { 1741 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); 1742 if (IS_ERR(dev->queues[i + 1])) 1743 return PTR_ERR(dev->queues[i + 1]); 1744 dev->queue_count++; 1745 } 1746 1747 for (; i < num_possible_cpus(); i++) { 1748 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1749 dev->queues[i + 1] = dev->queues[target + 1]; 1750 } 1751 1752 return 0; 1753} 1754 1755static void nvme_free_queues(struct nvme_dev *dev) 1756{ 1757 int i; 1758 1759 for (i = dev->queue_count - 1; i >= 0; i--) 1760 nvme_free_queue(dev, i); 1761} 1762 1763/* 1764 * Return: error value if an error occurred setting up the queues or calling 1765 * Identify Device. 0 if these succeeded, even if adding some of the 1766 * namespaces failed. At the moment, these failures are silent. TBD which 1767 * failures should be reported. 1768 */ 1769static int nvme_dev_add(struct nvme_dev *dev) 1770{ 1771 int res, nn, i; 1772 struct nvme_ns *ns; 1773 struct nvme_id_ctrl *ctrl; 1774 struct nvme_id_ns *id_ns; 1775 void *mem; 1776 dma_addr_t dma_addr; 1777 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1778 1779 res = nvme_setup_io_queues(dev); 1780 if (res) 1781 return res; 1782 1783 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1784 GFP_KERNEL); 1785 if (!mem) 1786 return -ENOMEM; 1787 1788 res = nvme_identify(dev, 0, 1, dma_addr); 1789 if (res) { 1790 res = -EIO; 1791 goto out; 1792 } 1793 1794 ctrl = mem; 1795 nn = le32_to_cpup(&ctrl->nn); 1796 dev->oncs = le16_to_cpup(&ctrl->oncs); 1797 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1798 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1799 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1800 if (ctrl->mdts) 1801 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 1802 if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) && 1803 (dev->pci_dev->device == 0x0953) && ctrl->vs[3]) 1804 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 1805 1806 id_ns = mem; 1807 for (i = 1; i <= nn; i++) { 1808 res = nvme_identify(dev, i, 0, dma_addr); 1809 if (res) 1810 continue; 1811 1812 if (id_ns->ncap == 0) 1813 continue; 1814 1815 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1816 dma_addr + 4096, NULL); 1817 if (res) 1818 memset(mem + 4096, 0, 4096); 1819 1820 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1821 if (ns) 1822 list_add_tail(&ns->list, &dev->namespaces); 1823 } 1824 list_for_each_entry(ns, &dev->namespaces, list) 1825 add_disk(ns->disk); 1826 res = 0; 1827 1828 out: 1829 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1830 return res; 1831} 1832 1833static int nvme_dev_remove(struct nvme_dev *dev) 1834{ 1835 struct nvme_ns *ns, *next; 1836 1837 spin_lock(&dev_list_lock); 1838 list_del(&dev->node); 1839 spin_unlock(&dev_list_lock); 1840 1841 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1842 list_del(&ns->list); 1843 del_gendisk(ns->disk); 1844 nvme_ns_free(ns); 1845 } 1846 1847 nvme_free_queues(dev); 1848 1849 return 0; 1850} 1851 1852static int nvme_setup_prp_pools(struct nvme_dev *dev) 1853{ 1854 struct device *dmadev = &dev->pci_dev->dev; 1855 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1856 PAGE_SIZE, PAGE_SIZE, 0); 1857 if (!dev->prp_page_pool) 1858 return -ENOMEM; 1859 1860 /* Optimisation for I/Os between 4k and 128k */ 1861 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1862 256, 256, 0); 1863 if (!dev->prp_small_pool) { 1864 dma_pool_destroy(dev->prp_page_pool); 1865 return -ENOMEM; 1866 } 1867 return 0; 1868} 1869 1870static void nvme_release_prp_pools(struct nvme_dev *dev) 1871{ 1872 dma_pool_destroy(dev->prp_page_pool); 1873 dma_pool_destroy(dev->prp_small_pool); 1874} 1875 1876static DEFINE_IDA(nvme_instance_ida); 1877 1878static int nvme_set_instance(struct nvme_dev *dev) 1879{ 1880 int instance, error; 1881 1882 do { 1883 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1884 return -ENODEV; 1885 1886 spin_lock(&dev_list_lock); 1887 error = ida_get_new(&nvme_instance_ida, &instance); 1888 spin_unlock(&dev_list_lock); 1889 } while (error == -EAGAIN); 1890 1891 if (error) 1892 return -ENODEV; 1893 1894 dev->instance = instance; 1895 return 0; 1896} 1897 1898static void nvme_release_instance(struct nvme_dev *dev) 1899{ 1900 spin_lock(&dev_list_lock); 1901 ida_remove(&nvme_instance_ida, dev->instance); 1902 spin_unlock(&dev_list_lock); 1903} 1904 1905static void nvme_free_dev(struct kref *kref) 1906{ 1907 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 1908 nvme_dev_remove(dev); 1909 if (dev->pci_dev->msi_enabled) 1910 pci_disable_msi(dev->pci_dev); 1911 else if (dev->pci_dev->msix_enabled) 1912 pci_disable_msix(dev->pci_dev); 1913 iounmap(dev->bar); 1914 nvme_release_instance(dev); 1915 nvme_release_prp_pools(dev); 1916 pci_disable_device(dev->pci_dev); 1917 pci_release_regions(dev->pci_dev); 1918 kfree(dev->queues); 1919 kfree(dev->entry); 1920 kfree(dev); 1921} 1922 1923static int nvme_dev_open(struct inode *inode, struct file *f) 1924{ 1925 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 1926 miscdev); 1927 kref_get(&dev->kref); 1928 f->private_data = dev; 1929 return 0; 1930} 1931 1932static int nvme_dev_release(struct inode *inode, struct file *f) 1933{ 1934 struct nvme_dev *dev = f->private_data; 1935 kref_put(&dev->kref, nvme_free_dev); 1936 return 0; 1937} 1938 1939static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1940{ 1941 struct nvme_dev *dev = f->private_data; 1942 switch (cmd) { 1943 case NVME_IOCTL_ADMIN_CMD: 1944 return nvme_user_admin_cmd(dev, (void __user *)arg); 1945 default: 1946 return -ENOTTY; 1947 } 1948} 1949 1950static const struct file_operations nvme_dev_fops = { 1951 .owner = THIS_MODULE, 1952 .open = nvme_dev_open, 1953 .release = nvme_dev_release, 1954 .unlocked_ioctl = nvme_dev_ioctl, 1955 .compat_ioctl = nvme_dev_ioctl, 1956}; 1957 1958static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1959{ 1960 int bars, result = -ENOMEM; 1961 struct nvme_dev *dev; 1962 1963 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1964 if (!dev) 1965 return -ENOMEM; 1966 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1967 GFP_KERNEL); 1968 if (!dev->entry) 1969 goto free; 1970 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1971 GFP_KERNEL); 1972 if (!dev->queues) 1973 goto free; 1974 1975 if (pci_enable_device_mem(pdev)) 1976 goto free; 1977 pci_set_master(pdev); 1978 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1979 if (pci_request_selected_regions(pdev, bars, "nvme")) 1980 goto disable; 1981 1982 INIT_LIST_HEAD(&dev->namespaces); 1983 dev->pci_dev = pdev; 1984 pci_set_drvdata(pdev, dev); 1985 1986 if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) 1987 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1988 else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) 1989 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); 1990 else 1991 goto disable; 1992 1993 result = nvme_set_instance(dev); 1994 if (result) 1995 goto disable; 1996 1997 dev->entry[0].vector = pdev->irq; 1998 1999 result = nvme_setup_prp_pools(dev); 2000 if (result) 2001 goto disable_msix; 2002 2003 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2004 if (!dev->bar) { 2005 result = -ENOMEM; 2006 goto disable_msix; 2007 } 2008 2009 result = nvme_configure_admin_queue(dev); 2010 if (result) 2011 goto unmap; 2012 dev->queue_count++; 2013 2014 spin_lock(&dev_list_lock); 2015 list_add(&dev->node, &dev_list); 2016 spin_unlock(&dev_list_lock); 2017 2018 result = nvme_dev_add(dev); 2019 if (result) 2020 goto delete; 2021 2022 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2023 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2024 dev->miscdev.parent = &pdev->dev; 2025 dev->miscdev.name = dev->name; 2026 dev->miscdev.fops = &nvme_dev_fops; 2027 result = misc_register(&dev->miscdev); 2028 if (result) 2029 goto remove; 2030 2031 kref_init(&dev->kref); 2032 return 0; 2033 2034 remove: 2035 nvme_dev_remove(dev); 2036 delete: 2037 spin_lock(&dev_list_lock); 2038 list_del(&dev->node); 2039 spin_unlock(&dev_list_lock); 2040 2041 nvme_free_queues(dev); 2042 unmap: 2043 iounmap(dev->bar); 2044 disable_msix: 2045 if (dev->pci_dev->msi_enabled) 2046 pci_disable_msi(dev->pci_dev); 2047 else if (dev->pci_dev->msix_enabled) 2048 pci_disable_msix(dev->pci_dev); 2049 nvme_release_instance(dev); 2050 nvme_release_prp_pools(dev); 2051 disable: 2052 pci_disable_device(pdev); 2053 pci_release_regions(pdev); 2054 free: 2055 kfree(dev->queues); 2056 kfree(dev->entry); 2057 kfree(dev); 2058 return result; 2059} 2060 2061static void nvme_remove(struct pci_dev *pdev) 2062{ 2063 struct nvme_dev *dev = pci_get_drvdata(pdev); 2064 misc_deregister(&dev->miscdev); 2065 kref_put(&dev->kref, nvme_free_dev); 2066} 2067 2068/* These functions are yet to be implemented */ 2069#define nvme_error_detected NULL 2070#define nvme_dump_registers NULL 2071#define nvme_link_reset NULL 2072#define nvme_slot_reset NULL 2073#define nvme_error_resume NULL 2074#define nvme_suspend NULL 2075#define nvme_resume NULL 2076 2077static const struct pci_error_handlers nvme_err_handler = { 2078 .error_detected = nvme_error_detected, 2079 .mmio_enabled = nvme_dump_registers, 2080 .link_reset = nvme_link_reset, 2081 .slot_reset = nvme_slot_reset, 2082 .resume = nvme_error_resume, 2083}; 2084 2085/* Move to pci_ids.h later */ 2086#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2087 2088static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 2089 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2090 { 0, } 2091}; 2092MODULE_DEVICE_TABLE(pci, nvme_id_table); 2093 2094static struct pci_driver nvme_driver = { 2095 .name = "nvme", 2096 .id_table = nvme_id_table, 2097 .probe = nvme_probe, 2098 .remove = nvme_remove, 2099 .suspend = nvme_suspend, 2100 .resume = nvme_resume, 2101 .err_handler = &nvme_err_handler, 2102}; 2103 2104static int __init nvme_init(void) 2105{ 2106 int result; 2107 2108 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2109 if (IS_ERR(nvme_thread)) 2110 return PTR_ERR(nvme_thread); 2111 2112 result = register_blkdev(nvme_major, "nvme"); 2113 if (result < 0) 2114 goto kill_kthread; 2115 else if (result > 0) 2116 nvme_major = result; 2117 2118 result = pci_register_driver(&nvme_driver); 2119 if (result) 2120 goto unregister_blkdev; 2121 return 0; 2122 2123 unregister_blkdev: 2124 unregister_blkdev(nvme_major, "nvme"); 2125 kill_kthread: 2126 kthread_stop(nvme_thread); 2127 return result; 2128} 2129 2130static void __exit nvme_exit(void) 2131{ 2132 pci_unregister_driver(&nvme_driver); 2133 unregister_blkdev(nvme_major, "nvme"); 2134 kthread_stop(nvme_thread); 2135} 2136 2137MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2138MODULE_LICENSE("GPL"); 2139MODULE_VERSION("0.8"); 2140module_init(nvme_init); 2141module_exit(nvme_exit); 2142