nvme-core.c revision 0e53d18051725da46cbccfb7874a6422d4d4f274
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/ptrace.h> 40#include <linux/sched.h> 41#include <linux/slab.h> 42#include <linux/types.h> 43#include <scsi/sg.h> 44#include <asm-generic/io-64-nonatomic-lo-hi.h> 45 46#define NVME_Q_DEPTH 1024 47#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 48#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 49#define NVME_MINORS 64 50#define ADMIN_TIMEOUT (60 * HZ) 51 52static int nvme_major; 53module_param(nvme_major, int, 0); 54 55static int use_threaded_interrupts; 56module_param(use_threaded_interrupts, int, 0); 57 58static DEFINE_SPINLOCK(dev_list_lock); 59static LIST_HEAD(dev_list); 60static struct task_struct *nvme_thread; 61static struct workqueue_struct *nvme_workq; 62 63static void nvme_reset_failed_dev(struct work_struct *ws); 64 65/* 66 * An NVM Express queue. Each device has at least two (one for admin 67 * commands and one for I/O commands). 68 */ 69struct nvme_queue { 70 struct device *q_dmadev; 71 struct nvme_dev *dev; 72 spinlock_t q_lock; 73 struct nvme_command *sq_cmds; 74 volatile struct nvme_completion *cqes; 75 dma_addr_t sq_dma_addr; 76 dma_addr_t cq_dma_addr; 77 wait_queue_head_t sq_full; 78 wait_queue_t sq_cong_wait; 79 struct bio_list sq_cong; 80 u32 __iomem *q_db; 81 u16 q_depth; 82 u16 cq_vector; 83 u16 sq_head; 84 u16 sq_tail; 85 u16 cq_head; 86 u16 qid; 87 u8 cq_phase; 88 u8 cqe_seen; 89 u8 q_suspended; 90 unsigned long cmdid_data[]; 91}; 92 93/* 94 * Check we didin't inadvertently grow the command struct 95 */ 96static inline void _nvme_check_size(void) 97{ 98 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 99 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 100 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 101 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 102 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 103 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 104 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 105 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 106 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 107 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 108 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 109 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 110} 111 112typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 113 struct nvme_completion *); 114 115struct nvme_cmd_info { 116 nvme_completion_fn fn; 117 void *ctx; 118 unsigned long timeout; 119 int aborted; 120}; 121 122static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 123{ 124 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 125} 126 127static unsigned nvme_queue_extra(int depth) 128{ 129 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 130} 131 132/** 133 * alloc_cmdid() - Allocate a Command ID 134 * @nvmeq: The queue that will be used for this command 135 * @ctx: A pointer that will be passed to the handler 136 * @handler: The function to call on completion 137 * 138 * Allocate a Command ID for a queue. The data passed in will 139 * be passed to the completion handler. This is implemented by using 140 * the bottom two bits of the ctx pointer to store the handler ID. 141 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 142 * We can change this if it becomes a problem. 143 * 144 * May be called with local interrupts disabled and the q_lock held, 145 * or with interrupts enabled and no locks held. 146 */ 147static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 148 nvme_completion_fn handler, unsigned timeout) 149{ 150 int depth = nvmeq->q_depth - 1; 151 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 152 int cmdid; 153 154 do { 155 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 156 if (cmdid >= depth) 157 return -EBUSY; 158 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 159 160 info[cmdid].fn = handler; 161 info[cmdid].ctx = ctx; 162 info[cmdid].timeout = jiffies + timeout; 163 info[cmdid].aborted = 0; 164 return cmdid; 165} 166 167static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 168 nvme_completion_fn handler, unsigned timeout) 169{ 170 int cmdid; 171 wait_event_killable(nvmeq->sq_full, 172 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 173 return (cmdid < 0) ? -EINTR : cmdid; 174} 175 176/* Special values must be less than 0x1000 */ 177#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 178#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 179#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 180#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 181#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 182#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) 183 184static void special_completion(struct nvme_dev *dev, void *ctx, 185 struct nvme_completion *cqe) 186{ 187 if (ctx == CMD_CTX_CANCELLED) 188 return; 189 if (ctx == CMD_CTX_FLUSH) 190 return; 191 if (ctx == CMD_CTX_ABORT) { 192 ++dev->abort_limit; 193 return; 194 } 195 if (ctx == CMD_CTX_COMPLETED) { 196 dev_warn(&dev->pci_dev->dev, 197 "completed id %d twice on queue %d\n", 198 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 199 return; 200 } 201 if (ctx == CMD_CTX_INVALID) { 202 dev_warn(&dev->pci_dev->dev, 203 "invalid id %d completed on queue %d\n", 204 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 205 return; 206 } 207 208 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 209} 210 211/* 212 * Called with local interrupts disabled and the q_lock held. May not sleep. 213 */ 214static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 215 nvme_completion_fn *fn) 216{ 217 void *ctx; 218 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 219 220 if (cmdid >= nvmeq->q_depth) { 221 *fn = special_completion; 222 return CMD_CTX_INVALID; 223 } 224 if (fn) 225 *fn = info[cmdid].fn; 226 ctx = info[cmdid].ctx; 227 info[cmdid].fn = special_completion; 228 info[cmdid].ctx = CMD_CTX_COMPLETED; 229 clear_bit(cmdid, nvmeq->cmdid_data); 230 wake_up(&nvmeq->sq_full); 231 return ctx; 232} 233 234static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 235 nvme_completion_fn *fn) 236{ 237 void *ctx; 238 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 239 if (fn) 240 *fn = info[cmdid].fn; 241 ctx = info[cmdid].ctx; 242 info[cmdid].fn = special_completion; 243 info[cmdid].ctx = CMD_CTX_CANCELLED; 244 return ctx; 245} 246 247struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 248{ 249 return dev->queues[get_cpu() + 1]; 250} 251 252void put_nvmeq(struct nvme_queue *nvmeq) 253{ 254 put_cpu(); 255} 256 257/** 258 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 259 * @nvmeq: The queue to use 260 * @cmd: The command to send 261 * 262 * Safe to use from interrupt context 263 */ 264static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 265{ 266 unsigned long flags; 267 u16 tail; 268 spin_lock_irqsave(&nvmeq->q_lock, flags); 269 tail = nvmeq->sq_tail; 270 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 271 if (++tail == nvmeq->q_depth) 272 tail = 0; 273 writel(tail, nvmeq->q_db); 274 nvmeq->sq_tail = tail; 275 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 276 277 return 0; 278} 279 280static __le64 **iod_list(struct nvme_iod *iod) 281{ 282 return ((void *)iod) + iod->offset; 283} 284 285/* 286 * Will slightly overestimate the number of pages needed. This is OK 287 * as it only leads to a small amount of wasted memory for the lifetime of 288 * the I/O. 289 */ 290static int nvme_npages(unsigned size) 291{ 292 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 293 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 294} 295 296static struct nvme_iod * 297nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 298{ 299 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 300 sizeof(__le64 *) * nvme_npages(nbytes) + 301 sizeof(struct scatterlist) * nseg, gfp); 302 303 if (iod) { 304 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 305 iod->npages = -1; 306 iod->length = nbytes; 307 iod->nents = 0; 308 iod->start_time = jiffies; 309 } 310 311 return iod; 312} 313 314void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 315{ 316 const int last_prp = PAGE_SIZE / 8 - 1; 317 int i; 318 __le64 **list = iod_list(iod); 319 dma_addr_t prp_dma = iod->first_dma; 320 321 if (iod->npages == 0) 322 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 323 for (i = 0; i < iod->npages; i++) { 324 __le64 *prp_list = list[i]; 325 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 326 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 327 prp_dma = next_prp_dma; 328 } 329 kfree(iod); 330} 331 332static void nvme_start_io_acct(struct bio *bio) 333{ 334 struct gendisk *disk = bio->bi_bdev->bd_disk; 335 const int rw = bio_data_dir(bio); 336 int cpu = part_stat_lock(); 337 part_round_stats(cpu, &disk->part0); 338 part_stat_inc(cpu, &disk->part0, ios[rw]); 339 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); 340 part_inc_in_flight(&disk->part0, rw); 341 part_stat_unlock(); 342} 343 344static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) 345{ 346 struct gendisk *disk = bio->bi_bdev->bd_disk; 347 const int rw = bio_data_dir(bio); 348 unsigned long duration = jiffies - start_time; 349 int cpu = part_stat_lock(); 350 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 351 part_round_stats(cpu, &disk->part0); 352 part_dec_in_flight(&disk->part0, rw); 353 part_stat_unlock(); 354} 355 356static void bio_completion(struct nvme_dev *dev, void *ctx, 357 struct nvme_completion *cqe) 358{ 359 struct nvme_iod *iod = ctx; 360 struct bio *bio = iod->private; 361 u16 status = le16_to_cpup(&cqe->status) >> 1; 362 363 if (iod->nents) { 364 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 365 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 366 nvme_end_io_acct(bio, iod->start_time); 367 } 368 nvme_free_iod(dev, iod); 369 if (status) 370 bio_endio(bio, -EIO); 371 else 372 bio_endio(bio, 0); 373} 374 375/* length is in bytes. gfp flags indicates whether we may sleep. */ 376int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 377 struct nvme_iod *iod, int total_len, gfp_t gfp) 378{ 379 struct dma_pool *pool; 380 int length = total_len; 381 struct scatterlist *sg = iod->sg; 382 int dma_len = sg_dma_len(sg); 383 u64 dma_addr = sg_dma_address(sg); 384 int offset = offset_in_page(dma_addr); 385 __le64 *prp_list; 386 __le64 **list = iod_list(iod); 387 dma_addr_t prp_dma; 388 int nprps, i; 389 390 cmd->prp1 = cpu_to_le64(dma_addr); 391 length -= (PAGE_SIZE - offset); 392 if (length <= 0) 393 return total_len; 394 395 dma_len -= (PAGE_SIZE - offset); 396 if (dma_len) { 397 dma_addr += (PAGE_SIZE - offset); 398 } else { 399 sg = sg_next(sg); 400 dma_addr = sg_dma_address(sg); 401 dma_len = sg_dma_len(sg); 402 } 403 404 if (length <= PAGE_SIZE) { 405 cmd->prp2 = cpu_to_le64(dma_addr); 406 return total_len; 407 } 408 409 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 410 if (nprps <= (256 / 8)) { 411 pool = dev->prp_small_pool; 412 iod->npages = 0; 413 } else { 414 pool = dev->prp_page_pool; 415 iod->npages = 1; 416 } 417 418 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 419 if (!prp_list) { 420 cmd->prp2 = cpu_to_le64(dma_addr); 421 iod->npages = -1; 422 return (total_len - length) + PAGE_SIZE; 423 } 424 list[0] = prp_list; 425 iod->first_dma = prp_dma; 426 cmd->prp2 = cpu_to_le64(prp_dma); 427 i = 0; 428 for (;;) { 429 if (i == PAGE_SIZE / 8) { 430 __le64 *old_prp_list = prp_list; 431 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 432 if (!prp_list) 433 return total_len - length; 434 list[iod->npages++] = prp_list; 435 prp_list[0] = old_prp_list[i - 1]; 436 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 437 i = 1; 438 } 439 prp_list[i++] = cpu_to_le64(dma_addr); 440 dma_len -= PAGE_SIZE; 441 dma_addr += PAGE_SIZE; 442 length -= PAGE_SIZE; 443 if (length <= 0) 444 break; 445 if (dma_len > 0) 446 continue; 447 BUG_ON(dma_len < 0); 448 sg = sg_next(sg); 449 dma_addr = sg_dma_address(sg); 450 dma_len = sg_dma_len(sg); 451 } 452 453 return total_len; 454} 455 456struct nvme_bio_pair { 457 struct bio b1, b2, *parent; 458 struct bio_vec *bv1, *bv2; 459 int err; 460 atomic_t cnt; 461}; 462 463static void nvme_bio_pair_endio(struct bio *bio, int err) 464{ 465 struct nvme_bio_pair *bp = bio->bi_private; 466 467 if (err) 468 bp->err = err; 469 470 if (atomic_dec_and_test(&bp->cnt)) { 471 bio_endio(bp->parent, bp->err); 472 kfree(bp->bv1); 473 kfree(bp->bv2); 474 kfree(bp); 475 } 476} 477 478static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, 479 int len, int offset) 480{ 481 struct nvme_bio_pair *bp; 482 483 BUG_ON(len > bio->bi_size); 484 BUG_ON(idx > bio->bi_vcnt); 485 486 bp = kmalloc(sizeof(*bp), GFP_ATOMIC); 487 if (!bp) 488 return NULL; 489 bp->err = 0; 490 491 bp->b1 = *bio; 492 bp->b2 = *bio; 493 494 bp->b1.bi_size = len; 495 bp->b2.bi_size -= len; 496 bp->b1.bi_vcnt = idx; 497 bp->b2.bi_idx = idx; 498 bp->b2.bi_sector += len >> 9; 499 500 if (offset) { 501 bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 502 GFP_ATOMIC); 503 if (!bp->bv1) 504 goto split_fail_1; 505 506 bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 507 GFP_ATOMIC); 508 if (!bp->bv2) 509 goto split_fail_2; 510 511 memcpy(bp->bv1, bio->bi_io_vec, 512 bio->bi_max_vecs * sizeof(struct bio_vec)); 513 memcpy(bp->bv2, bio->bi_io_vec, 514 bio->bi_max_vecs * sizeof(struct bio_vec)); 515 516 bp->b1.bi_io_vec = bp->bv1; 517 bp->b2.bi_io_vec = bp->bv2; 518 bp->b2.bi_io_vec[idx].bv_offset += offset; 519 bp->b2.bi_io_vec[idx].bv_len -= offset; 520 bp->b1.bi_io_vec[idx].bv_len = offset; 521 bp->b1.bi_vcnt++; 522 } else 523 bp->bv1 = bp->bv2 = NULL; 524 525 bp->b1.bi_private = bp; 526 bp->b2.bi_private = bp; 527 528 bp->b1.bi_end_io = nvme_bio_pair_endio; 529 bp->b2.bi_end_io = nvme_bio_pair_endio; 530 531 bp->parent = bio; 532 atomic_set(&bp->cnt, 2); 533 534 return bp; 535 536 split_fail_2: 537 kfree(bp->bv1); 538 split_fail_1: 539 kfree(bp); 540 return NULL; 541} 542 543static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 544 int idx, int len, int offset) 545{ 546 struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); 547 if (!bp) 548 return -ENOMEM; 549 550 if (bio_list_empty(&nvmeq->sq_cong)) 551 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 552 bio_list_add(&nvmeq->sq_cong, &bp->b1); 553 bio_list_add(&nvmeq->sq_cong, &bp->b2); 554 555 return 0; 556} 557 558/* NVMe scatterlists require no holes in the virtual address */ 559#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 560 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 561 562static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 563 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 564{ 565 struct bio_vec *bvec, *bvprv = NULL; 566 struct scatterlist *sg = NULL; 567 int i, length = 0, nsegs = 0, split_len = bio->bi_size; 568 569 if (nvmeq->dev->stripe_size) 570 split_len = nvmeq->dev->stripe_size - 571 ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); 572 573 sg_init_table(iod->sg, psegs); 574 bio_for_each_segment(bvec, bio, i) { 575 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 576 sg->length += bvec->bv_len; 577 } else { 578 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 579 return nvme_split_and_submit(bio, nvmeq, i, 580 length, 0); 581 582 sg = sg ? sg + 1 : iod->sg; 583 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 584 bvec->bv_offset); 585 nsegs++; 586 } 587 588 if (split_len - length < bvec->bv_len) 589 return nvme_split_and_submit(bio, nvmeq, i, split_len, 590 split_len - length); 591 length += bvec->bv_len; 592 bvprv = bvec; 593 } 594 iod->nents = nsegs; 595 sg_mark_end(sg); 596 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 597 return -ENOMEM; 598 599 BUG_ON(length != bio->bi_size); 600 return length; 601} 602 603/* 604 * We reuse the small pool to allocate the 16-byte range here as it is not 605 * worth having a special pool for these or additional cases to handle freeing 606 * the iod. 607 */ 608static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 609 struct bio *bio, struct nvme_iod *iod, int cmdid) 610{ 611 struct nvme_dsm_range *range; 612 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 613 614 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 615 &iod->first_dma); 616 if (!range) 617 return -ENOMEM; 618 619 iod_list(iod)[0] = (__le64 *)range; 620 iod->npages = 0; 621 622 range->cattr = cpu_to_le32(0); 623 range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); 624 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 625 626 memset(cmnd, 0, sizeof(*cmnd)); 627 cmnd->dsm.opcode = nvme_cmd_dsm; 628 cmnd->dsm.command_id = cmdid; 629 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 630 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 631 cmnd->dsm.nr = 0; 632 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 633 634 if (++nvmeq->sq_tail == nvmeq->q_depth) 635 nvmeq->sq_tail = 0; 636 writel(nvmeq->sq_tail, nvmeq->q_db); 637 638 return 0; 639} 640 641static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 642 int cmdid) 643{ 644 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 645 646 memset(cmnd, 0, sizeof(*cmnd)); 647 cmnd->common.opcode = nvme_cmd_flush; 648 cmnd->common.command_id = cmdid; 649 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 650 651 if (++nvmeq->sq_tail == nvmeq->q_depth) 652 nvmeq->sq_tail = 0; 653 writel(nvmeq->sq_tail, nvmeq->q_db); 654 655 return 0; 656} 657 658int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 659{ 660 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 661 special_completion, NVME_IO_TIMEOUT); 662 if (unlikely(cmdid < 0)) 663 return cmdid; 664 665 return nvme_submit_flush(nvmeq, ns, cmdid); 666} 667 668/* 669 * Called with local interrupts disabled and the q_lock held. May not sleep. 670 */ 671static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 672 struct bio *bio) 673{ 674 struct nvme_command *cmnd; 675 struct nvme_iod *iod; 676 enum dma_data_direction dma_dir; 677 int cmdid, length, result; 678 u16 control; 679 u32 dsmgmt; 680 int psegs = bio_phys_segments(ns->queue, bio); 681 682 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 683 result = nvme_submit_flush_data(nvmeq, ns); 684 if (result) 685 return result; 686 } 687 688 result = -ENOMEM; 689 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 690 if (!iod) 691 goto nomem; 692 iod->private = bio; 693 694 result = -EBUSY; 695 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 696 if (unlikely(cmdid < 0)) 697 goto free_iod; 698 699 if (bio->bi_rw & REQ_DISCARD) { 700 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 701 if (result) 702 goto free_cmdid; 703 return result; 704 } 705 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 706 return nvme_submit_flush(nvmeq, ns, cmdid); 707 708 control = 0; 709 if (bio->bi_rw & REQ_FUA) 710 control |= NVME_RW_FUA; 711 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 712 control |= NVME_RW_LR; 713 714 dsmgmt = 0; 715 if (bio->bi_rw & REQ_RAHEAD) 716 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 717 718 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 719 720 memset(cmnd, 0, sizeof(*cmnd)); 721 if (bio_data_dir(bio)) { 722 cmnd->rw.opcode = nvme_cmd_write; 723 dma_dir = DMA_TO_DEVICE; 724 } else { 725 cmnd->rw.opcode = nvme_cmd_read; 726 dma_dir = DMA_FROM_DEVICE; 727 } 728 729 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 730 if (result <= 0) 731 goto free_cmdid; 732 length = result; 733 734 cmnd->rw.command_id = cmdid; 735 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 736 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 737 GFP_ATOMIC); 738 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 739 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 740 cmnd->rw.control = cpu_to_le16(control); 741 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 742 743 nvme_start_io_acct(bio); 744 if (++nvmeq->sq_tail == nvmeq->q_depth) 745 nvmeq->sq_tail = 0; 746 writel(nvmeq->sq_tail, nvmeq->q_db); 747 748 return 0; 749 750 free_cmdid: 751 free_cmdid(nvmeq, cmdid, NULL); 752 free_iod: 753 nvme_free_iod(nvmeq->dev, iod); 754 nomem: 755 return result; 756} 757 758static int nvme_process_cq(struct nvme_queue *nvmeq) 759{ 760 u16 head, phase; 761 762 head = nvmeq->cq_head; 763 phase = nvmeq->cq_phase; 764 765 for (;;) { 766 void *ctx; 767 nvme_completion_fn fn; 768 struct nvme_completion cqe = nvmeq->cqes[head]; 769 if ((le16_to_cpu(cqe.status) & 1) != phase) 770 break; 771 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 772 if (++head == nvmeq->q_depth) { 773 head = 0; 774 phase = !phase; 775 } 776 777 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 778 fn(nvmeq->dev, ctx, &cqe); 779 } 780 781 /* If the controller ignores the cq head doorbell and continuously 782 * writes to the queue, it is theoretically possible to wrap around 783 * the queue twice and mistakenly return IRQ_NONE. Linux only 784 * requires that 0.1% of your interrupts are handled, so this isn't 785 * a big problem. 786 */ 787 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 788 return 0; 789 790 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 791 nvmeq->cq_head = head; 792 nvmeq->cq_phase = phase; 793 794 nvmeq->cqe_seen = 1; 795 return 1; 796} 797 798static void nvme_make_request(struct request_queue *q, struct bio *bio) 799{ 800 struct nvme_ns *ns = q->queuedata; 801 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 802 int result = -EBUSY; 803 804 if (!nvmeq) { 805 put_nvmeq(NULL); 806 bio_endio(bio, -EIO); 807 return; 808 } 809 810 spin_lock_irq(&nvmeq->q_lock); 811 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) 812 result = nvme_submit_bio_queue(nvmeq, ns, bio); 813 if (unlikely(result)) { 814 if (bio_list_empty(&nvmeq->sq_cong)) 815 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 816 bio_list_add(&nvmeq->sq_cong, bio); 817 } 818 819 nvme_process_cq(nvmeq); 820 spin_unlock_irq(&nvmeq->q_lock); 821 put_nvmeq(nvmeq); 822} 823 824static irqreturn_t nvme_irq(int irq, void *data) 825{ 826 irqreturn_t result; 827 struct nvme_queue *nvmeq = data; 828 spin_lock(&nvmeq->q_lock); 829 nvme_process_cq(nvmeq); 830 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 831 nvmeq->cqe_seen = 0; 832 spin_unlock(&nvmeq->q_lock); 833 return result; 834} 835 836static irqreturn_t nvme_irq_check(int irq, void *data) 837{ 838 struct nvme_queue *nvmeq = data; 839 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 840 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 841 return IRQ_NONE; 842 return IRQ_WAKE_THREAD; 843} 844 845static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 846{ 847 spin_lock_irq(&nvmeq->q_lock); 848 cancel_cmdid(nvmeq, cmdid, NULL); 849 spin_unlock_irq(&nvmeq->q_lock); 850} 851 852struct sync_cmd_info { 853 struct task_struct *task; 854 u32 result; 855 int status; 856}; 857 858static void sync_completion(struct nvme_dev *dev, void *ctx, 859 struct nvme_completion *cqe) 860{ 861 struct sync_cmd_info *cmdinfo = ctx; 862 cmdinfo->result = le32_to_cpup(&cqe->result); 863 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 864 wake_up_process(cmdinfo->task); 865} 866 867/* 868 * Returns 0 on success. If the result is negative, it's a Linux error code; 869 * if the result is positive, it's an NVM Express status code 870 */ 871int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, 872 u32 *result, unsigned timeout) 873{ 874 int cmdid; 875 struct sync_cmd_info cmdinfo; 876 877 cmdinfo.task = current; 878 cmdinfo.status = -EINTR; 879 880 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 881 timeout); 882 if (cmdid < 0) 883 return cmdid; 884 cmd->common.command_id = cmdid; 885 886 set_current_state(TASK_KILLABLE); 887 nvme_submit_cmd(nvmeq, cmd); 888 schedule_timeout(timeout); 889 890 if (cmdinfo.status == -EINTR) { 891 nvme_abort_command(nvmeq, cmdid); 892 return -EINTR; 893 } 894 895 if (result) 896 *result = cmdinfo.result; 897 898 return cmdinfo.status; 899} 900 901int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 902 u32 *result) 903{ 904 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 905} 906 907static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 908{ 909 int status; 910 struct nvme_command c; 911 912 memset(&c, 0, sizeof(c)); 913 c.delete_queue.opcode = opcode; 914 c.delete_queue.qid = cpu_to_le16(id); 915 916 status = nvme_submit_admin_cmd(dev, &c, NULL); 917 if (status) 918 return -EIO; 919 return 0; 920} 921 922static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 923 struct nvme_queue *nvmeq) 924{ 925 int status; 926 struct nvme_command c; 927 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 928 929 memset(&c, 0, sizeof(c)); 930 c.create_cq.opcode = nvme_admin_create_cq; 931 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 932 c.create_cq.cqid = cpu_to_le16(qid); 933 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 934 c.create_cq.cq_flags = cpu_to_le16(flags); 935 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 936 937 status = nvme_submit_admin_cmd(dev, &c, NULL); 938 if (status) 939 return -EIO; 940 return 0; 941} 942 943static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 944 struct nvme_queue *nvmeq) 945{ 946 int status; 947 struct nvme_command c; 948 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 949 950 memset(&c, 0, sizeof(c)); 951 c.create_sq.opcode = nvme_admin_create_sq; 952 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 953 c.create_sq.sqid = cpu_to_le16(qid); 954 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 955 c.create_sq.sq_flags = cpu_to_le16(flags); 956 c.create_sq.cqid = cpu_to_le16(qid); 957 958 status = nvme_submit_admin_cmd(dev, &c, NULL); 959 if (status) 960 return -EIO; 961 return 0; 962} 963 964static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 965{ 966 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 967} 968 969static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 970{ 971 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 972} 973 974int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 975 dma_addr_t dma_addr) 976{ 977 struct nvme_command c; 978 979 memset(&c, 0, sizeof(c)); 980 c.identify.opcode = nvme_admin_identify; 981 c.identify.nsid = cpu_to_le32(nsid); 982 c.identify.prp1 = cpu_to_le64(dma_addr); 983 c.identify.cns = cpu_to_le32(cns); 984 985 return nvme_submit_admin_cmd(dev, &c, NULL); 986} 987 988int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 989 dma_addr_t dma_addr, u32 *result) 990{ 991 struct nvme_command c; 992 993 memset(&c, 0, sizeof(c)); 994 c.features.opcode = nvme_admin_get_features; 995 c.features.nsid = cpu_to_le32(nsid); 996 c.features.prp1 = cpu_to_le64(dma_addr); 997 c.features.fid = cpu_to_le32(fid); 998 999 return nvme_submit_admin_cmd(dev, &c, result); 1000} 1001 1002int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1003 dma_addr_t dma_addr, u32 *result) 1004{ 1005 struct nvme_command c; 1006 1007 memset(&c, 0, sizeof(c)); 1008 c.features.opcode = nvme_admin_set_features; 1009 c.features.prp1 = cpu_to_le64(dma_addr); 1010 c.features.fid = cpu_to_le32(fid); 1011 c.features.dword11 = cpu_to_le32(dword11); 1012 1013 return nvme_submit_admin_cmd(dev, &c, result); 1014} 1015 1016/** 1017 * nvme_abort_cmd - Attempt aborting a command 1018 * @cmdid: Command id of a timed out IO 1019 * @queue: The queue with timed out IO 1020 * 1021 * Schedule controller reset if the command was already aborted once before and 1022 * still hasn't been returned to the driver, or if this is the admin queue. 1023 */ 1024static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) 1025{ 1026 int a_cmdid; 1027 struct nvme_command cmd; 1028 struct nvme_dev *dev = nvmeq->dev; 1029 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1030 1031 if (!nvmeq->qid || info[cmdid].aborted) { 1032 if (work_busy(&dev->reset_work)) 1033 return; 1034 list_del_init(&dev->node); 1035 dev_warn(&dev->pci_dev->dev, 1036 "I/O %d QID %d timeout, reset controller\n", cmdid, 1037 nvmeq->qid); 1038 INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); 1039 queue_work(nvme_workq, &dev->reset_work); 1040 return; 1041 } 1042 1043 if (!dev->abort_limit) 1044 return; 1045 1046 a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, 1047 ADMIN_TIMEOUT); 1048 if (a_cmdid < 0) 1049 return; 1050 1051 memset(&cmd, 0, sizeof(cmd)); 1052 cmd.abort.opcode = nvme_admin_abort_cmd; 1053 cmd.abort.cid = cmdid; 1054 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1055 cmd.abort.command_id = a_cmdid; 1056 1057 --dev->abort_limit; 1058 info[cmdid].aborted = 1; 1059 info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; 1060 1061 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1062 nvmeq->qid); 1063 nvme_submit_cmd(dev->queues[0], &cmd); 1064} 1065 1066/** 1067 * nvme_cancel_ios - Cancel outstanding I/Os 1068 * @queue: The queue to cancel I/Os on 1069 * @timeout: True to only cancel I/Os which have timed out 1070 */ 1071static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 1072{ 1073 int depth = nvmeq->q_depth - 1; 1074 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1075 unsigned long now = jiffies; 1076 int cmdid; 1077 1078 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1079 void *ctx; 1080 nvme_completion_fn fn; 1081 static struct nvme_completion cqe = { 1082 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 1083 }; 1084 1085 if (timeout && !time_after(now, info[cmdid].timeout)) 1086 continue; 1087 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1088 continue; 1089 if (timeout && nvmeq->dev->initialized) { 1090 nvme_abort_cmd(cmdid, nvmeq); 1091 continue; 1092 } 1093 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, 1094 nvmeq->qid); 1095 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1096 fn(nvmeq->dev, ctx, &cqe); 1097 } 1098} 1099 1100static void nvme_free_queue(struct nvme_queue *nvmeq) 1101{ 1102 spin_lock_irq(&nvmeq->q_lock); 1103 while (bio_list_peek(&nvmeq->sq_cong)) { 1104 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1105 bio_endio(bio, -EIO); 1106 } 1107 spin_unlock_irq(&nvmeq->q_lock); 1108 1109 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1110 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1111 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1112 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1113 kfree(nvmeq); 1114} 1115 1116static void nvme_free_queues(struct nvme_dev *dev) 1117{ 1118 int i; 1119 1120 for (i = dev->queue_count - 1; i >= 0; i--) { 1121 nvme_free_queue(dev->queues[i]); 1122 dev->queue_count--; 1123 dev->queues[i] = NULL; 1124 } 1125} 1126 1127static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1128{ 1129 struct nvme_queue *nvmeq = dev->queues[qid]; 1130 int vector = dev->entry[nvmeq->cq_vector].vector; 1131 1132 spin_lock_irq(&nvmeq->q_lock); 1133 if (nvmeq->q_suspended) { 1134 spin_unlock_irq(&nvmeq->q_lock); 1135 return; 1136 } 1137 nvmeq->q_suspended = 1; 1138 spin_unlock_irq(&nvmeq->q_lock); 1139 1140 irq_set_affinity_hint(vector, NULL); 1141 free_irq(vector, nvmeq); 1142 1143 /* Don't tell the adapter to delete the admin queue. 1144 * Don't tell a removed adapter to delete IO queues. */ 1145 if (qid && readl(&dev->bar->csts) != -1) { 1146 adapter_delete_sq(dev, qid); 1147 adapter_delete_cq(dev, qid); 1148 } 1149 1150 spin_lock_irq(&nvmeq->q_lock); 1151 nvme_process_cq(nvmeq); 1152 nvme_cancel_ios(nvmeq, false); 1153 spin_unlock_irq(&nvmeq->q_lock); 1154} 1155 1156static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1157 int depth, int vector) 1158{ 1159 struct device *dmadev = &dev->pci_dev->dev; 1160 unsigned extra = nvme_queue_extra(depth); 1161 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1162 if (!nvmeq) 1163 return NULL; 1164 1165 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1166 &nvmeq->cq_dma_addr, GFP_KERNEL); 1167 if (!nvmeq->cqes) 1168 goto free_nvmeq; 1169 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1170 1171 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1172 &nvmeq->sq_dma_addr, GFP_KERNEL); 1173 if (!nvmeq->sq_cmds) 1174 goto free_cqdma; 1175 1176 nvmeq->q_dmadev = dmadev; 1177 nvmeq->dev = dev; 1178 spin_lock_init(&nvmeq->q_lock); 1179 nvmeq->cq_head = 0; 1180 nvmeq->cq_phase = 1; 1181 init_waitqueue_head(&nvmeq->sq_full); 1182 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1183 bio_list_init(&nvmeq->sq_cong); 1184 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1185 nvmeq->q_depth = depth; 1186 nvmeq->cq_vector = vector; 1187 nvmeq->qid = qid; 1188 nvmeq->q_suspended = 1; 1189 dev->queue_count++; 1190 1191 return nvmeq; 1192 1193 free_cqdma: 1194 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1195 nvmeq->cq_dma_addr); 1196 free_nvmeq: 1197 kfree(nvmeq); 1198 return NULL; 1199} 1200 1201static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1202 const char *name) 1203{ 1204 if (use_threaded_interrupts) 1205 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1206 nvme_irq_check, nvme_irq, IRQF_SHARED, 1207 name, nvmeq); 1208 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1209 IRQF_SHARED, name, nvmeq); 1210} 1211 1212static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1213{ 1214 struct nvme_dev *dev = nvmeq->dev; 1215 unsigned extra = nvme_queue_extra(nvmeq->q_depth); 1216 1217 nvmeq->sq_tail = 0; 1218 nvmeq->cq_head = 0; 1219 nvmeq->cq_phase = 1; 1220 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1221 memset(nvmeq->cmdid_data, 0, extra); 1222 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1223 nvme_cancel_ios(nvmeq, false); 1224 nvmeq->q_suspended = 0; 1225} 1226 1227static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1228{ 1229 struct nvme_dev *dev = nvmeq->dev; 1230 int result; 1231 1232 result = adapter_alloc_cq(dev, qid, nvmeq); 1233 if (result < 0) 1234 return result; 1235 1236 result = adapter_alloc_sq(dev, qid, nvmeq); 1237 if (result < 0) 1238 goto release_cq; 1239 1240 result = queue_request_irq(dev, nvmeq, "nvme"); 1241 if (result < 0) 1242 goto release_sq; 1243 1244 spin_lock_irq(&nvmeq->q_lock); 1245 nvme_init_queue(nvmeq, qid); 1246 spin_unlock_irq(&nvmeq->q_lock); 1247 1248 return result; 1249 1250 release_sq: 1251 adapter_delete_sq(dev, qid); 1252 release_cq: 1253 adapter_delete_cq(dev, qid); 1254 return result; 1255} 1256 1257static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1258{ 1259 unsigned long timeout; 1260 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1261 1262 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1263 1264 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1265 msleep(100); 1266 if (fatal_signal_pending(current)) 1267 return -EINTR; 1268 if (time_after(jiffies, timeout)) { 1269 dev_err(&dev->pci_dev->dev, 1270 "Device not ready; aborting initialisation\n"); 1271 return -ENODEV; 1272 } 1273 } 1274 1275 return 0; 1276} 1277 1278/* 1279 * If the device has been passed off to us in an enabled state, just clear 1280 * the enabled bit. The spec says we should set the 'shutdown notification 1281 * bits', but doing so may cause the device to complete commands to the 1282 * admin queue ... and we don't know what memory that might be pointing at! 1283 */ 1284static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1285{ 1286 u32 cc = readl(&dev->bar->cc); 1287 1288 if (cc & NVME_CC_ENABLE) 1289 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1290 return nvme_wait_ready(dev, cap, false); 1291} 1292 1293static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1294{ 1295 return nvme_wait_ready(dev, cap, true); 1296} 1297 1298static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1299{ 1300 unsigned long timeout; 1301 u32 cc; 1302 1303 cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; 1304 writel(cc, &dev->bar->cc); 1305 1306 timeout = 2 * HZ + jiffies; 1307 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1308 NVME_CSTS_SHST_CMPLT) { 1309 msleep(100); 1310 if (fatal_signal_pending(current)) 1311 return -EINTR; 1312 if (time_after(jiffies, timeout)) { 1313 dev_err(&dev->pci_dev->dev, 1314 "Device shutdown incomplete; abort shutdown\n"); 1315 return -ENODEV; 1316 } 1317 } 1318 1319 return 0; 1320} 1321 1322static int nvme_configure_admin_queue(struct nvme_dev *dev) 1323{ 1324 int result; 1325 u32 aqa; 1326 u64 cap = readq(&dev->bar->cap); 1327 struct nvme_queue *nvmeq; 1328 1329 result = nvme_disable_ctrl(dev, cap); 1330 if (result < 0) 1331 return result; 1332 1333 nvmeq = dev->queues[0]; 1334 if (!nvmeq) { 1335 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1336 if (!nvmeq) 1337 return -ENOMEM; 1338 dev->queues[0] = nvmeq; 1339 } 1340 1341 aqa = nvmeq->q_depth - 1; 1342 aqa |= aqa << 16; 1343 1344 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1345 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1346 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1347 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1348 1349 writel(aqa, &dev->bar->aqa); 1350 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1351 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1352 writel(dev->ctrl_config, &dev->bar->cc); 1353 1354 result = nvme_enable_ctrl(dev, cap); 1355 if (result) 1356 return result; 1357 1358 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1359 if (result) 1360 return result; 1361 1362 spin_lock_irq(&nvmeq->q_lock); 1363 nvme_init_queue(nvmeq, 0); 1364 spin_unlock_irq(&nvmeq->q_lock); 1365 return result; 1366} 1367 1368struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1369 unsigned long addr, unsigned length) 1370{ 1371 int i, err, count, nents, offset; 1372 struct scatterlist *sg; 1373 struct page **pages; 1374 struct nvme_iod *iod; 1375 1376 if (addr & 3) 1377 return ERR_PTR(-EINVAL); 1378 if (!length || length > INT_MAX - PAGE_SIZE) 1379 return ERR_PTR(-EINVAL); 1380 1381 offset = offset_in_page(addr); 1382 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1383 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1384 if (!pages) 1385 return ERR_PTR(-ENOMEM); 1386 1387 err = get_user_pages_fast(addr, count, 1, pages); 1388 if (err < count) { 1389 count = err; 1390 err = -EFAULT; 1391 goto put_pages; 1392 } 1393 1394 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1395 sg = iod->sg; 1396 sg_init_table(sg, count); 1397 for (i = 0; i < count; i++) { 1398 sg_set_page(&sg[i], pages[i], 1399 min_t(unsigned, length, PAGE_SIZE - offset), 1400 offset); 1401 length -= (PAGE_SIZE - offset); 1402 offset = 0; 1403 } 1404 sg_mark_end(&sg[i - 1]); 1405 iod->nents = count; 1406 1407 err = -ENOMEM; 1408 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1409 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1410 if (!nents) 1411 goto free_iod; 1412 1413 kfree(pages); 1414 return iod; 1415 1416 free_iod: 1417 kfree(iod); 1418 put_pages: 1419 for (i = 0; i < count; i++) 1420 put_page(pages[i]); 1421 kfree(pages); 1422 return ERR_PTR(err); 1423} 1424 1425void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1426 struct nvme_iod *iod) 1427{ 1428 int i; 1429 1430 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1431 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1432 1433 for (i = 0; i < iod->nents; i++) 1434 put_page(sg_page(&iod->sg[i])); 1435} 1436 1437static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1438{ 1439 struct nvme_dev *dev = ns->dev; 1440 struct nvme_queue *nvmeq; 1441 struct nvme_user_io io; 1442 struct nvme_command c; 1443 unsigned length, meta_len; 1444 int status, i; 1445 struct nvme_iod *iod, *meta_iod = NULL; 1446 dma_addr_t meta_dma_addr; 1447 void *meta, *uninitialized_var(meta_mem); 1448 1449 if (copy_from_user(&io, uio, sizeof(io))) 1450 return -EFAULT; 1451 length = (io.nblocks + 1) << ns->lba_shift; 1452 meta_len = (io.nblocks + 1) * ns->ms; 1453 1454 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1455 return -EINVAL; 1456 1457 switch (io.opcode) { 1458 case nvme_cmd_write: 1459 case nvme_cmd_read: 1460 case nvme_cmd_compare: 1461 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1462 break; 1463 default: 1464 return -EINVAL; 1465 } 1466 1467 if (IS_ERR(iod)) 1468 return PTR_ERR(iod); 1469 1470 memset(&c, 0, sizeof(c)); 1471 c.rw.opcode = io.opcode; 1472 c.rw.flags = io.flags; 1473 c.rw.nsid = cpu_to_le32(ns->ns_id); 1474 c.rw.slba = cpu_to_le64(io.slba); 1475 c.rw.length = cpu_to_le16(io.nblocks); 1476 c.rw.control = cpu_to_le16(io.control); 1477 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1478 c.rw.reftag = cpu_to_le32(io.reftag); 1479 c.rw.apptag = cpu_to_le16(io.apptag); 1480 c.rw.appmask = cpu_to_le16(io.appmask); 1481 1482 if (meta_len) { 1483 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, 1484 meta_len); 1485 if (IS_ERR(meta_iod)) { 1486 status = PTR_ERR(meta_iod); 1487 meta_iod = NULL; 1488 goto unmap; 1489 } 1490 1491 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1492 &meta_dma_addr, GFP_KERNEL); 1493 if (!meta_mem) { 1494 status = -ENOMEM; 1495 goto unmap; 1496 } 1497 1498 if (io.opcode & 1) { 1499 int meta_offset = 0; 1500 1501 for (i = 0; i < meta_iod->nents; i++) { 1502 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1503 meta_iod->sg[i].offset; 1504 memcpy(meta_mem + meta_offset, meta, 1505 meta_iod->sg[i].length); 1506 kunmap_atomic(meta); 1507 meta_offset += meta_iod->sg[i].length; 1508 } 1509 } 1510 1511 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1512 } 1513 1514 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1515 1516 nvmeq = get_nvmeq(dev); 1517 /* 1518 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1519 * disabled. We may be preempted at any point, and be rescheduled 1520 * to a different CPU. That will cause cacheline bouncing, but no 1521 * additional races since q_lock already protects against other CPUs. 1522 */ 1523 put_nvmeq(nvmeq); 1524 if (length != (io.nblocks + 1) << ns->lba_shift) 1525 status = -ENOMEM; 1526 else if (!nvmeq || nvmeq->q_suspended) 1527 status = -EBUSY; 1528 else 1529 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1530 1531 if (meta_len) { 1532 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1533 int meta_offset = 0; 1534 1535 for (i = 0; i < meta_iod->nents; i++) { 1536 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1537 meta_iod->sg[i].offset; 1538 memcpy(meta, meta_mem + meta_offset, 1539 meta_iod->sg[i].length); 1540 kunmap_atomic(meta); 1541 meta_offset += meta_iod->sg[i].length; 1542 } 1543 } 1544 1545 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1546 meta_dma_addr); 1547 } 1548 1549 unmap: 1550 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1551 nvme_free_iod(dev, iod); 1552 1553 if (meta_iod) { 1554 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1555 nvme_free_iod(dev, meta_iod); 1556 } 1557 1558 return status; 1559} 1560 1561static int nvme_user_admin_cmd(struct nvme_dev *dev, 1562 struct nvme_admin_cmd __user *ucmd) 1563{ 1564 struct nvme_admin_cmd cmd; 1565 struct nvme_command c; 1566 int status, length; 1567 struct nvme_iod *uninitialized_var(iod); 1568 unsigned timeout; 1569 1570 if (!capable(CAP_SYS_ADMIN)) 1571 return -EACCES; 1572 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1573 return -EFAULT; 1574 1575 memset(&c, 0, sizeof(c)); 1576 c.common.opcode = cmd.opcode; 1577 c.common.flags = cmd.flags; 1578 c.common.nsid = cpu_to_le32(cmd.nsid); 1579 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1580 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1581 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1582 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1583 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1584 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1585 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1586 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1587 1588 length = cmd.data_len; 1589 if (cmd.data_len) { 1590 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1591 length); 1592 if (IS_ERR(iod)) 1593 return PTR_ERR(iod); 1594 length = nvme_setup_prps(dev, &c.common, iod, length, 1595 GFP_KERNEL); 1596 } 1597 1598 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1599 ADMIN_TIMEOUT; 1600 if (length != cmd.data_len) 1601 status = -ENOMEM; 1602 else 1603 status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, 1604 timeout); 1605 1606 if (cmd.data_len) { 1607 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1608 nvme_free_iod(dev, iod); 1609 } 1610 1611 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1612 sizeof(cmd.result))) 1613 status = -EFAULT; 1614 1615 return status; 1616} 1617 1618static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1619 unsigned long arg) 1620{ 1621 struct nvme_ns *ns = bdev->bd_disk->private_data; 1622 1623 switch (cmd) { 1624 case NVME_IOCTL_ID: 1625 force_successful_syscall_return(); 1626 return ns->ns_id; 1627 case NVME_IOCTL_ADMIN_CMD: 1628 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1629 case NVME_IOCTL_SUBMIT_IO: 1630 return nvme_submit_io(ns, (void __user *)arg); 1631 case SG_GET_VERSION_NUM: 1632 return nvme_sg_get_version_num((void __user *)arg); 1633 case SG_IO: 1634 return nvme_sg_io(ns, (void __user *)arg); 1635 default: 1636 return -ENOTTY; 1637 } 1638} 1639 1640#ifdef CONFIG_COMPAT 1641static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1642 unsigned int cmd, unsigned long arg) 1643{ 1644 struct nvme_ns *ns = bdev->bd_disk->private_data; 1645 1646 switch (cmd) { 1647 case SG_IO: 1648 return nvme_sg_io32(ns, arg); 1649 } 1650 return nvme_ioctl(bdev, mode, cmd, arg); 1651} 1652#else 1653#define nvme_compat_ioctl NULL 1654#endif 1655 1656static const struct block_device_operations nvme_fops = { 1657 .owner = THIS_MODULE, 1658 .ioctl = nvme_ioctl, 1659 .compat_ioctl = nvme_compat_ioctl, 1660}; 1661 1662static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1663{ 1664 while (bio_list_peek(&nvmeq->sq_cong)) { 1665 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1666 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1667 1668 if (bio_list_empty(&nvmeq->sq_cong)) 1669 remove_wait_queue(&nvmeq->sq_full, 1670 &nvmeq->sq_cong_wait); 1671 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1672 if (bio_list_empty(&nvmeq->sq_cong)) 1673 add_wait_queue(&nvmeq->sq_full, 1674 &nvmeq->sq_cong_wait); 1675 bio_list_add_head(&nvmeq->sq_cong, bio); 1676 break; 1677 } 1678 } 1679} 1680 1681static int nvme_kthread(void *data) 1682{ 1683 struct nvme_dev *dev, *next; 1684 1685 while (!kthread_should_stop()) { 1686 set_current_state(TASK_INTERRUPTIBLE); 1687 spin_lock(&dev_list_lock); 1688 list_for_each_entry_safe(dev, next, &dev_list, node) { 1689 int i; 1690 if (readl(&dev->bar->csts) & NVME_CSTS_CFS && 1691 dev->initialized) { 1692 if (work_busy(&dev->reset_work)) 1693 continue; 1694 list_del_init(&dev->node); 1695 dev_warn(&dev->pci_dev->dev, 1696 "Failed status, reset controller\n"); 1697 INIT_WORK(&dev->reset_work, 1698 nvme_reset_failed_dev); 1699 queue_work(nvme_workq, &dev->reset_work); 1700 continue; 1701 } 1702 for (i = 0; i < dev->queue_count; i++) { 1703 struct nvme_queue *nvmeq = dev->queues[i]; 1704 if (!nvmeq) 1705 continue; 1706 spin_lock_irq(&nvmeq->q_lock); 1707 if (nvmeq->q_suspended) 1708 goto unlock; 1709 nvme_process_cq(nvmeq); 1710 nvme_cancel_ios(nvmeq, true); 1711 nvme_resubmit_bios(nvmeq); 1712 unlock: 1713 spin_unlock_irq(&nvmeq->q_lock); 1714 } 1715 } 1716 spin_unlock(&dev_list_lock); 1717 schedule_timeout(round_jiffies_relative(HZ)); 1718 } 1719 return 0; 1720} 1721 1722static DEFINE_IDA(nvme_index_ida); 1723 1724static int nvme_get_ns_idx(void) 1725{ 1726 int index, error; 1727 1728 do { 1729 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1730 return -1; 1731 1732 spin_lock(&dev_list_lock); 1733 error = ida_get_new(&nvme_index_ida, &index); 1734 spin_unlock(&dev_list_lock); 1735 } while (error == -EAGAIN); 1736 1737 if (error) 1738 index = -1; 1739 return index; 1740} 1741 1742static void nvme_put_ns_idx(int index) 1743{ 1744 spin_lock(&dev_list_lock); 1745 ida_remove(&nvme_index_ida, index); 1746 spin_unlock(&dev_list_lock); 1747} 1748 1749static void nvme_config_discard(struct nvme_ns *ns) 1750{ 1751 u32 logical_block_size = queue_logical_block_size(ns->queue); 1752 ns->queue->limits.discard_zeroes_data = 0; 1753 ns->queue->limits.discard_alignment = logical_block_size; 1754 ns->queue->limits.discard_granularity = logical_block_size; 1755 ns->queue->limits.max_discard_sectors = 0xffffffff; 1756 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1757} 1758 1759static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, 1760 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1761{ 1762 struct nvme_ns *ns; 1763 struct gendisk *disk; 1764 int lbaf; 1765 1766 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1767 return NULL; 1768 1769 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1770 if (!ns) 1771 return NULL; 1772 ns->queue = blk_alloc_queue(GFP_KERNEL); 1773 if (!ns->queue) 1774 goto out_free_ns; 1775 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1776 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1777 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1778 blk_queue_make_request(ns->queue, nvme_make_request); 1779 ns->dev = dev; 1780 ns->queue->queuedata = ns; 1781 1782 disk = alloc_disk(NVME_MINORS); 1783 if (!disk) 1784 goto out_free_queue; 1785 ns->ns_id = nsid; 1786 ns->disk = disk; 1787 lbaf = id->flbas & 0xf; 1788 ns->lba_shift = id->lbaf[lbaf].ds; 1789 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1790 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1791 if (dev->max_hw_sectors) 1792 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1793 1794 disk->major = nvme_major; 1795 disk->minors = NVME_MINORS; 1796 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1797 disk->fops = &nvme_fops; 1798 disk->private_data = ns; 1799 disk->queue = ns->queue; 1800 disk->driverfs_dev = &dev->pci_dev->dev; 1801 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1802 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1803 1804 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1805 nvme_config_discard(ns); 1806 1807 return ns; 1808 1809 out_free_queue: 1810 blk_cleanup_queue(ns->queue); 1811 out_free_ns: 1812 kfree(ns); 1813 return NULL; 1814} 1815 1816static void nvme_ns_free(struct nvme_ns *ns) 1817{ 1818 int index = ns->disk->first_minor / NVME_MINORS; 1819 put_disk(ns->disk); 1820 nvme_put_ns_idx(index); 1821 blk_cleanup_queue(ns->queue); 1822 kfree(ns); 1823} 1824 1825static int set_queue_count(struct nvme_dev *dev, int count) 1826{ 1827 int status; 1828 u32 result; 1829 u32 q_count = (count - 1) | ((count - 1) << 16); 1830 1831 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1832 &result); 1833 if (status) 1834 return status < 0 ? -EIO : -EBUSY; 1835 return min(result & 0xffff, result >> 16) + 1; 1836} 1837 1838static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1839{ 1840 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1841} 1842 1843static int nvme_setup_io_queues(struct nvme_dev *dev) 1844{ 1845 struct pci_dev *pdev = dev->pci_dev; 1846 int result, cpu, i, vecs, nr_io_queues, size, q_depth; 1847 1848 nr_io_queues = num_online_cpus(); 1849 result = set_queue_count(dev, nr_io_queues); 1850 if (result < 0) 1851 return result; 1852 if (result < nr_io_queues) 1853 nr_io_queues = result; 1854 1855 size = db_bar_size(dev, nr_io_queues); 1856 if (size > 8192) { 1857 iounmap(dev->bar); 1858 do { 1859 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1860 if (dev->bar) 1861 break; 1862 if (!--nr_io_queues) 1863 return -ENOMEM; 1864 size = db_bar_size(dev, nr_io_queues); 1865 } while (1); 1866 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1867 dev->queues[0]->q_db = dev->dbs; 1868 } 1869 1870 /* Deregister the admin queue's interrupt */ 1871 free_irq(dev->entry[0].vector, dev->queues[0]); 1872 1873 vecs = nr_io_queues; 1874 for (i = 0; i < vecs; i++) 1875 dev->entry[i].entry = i; 1876 for (;;) { 1877 result = pci_enable_msix(pdev, dev->entry, vecs); 1878 if (result <= 0) 1879 break; 1880 vecs = result; 1881 } 1882 1883 if (result < 0) { 1884 vecs = nr_io_queues; 1885 if (vecs > 32) 1886 vecs = 32; 1887 for (;;) { 1888 result = pci_enable_msi_block(pdev, vecs); 1889 if (result == 0) { 1890 for (i = 0; i < vecs; i++) 1891 dev->entry[i].vector = i + pdev->irq; 1892 break; 1893 } else if (result < 0) { 1894 vecs = 1; 1895 break; 1896 } 1897 vecs = result; 1898 } 1899 } 1900 1901 /* 1902 * Should investigate if there's a performance win from allocating 1903 * more queues than interrupt vectors; it might allow the submission 1904 * path to scale better, even if the receive path is limited by the 1905 * number of interrupts. 1906 */ 1907 nr_io_queues = vecs; 1908 1909 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1910 if (result) { 1911 dev->queues[0]->q_suspended = 1; 1912 goto free_queues; 1913 } 1914 1915 /* Free previously allocated queues that are no longer usable */ 1916 spin_lock(&dev_list_lock); 1917 for (i = dev->queue_count - 1; i > nr_io_queues; i--) { 1918 struct nvme_queue *nvmeq = dev->queues[i]; 1919 1920 spin_lock_irq(&nvmeq->q_lock); 1921 nvme_cancel_ios(nvmeq, false); 1922 spin_unlock_irq(&nvmeq->q_lock); 1923 1924 nvme_free_queue(nvmeq); 1925 dev->queue_count--; 1926 dev->queues[i] = NULL; 1927 } 1928 spin_unlock(&dev_list_lock); 1929 1930 cpu = cpumask_first(cpu_online_mask); 1931 for (i = 0; i < nr_io_queues; i++) { 1932 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1933 cpu = cpumask_next(cpu, cpu_online_mask); 1934 } 1935 1936 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1937 NVME_Q_DEPTH); 1938 for (i = dev->queue_count - 1; i < nr_io_queues; i++) { 1939 dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); 1940 if (!dev->queues[i + 1]) { 1941 result = -ENOMEM; 1942 goto free_queues; 1943 } 1944 } 1945 1946 for (; i < num_possible_cpus(); i++) { 1947 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1948 dev->queues[i + 1] = dev->queues[target + 1]; 1949 } 1950 1951 for (i = 1; i < dev->queue_count; i++) { 1952 result = nvme_create_queue(dev->queues[i], i); 1953 if (result) { 1954 for (--i; i > 0; i--) 1955 nvme_disable_queue(dev, i); 1956 goto free_queues; 1957 } 1958 } 1959 1960 return 0; 1961 1962 free_queues: 1963 nvme_free_queues(dev); 1964 return result; 1965} 1966 1967/* 1968 * Return: error value if an error occurred setting up the queues or calling 1969 * Identify Device. 0 if these succeeded, even if adding some of the 1970 * namespaces failed. At the moment, these failures are silent. TBD which 1971 * failures should be reported. 1972 */ 1973static int nvme_dev_add(struct nvme_dev *dev) 1974{ 1975 struct pci_dev *pdev = dev->pci_dev; 1976 int res; 1977 unsigned nn, i; 1978 struct nvme_ns *ns; 1979 struct nvme_id_ctrl *ctrl; 1980 struct nvme_id_ns *id_ns; 1981 void *mem; 1982 dma_addr_t dma_addr; 1983 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1984 1985 mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); 1986 if (!mem) 1987 return -ENOMEM; 1988 1989 res = nvme_identify(dev, 0, 1, dma_addr); 1990 if (res) { 1991 res = -EIO; 1992 goto out; 1993 } 1994 1995 ctrl = mem; 1996 nn = le32_to_cpup(&ctrl->nn); 1997 dev->oncs = le16_to_cpup(&ctrl->oncs); 1998 dev->abort_limit = ctrl->acl + 1; 1999 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2000 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2001 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2002 if (ctrl->mdts) 2003 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2004 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2005 (pdev->device == 0x0953) && ctrl->vs[3]) 2006 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2007 2008 id_ns = mem; 2009 for (i = 1; i <= nn; i++) { 2010 res = nvme_identify(dev, i, 0, dma_addr); 2011 if (res) 2012 continue; 2013 2014 if (id_ns->ncap == 0) 2015 continue; 2016 2017 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 2018 dma_addr + 4096, NULL); 2019 if (res) 2020 memset(mem + 4096, 0, 4096); 2021 2022 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 2023 if (ns) 2024 list_add_tail(&ns->list, &dev->namespaces); 2025 } 2026 list_for_each_entry(ns, &dev->namespaces, list) 2027 add_disk(ns->disk); 2028 res = 0; 2029 2030 out: 2031 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 2032 return res; 2033} 2034 2035static int nvme_dev_map(struct nvme_dev *dev) 2036{ 2037 int bars, result = -ENOMEM; 2038 struct pci_dev *pdev = dev->pci_dev; 2039 2040 if (pci_enable_device_mem(pdev)) 2041 return result; 2042 2043 dev->entry[0].vector = pdev->irq; 2044 pci_set_master(pdev); 2045 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2046 if (pci_request_selected_regions(pdev, bars, "nvme")) 2047 goto disable_pci; 2048 2049 if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && 2050 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) 2051 goto disable; 2052 2053 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2054 if (!dev->bar) 2055 goto disable; 2056 if (readl(&dev->bar->csts) == -1) { 2057 result = -ENODEV; 2058 goto unmap; 2059 } 2060 dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap)); 2061 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2062 2063 return 0; 2064 2065 unmap: 2066 iounmap(dev->bar); 2067 dev->bar = NULL; 2068 disable: 2069 pci_release_regions(pdev); 2070 disable_pci: 2071 pci_disable_device(pdev); 2072 return result; 2073} 2074 2075static void nvme_dev_unmap(struct nvme_dev *dev) 2076{ 2077 if (dev->pci_dev->msi_enabled) 2078 pci_disable_msi(dev->pci_dev); 2079 else if (dev->pci_dev->msix_enabled) 2080 pci_disable_msix(dev->pci_dev); 2081 2082 if (dev->bar) { 2083 iounmap(dev->bar); 2084 dev->bar = NULL; 2085 pci_release_regions(dev->pci_dev); 2086 } 2087 2088 if (pci_is_enabled(dev->pci_dev)) 2089 pci_disable_device(dev->pci_dev); 2090} 2091 2092static void nvme_dev_shutdown(struct nvme_dev *dev) 2093{ 2094 int i; 2095 2096 dev->initialized = 0; 2097 for (i = dev->queue_count - 1; i >= 0; i--) 2098 nvme_disable_queue(dev, i); 2099 2100 spin_lock(&dev_list_lock); 2101 list_del_init(&dev->node); 2102 spin_unlock(&dev_list_lock); 2103 2104 if (dev->bar) 2105 nvme_shutdown_ctrl(dev); 2106 nvme_dev_unmap(dev); 2107} 2108 2109static void nvme_dev_remove(struct nvme_dev *dev) 2110{ 2111 struct nvme_ns *ns, *next; 2112 2113 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2114 list_del(&ns->list); 2115 del_gendisk(ns->disk); 2116 nvme_ns_free(ns); 2117 } 2118} 2119 2120static int nvme_setup_prp_pools(struct nvme_dev *dev) 2121{ 2122 struct device *dmadev = &dev->pci_dev->dev; 2123 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 2124 PAGE_SIZE, PAGE_SIZE, 0); 2125 if (!dev->prp_page_pool) 2126 return -ENOMEM; 2127 2128 /* Optimisation for I/Os between 4k and 128k */ 2129 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 2130 256, 256, 0); 2131 if (!dev->prp_small_pool) { 2132 dma_pool_destroy(dev->prp_page_pool); 2133 return -ENOMEM; 2134 } 2135 return 0; 2136} 2137 2138static void nvme_release_prp_pools(struct nvme_dev *dev) 2139{ 2140 dma_pool_destroy(dev->prp_page_pool); 2141 dma_pool_destroy(dev->prp_small_pool); 2142} 2143 2144static DEFINE_IDA(nvme_instance_ida); 2145 2146static int nvme_set_instance(struct nvme_dev *dev) 2147{ 2148 int instance, error; 2149 2150 do { 2151 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2152 return -ENODEV; 2153 2154 spin_lock(&dev_list_lock); 2155 error = ida_get_new(&nvme_instance_ida, &instance); 2156 spin_unlock(&dev_list_lock); 2157 } while (error == -EAGAIN); 2158 2159 if (error) 2160 return -ENODEV; 2161 2162 dev->instance = instance; 2163 return 0; 2164} 2165 2166static void nvme_release_instance(struct nvme_dev *dev) 2167{ 2168 spin_lock(&dev_list_lock); 2169 ida_remove(&nvme_instance_ida, dev->instance); 2170 spin_unlock(&dev_list_lock); 2171} 2172 2173static void nvme_free_dev(struct kref *kref) 2174{ 2175 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2176 kfree(dev->queues); 2177 kfree(dev->entry); 2178 kfree(dev); 2179} 2180 2181static int nvme_dev_open(struct inode *inode, struct file *f) 2182{ 2183 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 2184 miscdev); 2185 kref_get(&dev->kref); 2186 f->private_data = dev; 2187 return 0; 2188} 2189 2190static int nvme_dev_release(struct inode *inode, struct file *f) 2191{ 2192 struct nvme_dev *dev = f->private_data; 2193 kref_put(&dev->kref, nvme_free_dev); 2194 return 0; 2195} 2196 2197static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2198{ 2199 struct nvme_dev *dev = f->private_data; 2200 switch (cmd) { 2201 case NVME_IOCTL_ADMIN_CMD: 2202 return nvme_user_admin_cmd(dev, (void __user *)arg); 2203 default: 2204 return -ENOTTY; 2205 } 2206} 2207 2208static const struct file_operations nvme_dev_fops = { 2209 .owner = THIS_MODULE, 2210 .open = nvme_dev_open, 2211 .release = nvme_dev_release, 2212 .unlocked_ioctl = nvme_dev_ioctl, 2213 .compat_ioctl = nvme_dev_ioctl, 2214}; 2215 2216static int nvme_dev_start(struct nvme_dev *dev) 2217{ 2218 int result; 2219 2220 result = nvme_dev_map(dev); 2221 if (result) 2222 return result; 2223 2224 result = nvme_configure_admin_queue(dev); 2225 if (result) 2226 goto unmap; 2227 2228 spin_lock(&dev_list_lock); 2229 list_add(&dev->node, &dev_list); 2230 spin_unlock(&dev_list_lock); 2231 2232 result = nvme_setup_io_queues(dev); 2233 if (result && result != -EBUSY) 2234 goto disable; 2235 2236 return result; 2237 2238 disable: 2239 spin_lock(&dev_list_lock); 2240 list_del_init(&dev->node); 2241 spin_unlock(&dev_list_lock); 2242 unmap: 2243 nvme_dev_unmap(dev); 2244 return result; 2245} 2246 2247static int nvme_remove_dead_ctrl(void *arg) 2248{ 2249 struct nvme_dev *dev = (struct nvme_dev *)arg; 2250 struct pci_dev *pdev = dev->pci_dev; 2251 2252 if (pci_get_drvdata(pdev)) 2253 pci_stop_and_remove_bus_device(pdev); 2254 kref_put(&dev->kref, nvme_free_dev); 2255 return 0; 2256} 2257 2258static void nvme_remove_disks(struct work_struct *ws) 2259{ 2260 int i; 2261 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2262 2263 nvme_dev_remove(dev); 2264 spin_lock(&dev_list_lock); 2265 for (i = dev->queue_count - 1; i > 0; i--) { 2266 BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended); 2267 nvme_free_queue(dev->queues[i]); 2268 dev->queue_count--; 2269 dev->queues[i] = NULL; 2270 } 2271 spin_unlock(&dev_list_lock); 2272} 2273 2274static int nvme_dev_resume(struct nvme_dev *dev) 2275{ 2276 int ret; 2277 2278 ret = nvme_dev_start(dev); 2279 if (ret && ret != -EBUSY) 2280 return ret; 2281 if (ret == -EBUSY) { 2282 spin_lock(&dev_list_lock); 2283 INIT_WORK(&dev->reset_work, nvme_remove_disks); 2284 queue_work(nvme_workq, &dev->reset_work); 2285 spin_unlock(&dev_list_lock); 2286 } 2287 dev->initialized = 1; 2288 return 0; 2289} 2290 2291static void nvme_dev_reset(struct nvme_dev *dev) 2292{ 2293 nvme_dev_shutdown(dev); 2294 if (nvme_dev_resume(dev)) { 2295 dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); 2296 kref_get(&dev->kref); 2297 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2298 dev->instance))) { 2299 dev_err(&dev->pci_dev->dev, 2300 "Failed to start controller remove task\n"); 2301 kref_put(&dev->kref, nvme_free_dev); 2302 } 2303 } 2304} 2305 2306static void nvme_reset_failed_dev(struct work_struct *ws) 2307{ 2308 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2309 nvme_dev_reset(dev); 2310} 2311 2312static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2313{ 2314 int result = -ENOMEM; 2315 struct nvme_dev *dev; 2316 2317 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2318 if (!dev) 2319 return -ENOMEM; 2320 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2321 GFP_KERNEL); 2322 if (!dev->entry) 2323 goto free; 2324 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2325 GFP_KERNEL); 2326 if (!dev->queues) 2327 goto free; 2328 2329 INIT_LIST_HEAD(&dev->namespaces); 2330 dev->pci_dev = pdev; 2331 pci_set_drvdata(pdev, dev); 2332 result = nvme_set_instance(dev); 2333 if (result) 2334 goto free; 2335 2336 result = nvme_setup_prp_pools(dev); 2337 if (result) 2338 goto release; 2339 2340 result = nvme_dev_start(dev); 2341 if (result) { 2342 if (result == -EBUSY) 2343 goto create_cdev; 2344 goto release_pools; 2345 } 2346 2347 result = nvme_dev_add(dev); 2348 if (result) 2349 goto shutdown; 2350 2351 create_cdev: 2352 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2353 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2354 dev->miscdev.parent = &pdev->dev; 2355 dev->miscdev.name = dev->name; 2356 dev->miscdev.fops = &nvme_dev_fops; 2357 result = misc_register(&dev->miscdev); 2358 if (result) 2359 goto remove; 2360 2361 dev->initialized = 1; 2362 kref_init(&dev->kref); 2363 return 0; 2364 2365 remove: 2366 nvme_dev_remove(dev); 2367 shutdown: 2368 nvme_dev_shutdown(dev); 2369 release_pools: 2370 nvme_free_queues(dev); 2371 nvme_release_prp_pools(dev); 2372 release: 2373 nvme_release_instance(dev); 2374 free: 2375 kfree(dev->queues); 2376 kfree(dev->entry); 2377 kfree(dev); 2378 return result; 2379} 2380 2381static void nvme_remove(struct pci_dev *pdev) 2382{ 2383 struct nvme_dev *dev = pci_get_drvdata(pdev); 2384 2385 spin_lock(&dev_list_lock); 2386 list_del_init(&dev->node); 2387 spin_unlock(&dev_list_lock); 2388 2389 pci_set_drvdata(pdev, NULL); 2390 flush_work(&dev->reset_work); 2391 misc_deregister(&dev->miscdev); 2392 nvme_dev_remove(dev); 2393 nvme_dev_shutdown(dev); 2394 nvme_free_queues(dev); 2395 nvme_release_instance(dev); 2396 nvme_release_prp_pools(dev); 2397 kref_put(&dev->kref, nvme_free_dev); 2398} 2399 2400/* These functions are yet to be implemented */ 2401#define nvme_error_detected NULL 2402#define nvme_dump_registers NULL 2403#define nvme_link_reset NULL 2404#define nvme_slot_reset NULL 2405#define nvme_error_resume NULL 2406 2407static int nvme_suspend(struct device *dev) 2408{ 2409 struct pci_dev *pdev = to_pci_dev(dev); 2410 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2411 2412 nvme_dev_shutdown(ndev); 2413 return 0; 2414} 2415 2416static int nvme_resume(struct device *dev) 2417{ 2418 struct pci_dev *pdev = to_pci_dev(dev); 2419 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2420 2421 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { 2422 INIT_WORK(&ndev->reset_work, nvme_reset_failed_dev); 2423 queue_work(nvme_workq, &ndev->reset_work); 2424 } 2425 return 0; 2426} 2427 2428static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2429 2430static const struct pci_error_handlers nvme_err_handler = { 2431 .error_detected = nvme_error_detected, 2432 .mmio_enabled = nvme_dump_registers, 2433 .link_reset = nvme_link_reset, 2434 .slot_reset = nvme_slot_reset, 2435 .resume = nvme_error_resume, 2436}; 2437 2438/* Move to pci_ids.h later */ 2439#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2440 2441static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 2442 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2443 { 0, } 2444}; 2445MODULE_DEVICE_TABLE(pci, nvme_id_table); 2446 2447static struct pci_driver nvme_driver = { 2448 .name = "nvme", 2449 .id_table = nvme_id_table, 2450 .probe = nvme_probe, 2451 .remove = nvme_remove, 2452 .driver = { 2453 .pm = &nvme_dev_pm_ops, 2454 }, 2455 .err_handler = &nvme_err_handler, 2456}; 2457 2458static int __init nvme_init(void) 2459{ 2460 int result; 2461 2462 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2463 if (IS_ERR(nvme_thread)) 2464 return PTR_ERR(nvme_thread); 2465 2466 result = -ENOMEM; 2467 nvme_workq = create_singlethread_workqueue("nvme"); 2468 if (!nvme_workq) 2469 goto kill_kthread; 2470 2471 result = register_blkdev(nvme_major, "nvme"); 2472 if (result < 0) 2473 goto kill_workq; 2474 else if (result > 0) 2475 nvme_major = result; 2476 2477 result = pci_register_driver(&nvme_driver); 2478 if (result) 2479 goto unregister_blkdev; 2480 return 0; 2481 2482 unregister_blkdev: 2483 unregister_blkdev(nvme_major, "nvme"); 2484 kill_workq: 2485 destroy_workqueue(nvme_workq); 2486 kill_kthread: 2487 kthread_stop(nvme_thread); 2488 return result; 2489} 2490 2491static void __exit nvme_exit(void) 2492{ 2493 pci_unregister_driver(&nvme_driver); 2494 unregister_blkdev(nvme_major, "nvme"); 2495 destroy_workqueue(nvme_workq); 2496 kthread_stop(nvme_thread); 2497} 2498 2499MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2500MODULE_LICENSE("GPL"); 2501MODULE_VERSION("0.8"); 2502module_init(nvme_init); 2503module_exit(nvme_exit); 2504