pci.c 70.4 KB
Newer Older
Matthew Wilcox's avatar
Matthew Wilcox committed
1 2
/*
 * NVM Express device driver
3
 * Copyright (c) 2011-2014, Intel Corporation.
Matthew Wilcox's avatar
Matthew Wilcox committed
4 5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

Keith Busch's avatar
Keith Busch committed
15
#include <linux/aer.h>
16
#include <linux/async.h>
Matthew Wilcox's avatar
Matthew Wilcox committed
17
#include <linux/blkdev.h>
Matias Bjørling's avatar
Matias Bjørling committed
18
#include <linux/blk-mq.h>
19
#include <linux/blk-mq-pci.h>
20
#include <linux/dmi.h>
Matthew Wilcox's avatar
Matthew Wilcox committed
21 22 23 24 25
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
26
#include <linux/mutex.h>
27
#include <linux/once.h>
Matthew Wilcox's avatar
Matthew Wilcox committed
28
#include <linux/pci.h>
Keith Busch's avatar
Keith Busch committed
29
#include <linux/t10-pi.h>
Matthew Wilcox's avatar
Matthew Wilcox committed
30
#include <linux/types.h>
31
#include <linux/io-64-nonatomic-lo-hi.h>
32
#include <linux/sed-opal.h>
33

34 35
#include "nvme.h"

Matthew Wilcox's avatar
Matthew Wilcox committed
36 37
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
38

39
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
40

41 42 43
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

44 45 46 47
static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

48 49 50 51
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
52

53 54 55 56 57 58
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

59 60 61 62 63 64 65 66 67 68
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
	.get = param_get_int,
};

static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

69 70
struct nvme_dev;
struct nvme_queue;
71

72
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
73

74 75 76 77
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
78
	struct nvme_queue *queues;
79 80 81 82 83 84 85 86
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
87
	unsigned int num_vecs;
88 89 90
	int q_depth;
	u32 db_stride;
	void __iomem *bar;
91
	unsigned long bar_mapped_size;
92
	struct work_struct remove_work;
93
	struct mutex shutdown_lock;
94 95
	bool subsystem;
	void __iomem *cmb;
96
	pci_bus_addr_t cmb_bus_addr;
97 98
	u64 cmb_size;
	u32 cmbsz;
99
	u32 cmbloc;
100
	struct nvme_ctrl ctrl;
101
	struct completion ioq_wait;
102 103

	/* shadow doorbell buffer support: */
104 105 106 107
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
108 109 110 111

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
112
	dma_addr_t host_mem_descs_dma;
113 114
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
Keith Busch's avatar
Keith Busch committed
115
};
116

117 118 119 120 121 122 123 124 125 126 127
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (ret != 0 || n < 2)
		return -EINVAL;

	return param_set_int(val, kp);
}

128 129 130 131 132 133 134 135 136 137
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

138 139 140 141 142
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

Matthew Wilcox's avatar
Matthew Wilcox committed
143 144 145 146 147 148
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
	struct device *q_dmadev;
149
	struct nvme_dev *dev;
150
	spinlock_t sq_lock;
Matthew Wilcox's avatar
Matthew Wilcox committed
151
	struct nvme_command *sq_cmds;
152
	struct nvme_command __iomem *sq_cmds_io;
153
	spinlock_t cq_lock ____cacheline_aligned_in_smp;
Matthew Wilcox's avatar
Matthew Wilcox committed
154
	volatile struct nvme_completion *cqes;
155
	struct blk_mq_tags **tags;
Matthew Wilcox's avatar
Matthew Wilcox committed
156 157 158 159
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
160
	s16 cq_vector;
Matthew Wilcox's avatar
Matthew Wilcox committed
161 162
	u16 sq_tail;
	u16 cq_head;
163
	u16 last_cq_head;
Keith Busch's avatar
Keith Busch committed
164
	u16 qid;
165
	u8 cq_phase;
166 167 168 169
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
Matthew Wilcox's avatar
Matthew Wilcox committed
170 171
};

172 173 174
/*
 * The nvme_iod describes the data in an I/O, including the list of PRP
 * entries.  You can't see it in this data structure because C doesn't let
175
 * me express that.  Use nvme_init_iod to ensure there's enough space
176 177 178
 * allocated to store the PRP list.
 */
struct nvme_iod {
179
	struct nvme_request req;
180
	struct nvme_queue *nvmeq;
181
	bool use_sgl;
182
	int aborted;
183 184 185 186
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	int length;		/* Of data, in bytes */
	dma_addr_t first_dma;
187
	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
188 189
	struct scatterlist *sg;
	struct scatterlist inline_sg[0];
Matthew Wilcox's avatar
Matthew Wilcox committed
190 191 192 193 194 195 196 197 198 199 200 201
};

/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
202
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
Keith Busch's avatar
Keith Busch committed
203
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
Matthew Wilcox's avatar
Matthew Wilcox committed
204
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
205 206
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
Matthew Wilcox's avatar
Matthew Wilcox committed
207
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
Keith Busch's avatar
Keith Busch committed
208
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}

static inline unsigned int nvme_dbbuf_size(u32 stride)
{
	return ((num_possible_cpus() + 1) * 8 * stride);
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
283
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
Matthew Wilcox's avatar
Matthew Wilcox committed
315 316
}

317 318 319 320
/*
 * Max size of iod being embedded in the request payload
 */
#define NVME_INT_PAGES		2
321
#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
322 323 324 325 326 327 328 329

/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
330 331
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
332 333 334
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

335 336 337 338 339
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
static int nvme_pci_npages_sgl(unsigned int num_seg)
340
{
341
	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
342
}
343

344 345
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
		unsigned int size, unsigned int nseg, bool use_sgl)
346
{
347 348 349 350 351 352 353 354
	size_t alloc_size;

	if (use_sgl)
		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
	else
		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);

	return alloc_size + sizeof(struct scatterlist) * nseg;
355
}
356

357
static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
358
{
359 360 361 362 363
	unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
				    NVME_INT_BYTES(dev), NVME_INT_PAGES,
				    use_sgl);

	return sizeof(struct nvme_iod) + alloc_size;
364 365
}

Matias Bjørling's avatar
Matias Bjørling committed
366 367
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
368
{
Matias Bjørling's avatar
Matias Bjørling committed
369
	struct nvme_dev *dev = data;
370
	struct nvme_queue *nvmeq = &dev->queues[0];
Matias Bjørling's avatar
Matias Bjørling committed
371

372 373 374 375
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

Matias Bjørling's avatar
Matias Bjørling committed
376
	hctx->driver_data = nvmeq;
377
	nvmeq->tags = &dev->admin_tagset.tags[0];
Matias Bjørling's avatar
Matias Bjørling committed
378
	return 0;
379 380
}

381 382 383 384 385 386 387
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

Matias Bjørling's avatar
Matias Bjørling committed
388 389
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
Matthew Wilcox's avatar
Matthew Wilcox committed
390
{
Matias Bjørling's avatar
Matias Bjørling committed
391
	struct nvme_dev *dev = data;
392
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
Matias Bjørling's avatar
Matias Bjørling committed
393

394 395
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
Matthew Wilcox's avatar
Matthew Wilcox committed
396

397
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
Matias Bjørling's avatar
Matias Bjørling committed
398 399
	hctx->driver_data = nvmeq;
	return 0;
Matthew Wilcox's avatar
Matthew Wilcox committed
400 401
}

402 403
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
Matthew Wilcox's avatar
Matthew Wilcox committed
404
{
405
	struct nvme_dev *dev = set->driver_data;
406
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
407
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
408
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
Matias Bjørling's avatar
Matias Bjørling committed
409 410

	BUG_ON(!nvmeq);
411
	iod->nvmeq = nvmeq;
Matias Bjørling's avatar
Matias Bjørling committed
412 413 414
	return 0;
}

415 416 417 418
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;

419 420
	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
421 422
}

Matthew Wilcox's avatar
Matthew Wilcox committed
423
/**
424
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
Matthew Wilcox's avatar
Matthew Wilcox committed
425 426 427
 * @nvmeq: The queue to use
 * @cmd: The command to send
 */
428
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
Matthew Wilcox's avatar
Matthew Wilcox committed
429
{
430
	spin_lock(&nvmeq->sq_lock);
431
	if (nvmeq->sq_cmds_io)
432 433
		memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd,
				sizeof(*cmd));
434
	else
435
		memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
436

437 438 439 440 441 442
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
	spin_unlock(&nvmeq->sq_lock);
Matthew Wilcox's avatar
Matthew Wilcox committed
443 444
}

445
static void **nvme_pci_iod_list(struct request *req)
Matthew Wilcox's avatar
Matthew Wilcox committed
446
{
447
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
448
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
Matthew Wilcox's avatar
Matthew Wilcox committed
449 450
}

451 452 453
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
454
	int nseg = blk_rq_nr_phys_segments(req);
455 456
	unsigned int avg_seg_size;

457 458 459 460
	if (nseg == 0)
		return false;

	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
461 462 463 464 465 466 467 468 469 470

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

471
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
472
{
473
	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
474
	int nseg = blk_rq_nr_phys_segments(rq);
475
	unsigned int size = blk_rq_payload_bytes(rq);
476

477 478
	iod->use_sgl = nvme_pci_use_sgls(dev, rq);

479
	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
480 481 482 483
		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
				iod->use_sgl);

		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
484
		if (!iod->sg)
485
			return BLK_STS_RESOURCE;
486 487
	} else {
		iod->sg = iod->inline_sg;
488 489
	}

490 491 492 493
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;
	iod->length = size;
Keith Busch's avatar
Keith Busch committed
494

495
	return BLK_STS_OK;
496 497
}

498
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
Matthew Wilcox's avatar
Matthew Wilcox committed
499
{
500
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
501 502 503
	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;

504 505 506
	int i;

	if (iod->npages == 0)
507 508 509
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

510
	for (i = 0; i < iod->npages; i++) {
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
526
	}
527

528 529
	if (iod->sg != iod->inline_sg)
		kfree(iod->sg);
530 531
}

532
#ifdef CONFIG_BLK_DEV_INTEGRITY
Keith Busch's avatar
Keith Busch committed
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == v)
		pi->ref_tag = cpu_to_be32(p);
}

static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == p)
		pi->ref_tag = cpu_to_be32(v);
}

/**
 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
 *
 * The virtual start sector is the one that was originally submitted by the
 * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
 * start sector may be different. Remap protection information to match the
 * physical LBA on writes, and back to the original seed on reads.
 *
 * Type 0 and 3 do not have a ref tag, so no remapping required.
 */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
	struct nvme_ns *ns = req->rq_disk->private_data;
	struct bio_integrity_payload *bip;
	struct t10_pi_tuple *pi;
	void *p, *pmap;
	u32 i, nlb, ts, phys, virt;

	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
		return;

	bip = bio_integrity(req->bio);
	if (!bip)
		return;

	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;

	p = pmap;
	virt = bip_get_seed(bip);
	phys = nvme_block_nr(ns, blk_rq_pos(req));
	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
577
	ts = ns->disk->queue->integrity.tuple_size;
Keith Busch's avatar
Keith Busch committed
578 579 580 581 582 583 584 585

	for (i = 0; i < nlb; i++, virt++, phys++) {
		pi = (struct t10_pi_tuple *)p;
		dif_swap(phys, virt, pi);
		p += ts;
	}
	kunmap_atomic(pmap);
}
586 587 588 589 590 591 592 593 594 595 596 597 598
#else /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
}
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
#endif

599 600 601 602 603 604 605 606 607 608 609 610 611 612
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

613 614
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
Matthew Wilcox's avatar
Matthew Wilcox committed
615
{
616
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
617
	struct dma_pool *pool;
618
	int length = blk_rq_payload_bytes(req);
619
	struct scatterlist *sg = iod->sg;
Matthew Wilcox's avatar
Matthew Wilcox committed
620 621
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
622
	u32 page_size = dev->ctrl.page_size;
623
	int offset = dma_addr & (page_size - 1);
624
	__le64 *prp_list;
625
	void **list = nvme_pci_iod_list(req);
626
	dma_addr_t prp_dma;
627
	int nprps, i;
Matthew Wilcox's avatar
Matthew Wilcox committed
628

629
	length -= (page_size - offset);
630 631
	if (length <= 0) {
		iod->first_dma = 0;
632
		goto done;
633
	}
Matthew Wilcox's avatar
Matthew Wilcox committed
634

635
	dma_len -= (page_size - offset);
Matthew Wilcox's avatar
Matthew Wilcox committed
636
	if (dma_len) {
637
		dma_addr += (page_size - offset);
Matthew Wilcox's avatar
Matthew Wilcox committed
638 639 640 641 642 643
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

644
	if (length <= page_size) {
645
		iod->first_dma = dma_addr;
646
		goto done;
647 648
	}

649
	nprps = DIV_ROUND_UP(length, page_size);
650 651
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
652
		iod->npages = 0;
653 654
	} else {
		pool = dev->prp_page_pool;
655
		iod->npages = 1;
656 657
	}

658
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
659
	if (!prp_list) {
660
		iod->first_dma = dma_addr;
661
		iod->npages = -1;
662
		return BLK_STS_RESOURCE;
663
	}
664 665
	list[0] = prp_list;
	iod->first_dma = prp_dma;
666 667
	i = 0;
	for (;;) {
668
		if (i == page_size >> 3) {
669
			__le64 *old_prp_list = prp_list;
670
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
671
			if (!prp_list)
672
				return BLK_STS_RESOURCE;
673
			list[iod->npages++] = prp_list;
674 675 676
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
677 678
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
679 680 681
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
682 683 684 685
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
686 687
		if (unlikely(dma_len < 0))
			goto bad_sgl;
688 689 690
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
Matthew Wilcox's avatar
Matthew Wilcox committed
691 692
	}

693 694 695 696
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

697 698 699
	return BLK_STS_OK;

 bad_sgl:
700 701 702
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
703
	return BLK_STS_IOERR;
Matthew Wilcox's avatar
Matthew Wilcox committed
704 705
}

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
728
		struct request *req, struct nvme_rw_command *cmd, int entries)
729 730 731 732 733 734
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
735
	int i = 0;
736 737 738 739

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

740
	if (entries == 1) {
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
781
	} while (--entries > 0);
782 783 784 785

	return BLK_STS_OK;
}

786
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
787
		struct nvme_command *cmnd)
788
{
789
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
790 791 792
	struct request_queue *q = req->q;
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
793
	blk_status_t ret = BLK_STS_IOERR;
794
	int nr_mapped;
795

796
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
797 798 799
	iod->nents = blk_rq_map_sg(q, req, iod->sg);
	if (!iod->nents)
		goto out;
800

801
	ret = BLK_STS_RESOURCE;
802 803 804
	nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
			DMA_ATTR_NO_WARN);
	if (!nr_mapped)
805
		goto out;
806

807
	if (iod->use_sgl)
808
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
809 810 811
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);

812
	if (ret != BLK_STS_OK)
813
		goto out_unmap;
814

815
	ret = BLK_STS_IOERR;
816 817 818
	if (blk_integrity_rq(req)) {
		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
			goto out_unmap;
819

820 821
		sg_init_table(&iod->meta_sg, 1);
		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
822
			goto out_unmap;
823

824
		if (req_op(req) == REQ_OP_WRITE)
825
			nvme_dif_remap(req, nvme_dif_prep);
826

827
		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
828
			goto out_unmap;
829
	}
Matthew Wilcox's avatar
Matthew Wilcox committed
830

831
	if (blk_integrity_rq(req))
832
		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
833
	return BLK_STS_OK;
Matthew Wilcox's avatar
Matthew Wilcox committed
834

835 836 837 838
out_unmap:
	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
	return ret;
Matthew Wilcox's avatar
Matthew Wilcox committed
839 840
}

841
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
Matthew Wilcox's avatar
Matthew Wilcox committed
842
{
843
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
844 845 846 847 848 849
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;

	if (iod->nents) {
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
		if (blk_integrity_rq(req)) {
850
			if (req_op(req) == REQ_OP_READ)
851
				nvme_dif_remap(req, nvme_dif_complete);
852
			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
Keith Busch's avatar
Keith Busch committed
853
		}
854
	}
Keith Busch's avatar
Keith Busch committed
855

856
	nvme_cleanup_cmd(req);
857
	nvme_free_iod(dev, req);
858
}
Matthew Wilcox's avatar
Matthew Wilcox committed
859

860 861 862
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
863
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
Matias Bjørling's avatar
Matias Bjørling committed
864
			 const struct blk_mq_queue_data *bd)
865
{
Matias Bjørling's avatar
Matias Bjørling committed
866 867
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
868
	struct nvme_dev *dev = nvmeq->dev;
Matias Bjørling's avatar
Matias Bjørling committed
869
	struct request *req = bd->rq;
870
	struct nvme_command cmnd;
871
	blk_status_t ret;
Keith Busch's avatar
Keith Busch committed
872

873 874 875 876 877 878 879
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
	if (unlikely(nvmeq->cq_vector < 0))
		return BLK_STS_IOERR;

880
	ret = nvme_setup_cmd(ns, req, &cmnd);
881
	if (ret)
882
		return ret;
Matias Bjørling's avatar
Matias Bjørling committed
883

884
	ret = nvme_init_iod(req, dev);
885
	if (ret)
886
		goto out_free_cmd;
Matias Bjørling's avatar
Matias Bjørling committed
887

888
	if (blk_rq_nr_phys_segments(req)) {
889
		ret = nvme_map_data(dev, req, &cmnd);
890 891 892
		if (ret)
			goto out_cleanup_iod;
	}
Matias Bjørling's avatar
Matias Bjørling committed
893

894
	blk_mq_start_request(req);
895
	nvme_submit_cmd(nvmeq, &cmnd);
896
	return BLK_STS_OK;
897
out_cleanup_iod:
898
	nvme_free_iod(dev, req);
899 900
out_free_cmd:
	nvme_cleanup_cmd(req);
901
	return ret;
Matthew Wilcox's avatar
Matthew Wilcox committed
902
}
Keith Busch's avatar
Keith Busch committed
903

904
static void nvme_pci_complete_rq(struct request *req)
905
{
906
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
Matias Bjørling's avatar
Matias Bjørling committed
907

908 909
	nvme_unmap_data(iod->nvmeq->dev, req);
	nvme_complete_rq(req);
Matthew Wilcox's avatar
Matthew Wilcox committed
910 911
}

912
/* We read the CQE phase first to check if the rest of the entry is valid */
913
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
914
{
915 916
	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
			nvmeq->cq_phase;
917 918
}

919
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
Matthew Wilcox's avatar
Matthew Wilcox committed
920
{
921
	u16 head = nvmeq->cq_head;
922

923 924 925
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
926
}
927

928
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
929
{
930
	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
931
	struct request *req;
932

933 934 935 936 937
	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
Matthew Wilcox's avatar
Matthew Wilcox committed
938 939
	}

940 941 942 943 944 945 946
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
	if (unlikely(nvmeq->qid == 0 &&
Keith Busch's avatar
Keith Busch committed
947
			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
948 949
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
Jens Axboe's avatar
Jens Axboe committed
950
		return;
951
	}
Matthew Wilcox's avatar
Matthew Wilcox committed
952

953 954 955
	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
	nvme_end_request(req, cqe->status, cqe->result);
}
Matthew Wilcox's avatar
Matthew Wilcox committed
956

957
static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
Matthew Wilcox's avatar
Matthew Wilcox committed
958
{
959 960 961 962 963 964
	while (start != end) {
		nvme_handle_cqe(nvmeq, start);
		if (++start == nvmeq->q_depth)
			start = 0;
	}
}
965

966 967 968 969 970
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
	if (++nvmeq->cq_head == nvmeq->q_depth) {
		nvmeq->cq_head = 0;
		nvmeq->cq_phase = !nvmeq->cq_phase;
Matthew Wilcox's avatar
Matthew Wilcox committed
971
	}
Jens Axboe's avatar
Jens Axboe committed
972 973
}

974 975
static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
		u16 *end, int tag)
Jens Axboe's avatar
Jens Axboe committed
976
{
977
	bool found = false;
Matthew Wilcox's avatar
Matthew Wilcox committed
978

979 980 981 982 983
	*start = nvmeq->cq_head;
	while (!found && nvme_cqe_pending(nvmeq)) {
		if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
			found = true;
		nvme_update_cq_head(nvmeq);
984
	}
985
	*end = nvmeq->cq_head;
986

987
	if (*start != *end)
988
		nvme_ring_cq_doorbell(nvmeq);
989
	return found;
Matthew Wilcox's avatar
Matthew Wilcox committed
990 991 992
}

static irqreturn_t nvme_irq(int irq, void *data)
993 994
{
	struct nvme_queue *nvmeq = data;
995
	irqreturn_t ret = IRQ_NONE;
996 997
	u16 start, end;

998
	spin_lock(&nvmeq->cq_lock);
999 1000
	if (nvmeq->cq_head != nvmeq->last_cq_head)
		ret = IRQ_HANDLED;
1001
	nvme_process_cq(nvmeq, &start, &end, -1);
1002
	nvmeq->last_cq_head = nvmeq->cq_head;
1003
	spin_unlock(&nvmeq->cq_lock);
1004

1005 1006 1007 1008 1009 1010
	if (start != end) {
		nvme_complete_cqes(nvmeq, start, end);
		return IRQ_HANDLED;
	}

	return ret;
1011 1012 1013 1014 1015
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1016
	if (nvme_cqe_pending(nvmeq))
1017 1018
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1019 1020
}

Keith Busch's avatar
Keith Busch committed
1021
static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
Jens Axboe's avatar
Jens Axboe committed
1022
{
1023 1024
	u16 start, end;
	bool found;
Jens Axboe's avatar
Jens Axboe committed
1025

1026
	if (!nvme_cqe_pending(nvmeq))
1027
		return 0;
Jens Axboe's avatar
Jens Axboe committed
1028

1029
	spin_lock_irq(&nvmeq->cq_lock);
1030
	found = nvme_process_cq(nvmeq, &start, &end, tag);
1031
	spin_unlock_irq(&nvmeq->cq_lock);
1032