From 79478bf9ea9fa48d30836afa796ac13d8a0f320b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:54 +0100
Subject: block: move blk_rq_err_bytes to scsi

blk_rq_err_bytes is only used by the scsi midlayer, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 41 -----------------------------------------
 1 file changed, 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1378d084c770..682b112f513f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1176,47 +1176,6 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
-/**
- * blk_rq_err_bytes - determine number of bytes till the next failure boundary
- * @rq: request to examine
- *
- * Description:
- *     A request could be merge of IOs which require different failure
- *     handling.  This function determines the number of bytes which
- *     can be failed from the beginning of the request without
- *     crossing into area which need to be retried further.
- *
- * Return:
- *     The number of bytes to fail.
- */
-unsigned int blk_rq_err_bytes(const struct request *rq)
-{
-	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
-	unsigned int bytes = 0;
-	struct bio *bio;
-
-	if (!(rq->rq_flags & RQF_MIXED_MERGE))
-		return blk_rq_bytes(rq);
-
-	/*
-	 * Currently the only 'mixing' which can happen is between
-	 * different fastfail types.  We can safely fail portions
-	 * which have all the failfast bits that the first one has -
-	 * the ones which are at least as eager to fail as the first
-	 * one.
-	 */
-	for (bio = rq->bio; bio; bio = bio->bi_next) {
-		if ((bio->bi_opf & ff) != ff)
-			break;
-		bytes += bio->bi_iter.bi_size;
-	}
-
-	/* this could lead to infinite loop */
-	BUG_ON(blk_rq_bytes(rq) && !bytes);
-	return bytes;
-}
-EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-
 static void update_io_ticks(struct block_device *part, unsigned long now,
 		bool end)
 {
-- 
cgit v1.2.1


From 786d4e01c550e8bb7c9f9f23bca0596a2a33483c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:55 +0100
Subject: block: remove rq_flush_dcache_pages

This function is trivial, and flush_dcache_page is always defined, so
just open code it in the 2.5 callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 682b112f513f..039e28509f59 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1300,25 +1300,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_steal_bios);
 
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-/**
- * rq_flush_dcache_pages - Helper function to flush all pages in a request
- * @rq: the request to be flushed
- *
- * Description:
- *     Flush all pages in @rq.
- */
-void rq_flush_dcache_pages(struct request *rq)
-{
-	struct req_iterator iter;
-	struct bio_vec bvec;
-
-	rq_for_each_segment(bvec, rq, iter)
-		flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
-#endif
-
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
-- 
cgit v1.2.1


From 4054cff92c357813b6861b622122b344990f7e31 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:56 +0100
Subject: block: remove blk-exec.c

All this code is tightly coupled to the blk-mq core, so move it
there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-4-hch@lst.de
[axboe: remove doc generation for blk-exec.c]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Makefile   |   2 +-
 block/blk-exec.c | 116 -------------------------------------------------------
 block/blk-mq.c   | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 117 deletions(-)
 delete mode 100644 block/blk-exec.c

(limited to 'block')

diff --git a/block/Makefile b/block/Makefile
index 44df57e562bf..f38eaa612929 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 
 obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o blk-timeout.o \
+			blk-merge.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
diff --git a/block/blk-exec.c b/block/blk-exec.c
deleted file mode 100644
index 1b8b47f6e79b..000000000000
--- a/block/blk-exec.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to setting various queue properties from drivers
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/sched/sysctl.h>
-
-#include "blk.h"
-#include "blk-mq-sched.h"
-
-/**
- * blk_end_sync_rq - executes a completion event on a request
- * @rq: request to complete
- * @error: end I/O status of the request
- */
-static void blk_end_sync_rq(struct request *rq, blk_status_t error)
-{
-	struct completion *waiting = rq->end_io_data;
-
-	rq->end_io_data = (void *)(uintptr_t)error;
-
-	/*
-	 * complete last, if this is a stack request the process (and thus
-	 * the rq pointer) could be invalid right after this complete()
-	 */
-	complete(waiting);
-}
-
-/**
- * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
- * @bd_disk:	matching gendisk
- * @rq:		request to insert
- * @at_head:    insert request at head or tail of queue
- * @done:	I/O completion handler
- *
- * Description:
- *    Insert a fully prepared request at the back of the I/O scheduler queue
- *    for execution.  Don't wait for completion.
- *
- * Note:
- *    This function will invoke @done directly if the queue is dead.
- */
-void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
-			   int at_head, rq_end_io_fn *done)
-{
-	WARN_ON(irqs_disabled());
-	WARN_ON(!blk_rq_is_passthrough(rq));
-
-	rq->rq_disk = bd_disk;
-	rq->end_io = done;
-
-	blk_account_io_start(rq);
-
-	/*
-	 * don't check dying flag for MQ because the request won't
-	 * be reused after dying flag is set
-	 */
-	blk_mq_sched_insert_request(rq, at_head, true, false);
-}
-EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
-
-static bool blk_rq_is_poll(struct request *rq)
-{
-	if (!rq->mq_hctx)
-		return false;
-	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
-		return false;
-	if (WARN_ON_ONCE(!rq->bio))
-		return false;
-	return true;
-}
-
-static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
-{
-	do {
-		bio_poll(rq->bio, NULL, 0);
-		cond_resched();
-	} while (!completion_done(wait));
-}
-
-/**
- * blk_execute_rq - insert a request into queue for execution
- * @bd_disk:	matching gendisk
- * @rq:		request to insert
- * @at_head:    insert request at head or tail of queue
- *
- * Description:
- *    Insert a fully prepared request at the back of the I/O scheduler queue
- *    for execution and wait for completion.
- * Return: The blk_status_t result provided to blk_mq_end_request().
- */
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	unsigned long hang_check;
-
-	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
-
-	/* Prevent hang_check timer from firing at us during very long I/O */
-	hang_check = sysctl_hung_task_timeout_secs;
-
-	if (blk_rq_is_poll(rq))
-		blk_rq_poll_completion(rq, &wait);
-	else if (hang_check)
-		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
-	else
-		wait_for_completion_io(&wait);
-
-	return (blk_status_t)(uintptr_t)rq->end_io_data;
-}
-EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8874a63ae952..3e5dc87e0cfc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
+#include <linux/sched/sysctl.h>
 
 #include <trace/events/block.h>
 
@@ -1058,6 +1059,112 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ * @error: end I/O status of the request
+ */
+static void blk_end_sync_rq(struct request *rq, blk_status_t error)
+{
+	struct completion *waiting = rq->end_io_data;
+
+	rq->end_io_data = (void *)(uintptr_t)error;
+
+	/*
+	 * complete last, if this is a stack request the process (and thus
+	 * the rq pointer) could be invalid right after this complete()
+	 */
+	complete(waiting);
+}
+
+/**
+ * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ * @done:	I/O completion handler
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
+ *    for execution.  Don't wait for completion.
+ *
+ * Note:
+ *    This function will invoke @done directly if the queue is dead.
+ */
+void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
+			   int at_head, rq_end_io_fn *done)
+{
+	WARN_ON(irqs_disabled());
+	WARN_ON(!blk_rq_is_passthrough(rq));
+
+	rq->rq_disk = bd_disk;
+	rq->end_io = done;
+
+	blk_account_io_start(rq);
+
+	/*
+	 * don't check dying flag for MQ because the request won't
+	 * be reused after dying flag is set
+	 */
+	blk_mq_sched_insert_request(rq, at_head, true, false);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+static bool blk_rq_is_poll(struct request *rq)
+{
+	if (!rq->mq_hctx)
+		return false;
+	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+		return false;
+	if (WARN_ON_ONCE(!rq->bio))
+		return false;
+	return true;
+}
+
+static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
+{
+	do {
+		bio_poll(rq->bio, NULL, 0);
+		cond_resched();
+	} while (!completion_done(wait));
+}
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
+ *    for execution and wait for completion.
+ * Return: The blk_status_t result provided to blk_mq_end_request().
+ */
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+		int at_head)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	unsigned long hang_check;
+
+	rq->end_io_data = &wait;
+	blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+
+	if (blk_rq_is_poll(rq))
+		blk_rq_poll_completion(rq, &wait);
+	else if (hang_check)
+		while (!wait_for_completion_io_timeout(&wait,
+				hang_check * (HZ/2)))
+			;
+	else
+		wait_for_completion_io(&wait);
+
+	return (blk_status_t)(uintptr_t)rq->end_io_data;
+}
+EXPORT_SYMBOL(blk_execute_rq);
+
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-- 
cgit v1.2.1


From b84c5b50d329bf7cfdba6bd5c8a99f1b8604e301 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:57 +0100
Subject: blk-mq: move blk_mq_flush_plug_list

Move blk_mq_flush_plug_list and blk_mq_plug_issue_direct down in blk-mq.c
to prepare for marking blk_mq_request_issue_directly static without the
need of a forward declaration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 184 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 92 insertions(+), 92 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3e5dc87e0cfc..df28e5ef8c2d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2309,98 +2309,6 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
 	*queued = 0;
 }
 
-static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
-{
-	struct blk_mq_hw_ctx *hctx = NULL;
-	struct request *rq;
-	int queued = 0;
-	int errors = 0;
-
-	while ((rq = rq_list_pop(&plug->mq_list))) {
-		bool last = rq_list_empty(plug->mq_list);
-		blk_status_t ret;
-
-		if (hctx != rq->mq_hctx) {
-			if (hctx)
-				blk_mq_commit_rqs(hctx, &queued, from_schedule);
-			hctx = rq->mq_hctx;
-		}
-
-		ret = blk_mq_request_issue_directly(rq, last);
-		switch (ret) {
-		case BLK_STS_OK:
-			queued++;
-			break;
-		case BLK_STS_RESOURCE:
-		case BLK_STS_DEV_RESOURCE:
-			blk_mq_request_bypass_insert(rq, false, last);
-			blk_mq_commit_rqs(hctx, &queued, from_schedule);
-			return;
-		default:
-			blk_mq_end_request(rq, ret);
-			errors++;
-			break;
-		}
-	}
-
-	/*
-	 * If we didn't flush the entire list, we could have told the driver
-	 * there was more coming, but that turned out to be a lie.
-	 */
-	if (errors)
-		blk_mq_commit_rqs(hctx, &queued, from_schedule);
-}
-
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
-{
-	struct blk_mq_hw_ctx *this_hctx;
-	struct blk_mq_ctx *this_ctx;
-	unsigned int depth;
-	LIST_HEAD(list);
-
-	if (rq_list_empty(plug->mq_list))
-		return;
-	plug->rq_count = 0;
-
-	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		blk_mq_plug_issue_direct(plug, false);
-		if (rq_list_empty(plug->mq_list))
-			return;
-	}
-
-	this_hctx = NULL;
-	this_ctx = NULL;
-	depth = 0;
-	do {
-		struct request *rq;
-
-		rq = rq_list_pop(&plug->mq_list);
-
-		if (!this_hctx) {
-			this_hctx = rq->mq_hctx;
-			this_ctx = rq->mq_ctx;
-		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
-			trace_block_unplug(this_hctx->queue, depth,
-						!from_schedule);
-			blk_mq_sched_insert_requests(this_hctx, this_ctx,
-						&list, from_schedule);
-			depth = 0;
-			this_hctx = rq->mq_hctx;
-			this_ctx = rq->mq_ctx;
-
-		}
-
-		list_add(&rq->queuelist, &list);
-		depth++;
-	} while (!rq_list_empty(plug->mq_list));
-
-	if (!list_empty(&list)) {
-		trace_block_unplug(this_hctx->queue, depth, !from_schedule);
-		blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
-						from_schedule);
-	}
-}
-
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 		unsigned int nr_segs)
 {
@@ -2540,6 +2448,98 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 	return ret;
 }
 
+static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
+{
+	struct blk_mq_hw_ctx *hctx = NULL;
+	struct request *rq;
+	int queued = 0;
+	int errors = 0;
+
+	while ((rq = rq_list_pop(&plug->mq_list))) {
+		bool last = rq_list_empty(plug->mq_list);
+		blk_status_t ret;
+
+		if (hctx != rq->mq_hctx) {
+			if (hctx)
+				blk_mq_commit_rqs(hctx, &queued, from_schedule);
+			hctx = rq->mq_hctx;
+		}
+
+		ret = blk_mq_request_issue_directly(rq, last);
+		switch (ret) {
+		case BLK_STS_OK:
+			queued++;
+			break;
+		case BLK_STS_RESOURCE:
+		case BLK_STS_DEV_RESOURCE:
+			blk_mq_request_bypass_insert(rq, false, last);
+			blk_mq_commit_rqs(hctx, &queued, from_schedule);
+			return;
+		default:
+			blk_mq_end_request(rq, ret);
+			errors++;
+			break;
+		}
+	}
+
+	/*
+	 * If we didn't flush the entire list, we could have told the driver
+	 * there was more coming, but that turned out to be a lie.
+	 */
+	if (errors)
+		blk_mq_commit_rqs(hctx, &queued, from_schedule);
+}
+
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+	struct blk_mq_hw_ctx *this_hctx;
+	struct blk_mq_ctx *this_ctx;
+	unsigned int depth;
+	LIST_HEAD(list);
+
+	if (rq_list_empty(plug->mq_list))
+		return;
+	plug->rq_count = 0;
+
+	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+		blk_mq_plug_issue_direct(plug, false);
+		if (rq_list_empty(plug->mq_list))
+			return;
+	}
+
+	this_hctx = NULL;
+	this_ctx = NULL;
+	depth = 0;
+	do {
+		struct request *rq;
+
+		rq = rq_list_pop(&plug->mq_list);
+
+		if (!this_hctx) {
+			this_hctx = rq->mq_hctx;
+			this_ctx = rq->mq_ctx;
+		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+			trace_block_unplug(this_hctx->queue, depth,
+						!from_schedule);
+			blk_mq_sched_insert_requests(this_hctx, this_ctx,
+						&list, from_schedule);
+			depth = 0;
+			this_hctx = rq->mq_hctx;
+			this_ctx = rq->mq_ctx;
+
+		}
+
+		list_add(&rq->queuelist, &list);
+		depth++;
+	} while (!rq_list_empty(plug->mq_list));
+
+	if (!list_empty(&list)) {
+		trace_block_unplug(this_hctx->queue, depth, !from_schedule);
+		blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
+						from_schedule);
+	}
+}
+
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		struct list_head *list)
 {
-- 
cgit v1.2.1


From 06c8c691e2820077936e59ad334eb806e90b69eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:58 +0100
Subject: block: move request based cloning helpers to blk-mq.c

Keep all the request based code together.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 184 +------------------------------------------------------
 block/blk-mq.c   | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/blk-mq.h   |   3 -
 block/blk.h      |  10 +++
 4 files changed, 185 insertions(+), 187 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 039e28509f59..5d6017d7f84e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -594,7 +594,7 @@ static int __init setup_fail_make_request(char *str)
 }
 __setup("fail_make_request=", setup_fail_make_request);
 
-static bool should_fail_request(struct block_device *part, unsigned int bytes)
+bool should_fail_request(struct block_device *part, unsigned int bytes)
 {
 	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
 }
@@ -608,15 +608,6 @@ static int __init fail_make_request_debugfs(void)
 }
 
 late_initcall(fail_make_request_debugfs);
-
-#else /* CONFIG_FAIL_MAKE_REQUEST */
-
-static inline bool should_fail_request(struct block_device *part,
-					unsigned int bytes)
-{
-	return false;
-}
-
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
 static inline bool bio_check_ro(struct bio *bio)
@@ -1090,92 +1081,6 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
 }
 EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
 
-/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- *                              for the new queue limits
- * @q:  the queue
- * @rq: the request being checked
- *
- * Description:
- *    @rq may have been made based on weaker limitations of upper-level queues
- *    in request stacking drivers, and it may violate the limitation of @q.
- *    Since the block layer and the underlying device driver trust @rq
- *    after it is inserted to @q, it should be checked against @q before
- *    the insertion using this generic function.
- *
- *    Request stacking drivers like request-based dm may change the queue
- *    limits when retrying requests on other queues. Those requests need
- *    to be checked against the new queue limits again during dispatch.
- */
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
-				      struct request *rq)
-{
-	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
-
-	if (blk_rq_sectors(rq) > max_sectors) {
-		/*
-		 * SCSI device does not have a good way to return if
-		 * Write Same/Zero is actually supported. If a device rejects
-		 * a non-read/write command (discard, write same,etc.) the
-		 * low-level device driver will set the relevant queue limit to
-		 * 0 to prevent blk-lib from issuing more of the offending
-		 * operations. Commands queued prior to the queue limit being
-		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
-		 * errors being propagated to upper layers.
-		 */
-		if (max_sectors == 0)
-			return BLK_STS_NOTSUPP;
-
-		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
-			__func__, blk_rq_sectors(rq), max_sectors);
-		return BLK_STS_IOERR;
-	}
-
-	/*
-	 * The queue settings related to segment counting may differ from the
-	 * original queue.
-	 */
-	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > queue_max_segments(q)) {
-		printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
-			__func__, rq->nr_phys_segments, queue_max_segments(q));
-		return BLK_STS_IOERR;
-	}
-
-	return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q:  the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
-	blk_status_t ret;
-
-	ret = blk_cloned_rq_check_limits(q, rq);
-	if (ret != BLK_STS_OK)
-		return ret;
-
-	if (rq->rq_disk &&
-	    should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
-		return BLK_STS_IOERR;
-
-	if (blk_crypto_insert_cloned_request(rq))
-		return BLK_STS_IOERR;
-
-	blk_account_io_start(rq);
-
-	/*
-	 * Since we have a scheduler attached on the top device,
-	 * bypass a potential scheduler on the bottom device for
-	 * insert.
-	 */
-	return blk_mq_request_issue_directly(rq, true);
-}
-EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
-
 static void update_io_ticks(struct block_device *part, unsigned long now,
 		bool end)
 {
@@ -1328,93 +1233,6 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = &fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else {
-			rq->bio = rq->biotail = bio;
-		}
-		bio = NULL;
-	}
-
-	/* Copy attributes of the original request to the clone request. */
-	rq->__sector = blk_rq_pos(rq_src);
-	rq->__data_len = blk_rq_bytes(rq_src);
-	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
-		rq->special_vec = rq_src->special_vec;
-	}
-	rq->nr_phys_segments = rq_src->nr_phys_segments;
-	rq->ioprio = rq_src->ioprio;
-
-	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
-		goto free_and_out;
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
-
 int kblockd_schedule_work(struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index df28e5ef8c2d..812dac9ecb29 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2435,7 +2435,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	hctx_unlock(hctx, srcu_idx);
 }
 
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
 	blk_status_t ret;
 	int srcu_idx;
@@ -2824,6 +2824,179 @@ void blk_mq_submit_bio(struct bio *bio)
 	}
 }
 
+/**
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ *                              for the new queue limits
+ * @q:  the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ *    @rq may have been made based on weaker limitations of upper-level queues
+ *    in request stacking drivers, and it may violate the limitation of @q.
+ *    Since the block layer and the underlying device driver trust @rq
+ *    after it is inserted to @q, it should be checked against @q before
+ *    the insertion using this generic function.
+ *
+ *    Request stacking drivers like request-based dm may change the queue
+ *    limits when retrying requests on other queues. Those requests need
+ *    to be checked against the new queue limits again during dispatch.
+ */
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
+				      struct request *rq)
+{
+	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+
+	if (blk_rq_sectors(rq) > max_sectors) {
+		/*
+		 * SCSI device does not have a good way to return if
+		 * Write Same/Zero is actually supported. If a device rejects
+		 * a non-read/write command (discard, write same,etc.) the
+		 * low-level device driver will set the relevant queue limit to
+		 * 0 to prevent blk-lib from issuing more of the offending
+		 * operations. Commands queued prior to the queue limit being
+		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
+		 * errors being propagated to upper layers.
+		 */
+		if (max_sectors == 0)
+			return BLK_STS_NOTSUPP;
+
+		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
+			__func__, blk_rq_sectors(rq), max_sectors);
+		return BLK_STS_IOERR;
+	}
+
+	/*
+	 * The queue settings related to segment counting may differ from the
+	 * original queue.
+	 */
+	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
+	if (rq->nr_phys_segments > queue_max_segments(q)) {
+		printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
+			__func__, rq->nr_phys_segments, queue_max_segments(q));
+		return BLK_STS_IOERR;
+	}
+
+	return BLK_STS_OK;
+}
+
+/**
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @q:  the queue to submit the request
+ * @rq: the request being queued
+ */
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+{
+	blk_status_t ret;
+
+	ret = blk_cloned_rq_check_limits(q, rq);
+	if (ret != BLK_STS_OK)
+		return ret;
+
+	if (rq->rq_disk &&
+	    should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
+		return BLK_STS_IOERR;
+
+	if (blk_crypto_insert_cloned_request(rq))
+		return BLK_STS_IOERR;
+
+	blk_account_io_start(rq);
+
+	/*
+	 * Since we have a scheduler attached on the top device,
+	 * bypass a potential scheduler on the bottom device for
+	 * insert.
+	 */
+	return blk_mq_request_issue_directly(rq, true);
+}
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = &fs_bio_set;
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_clone_fast(bio_src, gfp_mask, bs);
+		if (!bio)
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else {
+			rq->bio = rq->biotail = bio;
+		}
+		bio = NULL;
+	}
+
+	/* Copy attributes of the original request to the clone request. */
+	rq->__sector = blk_rq_pos(rq_src);
+	rq->__data_len = blk_rq_bytes(rq_src);
+	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+		rq->special_vec = rq_src->special_vec;
+	}
+	rq->nr_phys_segments = rq_src->nr_phys_segments;
+	rq->ioprio = rq_src->ioprio;
+
+	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
+		goto free_and_out;
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_put(bio);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+
 static size_t order_to_size(unsigned int order)
 {
 	return (size_t)PAGE_SIZE << order;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index afcf9931a489..d516c7a46f57 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -65,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
 				  bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
-
-/* Used by blk_insert_cloned_request() to issue request directly */
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				    struct list_head *list);
 
diff --git a/block/blk.h b/block/blk.h
index ccde6e6f1736..8a3761b6dc33 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -493,4 +493,14 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
 				struct blk_independent_access_ranges *new_iars);
 void disk_unregister_independent_access_ranges(struct gendisk *disk);
 
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+bool should_fail_request(struct block_device *part, unsigned int bytes);
+#else /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool should_fail_request(struct block_device *part,
+					unsigned int bytes)
+{
+	return false;
+}
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
+
 #endif /* BLK_INTERNAL_H */
-- 
cgit v1.2.1


From 52fdbbcc83f35da90f857668acc387470ed84606 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:13:59 +0100
Subject: block: move blk_rq_init to blk-mq.c

blk_rq_init deals with a request structure, so move it to blk-mq.c

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 17 -----------------
 block/blk-mq.c   | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5d6017d7f84e..6c0bc2d895a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -109,23 +109,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
 
-void blk_rq_init(struct request_queue *q, struct request *rq)
-{
-	memset(rq, 0, sizeof(*rq));
-
-	INIT_LIST_HEAD(&rq->queuelist);
-	rq->q = q;
-	rq->__sector = (sector_t) -1;
-	INIT_HLIST_NODE(&rq->hash);
-	RB_CLEAR_NODE(&rq->rb_node);
-	rq->tag = BLK_MQ_NO_TAG;
-	rq->internal_tag = BLK_MQ_NO_TAG;
-	rq->start_time_ns = ktime_get_ns();
-	rq->part = NULL;
-	blk_crypto_rq_set_defaults(rq);
-}
-EXPORT_SYMBOL(blk_rq_init);
-
 #define REQ_OP_NAME(name) [REQ_OP_##name] = #name
 static const char *const blk_op_name[] = {
 	REQ_OP_NAME(READ),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 812dac9ecb29..3217139d2e0b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -328,6 +328,23 @@ void blk_mq_wake_waiters(struct request_queue *q)
 			blk_mq_tag_wakeup_all(hctx->tags, true);
 }
 
+void blk_rq_init(struct request_queue *q, struct request *rq)
+{
+	memset(rq, 0, sizeof(*rq));
+
+	INIT_LIST_HEAD(&rq->queuelist);
+	rq->q = q;
+	rq->__sector = (sector_t) -1;
+	INIT_HLIST_NODE(&rq->hash);
+	RB_CLEAR_NODE(&rq->rb_node);
+	rq->tag = BLK_MQ_NO_TAG;
+	rq->internal_tag = BLK_MQ_NO_TAG;
+	rq->start_time_ns = ktime_get_ns();
+	rq->part = NULL;
+	blk_crypto_rq_set_defaults(rq);
+}
+EXPORT_SYMBOL(blk_rq_init);
+
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
 {
-- 
cgit v1.2.1


From f2b8f3ce989d8066f2dedaad223b3e73c4485223 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:14:00 +0100
Subject: block: move blk_steal_bios to blk-mq.c

Keep all the request based code together.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 21 ---------------------
 block/blk-mq.c   | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6c0bc2d895a1..65891f058f3d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1167,27 +1167,6 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 }
 EXPORT_SYMBOL(disk_end_io_acct);
 
-/*
- * Steal bios from a request and add them to a bio list.
- * The request must not have been partially completed before.
- */
-void blk_steal_bios(struct bio_list *list, struct request *rq)
-{
-	if (rq->bio) {
-		if (list->tail)
-			list->tail->bi_next = rq->bio;
-		else
-			list->head = rq->bio;
-		list->tail = rq->biotail;
-
-		rq->bio = NULL;
-		rq->biotail = NULL;
-	}
-
-	rq->__data_len = 0;
-}
-EXPORT_SYMBOL_GPL(blk_steal_bios);
-
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3217139d2e0b..cd5f31c4d2fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3014,6 +3014,27 @@ free_and_out:
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+	if (rq->bio) {
+		if (list->tail)
+			list->tail->bi_next = rq->bio;
+		else
+			list->head = rq->bio;
+		list->tail = rq->biotail;
+
+		rq->bio = NULL;
+		rq->biotail = NULL;
+	}
+
+	rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
 static size_t order_to_size(unsigned int order)
 {
 	return (size_t)PAGE_SIZE << order;
-- 
cgit v1.2.1


From 450b7879e34517c3ebc3a35a53806fe40e60fac2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:14:01 +0100
Subject: block: move blk_account_io_{start,done} to blk-mq.c

These are only used for request based I/O, so move them where they are
used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 27 +--------------------------
 block/blk-mq.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 block/blk.h      | 21 +--------------------
 3 files changed, 44 insertions(+), 46 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 65891f058f3d..29c4db03742b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1064,8 +1064,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
 }
 EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
 
-static void update_io_ticks(struct block_device *part, unsigned long now,
-		bool end)
+void update_io_ticks(struct block_device *part, unsigned long now, bool end)
 {
 	unsigned long stamp;
 again:
@@ -1080,30 +1079,6 @@ again:
 	}
 }
 
-void __blk_account_io_done(struct request *req, u64 now)
-{
-	const int sgrp = op_stat_group(req_op(req));
-
-	part_stat_lock();
-	update_io_ticks(req->part, jiffies, true);
-	part_stat_inc(req->part, ios[sgrp]);
-	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
-	part_stat_unlock();
-}
-
-void __blk_account_io_start(struct request *rq)
-{
-	/* passthrough requests can hold bios that do not have ->bi_bdev set */
-	if (rq->bio && rq->bio->bi_bdev)
-		rq->part = rq->bio->bi_bdev;
-	else
-		rq->part = rq->rq_disk->part0;
-
-	part_stat_lock();
-	update_io_ticks(rq->part, jiffies, false);
-	part_stat_unlock();
-}
-
 static unsigned long __part_start_io_acct(struct block_device *part,
 					  unsigned int sectors, unsigned int op)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cd5f31c4d2fd..3921c7cfd64c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -809,6 +809,48 @@ bool blk_update_request(struct request *req, blk_status_t error,
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
+static void __blk_account_io_done(struct request *req, u64 now)
+{
+	const int sgrp = op_stat_group(req_op(req));
+
+	part_stat_lock();
+	update_io_ticks(req->part, jiffies, true);
+	part_stat_inc(req->part, ios[sgrp]);
+	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+	part_stat_unlock();
+}
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+	/*
+	 * Account IO completion.  flush_rq isn't accounted as a
+	 * normal IO on queueing nor completion.  Accounting the
+	 * containing request is enough.
+	 */
+	if (blk_do_io_stat(req) && req->part &&
+	    !(req->rq_flags & RQF_FLUSH_SEQ))
+		__blk_account_io_done(req, now);
+}
+
+static void __blk_account_io_start(struct request *rq)
+{
+	/* passthrough requests can hold bios that do not have ->bi_bdev set */
+	if (rq->bio && rq->bio->bi_bdev)
+		rq->part = rq->bio->bi_bdev;
+	else
+		rq->part = rq->rq_disk->part0;
+
+	part_stat_lock();
+	update_io_ticks(rq->part, jiffies, false);
+	part_stat_unlock();
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+	if (blk_do_io_stat(req))
+		__blk_account_io_start(req);
+}
+
 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 {
 	if (rq->rq_flags & RQF_STATS) {
diff --git a/block/blk.h b/block/blk.h
index 8a3761b6dc33..50aae8c0e03c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -257,9 +257,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
 			struct bio *bio, unsigned int nr_segs);
 
-void __blk_account_io_start(struct request *req);
-void __blk_account_io_done(struct request *req, u64 now);
-
 /*
  * Plug flush limits
  */
@@ -350,23 +347,7 @@ static inline bool blk_do_io_stat(struct request *rq)
 	return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
 }
 
-static inline void blk_account_io_done(struct request *req, u64 now)
-{
-	/*
-	 * Account IO completion.  flush_rq isn't accounted as a
-	 * normal IO on queueing nor completion.  Accounting the
-	 * containing request is enough.
-	 */
-	if (blk_do_io_stat(req) && req->part &&
-	    !(req->rq_flags & RQF_FLUSH_SEQ))
-		__blk_account_io_done(req, now);
-}
-
-static inline void blk_account_io_start(struct request *req)
-{
-	if (blk_do_io_stat(req))
-		__blk_account_io_start(req);
-}
+void update_io_ticks(struct block_device *part, unsigned long now, bool end);
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
-- 
cgit v1.2.1


From 22350ad7f15931b65d288e3e1462a51bfbfa5c4b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:14:02 +0100
Subject: block: move blk_dump_rq_flags to blk-mq.c

blk_dump_rq_flags deals with a request, so move it to blk-mq.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 14 --------------
 block/blk-mq.c   | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29c4db03742b..ae9bea11695e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -217,20 +217,6 @@ void blk_print_req_error(struct request *req, blk_status_t status)
 		IOPRIO_PRIO_CLASS(req->ioprio));
 }
 
-void blk_dump_rq_flags(struct request *rq, char *msg)
-{
-	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
-		rq->rq_disk ? rq->rq_disk->disk_name : "?",
-		(unsigned long long) rq->cmd_flags);
-
-	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
-	       (unsigned long long)blk_rq_pos(rq),
-	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
-	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
-	       rq->bio, rq->biotail, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(blk_dump_rq_flags);
-
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3921c7cfd64c..2a4eff639036 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -667,6 +667,20 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 		blk_mq_free_request(rq);
 }
 
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
+		rq->rq_disk ? rq->rq_disk->disk_name : "?",
+		(unsigned long long) rq->cmd_flags);
+
+	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
+	       (unsigned long long)blk_rq_pos(rq),
+	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
+	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
+	       rq->bio, rq->biotail, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, blk_status_t error)
 {
-- 
cgit v1.2.1


From 0d7a29a2b5eae30eff1e216badca76978a3de39f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:14:03 +0100
Subject: block: move blk_print_req_error to blk-mq.c

This function is only used by the request completion path.  Factor out
a blk_status_to_str to keep blk_errors private in blk-core.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 15 +++------------
 block/blk-mq.c   | 13 +++++++++++++
 block/blk.h      |  2 +-
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ae9bea11695e..8e14438ef822 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -199,22 +199,13 @@ int blk_status_to_errno(blk_status_t status)
 }
 EXPORT_SYMBOL_GPL(blk_status_to_errno);
 
-void blk_print_req_error(struct request *req, blk_status_t status)
+const char *blk_status_to_str(blk_status_t status)
 {
 	int idx = (__force int)status;
 
 	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
-		return;
-
-	printk_ratelimited(KERN_ERR
-		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
-		"phys_seg %u prio class %u\n",
-		blk_errors[idx].name,
-		req->rq_disk ? req->rq_disk->disk_name : "?",
-		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
-		req->cmd_flags & ~REQ_OP_MASK,
-		req->nr_phys_segments,
-		IOPRIO_PRIO_CLASS(req->ioprio));
+		return "<null>";
+	return blk_errors[idx].name;
 }
 
 /**
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2a4eff639036..00df1eb031c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -717,6 +717,19 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
+static void blk_print_req_error(struct request *req, blk_status_t status)
+{
+	printk_ratelimited(KERN_ERR
+		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+		"phys_seg %u prio class %u\n",
+		blk_status_to_str(status),
+		req->rq_disk ? req->rq_disk->disk_name : "?",
+		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
+		req->cmd_flags & ~REQ_OP_MASK,
+		req->nr_phys_segments,
+		IOPRIO_PRIO_CLASS(req->ioprio));
+}
+
 /**
  * blk_update_request - Complete multiple bytes without completing the request
  * @req:      the request being processed
diff --git a/block/blk.h b/block/blk.h
index 50aae8c0e03c..31ac75413287 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -250,7 +250,7 @@ static inline void blk_integrity_del(struct gendisk *disk)
 
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
-void blk_print_req_error(struct request *req, blk_status_t status);
+const char *blk_status_to_str(blk_status_t status);
 
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs, bool *same_queue_rq);
-- 
cgit v1.2.1


From d9337a420aed38cb4ffa465e5a546360410bc0cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Nov 2021 07:14:04 +0100
Subject: block: don't include blk-mq headers in blk-core.c

All request based code is in the blk-mq files now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20211117061404.331732-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8e14438ef822..35a04d8c180a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/blk-mq.h>
 #include <linux/blk-pm.h>
 #include <linux/blk-integrity.h>
 #include <linux/highmem.h>
@@ -47,8 +46,6 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-mq.h"
-#include "blk-mq-sched.h"
 #include "blk-pm.h"
 #include "blk-throttle.h"
 
-- 
cgit v1.2.1


From 86416916466514e4ae0b7296d20133b6427c4c1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:12 +0100
Subject: block: move GENHD_FL_NATIVE_CAPACITY to disk->state

The flag to indicate an unlocked native capacity is dynamic state,
not a driver capability flag, so move it to disk->state.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/core.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/partitions/core.c b/block/partitions/core.c
index 334b72ef1d73..520292fee933 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -527,18 +527,15 @@ out_unlock:
 
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
-	const struct block_device_operations *bdops = disk->fops;
-
-	if (bdops->unlock_native_capacity &&
-	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
-		printk(KERN_CONT "enabling native capacity\n");
-		bdops->unlock_native_capacity(disk);
-		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-		return true;
-	} else {
+	if (!disk->fops->unlock_native_capacity ||
+	    test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
 		printk(KERN_CONT "truncated\n");
 		return false;
 	}
+
+	printk(KERN_CONT "enabling native capacity\n");
+	disk->fops->unlock_native_capacity(disk);
+	return true;
 }
 
 void blk_drop_partitions(struct gendisk *disk)
-- 
cgit v1.2.1


From 1545e0b419ba1d9b9bee4061d4826340afe6b0aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:13 +0100
Subject: block: move GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE to disk->event_flags

GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE is all about the event reporting
mechanism, so move it to the event_flags field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bdev.c b/block/bdev.c
index b1d087e5e205..dd84961bed7e 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -837,7 +837,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 		 * used in blkdev_get/put().
 		 */
 		if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
-		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+		    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
 			bdev->bd_write_holder = true;
 			unblock_events = false;
 		}
-- 
cgit v1.2.1


From e3b3bad3f29878d13fdbc96f9e59674bd9b06bae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:15 +0100
Subject: block: remove a dead check in show_partition

disk_max_parts never returns 0 given that ->minors for devices not using
the extended dev_t must be non-zero, and disk_max_parts always returns
DISK_MAX_PARTS for the latter.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 30362aeacac4..1c326d3b54b4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -814,9 +814,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	struct block_device *part;
 	unsigned long idx;
 
-	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
-				   (sgp->flags & GENHD_FL_REMOVABLE)))
+	if (!get_capacity(sgp))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 		return 0;
-- 
cgit v1.2.1


From e16e506ccd673a3a888a34f8f694698305840044 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:16 +0100
Subject: block: merge disk_scan_partitions and blkdev_reread_part

Unify the functionality that implements a partition rescan for a
gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h   |  1 +
 block/genhd.c | 19 ++++++++++++-------
 block/ioctl.c | 31 +++++--------------------------
 3 files changed, 18 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/blk.h b/block/blk.h
index 31ac75413287..423cba8ea0a6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -449,6 +449,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		unsigned int max_sectors, bool *same_page);
 
 struct request_queue *blk_alloc_queue(int node_id);
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
 
 int disk_alloc_events(struct gendisk *disk);
 void disk_add_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index 1c326d3b54b4..94f39c4333b8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -372,17 +372,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
 }
 EXPORT_SYMBOL_GPL(disk_uevent);
 
-static void disk_scan_partitions(struct gendisk *disk)
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
 {
 	struct block_device *bdev;
 
-	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
-		return;
+	if (!disk_part_scan_enabled(disk))
+		return -EINVAL;
+	if (disk->open_partitions)
+		return -EBUSY;
 
 	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
-	if (!IS_ERR(bdev))
-		blkdev_put(bdev, FMODE_READ);
+	bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+	blkdev_put(bdev, mode);
+	return 0;
 }
 
 /**
@@ -509,7 +513,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 			goto out_unregister_bdi;
 
 		bdev_add(disk->part0, ddev->devt);
-		disk_scan_partitions(disk);
+		if (get_capacity(disk))
+			disk_scan_partitions(disk, FMODE_READ);
 
 		/*
 		 * Announce the disk and partitions after all partitions are
diff --git a/block/ioctl.c b/block/ioctl.c
index 0a1d10ac2e1a..4a86340133e4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
 }
 #endif
 
-static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
-{
-	struct block_device *tmp;
-
-	if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
-		return -EINVAL;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (bdev->bd_disk->open_partitions)
-		return -EBUSY;
-
-	/*
-	 * Reopen the device to revalidate the driver state and force a
-	 * partition rescan.
-	 */
-	mode &= ~FMODE_EXCL;
-	set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
-
-	tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
-	if (IS_ERR(tmp))
-		return PTR_ERR(tmp);
-	blkdev_put(tmp, mode);
-	return 0;
-}
-
 static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 		unsigned long arg, unsigned long flags)
 {
@@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 		bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKRRPART:
-		return blkdev_reread_part(bdev, mode);
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		if (bdev_is_partition(bdev))
+			return -EINVAL;
+		return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
 	case BLKTRACESTART:
 	case BLKTRACESTOP:
 	case BLKTRACETEARDOWN:
-- 
cgit v1.2.1


From 46e7eac647b34ed4106a8262f8bedbb90801fadd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:17 +0100
Subject: block: rename GENHD_FL_NO_PART_SCAN to GENHD_FL_NO_PART

The GENHD_FL_NO_PART_SCAN controls more than just partitions canning,
so rename it to GENHD_FL_NO_PART.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20211122130625.1136848-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 94f39c4333b8..685794a2ebb0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -500,7 +500,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 		 * and don't bother scanning for partitions either.
 		 */
 		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
-		disk->flags |= GENHD_FL_NO_PART_SCAN;
+		disk->flags |= GENHD_FL_NO_PART;
 	} else {
 		ret = bdi_register(disk->bdi, "%u:%u",
 				   disk->major, disk->first_minor);
-- 
cgit v1.2.1


From 140862805affca32b3c92a9a7643a7ee6d6ab278 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Nov 2021 06:38:04 -0700
Subject: block: remove the GENHD_FL_HIDDEN check in blkdev_get_no_open

Hidden gendisks never hash the block device inode, so this can't happen.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'block')

diff --git a/block/bdev.c b/block/bdev.c
index dd84961bed7e..e9ada04e71be 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -750,14 +750,6 @@ struct block_device *blkdev_get_no_open(dev_t dev)
 	if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
 		bdev = NULL;
 	iput(inode);
-
-	if (!bdev)
-		return NULL;
-	if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN)) {
-		put_device(&bdev->bd_device);
-		return NULL;
-	}
-
 	return bdev;
 }
 
-- 
cgit v1.2.1


From 3b5149ac50970669ee0ddb9629ec77ffd5c0622d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:21 +0100
Subject: block: remove GENHD_FL_SUPPRESS_PARTITION_INFO

This flag is not set directly anywhere and only inherited from
GENHD_FL_HIDDEN.  Just check for GENHD_FL_HIDDEN instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 685794a2ebb0..50a843ee18f6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -496,10 +496,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 
 	if (disk->flags & GENHD_FL_HIDDEN) {
 		/*
-		 * Don't let hidden disks show up in /proc/partitions,
-		 * and don't bother scanning for partitions either.
+		 * Don't bother scanning for partitions.
 		 */
-		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
 		disk->flags |= GENHD_FL_NO_PART;
 	} else {
 		ret = bdi_register(disk->bdi, "%u:%u",
@@ -725,8 +723,7 @@ void __init printk_all_partitions(void)
 		 * Don't show empty devices or things that have been
 		 * suppressed
 		 */
-		if (get_capacity(disk) == 0 ||
-		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+		if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
 			continue;
 
 		/*
@@ -819,9 +816,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	struct block_device *part;
 	unsigned long idx;
 
-	if (!get_capacity(sgp))
-		return 0;
-	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
+	if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
 		return 0;
 
 	rcu_read_lock();
-- 
cgit v1.2.1


From 1ebe2e5f9d68e94c524aba876f27b945669a7879 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:22 +0100
Subject: block: remove GENHD_FL_EXT_DEVT

All modern drivers can support extra partitions using the extended
dev_t.  In fact except for the ioctl method drivers never even see
partitions in normal operation.

So remove the GENHD_FL_EXT_DEVT and allow extra partitions for all
block devices that do support partitions, and require those that
do not support partitions to explicit disallow them using
GENHD_FL_NO_PART.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           | 6 +++---
 block/partitions/core.c | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 50a843ee18f6..628632537129 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -376,7 +376,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
 {
 	struct block_device *bdev;
 
-	if (!disk_part_scan_enabled(disk))
+	if (disk->flags & GENHD_FL_NO_PART)
 		return -EINVAL;
 	if (disk->open_partitions)
 		return -EBUSY;
@@ -438,7 +438,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 			return ret;
 		disk->major = BLOCK_EXT_MAJOR;
 		disk->first_minor = ret;
-		disk->flags |= GENHD_FL_EXT_DEVT;
 	}
 
 	ret = disk_alloc_events(disk);
@@ -872,7 +871,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	return sprintf(buf, "%d\n", disk_max_parts(disk));
+	return sprintf(buf, "%d\n",
+		(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
 }
 
 static ssize_t disk_removable_show(struct device *dev,
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 520292fee933..c2a1635922b1 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -98,13 +98,12 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
 static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
 {
 	struct parsed_partitions *state;
-	int nr;
+	int nr = DISK_MAX_PARTS;
 
 	state = kzalloc(sizeof(*state), GFP_KERNEL);
 	if (!state)
 		return NULL;
 
-	nr = disk_max_parts(hd);
 	state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
 	if (!state->parts) {
 		kfree(state);
@@ -326,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 
 	lockdep_assert_held(&disk->open_mutex);
 
-	if (partno >= disk_max_parts(disk))
+	if (partno >= DISK_MAX_PARTS)
 		return ERR_PTR(-EINVAL);
 
 	/*
@@ -604,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
 
-	if (!disk_part_scan_enabled(disk))
+	if (disk->flags & GENHD_FL_NO_PART)
 		return 0;
 
 	state = check_partition(disk);
@@ -687,7 +686,7 @@ rescan:
 	 * userspace for this particular setup.
 	 */
 	if (invalidate) {
-		if (disk_part_scan_enabled(disk) ||
+		if (!(disk->flags & GENHD_FL_NO_PART) ||
 		    !(disk->flags & GENHD_FL_REMOVABLE))
 			set_capacity(disk, 0);
 	}
-- 
cgit v1.2.1


From 9f18db572c97bc327b63528d195fdb252f47e9de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Nov 2021 14:06:23 +0100
Subject: block: don't set GENHD_FL_NO_PART for hidden gendisks

Hidden gendisks can't be opened using blkdev_get_*, so we can't really
reach any of the partition scanning paths or partitioning ioctls except
for the initial partition scan from add_disk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211122130625.1136848-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 628632537129..8e9cbf23c510 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -376,7 +376,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
 {
 	struct block_device *bdev;
 
-	if (disk->flags & GENHD_FL_NO_PART)
+	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
 		return -EINVAL;
 	if (disk->open_partitions)
 		return -EBUSY;
@@ -493,12 +493,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	if (ret)
 		goto out_put_slave_dir;
 
-	if (disk->flags & GENHD_FL_HIDDEN) {
-		/*
-		 * Don't bother scanning for partitions.
-		 */
-		disk->flags |= GENHD_FL_NO_PART;
-	} else {
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
 		ret = bdi_register(disk->bdi, "%u:%u",
 				   disk->major, disk->first_minor);
 		if (ret)
-- 
cgit v1.2.1


From 0c5bcc92d94a8f578ab2b06c1274e076cc8aecd3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 17:04:41 +0100
Subject: blk-mq: simplify the plug handling in blk_mq_submit_bio

blk_mq_submit_bio has two different plug cases, one that uses full
plugging and a limited plugging one.

The limited plugging case is only used for a corner case that does
not matter in real life:

 - no ->commit_rqs (so not NVMe)
 - no shared tags (so not SCSI)
 - not rotational (so no old disk or floppy driver)
 - must have multiple queues (so no eMMC)

Remove the limited merging case and all the related junk to simplify
blk_mq_submit_bio and the functions called from it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123160443.1315598-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c |  9 +-------
 block/blk-mq.c    | 68 +++++++++++--------------------------------------------
 block/blk.h       |  2 +-
 3 files changed, 15 insertions(+), 64 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 893c1a60b701..ba761c3f482b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1067,7 +1067,6 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
  * @nr_segs: number of segments in @bio
- * @same_queue_rq: output value, will be true if there's an existing request
  * from the passed in @q already in the plug list
  *
  * Determine whether @bio being queued on @q can be merged with the previous
@@ -1084,7 +1083,7 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-		unsigned int nr_segs, bool *same_queue_rq)
+		unsigned int nr_segs)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1096,12 +1095,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	/* check the previously added entry for a quick merge attempt */
 	rq = rq_list_peek(&plug->mq_list);
 	if (rq->q == q) {
-		/*
-		 * Only blk-mq multiple hardware queues case checks the rq in
-		 * the same queue, there should be only one such rq in a queue
-		 */
-		*same_queue_rq = true;
-
 		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
 				BIO_MERGE_OK)
 			return true;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 00df1eb031c0..4a13900b640e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2690,11 +2690,10 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 }
 
 static bool blk_mq_attempt_bio_merge(struct request_queue *q,
-				     struct bio *bio, unsigned int nr_segs,
-				     bool *same_queue_rq)
+				     struct bio *bio, unsigned int nr_segs)
 {
 	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
-		if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq))
+		if (blk_attempt_plug_merge(q, bio, nr_segs))
 			return true;
 		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
 			return true;
@@ -2705,8 +2704,7 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
 static struct request *blk_mq_get_new_requests(struct request_queue *q,
 					       struct blk_plug *plug,
 					       struct bio *bio,
-					       unsigned int nsegs,
-					       bool *same_queue_rq)
+					       unsigned int nsegs)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
@@ -2715,7 +2713,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	};
 	struct request *rq;
 
-	if (blk_mq_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
 		return NULL;
 
 	rq_qos_throttle(q, bio);
@@ -2751,8 +2749,7 @@ static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio)
 static inline struct request *blk_mq_get_request(struct request_queue *q,
 						 struct blk_plug *plug,
 						 struct bio *bio,
-						 unsigned int nsegs,
-						 bool *same_queue_rq)
+						 unsigned int nsegs)
 {
 	struct request *rq;
 	bool checked = false;
@@ -2762,8 +2759,7 @@ static inline struct request *blk_mq_get_request(struct request_queue *q,
 		if (rq && rq->q == q) {
 			if (unlikely(!submit_bio_checks(bio)))
 				return NULL;
-			if (blk_mq_attempt_bio_merge(q, bio, nsegs,
-						same_queue_rq))
+			if (blk_mq_attempt_bio_merge(q, bio, nsegs))
 				return NULL;
 			checked = true;
 			if (!blk_mq_can_use_cached_rq(rq, bio))
@@ -2781,7 +2777,7 @@ fallback:
 		return NULL;
 	if (unlikely(!checked && !submit_bio_checks(bio)))
 		goto out_put;
-	rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
+	rq = blk_mq_get_new_requests(q, plug, bio, nsegs);
 	if (rq)
 		return rq;
 out_put:
@@ -2808,7 +2804,6 @@ void blk_mq_submit_bio(struct bio *bio)
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct request *rq;
 	struct blk_plug *plug;
-	bool same_queue_rq = false;
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
@@ -2823,7 +2818,7 @@ void blk_mq_submit_bio(struct bio *bio)
 		return;
 
 	plug = blk_mq_plug(q, bio);
-	rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq);
+	rq = blk_mq_get_request(q, plug, bio, nr_segs);
 	if (unlikely(!rq))
 		return;
 
@@ -2846,16 +2841,7 @@ void blk_mq_submit_bio(struct bio *bio)
 		return;
 	}
 
-	if (plug && (q->nr_hw_queues == 1 ||
-	    blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
-	    q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
-		/*
-		 * Use plugging if we have a ->commit_rqs() hook as well, as
-		 * we know the driver uses bd->last in a smart fashion.
-		 *
-		 * Use normal plugging if this disk is slow HDD, as sequential
-		 * IO may benefit a lot from plug merging.
-		 */
+	if (plug) {
 		unsigned int request_count = plug->rq_count;
 		struct request *last = NULL;
 
@@ -2873,40 +2859,12 @@ void blk_mq_submit_bio(struct bio *bio)
 		}
 
 		blk_add_rq_to_plug(plug, rq);
-	} else if (rq->rq_flags & RQF_ELV) {
-		/* Insert the request at the IO scheduler queue */
+	} else if ((rq->rq_flags & RQF_ELV) ||
+		   (rq->mq_hctx->dispatch_busy &&
+		    (q->nr_hw_queues == 1 || !is_sync))) {
 		blk_mq_sched_insert_request(rq, false, true, true);
-	} else if (plug && !blk_queue_nomerges(q)) {
-		struct request *next_rq = NULL;
-
-		/*
-		 * We do limited plugging. If the bio can be merged, do that.
-		 * Otherwise the existing request in the plug list will be
-		 * issued. So the plug list will have one request at most
-		 * The plug list might get flushed before this. If that happens,
-		 * the plug list is empty, and same_queue_rq is invalid.
-		 */
-		if (same_queue_rq) {
-			next_rq = rq_list_pop(&plug->mq_list);
-			plug->rq_count--;
-		}
-		blk_add_rq_to_plug(plug, rq);
-		trace_block_plug(q);
-
-		if (next_rq) {
-			trace_block_unplug(q, 1, true);
-			blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq);
-		}
-	} else if ((q->nr_hw_queues > 1 && is_sync) ||
-		   !rq->mq_hctx->dispatch_busy) {
-		/*
-		 * There is no scheduler and we can try to send directly
-		 * to the hardware.
-		 */
-		blk_mq_try_issue_directly(rq->mq_hctx, rq);
 	} else {
-		/* Default case. */
-		blk_mq_sched_insert_request(rq, false, true, true);
+		blk_mq_try_issue_directly(rq->mq_hctx, rq);
 	}
 }
 
diff --git a/block/blk.h b/block/blk.h
index 423cba8ea0a6..5d4d08df772b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -253,7 +253,7 @@ void blk_add_timer(struct request *req);
 const char *blk_status_to_str(blk_status_t status);
 
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-		unsigned int nr_segs, bool *same_queue_rq);
+		unsigned int nr_segs);
 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
 			struct bio *bio, unsigned int nr_segs);
 
-- 
cgit v1.2.1


From 1e9c23034d7b216b02f1491505779dfe9a1a6e23 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 17:04:42 +0100
Subject: blk-mq: move more plug handling from blk_mq_submit_bio into
 blk_add_rq_to_plug

Keep all the functionality for adding a request to a plug in a single place.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123160443.1315598-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 64 +++++++++++++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4a13900b640e..3af88ffc9e2c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2662,21 +2662,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		hctx->queue->mq_ops->commit_rqs(hctx);
 }
 
-static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
-{
-	if (!plug->multiple_queues) {
-		struct request *nxt = rq_list_peek(&plug->mq_list);
-
-		if (nxt && nxt->q != rq->q)
-			plug->multiple_queues = true;
-	}
-	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
-		plug->has_elevator = true;
-	rq->rq_next = NULL;
-	rq_list_add(&plug->mq_list, rq);
-	plug->rq_count++;
-}
-
 /*
  * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
  * queues. This is important for md arrays to benefit from merging
@@ -2689,6 +2674,28 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 	return BLK_MAX_REQUEST_COUNT;
 }
 
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+	struct request *last = rq_list_peek(&plug->mq_list);
+
+	if (!plug->rq_count) {
+		trace_block_plug(rq->q);
+	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
+		   (!blk_queue_nomerges(rq->q) &&
+		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
+		blk_mq_flush_plug_list(plug, false);
+		trace_block_plug(rq->q);
+	}
+
+	if (!plug->multiple_queues && last && last->q != rq->q)
+		plug->multiple_queues = true;
+	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+		plug->has_elevator = true;
+	rq->rq_next = NULL;
+	rq_list_add(&plug->mq_list, rq);
+	plug->rq_count++;
+}
+
 static bool blk_mq_attempt_bio_merge(struct request_queue *q,
 				     struct bio *bio, unsigned int nr_segs)
 {
@@ -2841,31 +2848,14 @@ void blk_mq_submit_bio(struct bio *bio)
 		return;
 	}
 
-	if (plug) {
-		unsigned int request_count = plug->rq_count;
-		struct request *last = NULL;
-
-		if (!request_count) {
-			trace_block_plug(q);
-		} else if (!blk_queue_nomerges(q)) {
-			last = rq_list_peek(&plug->mq_list);
-			if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE)
-				last = NULL;
-		}
-
-		if (request_count >= blk_plug_max_rq_count(plug) || last) {
-			blk_mq_flush_plug_list(plug, false);
-			trace_block_plug(q);
-		}
-
+	if (plug)
 		blk_add_rq_to_plug(plug, rq);
-	} else if ((rq->rq_flags & RQF_ELV) ||
-		   (rq->mq_hctx->dispatch_busy &&
-		    (q->nr_hw_queues == 1 || !is_sync))) {
+	else if ((rq->rq_flags & RQF_ELV) ||
+		 (rq->mq_hctx->dispatch_busy &&
+		  (q->nr_hw_queues == 1 || !is_sync)))
 		blk_mq_sched_insert_request(rq, false, true, true);
-	} else {
+	else
 		blk_mq_try_issue_directly(rq->mq_hctx, rq);
-	}
 }
 
 /**
-- 
cgit v1.2.1


From 25c4b5e058578066db56d757ad3a7adeaff35856 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Nov 2021 13:37:38 -0700
Subject: blk-ioprio: don't set bio priority if not needed

We don't need to write to the bio if:

1) No ioprio value has ever been assigned to the blkcg
2) We wouldn't anyway, depending on bio and blkcg IO priority

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioprio.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 332a07761bf8..2e7f10e1c03f 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -62,6 +62,7 @@ struct ioprio_blkg {
 struct ioprio_blkcg {
 	struct blkcg_policy_data cpd;
 	enum prio_policy	 prio_policy;
+	bool			 prio_set;
 };
 
 static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
 	if (ret < 0)
 		return ret;
 	blkcg->prio_policy = ret;
-
+	blkcg->prio_set = true;
 	return nbytes;
 }
 
@@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
 			       struct bio *bio)
 {
 	struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+	u16 prio;
+
+	if (!blkcg->prio_set)
+		return;
 
 	/*
 	 * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
@@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
 	 * bio I/O priority is not modified. If the bio I/O priority equals
 	 * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
 	 */
-	bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
-			       IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+	prio = max_t(u16, bio->bi_ioprio,
+			IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+	if (prio > bio->bi_ioprio)
+		bio->bi_ioprio = prio;
 }
 
 static void blkcg_ioprio_exit(struct rq_qos *rqos)
-- 
cgit v1.2.1


From 48b5c1fbcd8c5bc6b91a56399a5257b801391dd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Nov 2021 14:03:26 -0700
Subject: block: only allocate poll_stats if there's a user of them

This is essentially never used, yet it's about 1/3rd of the total
queue size. Allocate it when needed, and don't embed it in the queue.

Kill the queue flag for this while at it, since we can just check the
assigned pointer now.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         | 10 ++++------
 block/blk-stat.c       | 18 ++++++++++++++++++
 block/blk-stat.h       |  1 +
 block/blk-sysfs.c      |  3 ++-
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4f2cf8399f3d..f4022b198580 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -122,7 +122,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(FUA),
 	QUEUE_FLAG_NAME(DAX),
 	QUEUE_FLAG_NAME(STATS),
-	QUEUE_FLAG_NAME(POLL_STATS),
 	QUEUE_FLAG_NAME(REGISTERED),
 	QUEUE_FLAG_NAME(QUIESCED),
 	QUEUE_FLAG_NAME(PCI_P2PDMA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3af88ffc9e2c..7cd408408a37 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4581,11 +4581,10 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 /* Enable polling stats and return whether they were already enabled. */
 static bool blk_poll_stats_enable(struct request_queue *q)
 {
-	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
-	    blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
+	if (q->poll_stat)
 		return true;
-	blk_stat_add_callback(q, q->poll_cb);
-	return false;
+
+	return blk_stats_alloc_enable(q);
 }
 
 static void blk_mq_poll_stats_start(struct request_queue *q)
@@ -4594,8 +4593,7 @@ static void blk_mq_poll_stats_start(struct request_queue *q)
 	 * We don't arm the callback if polling stats are not enabled or the
 	 * callback is already active.
 	 */
-	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
-	    blk_stat_is_active(q->poll_cb))
+	if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
 		return;
 
 	blk_stat_activate_msecs(q->poll_cb, 100);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index ae3dd1fb8e61..efb2a80db906 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -219,3 +219,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
 
 	kfree(stats);
 }
+
+bool blk_stats_alloc_enable(struct request_queue *q)
+{
+	struct blk_rq_stat *poll_stat;
+
+	poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
+				GFP_ATOMIC);
+	if (!poll_stat)
+		return false;
+
+	if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
+		kfree(poll_stat);
+		return true;
+	}
+
+	blk_stat_add_callback(q, q->poll_cb);
+	return false;
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17b47a86eefb..58f029af49e5 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -64,6 +64,7 @@ struct blk_stat_callback {
 
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
+bool blk_stats_alloc_enable(struct request_queue *q);
 
 void blk_stat_add(struct request *rq, u64 now);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cd75b0f73dc6..c079be1c58a3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -785,11 +785,12 @@ static void blk_release_queue(struct kobject *kobj)
 
 	might_sleep();
 
-	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+	if (q->poll_stat)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 
 	blk_free_queue_stats(q->stats);
+	kfree(q->poll_stat);
 
 	blk_exit_queue(q);
 
-- 
cgit v1.2.1


From 5a9d041ba2f6da468c891ca0fe263758e2c12091 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Nov 2021 11:18:32 -0700
Subject: block: move io_context creation into where it's needed

The only user of the io_context for IO is BFQ, yet we put the checking
and logic of it into the normal IO path.

Put the creation into blk_mq_sched_assign_ioc(), and have BFQ use that
helper.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c  | 2 ++
 block/blk-core.c     | 9 ---------
 block/blk-mq-sched.c | 5 +++++
 block/blk-mq.c       | 3 ---
 4 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..1ce1a99a7160 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6573,6 +6573,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
  */
 static void bfq_prepare_request(struct request *rq)
 {
+	blk_mq_sched_assign_ioc(rq);
+
 	/*
 	 * Regardless of whether we have an icq attached, we have to
 	 * clear the scheduler pointers, as they might point to
diff --git a/block/blk-core.c b/block/blk-core.c
index 35a04d8c180a..2053d1b0e90e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -750,15 +750,6 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		break;
 	}
 
-	/*
-	 * Various block parts want %current->io_context, so allocate it up
-	 * front rather than dealing with lots of pain to allocate it only
-	 * where needed. This may fail and the block layer knows how to live
-	 * with it.
-	 */
-	if (unlikely(!current->io_context))
-		create_task_io_context(current, GFP_ATOMIC, q->node);
-
 	if (blk_throtl_bio(bio))
 		return false;
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ba21449439cc..b942b38000e5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -24,6 +24,10 @@ void blk_mq_sched_assign_ioc(struct request *rq)
 	struct io_context *ioc;
 	struct io_cq *icq;
 
+	/* create task io_context, if we don't have one already */
+	if (unlikely(!current->io_context))
+		create_task_io_context(current, GFP_ATOMIC, q->node);
+
 	/*
 	 * May not have an IO context if it's a passthrough request
 	 */
@@ -43,6 +47,7 @@ void blk_mq_sched_assign_ioc(struct request *rq)
 	get_io_context(icq->ioc);
 	rq->elv.icq = icq;
 }
+EXPORT_SYMBOL_GPL(blk_mq_sched_assign_ioc);
 
 /*
  * Mark a hardware queue as needing a restart. For shared queues, maintain
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7cd408408a37..d6e7634e5e1f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -406,9 +406,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 		if (!op_is_flush(data->cmd_flags) &&
 		    e->type->ops.prepare_request) {
-			if (e->type->icq_cache)
-				blk_mq_sched_assign_ioc(rq);
-
 			e->type->ops.prepare_request(rq);
 			rq->rq_flags |= RQF_ELVPRIV;
 		}
-- 
cgit v1.2.1


From 35c90e6ec9608d8225b82ce609489b531cfd0a40 Mon Sep 17 00:00:00 2001
From: Guo Zhengkui <guozhengkui@vivo.com>
Date: Tue, 23 Nov 2021 14:33:40 +0800
Subject: blk_mq: remove repeated includes

Remove a repeated "#include<linux/sched/sysctl.h>".

Signed-off-by: Guo Zhengkui <guozhengkui@vivo.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20211123063340.25882-1-guozhengkui@vivo.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d6e7634e5e1f..4dff401bc642 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,7 +28,6 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
-#include <linux/sched/sysctl.h>
 
 #include <trace/events/block.h>
 
-- 
cgit v1.2.1


From 0281ed3cf44d2a7061ec3c1680e1f86e55ad57b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:05 +0100
Subject: block: move blk_get_flush_queue to blk-flush.c

blk_get_flush_queue is only used in blk-flush.c, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 6 ++++++
 block/blk.h       | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1fce6d16e6d3..86ee50455e41 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,6 +95,12 @@ enum {
 static void blk_kick_flush(struct request_queue *q,
 			   struct blk_flush_queue *fq, unsigned int flags);
 
+static inline struct blk_flush_queue *
+blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+{
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+}
+
 static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 {
 	unsigned int policy = 0;
diff --git a/block/blk.h b/block/blk.h
index 5d4d08df772b..1346085d89ce 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -35,12 +35,6 @@ extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
-static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
-{
-	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
-}
-
 static inline void __blk_get_queue(struct request_queue *q)
 {
 	kobject_get(&q->kobj);
-- 
cgit v1.2.1


From f46b81c54b241bf1ec01c8936a37201c99f94fc7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:06 +0100
Subject: block: remove elevator_exit

Open code elevator_exit in it's only caller, and rename __elevator_exit to
elevator_exit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c |  2 +-
 block/blk.h       | 11 +----------
 block/elevator.c  |  4 +++-
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c079be1c58a3..1677eb4a680c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -747,7 +747,7 @@ static void blk_exit_queue(struct request_queue *q)
 	 */
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		__elevator_exit(q, q->elevator);
+		elevator_exit(q, q->elevator);
 	}
 
 	/*
diff --git a/block/blk.h b/block/blk.h
index 1346085d89ce..2266cb1f7df5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -266,19 +266,10 @@ void blk_insert_flush(struct request *rq);
 
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
-void __elevator_exit(struct request_queue *, struct elevator_queue *);
+void elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q, bool uevent);
 void elv_unregister_queue(struct request_queue *q);
 
-static inline void elevator_exit(struct request_queue *q,
-		struct elevator_queue *e)
-{
-	lockdep_assert_held(&q->sysfs_lock);
-
-	blk_mq_sched_free_rqs(q);
-	__elevator_exit(q, e);
-}
-
 ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
 		char *buf);
 ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/elevator.c b/block/elevator.c
index 19a78d5516ba..3536cdd5fa12 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -188,7 +188,7 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	blk_mq_exit_sched(q, e);
@@ -595,6 +595,7 @@ int elevator_switch_mq(struct request_queue *q,
 			elv_unregister_queue(q);
 
 		ioc_clear_queue(q);
+		blk_mq_sched_free_rqs(q);
 		elevator_exit(q, q->elevator);
 	}
 
@@ -605,6 +606,7 @@ int elevator_switch_mq(struct request_queue *q,
 	if (new_e) {
 		ret = elv_register_queue(q, true);
 		if (ret) {
+			blk_mq_sched_free_rqs(q);
 			elevator_exit(q, q->elevator);
 			goto out;
 		}
-- 
cgit v1.2.1


From 0c6cb3a293fa9adc7be24d67ff7d201aefa50362 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:07 +0100
Subject: block: remove the e argument to elevator_exit

All callers pass q->elevator.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 2 +-
 block/blk.h       | 2 +-
 block/elevator.c  | 8 +++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1677eb4a680c..772347adc56b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -747,7 +747,7 @@ static void blk_exit_queue(struct request_queue *q)
 	 */
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		elevator_exit(q, q->elevator);
+		elevator_exit(q);
 	}
 
 	/*
diff --git a/block/blk.h b/block/blk.h
index 2266cb1f7df5..4df2ce8d4999 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -266,7 +266,7 @@ void blk_insert_flush(struct request *rq);
 
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
-void elevator_exit(struct request_queue *, struct elevator_queue *);
+void elevator_exit(struct request_queue *q);
 int elv_register_queue(struct request_queue *q, bool uevent);
 void elv_unregister_queue(struct request_queue *q);
 
diff --git a/block/elevator.c b/block/elevator.c
index 3536cdd5fa12..ec98aed39c4f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-void elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void elevator_exit(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
+
 	mutex_lock(&e->sysfs_lock);
 	blk_mq_exit_sched(q, e);
 	mutex_unlock(&e->sysfs_lock);
@@ -596,7 +598,7 @@ int elevator_switch_mq(struct request_queue *q,
 
 		ioc_clear_queue(q);
 		blk_mq_sched_free_rqs(q);
-		elevator_exit(q, q->elevator);
+		elevator_exit(q);
 	}
 
 	ret = blk_mq_init_sched(q, new_e);
@@ -607,7 +609,7 @@ int elevator_switch_mq(struct request_queue *q,
 		ret = elv_register_queue(q, true);
 		if (ret) {
 			blk_mq_sched_free_rqs(q);
-			elevator_exit(q, q->elevator);
+			elevator_exit(q);
 			goto out;
 		}
 	}
-- 
cgit v1.2.1


From 2aa7745bf6db55b7800c4433e102b07c649fd001 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:08 +0100
Subject: block: don't include blk-mq-sched.h in blk.h

No needed, shift it into the source files that need it instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 1 +
 block/blk-ioc.c        | 1 +
 block/blk-merge.c      | 1 +
 block/blk-mq-debugfs.c | 1 +
 block/blk-sysfs.c      | 1 +
 block/blk.h            | 1 -
 block/genhd.c          | 1 +
 7 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2053d1b0e90e..1acd94ba10c7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -46,6 +46,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 #include "blk-pm.h"
 #include "blk-throttle.h"
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 57299f860d41..70c99e85aee5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -11,6 +11,7 @@
 #include <linux/sched/task.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * For io context allocations
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ba761c3f482b..456fb88c49b1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,6 +12,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 #include "blk-throttle.h"
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f4022b198580..7f27dca3a45e 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -11,6 +11,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
+#include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 #include "blk-rq-qos.h"
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 772347adc56b..4622da4bb992 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -16,6 +16,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 #include "blk-throttle.h"
 
diff --git a/block/blk.h b/block/blk.h
index 4df2ce8d4999..db6efa351d3e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,7 +10,6 @@
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
 #include "blk-mq.h"
-#include "blk-mq-sched.h"
 
 struct elevator_type;
 
diff --git a/block/genhd.c b/block/genhd.c
index 8e9cbf23c510..01606db8c625 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -27,6 +27,7 @@
 #include <linux/badblocks.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
 static struct kobject *block_depr;
-- 
cgit v1.2.1


From e4a19f7289f3fa9b2a4e65d0f4b3a7816e8e445b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:09 +0100
Subject: block: don't include blk-mq.h in blk.h

No needed, shift a blk-stat.h include into the source file that needs it
instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 1 +
 block/blk.h          | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 39bb6e68a9a2..7c462c006b26 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -13,6 +13,7 @@
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
+#include "blk-stat.h"
 #include "blk-throttle.h"
 
 /* Max dispatch from a group in 1 round */
diff --git a/block/blk.h b/block/blk.h
index db6efa351d3e..0f9472bea616 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,7 +9,6 @@
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
-#include "blk-mq.h"
 
 struct elevator_type;
 
-- 
cgit v1.2.1


From a2ff7781cfe6a2104c64deb8311532ccd4d4c57d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:10 +0100
Subject: block: don't include <linux/blk-mq.h> in blk.h

Not needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk.h b/block/blk.h
index 0f9472bea616..a6e9ce376780 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -3,7 +3,6 @@
 #define BLK_INTERNAL_H
 
 #include <linux/idr.h>
-#include <linux/blk-mq.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
-- 
cgit v1.2.1


From ca5b304cabef55c670a22bbe53a883fe5c3b2620 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:11 +0100
Subject: block: don't include <linux/idr.h> in blk.h

Not needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk.h b/block/blk.h
index a6e9ce376780..4089aeffca4b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,7 +2,6 @@
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
 
-#include <linux/idr.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
-- 
cgit v1.2.1


From 82d981d4230bc0a19540fc540d4bdf49a3769f05 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 Nov 2021 19:53:12 +0100
Subject: block: don't include <linux/part_stat.h> in blk.h

Not needed, shift it into the source files that need it instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211123185312.1432157-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 1 +
 block/blk-core.c   | 1 +
 block/blk-flush.c  | 1 +
 block/blk-merge.c  | 1 +
 block/blk-mq.c     | 1 +
 block/blk.h        | 1 -
 block/genhd.c      | 1 +
 7 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 663aabfeba18..650f7e27989f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,6 +30,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/tracehook.h>
 #include <linux/psi.h>
+#include <linux/part_stat.h>
 #include "blk.h"
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
diff --git a/block/blk-core.c b/block/blk-core.c
index 1acd94ba10c7..b0660c9df852 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
 #include <linux/psi.h>
+#include <linux/part_stat.h>
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 86ee50455e41..902e80e48e4a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,6 +69,7 @@
 #include <linux/blkdev.h>
 #include <linux/gfp.h>
 #include <linux/blk-mq.h>
+#include <linux/part_stat.h>
 
 #include "blk.h"
 #include "blk-mq.h"
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 456fb88c49b1..e07f5a1ae86e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
 #include <linux/scatterlist.h>
+#include <linux/part_stat.h>
 
 #include <trace/events/block.h>
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4dff401bc642..4bdc3bc54ea3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
+#include <linux/part_stat.h>
 
 #include <trace/events/block.h>
 
diff --git a/block/blk.h b/block/blk.h
index 4089aeffca4b..a57c84654d0a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,7 +2,6 @@
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
 
-#include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <xen/xen.h>
diff --git a/block/genhd.c b/block/genhd.c
index 01606db8c625..5179a4f00fba 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,6 +25,7 @@
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
+#include <linux/part_stat.h>
 
 #include "blk.h"
 #include "blk-mq-sched.h"
-- 
cgit v1.2.1


From 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 24 Nov 2021 07:28:56 +0100
Subject: blk-mq: cleanup request allocation

Refactor the request alloction so that blk_mq_get_cached_request tries
to find a cached request first, and the entirely separate and now
self contained blk_mq_get_new_requests allocates one or more requests
if that is not possible.

There is a small change in behavior as submit_bio_checks is called
twice now if a cached request is present but can't be used, but that
is a small price to pay for unwinding this code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211124062856.1444266-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 90 +++++++++++++++++++++++++---------------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4bdc3bc54ea3..a89a624dd1df 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2717,8 +2717,12 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	};
 	struct request *rq;
 
-	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+	if (unlikely(bio_queue_enter(bio)))
 		return NULL;
+	if (unlikely(!submit_bio_checks(bio)))
+		goto queue_exit;
+	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+		goto queue_exit;
 
 	rq_qos_throttle(q, bio);
 
@@ -2729,64 +2733,44 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	}
 
 	rq = __blk_mq_alloc_requests(&data);
-	if (rq)
-		return rq;
+	if (!rq)
+		goto fail;
+	return rq;
 
+fail:
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
-
+queue_exit:
+	blk_queue_exit(q);
 	return NULL;
 }
 
-static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio)
-{
-	if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
-		return false;
-
-	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
-		return false;
-
-	return true;
-}
-
-static inline struct request *blk_mq_get_request(struct request_queue *q,
-						 struct blk_plug *plug,
-						 struct bio *bio,
-						 unsigned int nsegs)
+static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
+		struct blk_plug *plug, struct bio *bio, unsigned int nsegs)
 {
 	struct request *rq;
-	bool checked = false;
 
-	if (plug) {
-		rq = rq_list_peek(&plug->cached_rq);
-		if (rq && rq->q == q) {
-			if (unlikely(!submit_bio_checks(bio)))
-				return NULL;
-			if (blk_mq_attempt_bio_merge(q, bio, nsegs))
-				return NULL;
-			checked = true;
-			if (!blk_mq_can_use_cached_rq(rq, bio))
-				goto fallback;
-			rq->cmd_flags = bio->bi_opf;
-			plug->cached_rq = rq_list_next(rq);
-			INIT_LIST_HEAD(&rq->queuelist);
-			rq_qos_throttle(q, bio);
-			return rq;
-		}
-	}
+	if (!plug)
+		return NULL;
+	rq = rq_list_peek(&plug->cached_rq);
+	if (!rq || rq->q != q)
+		return NULL;
 
-fallback:
-	if (unlikely(bio_queue_enter(bio)))
+	if (unlikely(!submit_bio_checks(bio)))
 		return NULL;
-	if (unlikely(!checked && !submit_bio_checks(bio)))
-		goto out_put;
-	rq = blk_mq_get_new_requests(q, plug, bio, nsegs);
-	if (rq)
-		return rq;
-out_put:
-	blk_queue_exit(q);
-	return NULL;
+	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+		return NULL;
+	if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
+		return NULL;
+	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+		return NULL;
+
+	rq->cmd_flags = bio->bi_opf;
+	plug->cached_rq = rq_list_next(rq);
+	INIT_LIST_HEAD(&rq->queuelist);
+	rq_qos_throttle(q, bio);
+	return rq;
 }
 
 /**
@@ -2805,9 +2789,9 @@ out_put:
 void blk_mq_submit_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	struct blk_plug *plug = blk_mq_plug(q, bio);
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct request *rq;
-	struct blk_plug *plug;
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
@@ -2821,10 +2805,12 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (!bio_integrity_prep(bio))
 		return;
 
-	plug = blk_mq_plug(q, bio);
-	rq = blk_mq_get_request(q, plug, bio, nr_segs);
-	if (unlikely(!rq))
-		return;
+	rq = blk_mq_get_cached_request(q, plug, bio, nr_segs);
+	if (!rq) {
+		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+		if (unlikely(!rq))
+			return;
+	}
 
 	trace_block_getrq(bio);
 
-- 
cgit v1.2.1


From 72cd9df2ef788d88c138d51223a01ca6281f232d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 23 Nov 2021 17:37:33 -0800
Subject: blk-crypto: remove blk_crypto_unregister()

This function is trivial and is only used in one place.  Having this
function is misleading because it implies that blk_crypto_register()
needs to be paired with blk_crypto_unregister(), which is not the case.
Just set disk->queue->crypto_profile to NULL directly.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211124013733.347612-1-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto-profile.c | 5 -----
 block/blk-integrity.c      | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 605ba0626a5c..96c511967386 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -463,11 +463,6 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
 }
 EXPORT_SYMBOL_GPL(blk_crypto_register);
 
-void blk_crypto_unregister(struct request_queue *q)
-{
-	q->crypto_profile = NULL;
-}
-
 /**
  * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
  *					 by child device
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d670d54e5f7a..69eed260a823 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -411,7 +411,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	if (disk->queue->crypto_profile) {
 		pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-		blk_crypto_unregister(disk->queue);
+		disk->queue->crypto_profile = NULL;
 	}
 #endif
 }
-- 
cgit v1.2.1


From 790cf9c84837b232eb413b8b6b5d57817176cb23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:34 +0100
Subject: block: Provide blk_mq_sched_get_icq()

Currently we lookup ICQ only after the request is allocated. However BFQ
will want to decide how many scheduler tags it allows a given bfq queue
(effectively a process) to consume based on cgroup weight. So provide a
function blk_mq_sched_get_icq() so that BFQ can lookup ICQ earlier.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 26 +++++++++++++++-----------
 block/blk-mq-sched.h |  1 +
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index b942b38000e5..98c6a97729f2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,9 +18,8 @@
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
 
-void blk_mq_sched_assign_ioc(struct request *rq)
+struct io_cq *blk_mq_sched_get_icq(struct request_queue *q)
 {
-	struct request_queue *q = rq->q;
 	struct io_context *ioc;
 	struct io_cq *icq;
 
@@ -28,22 +27,27 @@ void blk_mq_sched_assign_ioc(struct request *rq)
 	if (unlikely(!current->io_context))
 		create_task_io_context(current, GFP_ATOMIC, q->node);
 
-	/*
-	 * May not have an IO context if it's a passthrough request
-	 */
+	/* May not have an IO context if context creation failed */
 	ioc = current->io_context;
 	if (!ioc)
-		return;
+		return NULL;
 
 	spin_lock_irq(&q->queue_lock);
 	icq = ioc_lookup_icq(ioc, q);
 	spin_unlock_irq(&q->queue_lock);
+	if (icq)
+		return icq;
+	return ioc_create_icq(ioc, q, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(blk_mq_sched_get_icq);
 
-	if (!icq) {
-		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
-		if (!icq)
-			return;
-	}
+void blk_mq_sched_assign_ioc(struct request *rq)
+{
+	struct io_cq *icq;
+
+	icq = blk_mq_sched_get_icq(rq->q);
+	if (!icq)
+		return;
 	get_io_context(icq->ioc);
 	rq->elv.icq = icq;
 }
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 25d1034952b6..add651ec06da 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,6 +8,7 @@
 
 #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
 
+struct io_cq *blk_mq_sched_get_icq(struct request_queue *q);
 void blk_mq_sched_assign_ioc(struct request *rq);
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
cgit v1.2.1


From 98f044999ba141a6b6b79cb3a996a73f05a46820 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:35 +0100
Subject: bfq: Track number of allocated requests in bfq_entity

When we want to limit number of requests used by each bfqq and also
cgroup, we need to track also number of requests used by each cgroup.
So track number of allocated requests for each bfq_entity.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-2-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 28 ++++++++++++++++++++++------
 block/bfq-iosched.h |  5 +++--
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1ce1a99a7160..1d564499614e 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1113,7 +1113,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
-	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
+	return bfqq->ref - bfqq->entity.allocated -
+		bfqq->entity.on_st_or_in_serv -
 		(bfqq->weight_counter != NULL) - bfqq->stable_ref;
 }
 
@@ -5878,6 +5879,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	}
 }
 
+static void bfqq_request_allocated(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	for_each_entity(entity)
+		entity->allocated++;
+}
+
+static void bfqq_request_freed(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	for_each_entity(entity)
+		entity->allocated--;
+}
+
 /* returns true if it causes the idle timer to be disabled */
 static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 {
@@ -5891,8 +5908,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		 * Release the request's reference to the old bfqq
 		 * and make sure one is taken to the shared queue.
 		 */
-		new_bfqq->allocated++;
-		bfqq->allocated--;
+		bfqq_request_allocated(new_bfqq);
+		bfqq_request_freed(bfqq);
 		new_bfqq->ref++;
 		/*
 		 * If the bic associated with the process
@@ -6251,8 +6268,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 
 static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
 {
-	bfqq->allocated--;
-
+	bfqq_request_freed(bfqq);
 	bfq_put_queue(bfqq);
 }
 
@@ -6674,7 +6690,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
 		}
 	}
 
-	bfqq->allocated++;
+	bfqq_request_allocated(bfqq);
 	bfqq->ref++;
 	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
 		     rq, bfqq, bfqq->ref);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a73488eec8a4..3787cfb0febb 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -170,6 +170,9 @@ struct bfq_entity {
 	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
 	int budget;
 
+	/* Number of requests allocated in the subtree of this entity */
+	int allocated;
+
 	/* device weight, if non-zero, it overrides the default weight of
 	 * bfq_group_data */
 	int dev_weight;
@@ -266,8 +269,6 @@ struct bfq_queue {
 	struct request *next_rq;
 	/* number of sync and async requests queued */
 	int queued[2];
-	/* number of requests currently allocated */
-	int allocated;
 	/* number of pending metadata requests */
 	int meta_pending;
 	/* fifo list of requests in sort_list */
-- 
cgit v1.2.1


From 44dfa279f117646163db0c8760addb45dd6a0e8c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:36 +0100
Subject: bfq: Store full bitmap depth in bfq_data

Store bitmap depth shift inside bfq_data so that we can use it in
bfq_limit_depth() for proportioning when limiting number of available
request tags for a cgroup.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-3-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 10 ++++++----
 block/bfq-iosched.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1d564499614e..cf9247301e3c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6857,7 +6857,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 				      struct sbitmap_queue *bt)
 {
 	unsigned int i, j, min_shallow = UINT_MAX;
+	unsigned int depth = 1U << bt->sb.shift;
 
+	bfqd->full_depth_shift = bt->sb.shift;
 	/*
 	 * In-word depths if no bfq_queue is being weight-raised:
 	 * leaving 25% of tags only for sync reads.
@@ -6869,13 +6871,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * limit 'something'.
 	 */
 	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+	bfqd->word_depths[0][0] = max(depth >> 1, 1U);
 	/*
 	 * no more than 75% of tags for sync writes (25% extra tags
 	 * w.r.t. async I/O, to prevent async I/O from starving sync
 	 * writes)
 	 */
-	bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+	bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
 
 	/*
 	 * In-word depths in case some bfq_queue is being weight-
@@ -6885,9 +6887,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * shortage.
 	 */
 	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+	bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
 	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+	bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < 2; j++)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 3787cfb0febb..820cb8c2d1fe 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -769,6 +769,7 @@ struct bfq_data {
 	 * function)
 	 */
 	unsigned int word_depths[2][2];
+	unsigned int full_depth_shift;
 };
 
 enum bfqq_state_flags {
-- 
cgit v1.2.1


From 76f1df88bbc2f984eb0418cc90de0a8384e63604 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:37 +0100
Subject: bfq: Limit number of requests consumed by each cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When cgroup IO scheduling is used with BFQ it does not really provide
service differentiation if the cgroup drives a big IO depth. That for
example happens with writeback which asynchronously submits lots of IO
but it can happen with AIO as well. The problem is that if we have two
cgroups that submit IO with different weights, the cgroup with higher
weight properly gets more IO time and is able to dispatch more IO.
However this causes lower weight cgroup to accumulate more requests
inside BFQ and eventually lower weight cgroup consumes most of IO
scheduler tags. At that point higher weight cgroup stops getting better
service as it is mostly blocked waiting for a scheduler tag while its
queues inside BFQ are empty and thus lower weight cgroup gets served.

Check how many requests submitting cgroup has allocated in
bfq_limit_depth() and if it consumes more requests than what would
correspond to its weight limit available depth to 1 so that the cgroup
cannot consume many more requests. With this limitation the higher
weight cgroup gets proper service even with writeback.

Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-4-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 137 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 118 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cf9247301e3c..95a19d1fbedf 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -565,26 +565,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 	}
 }
 
+#define BFQ_LIMIT_INLINE_DEPTH 16
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
+	struct bfq_entity **entities = inline_entities;
+	int depth, level;
+	int class_idx = bfqq->ioprio_class - 1;
+	struct bfq_sched_data *sched_data;
+	unsigned long wsum;
+	bool ret = false;
+
+	if (!entity->on_st_or_in_serv)
+		return false;
+
+	/* +1 for bfqq entity, root cgroup not included */
+	depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
+	if (depth > BFQ_LIMIT_INLINE_DEPTH) {
+		entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
+		if (!entities)
+			return false;
+	}
+
+	spin_lock_irq(&bfqd->lock);
+	sched_data = entity->sched_data;
+	/* Gather our ancestors as we need to traverse them in reverse order */
+	level = 0;
+	for_each_entity(entity) {
+		/*
+		 * If at some level entity is not even active, allow request
+		 * queueing so that BFQ knows there's work to do and activate
+		 * entities.
+		 */
+		if (!entity->on_st_or_in_serv)
+			goto out;
+		/* Uh, more parents than cgroup subsystem thinks? */
+		if (WARN_ON_ONCE(level >= depth))
+			break;
+		entities[level++] = entity;
+	}
+	WARN_ON_ONCE(level != depth);
+	for (level--; level >= 0; level--) {
+		entity = entities[level];
+		if (level > 0) {
+			wsum = bfq_entity_service_tree(entity)->wsum;
+		} else {
+			int i;
+			/*
+			 * For bfqq itself we take into account service trees
+			 * of all higher priority classes and multiply their
+			 * weights so that low prio queue from higher class
+			 * gets more requests than high prio queue from lower
+			 * class.
+			 */
+			wsum = 0;
+			for (i = 0; i <= class_idx; i++) {
+				wsum = wsum * IOPRIO_BE_NR +
+					sched_data->service_tree[i].wsum;
+			}
+		}
+		limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
+		if (entity->allocated >= limit) {
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				"too many requests: allocated %d limit %d level %d",
+				entity->allocated, limit, level);
+			ret = true;
+			break;
+		}
+	}
+out:
+	spin_unlock_irq(&bfqd->lock);
+	if (entities != inline_entities)
+		kfree(entities);
+	return ret;
+}
+#else
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+	return false;
+}
+#endif
+
 /*
  * Async I/O can easily starve sync I/O (both sync reads and sync
  * writes), by consuming all tags. Similarly, storms of sync writes,
  * such as those that sync(2) may trigger, can starve sync reads.
  * Limit depths of async I/O and sync writes so as to counter both
  * problems.
+ *
+ * Also if a bfq queue or its parent cgroup consume more tags than would be
+ * appropriate for their weight, we trim the available tag depth to 1. This
+ * avoids a situation where one cgroup can starve another cgroup from tags and
+ * thus block service differentiation among cgroups. Note that because the
+ * queue / cgroup already has many requests allocated and queued, this does not
+ * significantly affect service guarantees coming from the BFQ scheduling
+ * algorithm.
  */
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
+	struct bfq_io_cq *bic = icq_to_bic(blk_mq_sched_get_icq(data->q));
+	struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
+	int depth;
+	unsigned limit = data->q->nr_requests;
+
+	/* Sync reads have full depth available */
+	if (op_is_sync(op) && !op_is_write(op)) {
+		depth = 0;
+	} else {
+		depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+		limit = (limit * depth) >> bfqd->full_depth_shift;
+	}
 
-	if (op_is_sync(op) && !op_is_write(op))
-		return;
-
-	data->shallow_depth =
-		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+	/*
+	 * Does queue (or any parent entity) exceed number of requests that
+	 * should be available to it? Heavily limit depth so that it cannot
+	 * consume more available requests and thus starve other entities.
+	 */
+	if (bfqq && bfqq_request_over_limit(bfqq, limit))
+		depth = 1;
 
 	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
-			__func__, bfqd->wr_busy_queues, op_is_sync(op),
-			data->shallow_depth);
+		__func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
+	if (depth)
+		data->shallow_depth = depth;
 }
 
 static struct bfq_queue *
@@ -6853,10 +6961,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
  * See the comments on bfq_limit_depth for the purpose of
  * the depths set in the function. Return minimum shallow depth we'll use.
  */
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
-				      struct sbitmap_queue *bt)
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
 {
-	unsigned int i, j, min_shallow = UINT_MAX;
 	unsigned int depth = 1U << bt->sb.shift;
 
 	bfqd->full_depth_shift = bt->sb.shift;
@@ -6890,22 +6996,15 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
 	/* no more than ~37% of tags for sync writes (~20% extra tags) */
 	bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
-
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < 2; j++)
-			min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
-
-	return min_shallow;
 }
 
 static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
-	unsigned int min_shallow;
 
-	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
-	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+	bfq_update_depths(bfqd, &tags->bitmap_tags);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
 }
 
 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
-- 
cgit v1.2.1


From 1f18b7005b49b96782cd984babd59c286973b526 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:38 +0100
Subject: bfq: Limit waker detection in time

Currently, when process A starts issuing requests shortly after process
B has completed some IO three times in a row, we decide that B is a
"waker" of A meaning that completing IO of B is needed for A to make
progress and generally stop separating A's and B's IO much. This logic
is useful to avoid unnecessary idling and thus throughput loss for cases
where workload needs to switch e.g. between the process and the
journaling thread doing IO. However the detection heuristic tends to
frequently give false positives when A and B are fighting IO bandwidth
and other processes aren't doing much IO as we are basically deemed to
eventually accumulate three occurences of a situation where one process
starts issuing requests after the other has completed some IO. To reduce
these false positives, cancel the waker detection also if we didn't
accumulate three detected wakeups within given timeout. The rationale is
that if wakeups are really rare, the pointless idling doesn't hurt
throughput that much anyway.

This significantly reduces false waker detection for workload like:

[global]
directory=/mnt/repro/
rw=write
size=8g
time_based
runtime=30
ramp_time=10
blocksize=1m
direct=0
ioengine=sync

[slowwriter]
numjobs=1
fsync=200

[fastwriter]
numjobs=1
fsync=200

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-5-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 38 +++++++++++++++++++++++---------------
 block/bfq-iosched.h |  2 ++
 2 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 95a19d1fbedf..83a2225e407b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2091,20 +2091,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
  * aspect, see the comments on the choice of the queue for injection
  * in bfq_select_queue().
  *
- * Turning back to the detection of a waker queue, a queue Q is deemed
- * as a waker queue for bfqq if, for three consecutive times, bfqq
- * happens to become non empty right after a request of Q has been
- * completed. In this respect, even if bfqq is empty, we do not check
- * for a waker if it still has some in-flight I/O. In fact, in this
- * case bfqq is actually still being served by the drive, and may
- * receive new I/O on the completion of some of the in-flight
- * requests. In particular, on the first time, Q is tentatively set as
- * a candidate waker queue, while on the third consecutive time that Q
- * is detected, the field waker_bfqq is set to Q, to confirm that Q is
- * a waker queue for bfqq. These detection steps are performed only if
- * bfqq has a long think time, so as to make it more likely that
- * bfqq's I/O is actually being blocked by a synchronization. This
- * last filter, plus the above three-times requirement, make false
+ * Turning back to the detection of a waker queue, a queue Q is deemed as a
+ * waker queue for bfqq if, for three consecutive times, bfqq happens to become
+ * non empty right after a request of Q has been completed within given
+ * timeout. In this respect, even if bfqq is empty, we do not check for a waker
+ * if it still has some in-flight I/O. In fact, in this case bfqq is actually
+ * still being served by the drive, and may receive new I/O on the completion
+ * of some of the in-flight requests. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while on the third consecutive
+ * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
+ * is a waker queue for bfqq. These detection steps are performed only if bfqq
+ * has a long think time, so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This last filter, plus the
+ * above three-times requirement and time limit for detection, make false
  * positives less likely.
  *
  * NOTE
@@ -2136,8 +2135,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	    bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
 		return;
 
+	/*
+	 * We reset waker detection logic also if too much time has passed
+ 	 * since the first detection. If wakeups are rare, pointless idling
+	 * doesn't hurt throughput that much. The condition below makes sure
+	 * we do not uselessly idle blocking waker in more than 1/64 cases. 
+	 */
 	if (bfqd->last_completed_rq_bfqq !=
-	    bfqq->tentative_waker_bfqq) {
+	    bfqq->tentative_waker_bfqq ||
+	    now_ns > bfqq->waker_detection_started +
+					128 * (u64)bfqd->bfq_slice_idle) {
 		/*
 		 * First synchronization detected with a
 		 * candidate waker queue, or with a different
@@ -2146,6 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfqq->tentative_waker_bfqq =
 			bfqd->last_completed_rq_bfqq;
 		bfqq->num_waker_detections = 1;
+		bfqq->waker_detection_started = now_ns;
 	} else /* Same tentative waker queue detected again */
 		bfqq->num_waker_detections++;
 
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 820cb8c2d1fe..bb8180c52a31 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -388,6 +388,8 @@ struct bfq_queue {
 	struct bfq_queue *tentative_waker_bfqq;
 	/* number of times the same tentative waker has been detected */
 	unsigned int num_waker_detections;
+	/* time when we started considering this waker */
+	u64 waker_detection_started;
 
 	/* node for woken_list, see below */
 	struct hlist_node woken_list_node;
-- 
cgit v1.2.1


From 582f04e19ad7b41df993c669805e48a01bcd9c5b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:39 +0100
Subject: bfq: Provide helper to generate bfqq name

Instead of having helper formating bfqq pid, provide a helper to
generate full bfqq name as used in the traces. It saves some code
duplication and will save more in the coming tracepoints.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-6-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.h | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index bb8180c52a31..07288b9da389 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -25,7 +25,7 @@
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
-#define MAX_PID_STR_LENGTH 12
+#define MAX_BFQQ_NAME_LENGTH 16
 
 /*
  * Soft real-time applications are extremely more latency sensitive
@@ -1083,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 /* --------------- end of interface of B-WF2Q+ ---------------- */
 
 /* Logging facilities. */
-static inline void bfq_pid_to_str(int pid, char *str, int len)
+static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
 {
-	if (pid != -1)
-		snprintf(str, len, "%d", pid);
+	char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
+
+	if (bfqq->pid != -1)
+		snprintf(str, len, "bfq%d%c", bfqq->pid, type);
 	else
-		snprintf(str, len, "SHARED-");
+		snprintf(str, len, "bfqSHARED-%c", type);
 }
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char pid_str[MAX_PID_STR_LENGTH];	\
+	char pid_str[MAX_BFQQ_NAME_LENGTH];				\
 	if (likely(!blk_trace_note_message_enabled((bfqd)->queue)))	\
 		break;							\
-	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
+	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH);		\
 	blk_add_cgroup_trace_msg((bfqd)->queue,				\
 			bfqg_to_blkg(bfqq_group(bfqq))->blkcg,		\
-			"bfq%s%c " fmt, pid_str,			\
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);	\
+			"%s " fmt, pid_str, ##args);			\
 } while (0)
 
 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
@@ -1113,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do {	\
-	char pid_str[MAX_PID_STR_LENGTH];	\
+	char pid_str[MAX_BFQQ_NAME_LENGTH];				\
 	if (likely(!blk_trace_note_message_enabled((bfqd)->queue)))	\
 		break;							\
-	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
-	blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str,	\
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-				##args);	\
+	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH);		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args);	\
 } while (0)
 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
 
-- 
cgit v1.2.1


From 1eb17f5e15b73669df635fb07df2853cb1244a69 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:40 +0100
Subject: bfq: Log waker detections

Waker - wakee relationships are important in deciding whether one queue
can preempt the other one. Print information about detected waker-wakee
relationships so that scheduling decisions can be better understood from
block traces.

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-7-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 83a2225e407b..69144003a694 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2127,6 +2127,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
 static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			    u64 now_ns)
 {
+	char waker_name[MAX_BFQQ_NAME_LENGTH];
+
 	if (!bfqd->last_completed_rq_bfqq ||
 	    bfqd->last_completed_rq_bfqq == bfqq ||
 	    bfq_bfqq_has_short_ttime(bfqq) ||
@@ -2154,12 +2156,18 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			bfqd->last_completed_rq_bfqq;
 		bfqq->num_waker_detections = 1;
 		bfqq->waker_detection_started = now_ns;
+		bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
+			      MAX_BFQQ_NAME_LENGTH);
+		bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
 	} else /* Same tentative waker queue detected again */
 		bfqq->num_waker_detections++;
 
 	if (bfqq->num_waker_detections == 3) {
 		bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
 		bfqq->tentative_waker_bfqq = NULL;
+		bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
+			      MAX_BFQQ_NAME_LENGTH);
+		bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
 
 		/*
 		 * If the waker queue disappears, then
-- 
cgit v1.2.1


From c65e6fd460b4df796ecd6ea22e132076ed1f2820 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Nov 2021 14:36:41 +0100
Subject: bfq: Do not let waker requests skip proper accounting

Commit 7cc4ffc55564 ("block, bfq: put reqs of waker and woken in
dispatch list") added a condition to bfq_insert_request() which added
waker's requests directly to dispatch list. The rationale was that
completing waker's IO is needed to get more IO for the current queue.
Although this rationale is valid, there is a hole in it. The waker does
not necessarily serve the IO only for the current queue and maybe it's
current IO is not needed for current queue to make progress. Furthermore
injecting IO like this completely bypasses any service accounting within
bfq and thus we do not properly track how much service is waker's queue
getting or that the waker is actually doing any IO. Depending on the
conditions this can result in the waker getting too much or too few
service.

Consider for example the following job file:

[global]
directory=/mnt/repro/
rw=write
size=8g
time_based
runtime=30
ramp_time=10
blocksize=1m
direct=0
ioengine=sync

[slowwriter]
numjobs=1
prioclass=2
prio=7
fsync=200

[fastwriter]
numjobs=1
prioclass=2
prio=0
fsync=200

Despite processes have very different IO priorities, they get the same
about of service. The reason is that bfq identifies these processes as
having waker-wakee relationship and once that happens, IO from
fastwriter gets injected during slowwriter's time slice. As a result bfq
is not aware that fastwriter has any IO to do and constantly schedules
only slowwriter's queue. Thus fastwriter is forced to compete with
slowwriter's IO all the time instead of getting its share of time based
on IO priority.

Drop the special injection condition from bfq_insert_request(). As a
result, requests will be tracked and queued in a normal way and on next
dispatch bfq_select_queue() can decide whether the waker's inserted
requests should be injected during the current queue's timeslice or not.

Fixes: 7cc4ffc55564 ("block, bfq: put reqs of waker and woken in dispatch list")
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211125133645.27483-8-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 44 +-------------------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 69144003a694..85554b800970 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6132,48 +6132,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	spin_lock_irq(&bfqd->lock);
 	bfqq = bfq_init_rq(rq);
-
-	/*
-	 * Reqs with at_head or passthrough flags set are to be put
-	 * directly into dispatch list. Additional case for putting rq
-	 * directly into the dispatch queue: the only active
-	 * bfq_queues are bfqq and either its waker bfq_queue or one
-	 * of its woken bfq_queues. The rationale behind this
-	 * additional condition is as follows:
-	 * - consider a bfq_queue, say Q1, detected as a waker of
-	 *   another bfq_queue, say Q2
-	 * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
-	 *   some I/O of Q1 needs to be completed for new I/O of Q2
-	 *   to arrive.  A notable example of waker is journald
-	 * - so, Q1 and Q2 are in any respect the queues of two
-	 *   cooperating processes (or of two cooperating sets of
-	 *   processes): the goal of Q1's I/O is doing what needs to
-	 *   be done so that new Q2's I/O can finally be
-	 *   issued. Therefore, if the service of Q1's I/O is delayed,
-	 *   then Q2's I/O is delayed too.  Conversely, if Q2's I/O is
-	 *   delayed, the goal of Q1's I/O is hindered.
-	 * - as a consequence, if some I/O of Q1/Q2 arrives while
-	 *   Q2/Q1 is the only queue in service, there is absolutely
-	 *   no point in delaying the service of such an I/O. The
-	 *   only possible result is a throughput loss
-	 * - so, when the above condition holds, the best option is to
-	 *   have the new I/O dispatched as soon as possible
-	 * - the most effective and efficient way to attain the above
-	 *   goal is to put the new I/O directly in the dispatch
-	 *   list
-	 * - as an additional restriction, Q1 and Q2 must be the only
-	 *   busy queues for this commit to put the I/O of Q2/Q1 in
-	 *   the dispatch list.  This is necessary, because, if also
-	 *   other queues are waiting for service, then putting new
-	 *   I/O directly in the dispatch list may evidently cause a
-	 *   violation of service guarantees for the other queues
-	 */
-	if (!bfqq ||
-	    (bfqq != bfqd->in_service_queue &&
-	     bfqd->in_service_queue != NULL &&
-	     bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
-	     (bfqq->waker_bfqq == bfqd->in_service_queue ||
-	      bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
+	if (!bfqq || at_head) {
 		if (at_head)
 			list_add(&rq->queuelist, &bfqd->dispatch);
 		else
@@ -6200,7 +6159,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 * merge).
 	 */
 	cmd_flags = rq->cmd_flags;
-
 	spin_unlock_irq(&bfqd->lock);
 
 	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
-- 
cgit v1.2.1


From 5f480b1a6325748f26999e2151c9912e00cc4087 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 27 Nov 2021 00:19:43 +0800
Subject: blk-mq: use bio->bi_opf after bio is checked

bio->bi_opf isn't finalized before checking the bio, so use it after
submit_bio_checks() returns.

Fixes: 5b13bc8a3fd5 ("blk-mq: cleanup request allocation")
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a89a624dd1df..143a8edf6300 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2713,7 +2713,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	struct blk_mq_alloc_data data = {
 		.q		= q,
 		.nr_tags	= 1,
-		.cmd_flags	= bio->bi_opf,
 	};
 	struct request *rq;
 
@@ -2726,6 +2725,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 
 	rq_qos_throttle(q, bio);
 
+	/* ->bi_opf is finalized after submit_bio_checks() returns */
+	data.cmd_flags	= bio->bi_opf;
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
-- 
cgit v1.2.1


From 88c9a2ce520ba381bb70658c80ec704f4d60f728 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:05 +0100
Subject: fork: move copy_io to block/blk-ioc.c

Move the copying of the I/O context to the block layer as that is where
we can use the proper low-level interfaces.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 70c99e85aee5..3b31cfad4b75 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -322,6 +322,33 @@ struct io_context *get_task_io_context(struct task_struct *task,
 	return NULL;
 }
 
+int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
+
+	/*
+	 * Share io context with parent, if CLONE_IO is set
+	 */
+	if (clone_flags & CLONE_IO) {
+		get_io_context_active(ioc);
+
+		WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
+		atomic_inc(&ioc->nr_tasks);
+
+		tsk->io_context = ioc;
+	} else if (ioprio_valid(ioc->ioprio)) {
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
+			return -ENOMEM;
+
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc);
+	}
+
+	return 0;
+}
+
 /**
  * ioc_lookup_icq - lookup io_cq from ioc
  * @ioc: the associated io_context
-- 
cgit v1.2.1


From 836b394b633e3d618fa44292290bf3d9a1761e0c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:06 +0100
Subject: bfq: simplify bfq_bic_lookup

Remove the unused bfqd argument, and hardcode ioc to current->io_context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 85554b800970..c990c6409c11 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 
 /**
  * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
  * @q: the request queue.
  */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
-					struct io_context *ioc,
-					struct request_queue *q)
+static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
 {
-	if (ioc) {
-		unsigned long flags;
-		struct bfq_io_cq *icq;
+	struct bfq_io_cq *icq;
+	unsigned long flags;
 
-		spin_lock_irqsave(&q->queue_lock, flags);
-		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
-		spin_unlock_irqrestore(&q->queue_lock, flags);
+	if (!current->io_context)
+		return NULL;
 
-		return icq;
-	}
+	spin_lock_irqsave(&q->queue_lock, flags);
+	icq = icq_to_bic(ioc_lookup_icq(current->io_context, q));
+	spin_unlock_irqrestore(&q->queue_lock, flags);
 
-	return NULL;
+	return icq;
 }
 
 /*
@@ -2457,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 	 * returned by bfq_bic_lookup does not go away before
 	 * bfqd->lock is taken.
 	 */
-	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+	struct bfq_io_cq *bic = bfq_bic_lookup(q);
 	bool ret;
 
 	spin_lock_irq(&bfqd->lock);
-- 
cgit v1.2.1


From a0725c22cd8487f107a80ef87abf03c6379ec927 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:07 +0100
Subject: bfq: use bfq_bic_lookup in bfq_limit_depth

No need to create a new I/O context if there is none present yet in
->limit_depth.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c990c6409c11..ecc2e57e6863 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -663,7 +663,7 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
-	struct bfq_io_cq *bic = icq_to_bic(blk_mq_sched_get_icq(data->q));
+	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
 	struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
 	int depth;
 	unsigned limit = data->q->nr_requests;
-- 
cgit v1.2.1


From c2a32464f449370bff27a21b64b1b7d2e1d037f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:08 +0100
Subject: Revert "block: Provide blk_mq_sched_get_icq()"

This reverts commit 4896c4e64ba5d5d5acdbcf68c5910dd4f6d8fa62.

The helper is not needed any more.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 26 +++++++++++---------------
 block/blk-mq-sched.h |  1 -
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 98c6a97729f2..b942b38000e5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,8 +18,9 @@
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
 
-struct io_cq *blk_mq_sched_get_icq(struct request_queue *q)
+void blk_mq_sched_assign_ioc(struct request *rq)
 {
+	struct request_queue *q = rq->q;
 	struct io_context *ioc;
 	struct io_cq *icq;
 
@@ -27,27 +28,22 @@ struct io_cq *blk_mq_sched_get_icq(struct request_queue *q)
 	if (unlikely(!current->io_context))
 		create_task_io_context(current, GFP_ATOMIC, q->node);
 
-	/* May not have an IO context if context creation failed */
+	/*
+	 * May not have an IO context if it's a passthrough request
+	 */
 	ioc = current->io_context;
 	if (!ioc)
-		return NULL;
+		return;
 
 	spin_lock_irq(&q->queue_lock);
 	icq = ioc_lookup_icq(ioc, q);
 	spin_unlock_irq(&q->queue_lock);
-	if (icq)
-		return icq;
-	return ioc_create_icq(ioc, q, GFP_ATOMIC);
-}
-EXPORT_SYMBOL(blk_mq_sched_get_icq);
 
-void blk_mq_sched_assign_ioc(struct request *rq)
-{
-	struct io_cq *icq;
-
-	icq = blk_mq_sched_get_icq(rq->q);
-	if (!icq)
-		return;
+	if (!icq) {
+		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		if (!icq)
+			return;
+	}
 	get_io_context(icq->ioc);
 	rq->elv.icq = icq;
 }
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index add651ec06da..25d1034952b6 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,7 +8,6 @@
 
 #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
 
-struct io_cq *blk_mq_sched_get_icq(struct request_queue *q);
 void blk_mq_sched_assign_ioc(struct request *rq);
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
cgit v1.2.1


From 3304742562d27fb87a6d8291cc48824dd20f6964 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:09 +0100
Subject: block: mark put_io_context_active static

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3b31cfad4b75..f3ff495756cb 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -175,7 +175,7 @@ void put_io_context(struct io_context *ioc)
  * Undo get_io_context_active().  If active reference reaches zero after
  * put, @ioc can never issue further IOs and ioscheds are notified.
  */
-void put_io_context_active(struct io_context *ioc)
+static void put_io_context_active(struct io_context *ioc)
 {
 	struct io_cq *icq;
 
-- 
cgit v1.2.1


From 87dd1d63dcbd0f508a8b23785752e78d082fd176 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:10 +0100
Subject: block: move blk_mq_sched_assign_ioc to blk-ioc.c

Move blk_mq_sched_assign_ioc so that many interfaces from the file can
be marked static.  Rename the function to ioc_find_get_icq as well and
return the icq to simplify the interface.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c  |  2 +-
 block/blk-ioc.c      | 39 +++++++++++++++++++++++++++++++++++----
 block/blk-mq-sched.c | 31 -------------------------------
 block/blk-mq-sched.h |  2 --
 block/blk.h          |  6 +-----
 5 files changed, 37 insertions(+), 43 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ecc2e57e6863..2d484d3f7f22 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6666,7 +6666,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
  */
 static void bfq_prepare_request(struct request *rq)
 {
-	blk_mq_sched_assign_ioc(rq);
+	rq->elv.icq = ioc_find_get_icq(rq->q);
 
 	/*
 	 * Regardless of whether we have an icq attached, we have to
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f3ff495756cb..f4f84a2072be 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -24,7 +24,7 @@ static struct kmem_cache *iocontext_cachep;
  *
  * Increment reference count to @ioc.
  */
-void get_io_context(struct io_context *ioc)
+static void get_io_context(struct io_context *ioc)
 {
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 	atomic_long_inc(&ioc->refcount);
@@ -248,7 +248,8 @@ void ioc_clear_queue(struct request_queue *q)
 	__ioc_clear_queue(&icq_list);
 }
 
-int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
+		int node)
 {
 	struct io_context *ioc;
 	int ret;
@@ -397,8 +398,8 @@ EXPORT_SYMBOL(ioc_lookup_icq);
  * The caller is responsible for ensuring @ioc won't go away and @q is
  * alive and will stay alive until this function returns.
  */
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
-			     gfp_t gfp_mask)
+static struct io_cq *ioc_create_icq(struct io_context *ioc,
+		struct request_queue *q, gfp_t gfp_mask)
 {
 	struct elevator_type *et = q->elevator->type;
 	struct io_cq *icq;
@@ -441,6 +442,36 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	return icq;
 }
 
+struct io_cq *ioc_find_get_icq(struct request_queue *q)
+{
+	struct io_context *ioc;
+	struct io_cq *icq;
+
+	/* create task io_context, if we don't have one already */
+	if (unlikely(!current->io_context))
+		create_task_io_context(current, GFP_ATOMIC, q->node);
+
+	/*
+	 * May not have an IO context if it's a passthrough request
+	 */
+	ioc = current->io_context;
+	if (!ioc)
+		return NULL;
+
+	spin_lock_irq(&q->queue_lock);
+	icq = ioc_lookup_icq(ioc, q);
+	spin_unlock_irq(&q->queue_lock);
+
+	if (!icq) {
+		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		if (!icq)
+			return NULL;
+	}
+	get_io_context(icq->ioc);
+	return icq;
+}
+EXPORT_SYMBOL_GPL(ioc_find_get_icq);
+
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index b942b38000e5..0d7257848f7e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,37 +18,6 @@
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
 
-void blk_mq_sched_assign_ioc(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	struct io_context *ioc;
-	struct io_cq *icq;
-
-	/* create task io_context, if we don't have one already */
-	if (unlikely(!current->io_context))
-		create_task_io_context(current, GFP_ATOMIC, q->node);
-
-	/*
-	 * May not have an IO context if it's a passthrough request
-	 */
-	ioc = current->io_context;
-	if (!ioc)
-		return;
-
-	spin_lock_irq(&q->queue_lock);
-	icq = ioc_lookup_icq(ioc, q);
-	spin_unlock_irq(&q->queue_lock);
-
-	if (!icq) {
-		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
-		if (!icq)
-			return;
-	}
-	get_io_context(icq->ioc);
-	rq->elv.icq = icq;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_assign_ioc);
-
 /*
  * Mark a hardware queue as needing a restart. For shared queues, maintain
  * a count of how many hardware queues are marked for restart.
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 25d1034952b6..025013972453 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,8 +8,6 @@
 
 #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
 
-void blk_mq_sched_assign_ioc(struct request *rq);
-
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs, struct request **merged_request);
 bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk.h b/block/blk.h
index a57c84654d0a..187cb2654ffd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -363,14 +363,10 @@ static inline unsigned int bio_aligned_discard_max_sectors(
 /*
  * Internal io_context interface
  */
-void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_find_get_icq(struct request_queue *q);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
-			     gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
 
-int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
-- 
cgit v1.2.1


From 222ee581b84582dc472d5395b77d7e0cb5268d1c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:11 +0100
Subject: block: move the remaining elv.icq handling to the I/O scheduler

After the prepare side has been moved to the only I/O scheduler that
cares, do the same for the cleanup and the NULL initialization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 12 +++++++++++-
 block/blk-ioc.c     |  1 +
 block/blk-mq.c      | 14 +++-----------
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 2d484d3f7f22..8295b0f96cbf 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6569,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
 	rq->elv.priv[1] = NULL;
 }
 
+static void bfq_finish_request(struct request *rq)
+{
+	bfq_finish_requeue_request(rq);
+
+	if (rq->elv.icq) {
+		put_io_context(rq->elv.icq->ioc);
+		rq->elv.icq = NULL;
+	}
+}
+
 /*
  * Removes the association between the current task and bfqq, assuming
  * that bic points to the bfq iocontext of the task.
@@ -7388,7 +7398,7 @@ static struct elevator_type iosched_bfq_mq = {
 		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
 		.requeue_request        = bfq_finish_requeue_request,
-		.finish_request		= bfq_finish_requeue_request,
+		.finish_request		= bfq_finish_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f4f84a2072be..3ba15c867dfa 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -167,6 +167,7 @@ void put_io_context(struct io_context *ioc)
 	if (free_ioc)
 		kmem_cache_free(iocontext_cachep, ioc);
 }
+EXPORT_SYMBOL_GPL(put_io_context);
 
 /**
  * put_io_context_active - put active reference on ioc
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 143a8edf6300..3e67662f7801 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -400,7 +400,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	if (rq->rq_flags & RQF_ELV) {
 		struct elevator_queue *e = data->q->elevator;
 
-		rq->elv.icq = NULL;
 		INIT_HLIST_NODE(&rq->hash);
 		RB_CLEAR_NODE(&rq->rb_node);
 
@@ -631,16 +630,9 @@ void blk_mq_free_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-	if (rq->rq_flags & RQF_ELVPRIV) {
-		struct elevator_queue *e = q->elevator;
-
-		if (e->type->ops.finish_request)
-			e->type->ops.finish_request(rq);
-		if (rq->elv.icq) {
-			put_io_context(rq->elv.icq->ioc);
-			rq->elv.icq = NULL;
-		}
-	}
+	if ((rq->rq_flags & RQF_ELVPRIV) &&
+	    q->elevator->type->ops.finish_request)
+		q->elevator->type->ops.finish_request(rq);
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		__blk_mq_dec_active_requests(hctx);
-- 
cgit v1.2.1


From 50569c24be61eafb3efa06e2a3ccd447f75ae1b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:12 +0100
Subject: block: remove get_io_context_active

Fold it into it's only caller, and remove a lof of the debug checks
that are not needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3ba15c867dfa..cc4eb2ba87f7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(put_io_context);
  * put_io_context_active - put active reference on ioc
  * @ioc: ioc of interest
  *
- * Undo get_io_context_active().  If active reference reaches zero after
+ * Put an active reference to an ioc.  If active reference reaches zero after
  * put, @ioc can never issue further IOs and ioscheds are notified.
  */
 static void put_io_context_active(struct io_context *ioc)
@@ -333,11 +333,9 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	 * Share io context with parent, if CLONE_IO is set
 	 */
 	if (clone_flags & CLONE_IO) {
-		get_io_context_active(ioc);
-
-		WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
+		atomic_long_inc(&ioc->refcount);
+		atomic_inc(&ioc->active_ref);
 		atomic_inc(&ioc->nr_tasks);
-
 		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
 		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
-- 
cgit v1.2.1


From a0f14d8baaca3e2f3e57bdb062eb476175c90e83 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:13 +0100
Subject: block: factor out a alloc_io_context helper

Factor out a helper that just allocate an I/O context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index cc4eb2ba87f7..b42fbb82d5c0 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -249,18 +249,15 @@ void ioc_clear_queue(struct request_queue *q)
 	__ioc_clear_queue(&icq_list);
 }
 
-static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
-		int node)
+static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
-	int ret;
 
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return -ENOMEM;
+		return NULL;
 
-	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
 	atomic_set(&ioc->nr_tasks, 1);
 	atomic_set(&ioc->active_ref, 1);
@@ -268,6 +265,18 @@ static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
+	return ioc;
+}
+
+static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
+		int node)
+{
+	struct io_context *ioc;
+	int ret;
+
+	ioc = alloc_io_context(gfp_flags, node);
+	if (!ioc)
+		return -ENOMEM;
 
 	/*
 	 * Try to install.  ioc shouldn't be installed if someone else
-- 
cgit v1.2.1


From 8ffc13680eac16a1eec86275b65fc6f0e27a30d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:14 +0100
Subject: block: use alloc_io_context in __copy_io

In __copy_io we know that the newly allocate task_struct does not have
an I/O context yet and is not exiting.  So just allocate the I/O context
struct and install it directly.  There is no need to lock the task
either as it is just being created.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b42fbb82d5c0..f06d1040442c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -336,7 +336,6 @@ struct io_context *get_task_io_context(struct task_struct *task,
 int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct io_context *ioc = current->io_context;
-	struct io_context *new_ioc;
 
 	/*
 	 * Share io context with parent, if CLONE_IO is set
@@ -347,12 +346,10 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 		atomic_inc(&ioc->nr_tasks);
 		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
-		if (unlikely(!new_ioc))
+		tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
+		if (!tsk->io_context)
 			return -ENOMEM;
-
-		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc);
+		tsk->io_context->ioprio = ioc->ioprio;
 	}
 
 	return 0;
-- 
cgit v1.2.1


From d538ea4cb8e7241af8091eee30483fabf64444a5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:15 +0100
Subject: block: return the io_context from create_task_io_context

Grab a reference to the newly allocated or existing io_context in
create_task_io_context and return it.  This simplifies the callers and
removes the need for double lookups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 66 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f06d1040442c..5bfe810496fc 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -268,15 +268,14 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	return ioc;
 }
 
-static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
-		int node)
+static struct io_context *create_task_io_context(struct task_struct *task,
+		gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
-	int ret;
 
 	ioc = alloc_io_context(gfp_flags, node);
 	if (!ioc)
-		return -ENOMEM;
+		return NULL;
 
 	/*
 	 * Try to install.  ioc shouldn't be installed if someone else
@@ -292,11 +291,11 @@ static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags,
 	else
 		kmem_cache_free(iocontext_cachep, ioc);
 
-	ret = task->io_context ? 0 : -EBUSY;
-
+	ioc = task->io_context;
+	if (ioc)
+		get_io_context(ioc);
 	task_unlock(task);
-
-	return ret;
+	return ioc;
 }
 
 /**
@@ -319,18 +318,15 @@ struct io_context *get_task_io_context(struct task_struct *task,
 
 	might_sleep_if(gfpflags_allow_blocking(gfp_flags));
 
-	do {
-		task_lock(task);
-		ioc = task->io_context;
-		if (likely(ioc)) {
-			get_io_context(ioc);
-			task_unlock(task);
-			return ioc;
-		}
+	task_lock(task);
+	ioc = task->io_context;
+	if (unlikely(!ioc)) {
 		task_unlock(task);
-	} while (!create_task_io_context(task, gfp_flags, node));
-
-	return NULL;
+		return create_task_io_context(task, gfp_flags, node);
+	}
+	get_io_context(ioc);
+	task_unlock(task);
+	return ioc;
 }
 
 int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
@@ -449,30 +445,28 @@ static struct io_cq *ioc_create_icq(struct io_context *ioc,
 
 struct io_cq *ioc_find_get_icq(struct request_queue *q)
 {
-	struct io_context *ioc;
-	struct io_cq *icq;
-
-	/* create task io_context, if we don't have one already */
-	if (unlikely(!current->io_context))
-		create_task_io_context(current, GFP_ATOMIC, q->node);
+	struct io_context *ioc = current->io_context;
+	struct io_cq *icq = NULL;
 
-	/*
-	 * May not have an IO context if it's a passthrough request
-	 */
-	ioc = current->io_context;
-	if (!ioc)
-		return NULL;
+	if (unlikely(!ioc)) {
+		ioc = create_task_io_context(current, GFP_ATOMIC, q->node);
+		if (!ioc)
+			return NULL;
+	} else {
+		get_io_context(ioc);
 
-	spin_lock_irq(&q->queue_lock);
-	icq = ioc_lookup_icq(ioc, q);
-	spin_unlock_irq(&q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
+		icq = ioc_lookup_icq(ioc, q);
+		spin_unlock_irq(&q->queue_lock);
+	}
 
 	if (!icq) {
 		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
-		if (!icq)
+		if (!icq) {
+			put_io_context(ioc);
 			return NULL;
+		}
 	}
-	get_io_context(icq->ioc);
 	return icq;
 }
 EXPORT_SYMBOL_GPL(ioc_find_get_icq);
-- 
cgit v1.2.1


From 18b74c4dcad8150e855755697d4d594506e3de78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:16 +0100
Subject: block: simplify ioc_create_icq

Remove the ioc and gfp_mask argument, which are hard coded by the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 5bfe810496fc..c56648f7cad4 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -389,9 +389,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
 
 /**
  * ioc_create_icq - create and link io_cq
- * @ioc: io_context of interest
  * @q: request_queue of interest
- * @gfp_mask: allocation mask
  *
  * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
  * will be created using @gfp_mask.
@@ -399,19 +397,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
  * The caller is responsible for ensuring @ioc won't go away and @q is
  * alive and will stay alive until this function returns.
  */
-static struct io_cq *ioc_create_icq(struct io_context *ioc,
-		struct request_queue *q, gfp_t gfp_mask)
+static struct io_cq *ioc_create_icq(struct request_queue *q)
 {
+	struct io_context *ioc = current->io_context;
 	struct elevator_type *et = q->elevator->type;
 	struct io_cq *icq;
 
 	/* allocate stuff */
-	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+	icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
 				    q->node);
 	if (!icq)
 		return NULL;
 
-	if (radix_tree_maybe_preload(gfp_mask) < 0) {
+	if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
 		kmem_cache_free(et->icq_cache, icq);
 		return NULL;
 	}
@@ -461,7 +459,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
 	}
 
 	if (!icq) {
-		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		icq = ioc_create_icq(q);
 		if (!icq) {
 			put_io_context(ioc);
 			return NULL;
-- 
cgit v1.2.1


From eca5892a5d616d39185d652820931f21cab2f190 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 12:58:17 +0100
Subject: block: simplify ioc_lookup_icq

Remove the ioc argument as it always points to current->io_context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211126115817.2087431-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 2 +-
 block/blk-ioc.c     | 8 ++++----
 block/blk.h         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 8295b0f96cbf..0c612a911696 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -444,7 +444,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
 		return NULL;
 
 	spin_lock_irqsave(&q->queue_lock, flags);
-	icq = icq_to_bic(ioc_lookup_icq(current->io_context, q));
+	icq = icq_to_bic(ioc_lookup_icq(q));
 	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	return icq;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c56648f7cad4..536fb496ad76 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -353,14 +353,14 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 
 /**
  * ioc_lookup_icq - lookup io_cq from ioc
- * @ioc: the associated io_context
  * @q: the associated request_queue
  *
  * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
  * with @q->queue_lock held.
  */
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+struct io_cq *ioc_lookup_icq(struct request_queue *q)
 {
+	struct io_context *ioc = current->io_context;
 	struct io_cq *icq;
 
 	lockdep_assert_held(&q->queue_lock);
@@ -430,7 +430,7 @@ static struct io_cq *ioc_create_icq(struct request_queue *q)
 			et->ops.init_icq(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
-		icq = ioc_lookup_icq(ioc, q);
+		icq = ioc_lookup_icq(q);
 		if (!icq)
 			printk(KERN_ERR "cfq: icq link failed!\n");
 	}
@@ -454,7 +454,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
 		get_io_context(ioc);
 
 		spin_lock_irq(&q->queue_lock);
-		icq = ioc_lookup_icq(ioc, q);
+		icq = ioc_lookup_icq(q);
 		spin_unlock_irq(&q->queue_lock);
 	}
 
diff --git a/block/blk.h b/block/blk.h
index 187cb2654ffd..3be0fdf76c9a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -364,7 +364,7 @@ static inline unsigned int bio_aligned_discard_max_sectors(
  * Internal io_context interface
  */
 struct io_cq *ioc_find_get_icq(struct request_queue *q);
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_lookup_icq(struct request_queue *q);
 void ioc_clear_queue(struct request_queue *q);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-- 
cgit v1.2.1


From af22fef3e7a51cbd339814a0e196086e2bb2aa26 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@googlemail.com>
Date: Fri, 26 Nov 2021 23:06:52 +0000
Subject: block: Remove redundant initialization of variable ret

The variable ret is being initialized with a value that is never
read, it is being updated later on. The assignment is redundant and
can be removed.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Link: https://lore.kernel.org/r/20211126230652.1175636-1-colin.i.king@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bdev.c b/block/bdev.c
index e9ada04e71be..587645231d60 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -665,7 +665,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
 static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int ret = 0;
+	int ret;
 
 	if (disk->fops->open) {
 		ret = disk->fops->open(bdev, mode);
-- 
cgit v1.2.1


From 79bb1dbd12005f2143670a9a4f13d91e64725717 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 13:17:59 +0100
Subject: block: don't check ->rq_disk in merges

There is a 1:1 relationship between request_queues and gendisks now, so
no need for these extra checks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211126121802.2090656-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index e07f5a1ae86e..4de34a332c9f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -777,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (req_op(req) != req_op(next))
 		return NULL;
 
-	if (rq_data_dir(req) != rq_data_dir(next)
-	    || req->rq_disk != next->rq_disk)
+	if (rq_data_dir(req) != rq_data_dir(next))
 		return NULL;
 
 	if (req_op(req) == REQ_OP_WRITE_SAME &&
@@ -905,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 
-	/* must be same device */
-	if (rq->rq_disk != bio->bi_bdev->bd_disk)
-		return false;
-
 	/* only merge integrity protected bio into ditto rq */
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;
-- 
cgit v1.2.1


From f3fa33acca9f0058157214800f68b10d8e71ab7a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 13:18:00 +0100
Subject: block: remove the ->rq_disk field in struct request

Just use the disk attached to the request_queue instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211126121802.2090656-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c |  3 +--
 block/blk-mq.c    | 14 ++++++--------
 block/blk.h       |  2 +-
 3 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 902e80e48e4a..fd5187a0898d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -145,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
 
 static void blk_account_io_flush(struct request *rq)
 {
-	struct block_device *part = rq->rq_disk->part0;
+	struct block_device *part = rq->q->disk->part0;
 
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
@@ -339,7 +339,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
-	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 	/*
 	 * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3e67662f7801..f1abfd2e24f7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -377,7 +377,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->start_time_ns = ktime_get_ns();
 	else
 		rq->start_time_ns = 0;
-	rq->rq_disk = NULL;
 	rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	rq->alloc_time_ns = alloc_time_ns;
@@ -659,7 +658,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
-		rq->rq_disk ? rq->rq_disk->disk_name : "?",
+		rq->q->disk ? rq->q->disk->disk_name : "?",
 		(unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
@@ -712,7 +711,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
 		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
 		"phys_seg %u prio class %u\n",
 		blk_status_to_str(status),
-		req->rq_disk ? req->rq_disk->disk_name : "?",
+		req->q->disk ? req->q->disk->disk_name : "?",
 		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
 		req->cmd_flags & ~REQ_OP_MASK,
 		req->nr_phys_segments,
@@ -853,8 +852,8 @@ static void __blk_account_io_start(struct request *rq)
 	/* passthrough requests can hold bios that do not have ->bi_bdev set */
 	if (rq->bio && rq->bio->bi_bdev)
 		rq->part = rq->bio->bi_bdev;
-	else
-		rq->part = rq->rq_disk->part0;
+	else if (rq->q->disk)
+		rq->part = rq->q->disk->part0;
 
 	part_stat_lock();
 	update_io_ticks(rq->part, jiffies, false);
@@ -1172,7 +1171,6 @@ void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
 	WARN_ON(irqs_disabled());
 	WARN_ON(!blk_rq_is_passthrough(rq));
 
-	rq->rq_disk = bd_disk;
 	rq->end_io = done;
 
 	blk_account_io_start(rq);
@@ -2902,8 +2900,8 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	if (ret != BLK_STS_OK)
 		return ret;
 
-	if (rq->rq_disk &&
-	    should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
+	if (rq->q->disk &&
+	    should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;
 
 	if (blk_crypto_insert_cloned_request(rq))
diff --git a/block/blk.h b/block/blk.h
index 3be0fdf76c9a..a55d82c3d1c2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -324,7 +324,7 @@ int blk_dev_init(void);
  */
 static inline bool blk_do_io_stat(struct request *rq)
 {
-	return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
+	return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
 }
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
-- 
cgit v1.2.1


From b84ba30b6c7a75babdf73b83bc3c7b59b944501a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Nov 2021 13:18:01 +0100
Subject: block: remove the gendisk argument to blk_execute_rq

Remove the gendisk aregument to blk_execute_rq and blk_execute_rq_nowait
given that it is unused now.  Also convert the boolean at_head parameter
to actually use the bool type while touching the prototype.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211126121802.2090656-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c  | 10 +++-------
 block/bsg-lib.c |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f1abfd2e24f7..ecfc47fad236 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1153,7 +1153,6 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 
 /**
  * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
- * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
  * @done:	I/O completion handler
@@ -1165,8 +1164,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
  * Note:
  *    This function will invoke @done directly if the queue is dead.
  */
-void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
-			   int at_head, rq_end_io_fn *done)
+void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done)
 {
 	WARN_ON(irqs_disabled());
 	WARN_ON(!blk_rq_is_passthrough(rq));
@@ -1204,7 +1202,6 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
 
 /**
  * blk_execute_rq - insert a request into queue for execution
- * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
  *
@@ -1213,14 +1210,13 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
  *    for execution and wait for completion.
  * Return: The blk_status_t result provided to blk_mq_end_request().
  */
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
-		int at_head)
+blk_status_t blk_execute_rq(struct request *rq, bool at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	unsigned long hang_check;
 
 	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
+	blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq);
 
 	/* Prevent hang_check timer from firing at us during very long I/O */
 	hang_check = sysctl_hung_task_timeout_secs;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 10aa378702fa..acfe1357bf6c 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -92,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 		goto out_unmap_bidi_rq;
 
 	bio = rq->bio;
-	blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
+	blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
 
 	/*
 	 * The assignments below don't make much sense, but are kept for
-- 
cgit v1.2.1


From 8a7518931baa8ea023700987f3db31cb0a80610b Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 29 Nov 2021 09:26:59 +0800
Subject: block: Fix fsync always failed if once failed

We do test with inject error fault base on v4.19, after test some time we found
sync /dev/sda always failed.
[root@localhost] sync /dev/sda
sync: error syncing '/dev/sda': Input/output error

scsi log as follows:
[19069.812296] sd 0:0:0:0: [sda] tag#64 Send: scmd 0x00000000d03a0b6b
[19069.812302] sd 0:0:0:0: [sda] tag#64 CDB: Synchronize Cache(10) 35 00 00 00 00 00 00 00 00 00
[19069.812533] sd 0:0:0:0: [sda] tag#64 Done: SUCCESS Result: hostbyte=DID_OK driverbyte=DRIVER_OK
[19069.812536] sd 0:0:0:0: [sda] tag#64 CDB: Synchronize Cache(10) 35 00 00 00 00 00 00 00 00 00
[19069.812539] sd 0:0:0:0: [sda] tag#64 scsi host busy 1 failed 0
[19069.812542] sd 0:0:0:0: Notifying upper driver of completion (result 0)
[19069.812546] sd 0:0:0:0: [sda] tag#64 sd_done: completed 0 of 0 bytes
[19069.812549] sd 0:0:0:0: [sda] tag#64 0 sectors total, 0 bytes done.
[19069.812564] print_req_error: I/O error, dev sda, sector 0

ftrace log as follows:
 rep-306069 [007] .... 19654.923315: block_bio_queue: 8,0 FWS 0 + 0 [rep]
 rep-306069 [007] .... 19654.923333: block_getrq: 8,0 FWS 0 + 0 [rep]
 kworker/7:1H-250   [007] .... 19654.923352: block_rq_issue: 8,0 FF 0 () 0 + 0 [kworker/7:1H]
 <idle>-0     [007] ..s. 19654.923562: block_rq_complete: 8,0 FF () 18446744073709551615 + 0 [0]
 <idle>-0     [007] d.s. 19654.923576: block_rq_complete: 8,0 WS () 0 + 0 [-5]

As 8d6996630c03 introduce 'fq->rq_status', this data only update when 'flush_rq'
reference count isn't zero. If flush request once failed and record error code
in 'fq->rq_status'. If there is no chance to update 'fq->rq_status',then do fsync
will always failed.
To address this issue reset 'fq->rq_status' after return error code to upper layer.

Fixes: 8d6996630c03("block: fix null pointer dereference in blk_mq_rq_timed_out()")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211129012659.1553733-1-yebin10@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index fd5187a0898d..f78bb39e589e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -242,8 +242,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	 * avoiding use-after-free.
 	 */
 	WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
-	if (fq->rq_status != BLK_STS_OK)
+	if (fq->rq_status != BLK_STS_OK) {
 		error = fq->rq_status;
+		fq->rq_status = BLK_STS_OK;
+	}
 
 	if (!q->elevator) {
 		flush_rq->tag = BLK_MQ_NO_TAG;
-- 
cgit v1.2.1


From 18d78171c061889a9a43152f60d6a27a10fc7656 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Dec 2021 17:07:16 +0800
Subject: blk-mq: check q->poll_stat in queue_poll_stat_show

Without checking q->poll_stat in queue_poll_stat_show(), kernel panic
may be caused if q->poll_stat isn't allocated.

Fixes: 48b5c1fbcd8c ("block: only allocate poll_stats if there's a user of them")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211202090716.3292244-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f27dca3a45e..3a790eb4995c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -30,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
 	struct request_queue *q = data;
 	int bucket;
 
+	if (!q->poll_stat)
+		return 0;
+
 	for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
 		seq_printf(m, "read  (%d Bytes): ", 1 << (9 + bucket));
 		print_stat(m, &q->poll_stat[2 * bucket]);
-- 
cgit v1.2.1


From 373b5416b4b03ebda5d8f0605b81eff0dc76ebcf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 2 Dec 2021 12:42:58 -0700
Subject: block: get rid of useless goto and label in blk_mq_get_new_requests()

Expected case is returning a request, just check for success and return
the request rather than having an error label.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ecfc47fad236..ca33cb755c5f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2720,11 +2720,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	}
 
 	rq = __blk_mq_alloc_requests(&data);
-	if (!rq)
-		goto fail;
-	return rq;
-
-fail:
+	if (rq)
+		return rq;
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
-- 
cgit v1.2.1


From a08ed9aae8a3d2321ef378d6581cc87a3fb75b44 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 2 Dec 2021 12:43:46 -0700
Subject: block: fix double bio queue when merging in cached request path

When we attempt to merge off the cached request path, we return NULL
if successful. This makes the caller believe that it's should allocate
a new request, and hence we end up with the bio both merged and associated
with a new request. This, predictably, leads to all sorts of crashes.

Pass in a pointer to the bio pointer, and clear it for the merge case.
Then the caller knows that the bio is already queued, and no new requests
need to get allocated.

Fixes: 5b13bc8a3fd5 ("blk-mq: cleanup request allocation")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ca33cb755c5f..fc4520e992b1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2731,7 +2731,7 @@ queue_exit:
 }
 
 static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
-		struct blk_plug *plug, struct bio *bio, unsigned int nsegs)
+		struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
 {
 	struct request *rq;
 
@@ -2741,19 +2741,21 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
 	if (!rq || rq->q != q)
 		return NULL;
 
-	if (unlikely(!submit_bio_checks(bio)))
+	if (unlikely(!submit_bio_checks(*bio)))
 		return NULL;
-	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+	if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
+		*bio = NULL;
 		return NULL;
-	if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
+	}
+	if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
 		return NULL;
-	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+	if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
 		return NULL;
 
-	rq->cmd_flags = bio->bi_opf;
+	rq->cmd_flags = (*bio)->bi_opf;
 	plug->cached_rq = rq_list_next(rq);
 	INIT_LIST_HEAD(&rq->queuelist);
-	rq_qos_throttle(q, bio);
+	rq_qos_throttle(q, *bio);
 	return rq;
 }
 
@@ -2789,8 +2791,10 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (!bio_integrity_prep(bio))
 		return;
 
-	rq = blk_mq_get_cached_request(q, plug, bio, nr_segs);
+	rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
 	if (!rq) {
+		if (!bio)
+			return;
 		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
 		if (unlikely(!rq))
 			return;
-- 
cgit v1.2.1


From ceaa762527f41a431b552bc000de4b626d2d8cb7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Oct 2021 08:57:09 -0600
Subject: block: move direct_IO into our own read_iter handler

Don't call into generic_file_read_iter() if we know it's O_DIRECT, just
set it up ourselves and call our own handler. This avoids an indirect call
for O_DIRECT.

Fall back to filemap_read() if we fail.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/fops.c b/block/fops.c
index ad732a36f9b3..4d0e220f379e 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -566,21 +566,48 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
 	loff_t size = bdev_nr_bytes(bdev);
+	size_t count = iov_iter_count(to);
 	loff_t pos = iocb->ki_pos;
 	size_t shorted = 0;
-	ssize_t ret;
+	ssize_t ret = 0;
 
-	if (unlikely(pos + iov_iter_count(to) > size)) {
+	if (unlikely(pos + count > size)) {
 		if (pos >= size)
 			return 0;
 		size -= pos;
-		if (iov_iter_count(to) > size) {
-			shorted = iov_iter_count(to) - size;
+		if (count > size) {
+			shorted = count - size;
 			iov_iter_truncate(to, size);
 		}
 	}
 
-	ret = generic_file_read_iter(iocb, to);
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+		if (iocb->ki_flags & IOCB_NOWAIT) {
+			if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
+						iocb->ki_pos + count - 1))
+				return -EAGAIN;
+		} else {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+					        iocb->ki_pos + count - 1);
+			if (ret < 0)
+				return ret;
+		}
+
+		file_accessed(iocb->ki_filp);
+
+		ret = blkdev_direct_IO(iocb, to);
+		if (ret >= 0) {
+			iocb->ki_pos += ret;
+			count -= ret;
+		}
+		if (ret < 0 || !count)
+			return ret;
+	}
+
+	ret = filemap_read(iocb, to, ret);
 
 	if (unlikely(shorted))
 		iov_iter_reexpand(to, iov_iter_count(to) + shorted);
-- 
cgit v1.2.1


From 0a467d0fdd9594fbb449ebc93852533332c528fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Oct 2021 14:39:59 -0600
Subject: block: switch to atomic_t for request references

refcount_t is not as expensive as it used to be, but it's still more
expensive than the io_uring method of using atomic_t and just checking
for potential over/underflow.

This borrows that same implementation, which in turn is based on the
mm implementation from Linus.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c  |  4 ++--
 block/blk-mq-tag.c |  2 +-
 block/blk-mq.c     | 12 ++++++------
 block/blk.h        | 31 +++++++++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index f78bb39e589e..e4df894189ce 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -229,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
 
-	if (!refcount_dec_and_test(&flush_rq->ref)) {
+	if (!req_ref_put_and_test(flush_rq)) {
 		fq->rq_status = error;
 		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 		return;
@@ -349,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * and READ flush_rq->end_io
 	 */
 	smp_wmb();
-	refcount_set(&flush_rq->ref, 1);
+	req_ref_set(flush_rq, 1);
 
 	blk_flush_queue_rq(flush_rq, false);
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 995336abee33..380e2dd31bfc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,7 +228,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
 
 	spin_lock_irqsave(&tags->lock, flags);
 	rq = tags->rqs[bitnr];
-	if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
+	if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
 		rq = NULL;
 	spin_unlock_irqrestore(&tags->lock, flags);
 	return rq;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc4520e992b1..8c7cab75229e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -394,7 +394,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* tag was already set */
 	WRITE_ONCE(rq->deadline, 0);
-	refcount_set(&rq->ref, 1);
+	req_ref_set(rq, 1);
 
 	if (rq->rq_flags & RQF_ELV) {
 		struct elevator_queue *e = data->q->elevator;
@@ -642,7 +642,7 @@ void blk_mq_free_request(struct request *rq)
 	rq_qos_done(q, rq);
 
 	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-	if (refcount_dec_and_test(&rq->ref))
+	if (req_ref_put_and_test(rq))
 		__blk_mq_free_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
@@ -938,7 +938,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
 		rq_qos_done(rq->q, rq);
 
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-		if (!refcount_dec_and_test(&rq->ref))
+		if (!req_ref_put_and_test(rq))
 			continue;
 
 		blk_crypto_free_request(rq);
@@ -1401,7 +1401,7 @@ void blk_mq_put_rq_ref(struct request *rq)
 {
 	if (is_flush_rq(rq))
 		rq->end_io(rq, 0);
-	else if (refcount_dec_and_test(&rq->ref))
+	else if (req_ref_put_and_test(rq))
 		__blk_mq_free_request(rq);
 }
 
@@ -3049,7 +3049,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
 			unsigned long rq_addr = (unsigned long)rq;
 
 			if (rq_addr >= start && rq_addr < end) {
-				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+				WARN_ON_ONCE(req_ref_read(rq) != 0);
 				cmpxchg(&drv_tags->rqs[i], rq, NULL);
 			}
 		}
@@ -3383,7 +3383,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
 	if (!tags)
 		return;
 
-	WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
+	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
 
 	for (i = 0; i < queue_depth; i++)
 		cmpxchg(&tags->rqs[i], flush_rq, NULL);
diff --git a/block/blk.h b/block/blk.h
index a55d82c3d1c2..24d8b333bb03 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -461,4 +461,35 @@ static inline bool should_fail_request(struct block_device *part,
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+/*
+ * Optimized request reference counting. Ideally we'd make timeouts be more
+ * clever, as that's the only reason we need references at all... But until
+ * this happens, this is faster than using refcount_t. Also see:
+ *
+ * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct request *req)
+{
+	return atomic_inc_not_zero(&req->ref);
+}
+
+static inline bool req_ref_put_and_test(struct request *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->ref);
+}
+
+static inline void req_ref_set(struct request *req, int value)
+{
+	atomic_set(&req->ref, value);
+}
+
+static inline int req_ref_read(struct request *req)
+{
+	return atomic_read(&req->ref);
+}
+
 #endif /* BLK_INTERNAL_H */
-- 
cgit v1.2.1


From 2a904d00855f94cb85751e45fa494f225d44ae0d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 3 Dec 2021 21:15:31 +0800
Subject: blk-mq: remove hctx_lock and hctx_unlock

Remove hctx_lock and hctx_unlock, and add one helper of
blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
with rcu/srcu read held.

Compared with hctx_lock()/hctx_unlock():

1) remove 2 branch to 1, so we just need to check
(hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops

2) srcu_idx needn't to be touched in case of non-blocking

3) might_sleep_if() can be moved to the blocking branch

Also put the added blk_mq_run_dispatch_ops() in private header, so that
the following patch can use it out of blk-mq.c.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 57 ++++++++++-----------------------------------------------
 block/blk-mq.h | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 47 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8c7cab75229e..494da31dc1a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1071,26 +1071,6 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
-{
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-		rcu_read_unlock();
-	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
-}
-
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
-	__acquires(hctx->srcu)
-{
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		/* shut up gcc false positive */
-		*srcu_idx = 0;
-		rcu_read_lock();
-	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
-}
-
 /**
  * blk_mq_start_request - Start processing a request
  * @rq: Pointer to request to be started
@@ -1947,19 +1927,13 @@ out:
  */
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	int srcu_idx;
-
 	/*
 	 * We can't run the queue inline with ints disabled. Ensure that
 	 * we catch bad users of this early.
 	 */
 	WARN_ON_ONCE(in_interrupt());
 
-	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
-	hctx_lock(hctx, &srcu_idx);
-	blk_mq_sched_dispatch_requests(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2071,7 +2045,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
  */
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	int srcu_idx;
 	bool need_run;
 
 	/*
@@ -2082,10 +2055,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	hctx_lock(hctx, &srcu_idx);
-	need_run = !blk_queue_quiesced(hctx->queue) &&
-		blk_mq_hctx_has_pending(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	blk_mq_run_dispatch_ops(hctx,
+		need_run = !blk_queue_quiesced(hctx->queue) &&
+		blk_mq_hctx_has_pending(hctx));
 
 	if (need_run)
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2488,32 +2460,22 @@ insert:
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq)
 {
-	blk_status_t ret;
-	int srcu_idx;
-
-	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+	blk_status_t ret =
+		__blk_mq_try_issue_directly(hctx, rq, false, true);
 
-	hctx_lock(hctx, &srcu_idx);
-
-	ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_request_bypass_insert(rq, false, true);
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
-
-	hctx_unlock(hctx, srcu_idx);
 }
 
 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
 	blk_status_t ret;
-	int srcu_idx;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-	hctx_lock(hctx, &srcu_idx);
-	ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
-	hctx_unlock(hctx, srcu_idx);
-
+	blk_mq_run_dispatch_ops(hctx,
+		ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
 	return ret;
 }
 
@@ -2826,7 +2788,8 @@ void blk_mq_submit_bio(struct bio *bio)
 		  (q->nr_hw_queues == 1 || !is_sync)))
 		blk_mq_sched_insert_request(rq, false, true, true);
 	else
-		blk_mq_try_issue_directly(rq->mq_hctx, rq);
+		blk_mq_run_dispatch_ops(rq->mq_hctx,
+				blk_mq_try_issue_directly(rq->mq_hctx, rq));
 }
 
 /**
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d516c7a46f57..e4c396204928 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -374,5 +374,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return __blk_mq_active_requests(hctx) < depth;
 }
 
+/* run the code block in @dispatch_ops with rcu/srcu read lock held */
+#define blk_mq_run_dispatch_ops(hctx, dispatch_ops)		\
+do {								\
+	if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) {		\
+		rcu_read_lock();				\
+		(dispatch_ops);					\
+		rcu_read_unlock();				\
+	} else {						\
+		int srcu_idx;					\
+								\
+		might_sleep();					\
+		srcu_idx = srcu_read_lock((hctx)->srcu);	\
+		(dispatch_ops);					\
+		srcu_read_unlock((hctx)->srcu, srcu_idx);	\
+	}							\
+} while (0)
 
 #endif
-- 
cgit v1.2.1


From 704b914f15fb7daaf517e3acc4bed472b50ca19e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 3 Dec 2021 21:15:32 +0800
Subject: blk-mq: move srcu from blk_mq_hw_ctx to request_queue

In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 27 ++++++++++++++++++++++-----
 block/blk-mq-sysfs.c |  2 --
 block/blk-mq.c       | 37 ++++++++-----------------------------
 block/blk-mq.h       |  4 ++--
 block/blk-sysfs.c    |  3 ++-
 block/blk.h          | 10 +++++++++-
 block/genhd.c        |  2 +-
 7 files changed, 44 insertions(+), 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b0660c9df852..10619fd83c1b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
 {
 }
 
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
 	int ret;
 
-	q = kmem_cache_alloc_node(blk_requestq_cachep,
-				GFP_KERNEL | __GFP_ZERO, node_id);
+	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+			GFP_KERNEL | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 
+	if (alloc_srcu) {
+		blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+		if (init_srcu_struct(q->srcu) != 0)
+			goto fail_q;
+	}
+
 	q->last_merge = NULL;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_srcu;
 
 	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
 	if (ret)
@@ -508,8 +515,11 @@ fail_split:
 	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+	if (alloc_srcu)
+		cleanup_srcu_struct(q->srcu);
 fail_q:
-	kmem_cache_free(blk_requestq_cachep, q);
+	kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
 	return NULL;
 }
 
@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
 			sizeof_field(struct request, cmd_flags));
 	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			sizeof_field(struct bio, bi_opf));
+	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+			   __alignof__(struct request_queue)) !=
+		     sizeof(struct request_queue));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+			sizeof(struct request_queue) +
+			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
 	return 0;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
 						  kobj);
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 494da31dc1a5..6a2c2704454e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_wait_quiesce_done(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
+	if (blk_queue_has_srcu(q))
+		synchronize_srcu(q->srcu);
+	else
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-			   __alignof__(struct blk_mq_hw_ctx)) !=
-		     sizeof(struct blk_mq_hw_ctx));
-
-	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
-
-	return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx *hctx;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
 	if (!hctx)
 		goto fail_alloc_hctx;
 
@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	if (!hctx->fq)
 		goto free_bitmap;
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 	struct request_queue *q;
 	int ret;
 
-	q = blk_alloc_queue(set->numa_node);
+	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
 	if (!q)
 		return ERR_PTR(-ENOMEM);
 	q->queuedata = queuedata;
@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q)
 {
+	WARN_ON_ONCE(blk_queue_has_srcu(q) !=
+			!!(set->flags & BLK_MQ_F_BLOCKING));
+
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e4c396204928..792f0b29c6eb 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -385,9 +385,9 @@ do {								\
 		int srcu_idx;					\
 								\
 		might_sleep();					\
-		srcu_idx = srcu_read_lock((hctx)->srcu);	\
+		srcu_idx = srcu_read_lock((hctx)->queue->srcu);	\
 		(dispatch_ops);					\
-		srcu_read_unlock((hctx)->srcu, srcu_idx);	\
+		srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
 	}							\
 } while (0)
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4622da4bb992..3e6357321225 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 {
 	struct request_queue *q = container_of(rcu_head, struct request_queue,
 					       rcu_head);
-	kmem_cache_free(blk_requestq_cachep, q);
+
+	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
 }
 
 /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
diff --git a/block/blk.h b/block/blk.h
index 24d8b333bb03..7ccb7c7d86b3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -27,6 +27,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
@@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
 
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+	if (srcu)
+		return blk_requestq_srcu_cachep;
+	return blk_requestq_cachep;
+}
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+
 int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
 
 int disk_alloc_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index 5179a4f00fba..3c139a1b6f04 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 	struct request_queue *q;
 	struct gendisk *disk;
 
-	q = blk_alloc_queue(node);
+	q = blk_alloc_queue(node, false);
 	if (!q)
 		return NULL;
 
-- 
cgit v1.2.1


From bcc330f42f442a98d61f153d16c0b6487461ee81 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 3 Dec 2021 21:15:33 +0800
Subject: blk-mq: pass request queue to blk_mq_run_dispatch_ops

We have switched to allocate srcu into request queue, so it is fine
to pass request queue to blk_mq_run_dispatch_ops().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 9 +++++----
 block/blk-mq.h | 8 ++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a2c2704454e..24c65bb8719b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1925,7 +1925,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	 */
 	WARN_ON_ONCE(in_interrupt());
 
-	blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
+	blk_mq_run_dispatch_ops(hctx->queue,
+			blk_mq_sched_dispatch_requests(hctx));
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2047,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	blk_mq_run_dispatch_ops(hctx,
+	blk_mq_run_dispatch_ops(hctx->queue,
 		need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx));
 
@@ -2466,7 +2467,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 	blk_status_t ret;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-	blk_mq_run_dispatch_ops(hctx,
+	blk_mq_run_dispatch_ops(rq->q,
 		ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
 	return ret;
 }
@@ -2780,7 +2781,7 @@ void blk_mq_submit_bio(struct bio *bio)
 		  (q->nr_hw_queues == 1 || !is_sync)))
 		blk_mq_sched_insert_request(rq, false, true, true);
 	else
-		blk_mq_run_dispatch_ops(rq->mq_hctx,
+		blk_mq_run_dispatch_ops(rq->q,
 				blk_mq_try_issue_directly(rq->mq_hctx, rq));
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 792f0b29c6eb..d62004e2d531 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -375,9 +375,9 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 }
 
 /* run the code block in @dispatch_ops with rcu/srcu read lock held */
-#define blk_mq_run_dispatch_ops(hctx, dispatch_ops)		\
+#define blk_mq_run_dispatch_ops(q, dispatch_ops)		\
 do {								\
-	if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) {		\
+	if (!blk_queue_has_srcu(q)) {				\
 		rcu_read_lock();				\
 		(dispatch_ops);					\
 		rcu_read_unlock();				\
@@ -385,9 +385,9 @@ do {								\
 		int srcu_idx;					\
 								\
 		might_sleep();					\
-		srcu_idx = srcu_read_lock((hctx)->queue->srcu);	\
+		srcu_idx = srcu_read_lock((q)->srcu);		\
 		(dispatch_ops);					\
-		srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
+		srcu_read_unlock((q)->srcu, srcu_idx);		\
 	}							\
 } while (0)
 
-- 
cgit v1.2.1


From 4cafe86c9267f9dd5819df946ba8c038ba958370 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 3 Dec 2021 21:15:34 +0800
Subject: blk-mq: run dispatch lock once in case of issuing from list

It isn't necessary to call blk_mq_run_dispatch_ops() once for issuing
single request directly, and enough to do it one time when issuing from
whole list.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c |  3 ++-
 block/blk-mq.c       | 14 ++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0d7257848f7e..55488ba97823 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -475,7 +475,8 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 		 * us one extra enqueue & dequeue to sw queue.
 		 */
 		if (!hctx->dispatch_busy && !run_queue_async) {
-			blk_mq_try_issue_list_directly(hctx, list);
+			blk_mq_run_dispatch_ops(hctx->queue,
+				blk_mq_try_issue_list_directly(hctx, list));
 			if (list_empty(list))
 				goto out;
 		}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 24c65bb8719b..22ec21aa0c22 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2464,12 +2464,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
-	blk_status_t ret;
-	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-	blk_mq_run_dispatch_ops(rq->q,
-		ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
-	return ret;
+	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
 }
 
 static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
@@ -2526,7 +2521,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		blk_mq_plug_issue_direct(plug, false);
+		blk_mq_run_dispatch_ops(plug->mq_list->q,
+				blk_mq_plug_issue_direct(plug, false));
 		if (rq_list_empty(plug->mq_list))
 			return;
 	}
@@ -2867,7 +2863,9 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	 * bypass a potential scheduler on the bottom device for
 	 * insert.
 	 */
-	return blk_mq_request_issue_directly(rq, true);
+	blk_mq_run_dispatch_ops(rq->q,
+			ret = blk_mq_request_issue_directly(rq, true));
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
-- 
cgit v1.2.1


From 41adf531e390e7969f00a560b8971cbf42f5a6da Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 6 Dec 2021 19:12:13 +0800
Subject: blk-mq: don't run might_sleep() if the operation needn't blocking

The operation protected via blk_mq_run_dispatch_ops() in blk_mq_run_hw_queue
won't sleep, so don't run might_sleep() for it.

Reported-and-tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 block/blk-mq.h | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22ec21aa0c22..706e9a836fe6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2048,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	blk_mq_run_dispatch_ops(hctx->queue,
+	__blk_mq_run_dispatch_ops(hctx->queue, false,
 		need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx));
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d62004e2d531..948791ea2a3e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -375,7 +375,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 }
 
 /* run the code block in @dispatch_ops with rcu/srcu read lock held */
-#define blk_mq_run_dispatch_ops(q, dispatch_ops)		\
+#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)	\
 do {								\
 	if (!blk_queue_has_srcu(q)) {				\
 		rcu_read_lock();				\
@@ -384,11 +384,14 @@ do {								\
 	} else {						\
 		int srcu_idx;					\
 								\
-		might_sleep();					\
+		might_sleep_if(check_sleep);			\
 		srcu_idx = srcu_read_lock((q)->srcu);		\
 		(dispatch_ops);					\
 		srcu_read_unlock((q)->srcu, srcu_idx);		\
 	}							\
 } while (0)
 
+#define blk_mq_run_dispatch_ops(q, dispatch_ops)		\
+	__blk_mq_run_dispatch_ops(q, true, dispatch_ops)	\
+
 #endif
-- 
cgit v1.2.1


From 73f3760eddc9bc32c207fff06537f98f94bef451 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 6 Dec 2021 11:33:50 +0800
Subject: blk-mq: don't use plug->mq_list->q directly in
 blk_mq_run_dispatch_ops()

blk_mq_run_dispatch_ops() is defined as one macro, and plug->mq_list
will be changed when running 'dispatch_ops', so add one local variable
for holding request queue.

Reported-and-tested-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: 4cafe86c9267 ("blk-mq: run dispatch lock once in case of issuing from list")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 706e9a836fe6..0bf3523dd1f5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2521,7 +2521,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		blk_mq_run_dispatch_ops(plug->mq_list->q,
+		struct request_queue *q = rq_list_peek(&plug->mq_list)->q;
+
+		blk_mq_run_dispatch_ops(q,
 				blk_mq_plug_issue_direct(plug, false));
 		if (rq_list_empty(plug->mq_list))
 			return;
-- 
cgit v1.2.1


From 8ab30a331946c34e4ba022c44df8624acea1c74e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 6 Dec 2021 20:49:48 +0800
Subject: blk-mq: Drop busy_iter_fn blk_mq_hw_ctx argument

The only user of blk_mq_hw_ctx blk_mq_hw_ctx argument is
blk_mq_rq_inflight().

Function blk_mq_rq_inflight() uses the hctx to find the associated request
queue to match against the request. However this same check is already
done in caller bt_iter(), so drop this check.

With that change there are no more users of busy_iter_fn blk_mq_hw_ctx
argument, so drop the argument.

Reviewed-by Hannes Reinecke <hare@suse.de>

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Link: https://lore.kernel.org/r/1638794990-137490-2-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq.c     | 17 ++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 380e2dd31bfc..d3cf91d764d5 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -254,7 +254,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 		return true;
 
 	if (rq->q == hctx->queue && rq->mq_hctx == hctx)
-		ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
+		ret = iter_data->fn(rq, iter_data->data, reserved);
 	blk_mq_put_rq_ref(rq);
 	return ret;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0bf3523dd1f5..103c0f58853c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -127,8 +127,7 @@ struct mq_inflight {
 	unsigned int inflight[2];
 };
 
-static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
-				  struct request *rq, void *priv,
+static bool blk_mq_check_inflight(struct request *rq, void *priv,
 				  bool reserved)
 {
 	struct mq_inflight *mi = priv;
@@ -1308,14 +1307,15 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
-static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
-			       void *priv, bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv,
+			       bool reserved)
 {
 	/*
-	 * If we find a request that isn't idle and the queue matches,
-	 * we know the queue is busy. Return false to stop the iteration.
+	 * If we find a request that isn't idle we know the queue is busy
+	 * as it's checked in the iter.
+	 * Return false to stop the iteration.
 	 */
-	if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
+	if (blk_mq_request_started(rq)) {
 		bool *busy = priv;
 
 		*busy = true;
@@ -1377,8 +1377,7 @@ void blk_mq_put_rq_ref(struct request *rq)
 		__blk_mq_free_request(rq);
 }
 
-static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
 {
 	unsigned long *next = priv;
 
-- 
cgit v1.2.1


From fc39f8d2d1c10ac04976b0a247865bb0cec4dd88 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 6 Dec 2021 20:49:49 +0800
Subject: blk-mq: Delete busy_iter_fn

Typedefs busy_iter_fn and busy_tag_iter_fn are now identical, so delete
busy_iter_fn to reduce duplication.

It would be nicer to delete busy_tag_iter_fn, as the name busy_iter_fn is
less specific.

However busy_tag_iter_fn is used in many different parts of the tree,
unlike busy_iter_fn which is just use in block/, so just take the
straightforward path now, so that we could rename later treewide.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Link: https://lore.kernel.org/r/1638794990-137490-3-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c | 6 +++---
 block/blk-mq-tag.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d3cf91d764d5..58b80d4b7a07 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -215,7 +215,7 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
 
 struct bt_iter_data {
 	struct blk_mq_hw_ctx *hctx;
-	busy_iter_fn *fn;
+	busy_tag_iter_fn *fn;
 	void *data;
 	bool reserved;
 };
@@ -274,7 +274,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  *		bitmap_tags member of struct blk_mq_tags.
  */
 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-			busy_iter_fn *fn, void *data, bool reserved)
+			busy_tag_iter_fn *fn, void *data, bool reserved)
 {
 	struct bt_iter_data iter_data = {
 		.hctx = hctx,
@@ -457,7 +457,7 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
  * called for all requests on all queues that share that tag set and not only
  * for requests associated with @q.
  */
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index df787b5a23bd..5668e28be0b7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -28,7 +28,7 @@ extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
 extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
 
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv);
 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
-- 
cgit v1.2.1


From fea9f92f1748083cb82049ed503be30c3d3a9b69 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 6 Dec 2021 20:49:50 +0800
Subject: blk-mq: Optimise blk_mq_queue_tag_busy_iter() for shared tags

Kashyap reports high CPU usage in blk_mq_queue_tag_busy_iter() and callees
using megaraid SAS RAID card since moving to shared tags [0].

Previously, when shared tags was shared sbitmap, this function was less
than optimum since we would iter through all tags for all hctx's,
yet only ever match upto tagset depth number of rqs.

Since the change to shared tags, things are even less efficient if we have
parallel callers of blk_mq_queue_tag_busy_iter(). This is because in
bt_iter() -> blk_mq_find_and_get_req() there would be more contention on
accessing each request ref and tags->lock since they are now shared among
all HW queues.

Optimise by having separate calls to bt_for_each() for when we're using
shared tags. In this case no longer pass a hctx, as it is no longer
relevant, and teach bt_iter() about this.

Ming suggested something along the lines of this change, apart from a
different implementation.

[0] https://lore.kernel.org/linux-block/e4e92abbe9d52bcba6b8cc6c91c442cc@mail.gmail.com/

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reported-and-tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Fixes: e155b0c238b2 ("blk-mq: Use shared tags for shared sbitmap support")
Link: https://lore.kernel.org/r/1638794990-137490-4-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c | 59 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 58b80d4b7a07..e55a6834c9a6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -215,6 +215,7 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
 
 struct bt_iter_data {
 	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
 	busy_tag_iter_fn *fn;
 	void *data;
 	bool reserved;
@@ -238,11 +239,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
 	struct bt_iter_data *iter_data = data;
 	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
-	struct blk_mq_tags *tags = hctx->tags;
+	struct request_queue *q = iter_data->q;
+	struct blk_mq_tag_set *set = q->tag_set;
 	bool reserved = iter_data->reserved;
+	struct blk_mq_tags *tags;
 	struct request *rq;
 	bool ret = true;
 
+	if (blk_mq_is_shared_tags(set->flags))
+		tags = set->shared_tags;
+	else
+		tags = hctx->tags;
+
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
 	/*
@@ -253,7 +261,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	if (!rq)
 		return true;
 
-	if (rq->q == hctx->queue && rq->mq_hctx == hctx)
+	if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
 		ret = iter_data->fn(rq, iter_data->data, reserved);
 	blk_mq_put_rq_ref(rq);
 	return ret;
@@ -262,6 +270,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 /**
  * bt_for_each - iterate over the requests associated with a hardware queue
  * @hctx:	Hardware queue to examine.
+ * @q:		Request queue to examine.
  * @bt:		sbitmap to examine. This is either the breserved_tags member
  *		or the bitmap_tags member of struct blk_mq_tags.
  * @fn:		Pointer to the function that will be called for each request
@@ -273,14 +282,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  * @reserved:	Indicates whether @bt is the breserved_tags member or the
  *		bitmap_tags member of struct blk_mq_tags.
  */
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-			busy_tag_iter_fn *fn, void *data, bool reserved)
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
+			struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
+			void *data, bool reserved)
 {
 	struct bt_iter_data iter_data = {
 		.hctx = hctx,
 		.fn = fn,
 		.data = data,
 		.reserved = reserved,
+		.q = q,
 	};
 
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@@ -460,9 +471,6 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv)
 {
-	struct blk_mq_hw_ctx *hctx;
-	int i;
-
 	/*
 	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
 	 * while the queue is frozen. So we can use q_usage_counter to avoid
@@ -471,19 +479,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		struct blk_mq_tags *tags = hctx->tags;
-
-		/*
-		 * If no software queues are currently mapped to this
-		 * hardware queue, there's nothing to check
-		 */
-		if (!blk_mq_hw_queue_mapped(hctx))
-			continue;
+	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+		struct blk_mq_tags *tags = q->tag_set->shared_tags;
+		struct sbitmap_queue *bresv = &tags->breserved_tags;
+		struct sbitmap_queue *btags = &tags->bitmap_tags;
 
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+			bt_for_each(NULL, q, bresv, fn, priv, true);
+		bt_for_each(NULL, q, btags, fn, priv, false);
+	} else {
+		struct blk_mq_hw_ctx *hctx;
+		int i;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			struct blk_mq_tags *tags = hctx->tags;
+			struct sbitmap_queue *bresv = &tags->breserved_tags;
+			struct sbitmap_queue *btags = &tags->bitmap_tags;
+
+			/*
+			 * If no software queues are currently mapped to this
+			 * hardware queue, there's nothing to check
+			 */
+			if (!blk_mq_hw_queue_mapped(hctx))
+				continue;
+
+			if (tags->nr_reserved_tags)
+				bt_for_each(hctx, q, bresv, fn, priv, true);
+			bt_for_each(hctx, q, btags, fn, priv, false);
+		}
 	}
 	blk_queue_exit(q);
 }
-- 
cgit v1.2.1


From 0ba4566cd8a4e645b542e6ddbe3dd26c85ad2408 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 13 Dec 2021 17:11:13 +0000
Subject: bdev: Improve lookup_bdev documentation

Add a Context section and rewrite the rest to be clearer.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Link: https://lore.kernel.org/r/20211213171113.3097631-1-willy@infradead.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/bdev.c b/block/bdev.c
index 587645231d60..8bf93a19041b 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -955,15 +955,15 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 EXPORT_SYMBOL(blkdev_put);
 
 /**
- * lookup_bdev  - lookup a struct block_device by name
- * @pathname:	special file representing the block device
- * @dev:	return value of the block device's dev_t
+ * lookup_bdev() - Look up a struct block_device by name.
+ * @pathname: Name of the block device in the filesystem.
+ * @dev: Pointer to the block device's dev_t, if found.
  *
  * Lookup the block device's dev_t at @pathname in the current
- * namespace if possible and return it by @dev.
+ * namespace if possible and return it in @dev.
  *
- * RETURNS:
- * 0 if succeeded, errno otherwise.
+ * Context: May sleep.
+ * Return: 0 if succeeded, negative errno otherwise.
  */
 int lookup_bdev(const char *pathname, dev_t *dev)
 {
-- 
cgit v1.2.1


From 68497092bde9f53e35cafeb52fa9a267ebe0d9b1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Dec 2021 17:23:05 -0700
Subject: block: make queue stat accounting a reference

kyber turns on IO statistics when it is loaded on a queue, which means
that even if kyber is then later unloaded, we're still stuck with stats
enabled on the queue.

Change the account enabled from a bool to an int, and pair the enable call
with the equivalent disable call. This ensures that stats gets turned off
again appropriately.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-stat.c      | 21 ++++++++++++++++-----
 block/blk-stat.h      |  1 +
 block/kyber-iosched.c |  1 +
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index efb2a80db906..2ea01b5c1aca 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -15,7 +15,7 @@
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
-	bool enable_accounting;
+	int accounting;
 };
 
 void blk_rq_stat_init(struct blk_rq_stat *stat)
@@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
 
 	spin_lock_irqsave(&q->stats->lock, flags);
 	list_del_rcu(&cb->list);
-	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+	if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
 		blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
 	spin_unlock_irqrestore(&q->stats->lock, flags);
 
@@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
 		call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
 
+void blk_stat_disable_accounting(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->stats->lock, flags);
+	if (!--q->stats->accounting)
+		blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
+	spin_unlock_irqrestore(&q->stats->lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
+
 void blk_stat_enable_accounting(struct request_queue *q)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->stats->lock, flags);
-	q->stats->enable_accounting = true;
-	blk_queue_flag_set(QUEUE_FLAG_STATS, q);
+	if (!q->stats->accounting++)
+		blk_queue_flag_set(QUEUE_FLAG_STATS, q);
 	spin_unlock_irqrestore(&q->stats->lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
 
 	INIT_LIST_HEAD(&stats->callbacks);
 	spin_lock_init(&stats->lock);
-	stats->enable_accounting = false;
+	stats->accounting = 0;
 
 	return stats;
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 58f029af49e5..17e1eb4ec7e2 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -70,6 +70,7 @@ void blk_stat_add(struct request *rq, u64 now);
 
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
+void blk_stat_disable_accounting(struct request_queue *q);
 
 /**
  * blk_stat_alloc_callback() - Allocate a block statistics callback.
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index fdd74a4df56f..70ff2a599ef6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
 	int i;
 
 	del_timer_sync(&kqd->timer);
+	blk_stat_disable_accounting(kqd->q);
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 		sbitmap_queue_free(&kqd->domain_tokens[i]);
-- 
cgit v1.2.1


From 5581a5ddfe8d0ede1749bae1f7662fa739f83813 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 1 Dec 2021 15:01:51 -0700
Subject: block: add completion handler for fast path

The batched completions only deal with non-partial requests anyway,
and it doesn't deal with any requests that have errors. Add a completion
handler that assumes it's a full request and that it's all being ended
successfully.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 103c0f58853c..75154cc788db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -709,6 +709,47 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
 		IOPRIO_PRIO_CLASS(req->ioprio));
 }
 
+/*
+ * Fully end IO on a request. Does not support partial completions, or
+ * errors.
+ */
+static void blk_complete_request(struct request *req)
+{
+	const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
+	int total_bytes = blk_rq_bytes(req);
+	struct bio *bio = req->bio;
+
+	trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
+
+	if (!bio)
+		return;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
+		req->q->integrity.profile->complete_fn(req, total_bytes);
+#endif
+
+	blk_account_io_completion(req, total_bytes);
+
+	do {
+		struct bio *next = bio->bi_next;
+
+		/* Completion has already been traced */
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+		if (!is_flush)
+			bio_endio(bio);
+		bio = next;
+	} while (bio);
+
+	/*
+	 * Reset counters so that the request stacking driver
+	 * can find how many bytes remain in the request
+	 * later.
+	 */
+	req->bio = NULL;
+	req->__data_len = 0;
+}
+
 /**
  * blk_update_request - Complete multiple bytes without completing the request
  * @req:      the request being processed
@@ -922,7 +963,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
 		prefetch(rq->bio);
 		prefetch(rq->rq_next);
 
-		blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+		blk_complete_request(rq);
 		if (iob->need_ts)
 			__blk_mq_end_request_acct(rq, now);
 
-- 
cgit v1.2.1


From fcade2ce06ffebee5c2f6629ddbf2086c0f5ba5a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 1 Dec 2021 16:19:18 -0700
Subject: block: use singly linked list for bio cache

Pointless to maintain a head/tail for the list, as we never need to
access the tail. Entries are always LIFO for cache hotness reasons.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 15ab0d6d1c06..6fadc977cd7f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -26,7 +26,7 @@
 #include "blk-rq-qos.h"
 
 struct bio_alloc_cache {
-	struct bio_list		free_list;
+	struct bio		*free_list;
 	unsigned int		nr;
 };
 
@@ -630,7 +630,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
 	unsigned int i = 0;
 	struct bio *bio;
 
-	while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+	while ((bio = cache->free_list) != NULL) {
+		cache->free_list = bio->bi_next;
 		cache->nr--;
 		bio_free(bio);
 		if (++i == nr)
@@ -689,7 +690,8 @@ void bio_put(struct bio *bio)
 
 		bio_uninit(bio);
 		cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
-		bio_list_add_head(&cache->free_list, bio);
+		bio->bi_next = cache->free_list;
+		cache->free_list = bio;
 		if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
 			bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
 		put_cpu();
@@ -1704,8 +1706,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 		return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
 
 	cache = per_cpu_ptr(bs->cache, get_cpu());
-	bio = bio_list_pop(&cache->free_list);
-	if (bio) {
+	if (cache->free_list) {
+		bio = cache->free_list;
+		cache->free_list = bio->bi_next;
 		cache->nr--;
 		put_cpu();
 		bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
-- 
cgit v1.2.1


From 3c67d44de787dff288d7f2a51c372b22f7356db6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 3 Dec 2021 06:48:53 -0700
Subject: block: add mq_ops->queue_rqs hook

If we have a list of requests in our plug list, send it to the driver in
one go, if possible. The driver must set mq_ops->queue_rqs() to support
this, if not the usual one-by-one path is used.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 75154cc788db..51991232824a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2553,6 +2553,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_hw_ctx *this_hctx;
 	struct blk_mq_ctx *this_ctx;
+	struct request *rq;
 	unsigned int depth;
 	LIST_HEAD(list);
 
@@ -2561,7 +2562,28 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		struct request_queue *q = rq_list_peek(&plug->mq_list)->q;
+		struct request_queue *q;
+
+		rq = rq_list_peek(&plug->mq_list);
+		q = rq->q;
+
+		/*
+		 * Peek first request and see if we have a ->queue_rqs() hook.
+		 * If we do, we can dispatch the whole plug list in one go. We
+		 * already know at this point that all requests belong to the
+		 * same queue, caller must ensure that's the case.
+		 *
+		 * Since we pass off the full list to the driver at this point,
+		 * we do not increment the active request count for the queue.
+		 * Bypass shared tags for now because of that.
+		 */
+		if (q->mq_ops->queue_rqs &&
+		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+			blk_mq_run_dispatch_ops(q,
+				q->mq_ops->queue_rqs(&plug->mq_list));
+			if (rq_list_empty(plug->mq_list))
+				return;
+		}
 
 		blk_mq_run_dispatch_ops(q,
 				blk_mq_plug_issue_direct(plug, false));
@@ -2573,8 +2595,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	this_ctx = NULL;
 	depth = 0;
 	do {
-		struct request *rq;
-
 		rq = rq_list_pop(&plug->mq_list);
 
 		if (!this_hctx) {
-- 
cgit v1.2.1


From 8a2ba1785c5803d59a63b6320ff54fd4a37a41ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:21 +0100
Subject: block: remove the nr_task field from struct io_context

Nothing ever looks at ->nr_tasks, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 536fb496ad76..96336c2134ef 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -207,7 +207,6 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	atomic_dec(&ioc->nr_tasks);
 	put_io_context_active(ioc);
 }
 
@@ -259,7 +258,6 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		return NULL;
 
 	atomic_long_set(&ioc->refcount, 1);
-	atomic_set(&ioc->nr_tasks, 1);
 	atomic_set(&ioc->active_ref, 1);
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
@@ -339,7 +337,6 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	if (clone_flags & CLONE_IO) {
 		atomic_long_inc(&ioc->refcount);
 		atomic_inc(&ioc->active_ref);
-		atomic_inc(&ioc->nr_tasks);
 		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
 		tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
-- 
cgit v1.2.1


From 0aed2f162bbc7853fe91c0d70492ea73c4e9cb07 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:22 +0100
Subject: block: simplify struct io_context refcounting

Don't hold a reference to ->refcount for each active reference, but
just one for all active references.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 96336c2134ef..9cde3906be3c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -180,10 +180,8 @@ static void put_io_context_active(struct io_context *ioc)
 {
 	struct io_cq *icq;
 
-	if (!atomic_dec_and_test(&ioc->active_ref)) {
-		put_io_context(ioc);
+	if (!atomic_dec_and_test(&ioc->active_ref))
 		return;
-	}
 
 	spin_lock_irq(&ioc->lock);
 	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
@@ -335,7 +333,6 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	 * Share io context with parent, if CLONE_IO is set
 	 */
 	if (clone_flags & CLONE_IO) {
-		atomic_long_inc(&ioc->refcount);
 		atomic_inc(&ioc->active_ref);
 		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
-- 
cgit v1.2.1


From 4be8a2eaff2e4473b6e8ad9a3857bc9b1e79c8ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:23 +0100
Subject: block: refactor put_iocontext_active

Factor out a ioc_exit_icqs helper to tear down the icqs and the fold
the rest of put_iocontext_active into exit_io_context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9cde3906be3c..0380e33930e3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -54,6 +54,16 @@ static void ioc_exit_icq(struct io_cq *icq)
 	icq->flags |= ICQ_EXITED;
 }
 
+static void ioc_exit_icqs(struct io_context *ioc)
+{
+	struct io_cq *icq;
+
+	spin_lock_irq(&ioc->lock);
+	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
+		ioc_exit_icq(icq);
+	spin_unlock_irq(&ioc->lock);
+}
+
 /*
  * Release an icq. Called with ioc locked for blk-mq, and with both ioc
  * and queue locked for legacy.
@@ -169,32 +179,6 @@ void put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL_GPL(put_io_context);
 
-/**
- * put_io_context_active - put active reference on ioc
- * @ioc: ioc of interest
- *
- * Put an active reference to an ioc.  If active reference reaches zero after
- * put, @ioc can never issue further IOs and ioscheds are notified.
- */
-static void put_io_context_active(struct io_context *ioc)
-{
-	struct io_cq *icq;
-
-	if (!atomic_dec_and_test(&ioc->active_ref))
-		return;
-
-	spin_lock_irq(&ioc->lock);
-	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
-		if (icq->flags & ICQ_EXITED)
-			continue;
-
-		ioc_exit_icq(icq);
-	}
-	spin_unlock_irq(&ioc->lock);
-
-	put_io_context(ioc);
-}
-
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
@@ -205,7 +189,10 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	put_io_context_active(ioc);
+	if (atomic_dec_and_test(&ioc->active_ref)) {
+		ioc_exit_icqs(ioc);
+		put_io_context(ioc);
+	}
 }
 
 static void __ioc_clear_queue(struct list_head *icq_list)
-- 
cgit v1.2.1


From 8a20c0c7e0cea7eb0c32fd6b63ff514c9ac32b8f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:24 +0100
Subject: block: remove the NULL ioc check in put_io_context

No caller passes in a NULL pointer, so remove the check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 0380e33930e3..04f3d2b0ca7d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -155,9 +155,6 @@ void put_io_context(struct io_context *ioc)
 	unsigned long flags;
 	bool free_ioc = false;
 
-	if (ioc == NULL)
-		return;
-
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 
 	/*
-- 
cgit v1.2.1


From edf70ff5a1ed9769da35178454d743828061a6a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:25 +0100
Subject: block: refactor put_io_context

Move the code to delay freeing the icqs into a separate helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 04f3d2b0ca7d..ca996214c10a 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -143,6 +143,24 @@ static void ioc_release_fn(struct work_struct *work)
 	kmem_cache_free(iocontext_cachep, ioc);
 }
 
+/*
+ * Releasing icqs requires reverse order double locking and we may already be
+ * holding a queue_lock.  Do it asynchronously from a workqueue.
+ */
+static bool ioc_delay_free(struct io_context *ioc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	if (!hlist_empty(&ioc->icq_list)) {
+		queue_work(system_power_efficient_wq, &ioc->release_work);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&ioc->lock, flags);
+	return false;
+}
+
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
@@ -152,26 +170,8 @@ static void ioc_release_fn(struct work_struct *work)
  */
 void put_io_context(struct io_context *ioc)
 {
-	unsigned long flags;
-	bool free_ioc = false;
-
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
-
-	/*
-	 * Releasing ioc requires reverse order double locking and we may
-	 * already be holding a queue_lock.  Do it asynchronously from wq.
-	 */
-	if (atomic_long_dec_and_test(&ioc->refcount)) {
-		spin_lock_irqsave(&ioc->lock, flags);
-		if (!hlist_empty(&ioc->icq_list))
-			queue_work(system_power_efficient_wq,
-					&ioc->release_work);
-		else
-			free_ioc = true;
-		spin_unlock_irqrestore(&ioc->lock, flags);
-	}
-
-	if (free_ioc)
+	if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
 		kmem_cache_free(iocontext_cachep, ioc);
 }
 EXPORT_SYMBOL_GPL(put_io_context);
-- 
cgit v1.2.1


From 091abcb3efd71cb18e80c8f040d9e4a634d8906d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:26 +0100
Subject: block: cleanup ioc_clear_queue

Fold __ioc_clear_queue into ioc_clear_queue and switch to always
use plain _irq locking instead of the more expensive _irqsave that
is not needed here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ca996214c10a..f98a29ee8f36 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -192,27 +192,6 @@ void exit_io_context(struct task_struct *task)
 	}
 }
 
-static void __ioc_clear_queue(struct list_head *icq_list)
-{
-	unsigned long flags;
-
-	rcu_read_lock();
-	while (!list_empty(icq_list)) {
-		struct io_cq *icq = list_entry(icq_list->next,
-						struct io_cq, q_node);
-		struct io_context *ioc = icq->ioc;
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		if (icq->flags & ICQ_DESTROYED) {
-			spin_unlock_irqrestore(&ioc->lock, flags);
-			continue;
-		}
-		ioc_destroy_icq(icq);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-	}
-	rcu_read_unlock();
-}
-
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
  * @q: request_queue being cleared
@@ -227,7 +206,17 @@ void ioc_clear_queue(struct request_queue *q)
 	list_splice_init(&q->icq_list, &icq_list);
 	spin_unlock_irq(&q->queue_lock);
 
-	__ioc_clear_queue(&icq_list);
+	rcu_read_lock();
+	while (!list_empty(&icq_list)) {
+		struct io_cq *icq =
+			list_entry(icq_list.next, struct io_cq, q_node);
+
+		spin_lock_irq(&icq->ioc->lock);
+		if (!(icq->flags & ICQ_DESTROYED))
+			ioc_destroy_icq(icq);
+		spin_unlock_irq(&icq->ioc->lock);
+	}
+	rcu_read_unlock();
 }
 
 static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
-- 
cgit v1.2.1


From a411cd3cfdc5bbd1329d5b33dbf39e2b5213969d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:27 +0100
Subject: block: move set_task_ioprio to blk-ioc.c

Keep set_task_ioprio with the other low-level code that accesses the
io_context structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 34 ++++++++++++++++++++++++++++++++--
 block/ioprio.c  | 32 --------------------------------
 2 files changed, 32 insertions(+), 34 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f98a29ee8f36..c25ce2f3eb19 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,6 +8,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 #include <linux/sched/task.h>
 
 #include "blk.h"
@@ -280,8 +281,8 @@ static struct io_context *create_task_io_context(struct task_struct *task,
  * This function always goes through task_lock() and it's better to use
  * %current->io_context + get_io_context() for %current.
  */
-struct io_context *get_task_io_context(struct task_struct *task,
-				       gfp_t gfp_flags, int node)
+static struct io_context *get_task_io_context(struct task_struct *task,
+		gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 
@@ -298,6 +299,35 @@ struct io_context *get_task_io_context(struct task_struct *task,
 	return ioc;
 }
 
+int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+	int err;
+	struct io_context *ioc;
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if (!uid_eq(tcred->uid, cred->euid) &&
+	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
+	rcu_read_unlock();
+
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
+		ioc->ioprio = ioprio;
+		put_io_context(ioc);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
+
 int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct io_context *ioc = current->io_context;
diff --git a/block/ioprio.c b/block/ioprio.c
index 313c14a70bbd..e118f4bf2dc6 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -22,46 +22,14 @@
  */
 #include <linux/gfp.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
 #include <linux/ioprio.h>
 #include <linux/cred.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
-#include <linux/sched/user.h>
-#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
-	int err;
-	struct io_context *ioc;
-	const struct cred *cred = current_cred(), *tcred;
-
-	rcu_read_lock();
-	tcred = __task_cred(task);
-	if (!uid_eq(tcred->uid, cred->euid) &&
-	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
-	rcu_read_unlock();
-
-	err = security_task_setioprio(task, ioprio);
-	if (err)
-		return err;
-
-	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-	if (ioc) {
-		ioc->ioprio = ioprio;
-		put_io_context(ioc);
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
 int ioprio_check_cap(int ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
-- 
cgit v1.2.1


From 8472161b77c41d260c5ba0af6bf940269b297bb6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:28 +0100
Subject: block: fold get_task_io_context into set_task_ioprio

Fold get_task_io_context into its only caller, and simplify the code
as no reference to the I/O context is required to just set the ioprio
field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 52 ++++++++++++++--------------------------------------
 1 file changed, 14 insertions(+), 38 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c25ce2f3eb19..1ba7cfedca2d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -268,41 +268,9 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 	return ioc;
 }
 
-/**
- * get_task_io_context - get io_context of a task
- * @task: task of interest
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of @task.  If it doesn't exist, it is created with
- * @gfp_flags and @node.  The returned io_context has its reference count
- * incremented.
- *
- * This function always goes through task_lock() and it's better to use
- * %current->io_context + get_io_context() for %current.
- */
-static struct io_context *get_task_io_context(struct task_struct *task,
-		gfp_t gfp_flags, int node)
-{
-	struct io_context *ioc;
-
-	might_sleep_if(gfpflags_allow_blocking(gfp_flags));
-
-	task_lock(task);
-	ioc = task->io_context;
-	if (unlikely(!ioc)) {
-		task_unlock(task);
-		return create_task_io_context(task, gfp_flags, node);
-	}
-	get_io_context(ioc);
-	task_unlock(task);
-	return ioc;
-}
-
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
-	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 
 	rcu_read_lock();
@@ -318,13 +286,21 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	if (err)
 		return err;
 
-	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-	if (ioc) {
-		ioc->ioprio = ioprio;
-		put_io_context(ioc);
-	}
+	task_lock(task);
+	if (unlikely(!task->io_context)) {
+		struct io_context *ioc;
 
-	return err;
+		task_unlock(task);
+		ioc = create_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+		if (ioc) {
+			ioc->ioprio = ioprio;
+			put_io_context(ioc);
+		}
+		return 0;
+	}
+	task->io_context->ioprio = ioprio;
+	task_unlock(task);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-- 
cgit v1.2.1


From 5fc11eebb4a98df5324a4de369bb5ab7f0007ff7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:29 +0100
Subject: block: open code create_task_io_context in set_task_ioprio

The flow in set_task_ioprio can be simplified by simply open coding
create_task_io_context, which removes a refcount roundtrip on the I/O
context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 1ba7cfedca2d..cff0e3bdae53 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -291,12 +291,18 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 		struct io_context *ioc;
 
 		task_unlock(task);
-		ioc = create_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-		if (ioc) {
-			ioc->ioprio = ioprio;
-			put_io_context(ioc);
+
+		ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
+		if (!ioc)
+			return -ENOMEM;
+
+		task_lock(task);
+		if (task->io_context || (task->flags & PF_EXITING)) {
+			kmem_cache_free(iocontext_cachep, ioc);
+			ioc = task->io_context;
+		} else {
+			task->io_context = ioc;
 		}
-		return 0;
 	}
 	task->io_context->ioprio = ioprio;
 	task_unlock(task);
-- 
cgit v1.2.1


From 90b627f5426ce144cdd4ea585d1f7812359a1a6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:30 +0100
Subject: block: fold create_task_io_context into ioc_find_get_icq

Fold create_task_io_context into the only remaining caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 43 ++++++++++++-------------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index cff0e3bdae53..dc7fb064fd5f 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -238,36 +238,6 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	return ioc;
 }
 
-static struct io_context *create_task_io_context(struct task_struct *task,
-		gfp_t gfp_flags, int node)
-{
-	struct io_context *ioc;
-
-	ioc = alloc_io_context(gfp_flags, node);
-	if (!ioc)
-		return NULL;
-
-	/*
-	 * Try to install.  ioc shouldn't be installed if someone else
-	 * already did or @task, which isn't %current, is exiting.  Note
-	 * that we need to allow ioc creation on exiting %current as exit
-	 * path may issue IOs from e.g. exit_files().  The exit path is
-	 * responsible for not issuing IO after exit_io_context().
-	 */
-	task_lock(task);
-	if (!task->io_context &&
-	    (task == current || !(task->flags & PF_EXITING)))
-		task->io_context = ioc;
-	else
-		kmem_cache_free(iocontext_cachep, ioc);
-
-	ioc = task->io_context;
-	if (ioc)
-		get_io_context(ioc);
-	task_unlock(task);
-	return ioc;
-}
-
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
@@ -426,9 +396,20 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
 	struct io_cq *icq = NULL;
 
 	if (unlikely(!ioc)) {
-		ioc = create_task_io_context(current, GFP_ATOMIC, q->node);
+		ioc = alloc_io_context(GFP_ATOMIC, q->node);
 		if (!ioc)
 			return NULL;
+
+		task_lock(current);
+		if (current->io_context) {
+			kmem_cache_free(iocontext_cachep, ioc);
+			ioc = current->io_context;
+		} else {
+			current->io_context = ioc;
+		}
+
+		get_io_context(ioc);
+		task_unlock(current);
 	} else {
 		get_io_context(ioc);
 
-- 
cgit v1.2.1


From 5ef1630586317e92c9ebd7b4ce48f393b7ff790f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Dec 2021 07:31:31 +0100
Subject: block: only build the icq tracking code when needed

Only bfq needs to code to track icq, so make it conditional.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20211209063131.18537-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig         |  3 +++
 block/Kconfig.iosched |  1 +
 block/blk-ioc.c       | 68 +++++++++++++++++++++++++++++++--------------------
 block/blk.h           |  6 +++++
 4 files changed, 51 insertions(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index c6ce41a5e5b2..d5d4197b7ed2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
 config BLK_DEV_BSG_COMMON
 	tristate
 
+config BLK_ICQ
+	bool
+
 config BLK_DEV_BSGLIB
 	bool "Block layer SG support v4 helper lib"
 	select BLK_DEV_BSG_COMMON
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 885fee86dfca..615516146086 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -18,6 +18,7 @@ config MQ_IOSCHED_KYBER
 
 config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
+	select BLK_ICQ
 	help
 	BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
 	of the device among all processes according to their weights,
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index dc7fb064fd5f..87bdc9ca8295 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -19,6 +19,7 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
+#ifdef CONFIG_BLK_ICQ
 /**
  * get_io_context - increment reference count to io_context
  * @ioc: io_context to get
@@ -162,6 +163,42 @@ static bool ioc_delay_free(struct io_context *ioc)
 	return false;
 }
 
+/**
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
+ *
+ * Walk @q->icq_list and exit all io_cq's.
+ */
+void ioc_clear_queue(struct request_queue *q)
+{
+	LIST_HEAD(icq_list);
+
+	spin_lock_irq(&q->queue_lock);
+	list_splice_init(&q->icq_list, &icq_list);
+	spin_unlock_irq(&q->queue_lock);
+
+	rcu_read_lock();
+	while (!list_empty(&icq_list)) {
+		struct io_cq *icq =
+			list_entry(icq_list.next, struct io_cq, q_node);
+
+		spin_lock_irq(&icq->ioc->lock);
+		if (!(icq->flags & ICQ_DESTROYED))
+			ioc_destroy_icq(icq);
+		spin_unlock_irq(&icq->ioc->lock);
+	}
+	rcu_read_unlock();
+}
+#else /* CONFIG_BLK_ICQ */
+static inline void ioc_exit_icqs(struct io_context *ioc)
+{
+}
+static inline bool ioc_delay_free(struct io_context *ioc)
+{
+	return false;
+}
+#endif /* CONFIG_BLK_ICQ */
+
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
@@ -193,33 +230,6 @@ void exit_io_context(struct task_struct *task)
 	}
 }
 
-/**
- * ioc_clear_queue - break any ioc association with the specified queue
- * @q: request_queue being cleared
- *
- * Walk @q->icq_list and exit all io_cq's.
- */
-void ioc_clear_queue(struct request_queue *q)
-{
-	LIST_HEAD(icq_list);
-
-	spin_lock_irq(&q->queue_lock);
-	list_splice_init(&q->icq_list, &icq_list);
-	spin_unlock_irq(&q->queue_lock);
-
-	rcu_read_lock();
-	while (!list_empty(&icq_list)) {
-		struct io_cq *icq =
-			list_entry(icq_list.next, struct io_cq, q_node);
-
-		spin_lock_irq(&icq->ioc->lock);
-		if (!(icq->flags & ICQ_DESTROYED))
-			ioc_destroy_icq(icq);
-		spin_unlock_irq(&icq->ioc->lock);
-	}
-	rcu_read_unlock();
-}
-
 static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
@@ -231,10 +241,12 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 
 	atomic_long_set(&ioc->refcount, 1);
 	atomic_set(&ioc->active_ref, 1);
+#ifdef CONFIG_BLK_ICQ
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
+#endif
 	return ioc;
 }
 
@@ -300,6 +312,7 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
+#ifdef CONFIG_BLK_ICQ
 /**
  * ioc_lookup_icq - lookup io_cq from ioc
  * @q: the associated request_queue
@@ -428,6 +441,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
 	return icq;
 }
 EXPORT_SYMBOL_GPL(ioc_find_get_icq);
+#endif /* CONFIG_BLK_ICQ */
 
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk.h b/block/blk.h
index 7ccb7c7d86b3..8bd43b3ad33d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -366,7 +366,13 @@ static inline unsigned int bio_aligned_discard_max_sectors(
  */
 struct io_cq *ioc_find_get_icq(struct request_queue *q);
 struct io_cq *ioc_lookup_icq(struct request_queue *q);
+#ifdef CONFIG_BLK_ICQ
 void ioc_clear_queue(struct request_queue *q);
+#else
+static inline void ioc_clear_queue(struct request_queue *q)
+{
+}
+#endif /* CONFIG_BLK_ICQ */
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-- 
cgit v1.2.1


From 518579a9af10a4b7b952a8366566fdcc7cfce3ca Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 20 Dec 2021 12:59:19 -0800
Subject: blk-mq: blk-mq: check quiesce state before queue_rqs

The low level drivers don't expect to see new requests after a
successful quiesce completes. Check the queue quiesce state within the
rcu protected area prior to calling the driver's queue_rqs().

Fixes: 3c67d44de787 ("block: add mq_ops->queue_rqs hook")
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20211220205919.180191-1-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 51991232824a..0d7c9d3e0329 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2549,6 +2549,14 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 		blk_mq_commit_rqs(hctx, &queued, from_schedule);
 }
 
+static void __blk_mq_flush_plug_list(struct request_queue *q,
+				     struct blk_plug *plug)
+{
+	if (blk_queue_quiesced(q))
+		return;
+	q->mq_ops->queue_rqs(&plug->mq_list);
+}
+
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_hw_ctx *this_hctx;
@@ -2580,7 +2588,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		if (q->mq_ops->queue_rqs &&
 		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
 			blk_mq_run_dispatch_ops(q,
-				q->mq_ops->queue_rqs(&plug->mq_list));
+				__blk_mq_flush_plug_list(q, plug));
 			if (rq_list_empty(plug->mq_list))
 				return;
 		}
-- 
cgit v1.2.1


From a957b61254a7d59a6c14ee2ac2db20a62eb299a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Dec 2021 20:32:24 -0700
Subject: block: fix error in handling dead task for ioprio setting

Don't combine the task exiting and "already have io_context" case, we
need to just abort if the task is marked as dead. Return -ESRCH, which
is the documented value for ioprio_set() if the specified task could not
be found.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reported-by: syzbot+8836466a79f4175961b0@syzkaller.appspotmail.com
Fixes: 5fc11eebb4a9 ("block: open code create_task_io_context in set_task_ioprio")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 87bdc9ca8295..71c3a933cf16 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -279,7 +279,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 			return -ENOMEM;
 
 		task_lock(task);
-		if (task->io_context || (task->flags & PF_EXITING)) {
+		if (task->flags & PF_EXITING) {
+			err = -ESRCH;
+			kmem_cache_free(iocontext_cachep, ioc);
+			goto out;
+		}
+		if (task->io_context) {
 			kmem_cache_free(iocontext_cachep, ioc);
 			ioc = task->io_context;
 		} else {
@@ -287,8 +292,9 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 		}
 	}
 	task->io_context->ioprio = ioprio;
+out:
 	task_unlock(task);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-- 
cgit v1.2.1


From 37e11c3616f6182b6bd7f95a04df035b43464f39 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 21 Dec 2021 12:04:36 +0800
Subject: block: call blk_exit_queue() before freeing q->stats

blk_stat_disable_accounting() is added in commit 68497092bde9
("block: make queue stat accounting a reference"), and called in
kyber_exit_sched().

So we have to free q->stats after elevator is unloaded from
blk_exit_queue() in blk_release_queue(). Otherwise kernel panic
is caused.

Fixes: 68497092bde9 ("block: make queue stat accounting a reference")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211221040436.1333880-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3e6357321225..e20eadfcf5c8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -791,11 +791,11 @@ static void blk_release_queue(struct kobject *kobj)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 
+	blk_exit_queue(q);
+
 	blk_free_queue_stats(q->stats);
 	kfree(q->poll_stat);
 
-	blk_exit_queue(q);
-
 	blk_queue_free_zone_bitmaps(q);
 
 	if (queue_is_mq(q))
-- 
cgit v1.2.1


From 99d8690aae4b2f0d1d90075de355ac087f820a66 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Dec 2021 17:18:51 +0100
Subject: block: fix error unwinding in device_add_disk

One device_add is called disk->ev will be freed by disk_release, so we
should free it twice.  Fix this by allocating disk->ev after device_add
so that the extra local unwinding can be removed entirely.

Based on an earlier patch from Tetsuo Handa.

Reported-by: syzbot <syzbot+28a66a9fbc621c939000@syzkaller.appspotmail.com>
Tested-by: syzbot <syzbot+28a66a9fbc621c939000@syzkaller.appspotmail.com>
Fixes: 83cbce9574462c6b ("block: add error handling for device_add_disk / add_disk")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211221161851.788424-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3c139a1b6f04..603db5d6f10c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -442,10 +442,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 		disk->first_minor = ret;
 	}
 
-	ret = disk_alloc_events(disk);
-	if (ret)
-		goto out_free_ext_minor;
-
 	/* delay uevents, until we scanned partition table */
 	dev_set_uevent_suppress(ddev, 1);
 
@@ -456,7 +452,12 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 		ddev->devt = MKDEV(disk->major, disk->first_minor);
 	ret = device_add(ddev);
 	if (ret)
-		goto out_disk_release_events;
+		goto out_free_ext_minor;
+
+	ret = disk_alloc_events(disk);
+	if (ret)
+		goto out_device_del;
+
 	if (!sysfs_deprecated) {
 		ret = sysfs_create_link(block_depr, &ddev->kobj,
 					kobject_name(&ddev->kobj));
@@ -538,8 +539,6 @@ out_del_block_link:
 		sysfs_remove_link(block_depr, dev_name(ddev));
 out_device_del:
 	device_del(ddev);
-out_disk_release_events:
-	disk_release_events(disk);
 out_free_ext_minor:
 	if (disk->major == BLOCK_EXT_MAJOR)
 		blk_free_ext_minor(disk->first_minor);
-- 
cgit v1.2.1


From e338924bd05d6e71574bc13e310c89e10e49a8a5 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Fri, 17 Dec 2021 23:51:25 +0900
Subject: block: check minor range in device_add_disk()

ioctl(fd, LOOP_CTL_ADD, 1048576) causes

  sysfs: cannot create duplicate filename '/dev/block/7:0'

message because such request is treated as if ioctl(fd, LOOP_CTL_ADD, 0)
due to MINORMASK == 1048575. Verify that all minor numbers for that device
fit in the minor range.

Reported-by: wangyangbo <wangyangbo@uniontech.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/b1b19379-23ee-5379-0eb5-94bf5f79f1b4@i-love.sakura.ne.jp
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 603db5d6f10c..626c8406f21a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -431,6 +431,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 				DISK_MAX_PARTS);
 			disk->minors = DISK_MAX_PARTS;
 		}
+		if (disk->first_minor + disk->minors > MINORMASK + 1)
+			return -EINVAL;
 	} else {
 		if (WARN_ON(disk->minors))
 			return -EINVAL;
-- 
cgit v1.2.1


From 669a064625fa3a06ddf8a4ac1f35b7436b99f133 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 23 Dec 2021 13:53:00 +0100
Subject: block: drop needless assignment in set_task_ioprio()

Commit 5fc11eebb4a9 ("block: open code create_task_io_context in
set_task_ioprio") introduces a needless assignment
'ioc = task->io_context', as the local variable ioc is not further
used before returning.

Even after the further fix, commit a957b61254a7 ("block: fix error in
handling dead task for ioprio setting"), the assignment still remains
needless.

Drop this needless assignment in set_task_ioprio().

This code smell was identified with 'make clang-analyzer'.

Fixes: 5fc11eebb4a9 ("block: open code create_task_io_context in set_task_ioprio")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211223125300.20691-1-lukas.bulwahn@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 71c3a933cf16..11f49f78db32 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -284,12 +284,10 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 			kmem_cache_free(iocontext_cachep, ioc);
 			goto out;
 		}
-		if (task->io_context) {
+		if (task->io_context)
 			kmem_cache_free(iocontext_cachep, ioc);
-			ioc = task->io_context;
-		} else {
+		else
 			task->io_context = ioc;
-		}
 	}
 	task->io_context->ioprio = ioprio;
 out:
-- 
cgit v1.2.1


From 9d497e2941c30a060ba62d5485b3bc9d91ffb09e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 4 Jan 2022 21:42:23 +0800
Subject: block: don't protect submit_bio_checks by q_usage_counter

Commit cc9c884dd7f4 ("block: call submit_bio_checks under q_usage_counter")
uses q_usage_counter to protect submit_bio_checks for avoiding IO after
disk is deleted by del_gendisk().

Turns out the protection isn't necessary, because once
blk_mq_freeze_queue_wait() in del_gendisk() returns:

1) all in-flight IO has been done

2) all new IO will be failed in __bio_queue_enter() because
   q_usage_counter is dead, and GD_DEAD is set

3) both disk and request queue instance are safe since caller of
submit_bio() guarantees that the disk can't be closed.

Once submit_bio_checks() needn't the protection of q_usage_counter, we can
move submit_bio_checks before calling blk_mq_submit_bio() and
->submit_bio(). With this change, we needn't to throttle queue with
holding one allocated request, then precise driver tag or request won't be
wasted in throttling. Meantime we can unify the bio check for both bio
based and request based driver.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220104134223.590803-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 14 +++++++++-----
 block/blk-mq.c   | 39 +++++++++++++--------------------------
 2 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 10619fd83c1b..97f8bc8d3a79 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -787,17 +787,21 @@ end_io:
 
 static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
 {
-	if (unlikely(bio_queue_enter(bio) != 0))
-		return;
-	if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio))
-		disk->fops->submit_bio(bio);
-	blk_queue_exit(disk->queue);
+	if (blk_crypto_bio_prep(&bio)) {
+		if (likely(bio_queue_enter(bio) == 0)) {
+			disk->fops->submit_bio(bio);
+			blk_queue_exit(disk->queue);
+		}
+	}
 }
 
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
+	if (unlikely(!submit_bio_checks(bio)))
+		return;
+
 	if (!disk->fops->submit_bio)
 		blk_mq_submit_bio(bio);
 	else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0d7c9d3e0329..a6d4780580fc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2714,26 +2714,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
 
 static struct request *blk_mq_get_new_requests(struct request_queue *q,
 					       struct blk_plug *plug,
-					       struct bio *bio,
-					       unsigned int nsegs)
+					       struct bio *bio)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
 		.nr_tags	= 1,
+		.cmd_flags	= bio->bi_opf,
 	};
 	struct request *rq;
 
 	if (unlikely(bio_queue_enter(bio)))
 		return NULL;
-	if (unlikely(!submit_bio_checks(bio)))
-		goto queue_exit;
-	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
-		goto queue_exit;
 
-	rq_qos_throttle(q, bio);
-
-	/* ->bi_opf is finalized after submit_bio_checks() returns */
-	data.cmd_flags	= bio->bi_opf;
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
@@ -2746,13 +2738,12 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
-queue_exit:
 	blk_queue_exit(q);
 	return NULL;
 }
 
 static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
-		struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
+		struct blk_plug *plug, struct bio *bio)
 {
 	struct request *rq;
 
@@ -2762,21 +2753,14 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
 	if (!rq || rq->q != q)
 		return NULL;
 
-	if (unlikely(!submit_bio_checks(*bio)))
-		return NULL;
-	if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
-		*bio = NULL;
+	if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
 		return NULL;
-	}
-	if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
-		return NULL;
-	if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
+	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
 		return NULL;
 
-	rq->cmd_flags = (*bio)->bi_opf;
+	rq->cmd_flags = bio->bi_opf;
 	plug->cached_rq = rq_list_next(rq);
 	INIT_LIST_HEAD(&rq->queuelist);
-	rq_qos_throttle(q, *bio);
 	return rq;
 }
 
@@ -2812,11 +2796,14 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (!bio_integrity_prep(bio))
 		return;
 
-	rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
+	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
+		return;
+
+	rq_qos_throttle(q, bio);
+
+	rq = blk_mq_get_cached_request(q, plug, bio);
 	if (!rq) {
-		if (!bio)
-			return;
-		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+		rq = blk_mq_get_new_requests(q, plug, bio);
 		if (unlikely(!rq))
 			return;
 	}
-- 
cgit v1.2.1