Skip to content

Commit 33a1d14

Browse files
Mohamed Khalfellakawasaki
authored andcommitted
nvme: Convert tag_list mutex to rwsemaphore to avoid deadlock
blk_mq_{add,del}_queue_tag_set() functions add and remove queues from tagset, the functions make sure that tagset and queues are marked as shared when two or more queues are attached to the same tagset. Initially a tagset starts as unshared and when the number of added queues reaches two, blk_mq_add_queue_tag_set() marks it as shared along with all the queues attached to it. When the number of attached queues drops to 1 blk_mq_del_queue_tag_set() need to mark both the tagset and the remaining queues as unshared. Both functions need to freeze current queues in tagset before setting on unsetting BLK_MQ_F_TAG_QUEUE_SHARED flag. While doing so, both functions hold set->tag_list_lock mutex, which makes sense as we do not want queues to be added or deleted in the process. This used to work fine until commit 98d81f0 ("nvme: use blk_mq_[un]quiesce_tagset") made the nvme driver quiesce tagset instead of quiscing individual queues. blk_mq_quiesce_tagset() does the job and quiesce the queues in set->tag_list while holding set->tag_list_lock also. This results in deadlock between two threads with these stacktraces: __schedule+0x48e/0xed0 schedule+0x5a/0xc0 schedule_preempt_disabled+0x11/0x20 __mutex_lock.constprop.0+0x3cc/0x760 blk_mq_quiesce_tagset+0x26/0xd0 nvme_dev_disable_locked+0x77/0x280 [nvme] nvme_timeout+0x268/0x320 [nvme] blk_mq_handle_expired+0x5d/0x90 bt_iter+0x7e/0x90 blk_mq_queue_tag_busy_iter+0x2b2/0x590 ? __blk_mq_complete_request_remote+0x10/0x10 ? __blk_mq_complete_request_remote+0x10/0x10 blk_mq_timeout_work+0x15b/0x1a0 process_one_work+0x133/0x2f0 ? mod_delayed_work_on+0x90/0x90 worker_thread+0x2ec/0x400 ? mod_delayed_work_on+0x90/0x90 kthread+0xe2/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x2d/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 __schedule+0x48e/0xed0 schedule+0x5a/0xc0 blk_mq_freeze_queue_wait+0x62/0x90 ? destroy_sched_domains_rcu+0x30/0x30 blk_mq_exit_queue+0x151/0x180 disk_release+0xe3/0xf0 device_release+0x31/0x90 kobject_put+0x6d/0x180 nvme_scan_ns+0x858/0xc90 [nvme_core] ? nvme_scan_work+0x281/0x560 [nvme_core] nvme_scan_work+0x281/0x560 [nvme_core] process_one_work+0x133/0x2f0 ? mod_delayed_work_on+0x90/0x90 worker_thread+0x2ec/0x400 ? mod_delayed_work_on+0x90/0x90 kthread+0xe2/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x2d/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 The top stacktrace is showing nvme_timeout() called to handle nvme command timeout. timeout handler is trying to disable the controller and as a first step, it needs to blk_mq_quiesce_tagset() to tell blk-mq not to call queue callback handlers. The thread is stuck waiting for set->tag_list_lock as it tires to walk the queues in set->tag_list. The lock is held by the second thread in the bottom stack which is waiting for one of queues to be frozen. The queue usage counter will drop to zero after nvme_timeout() finishes, and this will not happen because the thread will wait for this mutex forever. Convert set->tag_list_lock mutex to set->tag_list_rwsem rwsemaphore to avoid the deadlock. Update blk_mq_[un]quiesce_tagset() to take the semaphore for read since this is enough to guarantee no queues will be added or removed. Update blk_mq_{add,del}_queue_tag_set() to take the semaphore for write while updating set->tag_list and downgrade it to read while freezing the queues. It should be safe to update set->flags and hctx->flags while holding the semaphore for read since the queues are already frozen. Fixes: 98d81f0 ("nvme: use blk_mq_[un]quiesce_tagset") Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
1 parent 4458758 commit 33a1d14

File tree

3 files changed

+58
-51
lines changed

3 files changed

+58
-51
lines changed

block/blk-mq-sysfs.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -230,21 +230,21 @@ int blk_mq_sysfs_register(struct gendisk *disk)
230230

231231
kobject_uevent(q->mq_kobj, KOBJ_ADD);
232232

233-
mutex_lock(&q->tag_set->tag_list_lock);
233+
down_read(&q->tag_set->tag_list_rwsem);
234234
queue_for_each_hw_ctx(q, hctx, i) {
235235
ret = blk_mq_register_hctx(hctx);
236236
if (ret)
237237
goto out_unreg;
238238
}
239-
mutex_unlock(&q->tag_set->tag_list_lock);
239+
up_read(&q->tag_set->tag_list_rwsem);
240240
return 0;
241241

242242
out_unreg:
243243
queue_for_each_hw_ctx(q, hctx, j) {
244244
if (j < i)
245245
blk_mq_unregister_hctx(hctx);
246246
}
247-
mutex_unlock(&q->tag_set->tag_list_lock);
247+
up_read(&q->tag_set->tag_list_rwsem);
248248

249249
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
250250
kobject_del(q->mq_kobj);
@@ -257,10 +257,10 @@ void blk_mq_sysfs_unregister(struct gendisk *disk)
257257
struct blk_mq_hw_ctx *hctx;
258258
unsigned long i;
259259

260-
mutex_lock(&q->tag_set->tag_list_lock);
260+
down_read(&q->tag_set->tag_list_rwsem);
261261
queue_for_each_hw_ctx(q, hctx, i)
262262
blk_mq_unregister_hctx(hctx);
263-
mutex_unlock(&q->tag_set->tag_list_lock);
263+
up_read(&q->tag_set->tag_list_rwsem);
264264

265265
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
266266
kobject_del(q->mq_kobj);

block/blk-mq.c

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -335,12 +335,12 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
335335
{
336336
struct request_queue *q;
337337

338-
mutex_lock(&set->tag_list_lock);
338+
down_read(&set->tag_list_rwsem);
339339
list_for_each_entry(q, &set->tag_list, tag_set_list) {
340340
if (!blk_queue_skip_tagset_quiesce(q))
341341
blk_mq_quiesce_queue_nowait(q);
342342
}
343-
mutex_unlock(&set->tag_list_lock);
343+
up_read(&set->tag_list_rwsem);
344344

345345
blk_mq_wait_quiesce_done(set);
346346
}
@@ -350,12 +350,12 @@ void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
350350
{
351351
struct request_queue *q;
352352

353-
mutex_lock(&set->tag_list_lock);
353+
down_read(&set->tag_list_rwsem);
354354
list_for_each_entry(q, &set->tag_list, tag_set_list) {
355355
if (!blk_queue_skip_tagset_quiesce(q))
356356
blk_mq_unquiesce_queue(q);
357357
}
358-
mutex_unlock(&set->tag_list_lock);
358+
up_read(&set->tag_list_rwsem);
359359
}
360360
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
361361

@@ -4274,56 +4274,63 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
42744274
}
42754275
}
42764276

4277-
static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
4278-
bool shared)
4279-
{
4280-
struct request_queue *q;
4281-
unsigned int memflags;
4282-
4283-
lockdep_assert_held(&set->tag_list_lock);
4284-
4285-
list_for_each_entry(q, &set->tag_list, tag_set_list) {
4286-
memflags = blk_mq_freeze_queue(q);
4287-
queue_set_hctx_shared(q, shared);
4288-
blk_mq_unfreeze_queue(q, memflags);
4289-
}
4290-
}
4291-
42924277
static void blk_mq_del_queue_tag_set(struct request_queue *q)
42934278
{
42944279
struct blk_mq_tag_set *set = q->tag_set;
4280+
struct request_queue *firstq;
4281+
unsigned int memflags;
42954282

4296-
mutex_lock(&set->tag_list_lock);
4283+
down_write(&set->tag_list_rwsem);
42974284
list_del(&q->tag_set_list);
4298-
if (list_is_singular(&set->tag_list)) {
4299-
/* just transitioned to unshared */
4300-
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4301-
/* update existing queue */
4302-
blk_mq_update_tag_set_shared(set, false);
4285+
if (!list_is_singular(&set->tag_list)) {
4286+
up_write(&set->tag_list_rwsem);
4287+
goto out;
43034288
}
4304-
mutex_unlock(&set->tag_list_lock);
4289+
4290+
/*
4291+
* Transitioning the remaining firstq to unshared.
4292+
* Also, downgrade the semaphore to avoid deadlock
4293+
* with blk_mq_quiesce_tagset() while waiting for
4294+
* firstq to be frozen.
4295+
*/
4296+
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4297+
downgrade_write(&set->tag_list_rwsem);
4298+
firstq = list_first_entry(&set->tag_list, struct request_queue,
4299+
tag_set_list);
4300+
memflags = blk_mq_freeze_queue(firstq);
4301+
queue_set_hctx_shared(firstq, false);
4302+
blk_mq_unfreeze_queue(firstq, memflags);
4303+
up_read(&set->tag_list_rwsem);
4304+
out:
43054305
INIT_LIST_HEAD(&q->tag_set_list);
43064306
}
43074307

43084308
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
43094309
struct request_queue *q)
43104310
{
4311-
mutex_lock(&set->tag_list_lock);
4311+
struct request_queue *firstq;
4312+
unsigned int memflags;
43124313

4313-
/*
4314-
* Check to see if we're transitioning to shared (from 1 to 2 queues).
4315-
*/
4316-
if (!list_empty(&set->tag_list) &&
4317-
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
4318-
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4319-
/* update existing queue */
4320-
blk_mq_update_tag_set_shared(set, true);
4321-
}
4322-
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
4323-
queue_set_hctx_shared(q, true);
4324-
list_add_tail(&q->tag_set_list, &set->tag_list);
4314+
down_write(&set->tag_list_rwsem);
4315+
if (!list_is_singular(&set->tag_list)) {
4316+
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
4317+
queue_set_hctx_shared(q, true);
4318+
list_add_tail(&q->tag_set_list, &set->tag_list);
4319+
up_write(&set->tag_list_rwsem);
4320+
return;
4321+
}
43254322

4326-
mutex_unlock(&set->tag_list_lock);
4323+
/* Transitioning firstq and q to shared. */
4324+
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4325+
list_add_tail(&q->tag_set_list, &set->tag_list);
4326+
downgrade_write(&set->tag_list_rwsem);
4327+
queue_set_hctx_shared(q, true);
4328+
firstq = list_first_entry(&set->tag_list, struct request_queue,
4329+
tag_set_list);
4330+
memflags = blk_mq_freeze_queue(firstq);
4331+
queue_set_hctx_shared(firstq, true);
4332+
blk_mq_unfreeze_queue(firstq, memflags);
4333+
up_read(&set->tag_list_rwsem);
43274334
}
43284335

43294336
/* All allocations will be freed in release handler of q->mq_kobj */
@@ -4855,7 +4862,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
48554862
if (ret)
48564863
goto out_free_mq_map;
48574864

4858-
mutex_init(&set->tag_list_lock);
4865+
init_rwsem(&set->tag_list_rwsem);
48594866
INIT_LIST_HEAD(&set->tag_list);
48604867

48614868
return 0;
@@ -5044,7 +5051,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
50445051
struct xarray elv_tbl, et_tbl;
50455052
bool queues_frozen = false;
50465053

5047-
lockdep_assert_held(&set->tag_list_lock);
5054+
lockdep_assert_held_write(&set->tag_list_rwsem);
50485055

50495056
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
50505057
nr_hw_queues = nr_cpu_ids;
@@ -5129,9 +5136,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
51295136
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
51305137
{
51315138
down_write(&set->update_nr_hwq_lock);
5132-
mutex_lock(&set->tag_list_lock);
5139+
down_write(&set->tag_list_rwsem);
51335140
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
5134-
mutex_unlock(&set->tag_list_lock);
5141+
up_write(&set->tag_list_rwsem);
51355142
up_write(&set->update_nr_hwq_lock);
51365143
}
51375144
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

include/linux/blk-mq.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ enum hctx_type {
502502
* @shared_tags:
503503
* Shared set of tags. Has @nr_hw_queues elements. If set,
504504
* shared by all @tags.
505-
* @tag_list_lock: Serializes tag_list accesses.
505+
* @tag_list_rwsem: Serializes tag_list accesses.
506506
* @tag_list: List of the request queues that use this tag set. See also
507507
* request_queue.tag_set_list.
508508
* @srcu: Use as lock when type of the request queue is blocking
@@ -530,7 +530,7 @@ struct blk_mq_tag_set {
530530

531531
struct blk_mq_tags *shared_tags;
532532

533-
struct mutex tag_list_lock;
533+
struct rw_semaphore tag_list_rwsem;
534534
struct list_head tag_list;
535535
struct srcu_struct *srcu;
536536
struct srcu_struct tags_srcu;

0 commit comments

Comments
 (0)