From 796d5116c407690b14fd5bda136aa67a39e7061a Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 2 Jun 2011 21:19:05 +0200 Subject: iosched: prevent aliased requests from starving other I/O Hi, Jens, If you recall, I posted an RFC patch for this back in July of last year: http://lkml.org/lkml/2010/7/13/279 The basic problem is that a process can issue a never-ending stream of async direct I/Os to the same sector on a device, thus starving out other I/O in the system (due to the way the alias handling works in both cfq and deadline). The solution I proposed back then was to start dispatching from the fifo after a certain number of aliases had been dispatched. Vivek asked why we had to treat aliases differently at all, and I never had a good answer. So, I put together a simple patch which allows aliases to be added to the rb tree (it adds them to the right, though that doesn't matter as the order isn't guaranteed anyway). I think this is the preferred solution, as it doesn't break up time slices in CFQ or batches in deadline. I've tested it, and it does solve the starvation issue. Let me know what you think. Cheers, Jeff Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 9 ++------- block/deadline-iosched.c | 4 +--- block/elevator.c | 7 ++----- 3 files changed, 5 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7c52d68..a2fb14b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1501,16 +1501,11 @@ static void cfq_add_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - struct request *__alias, *prev; + struct request *prev; cfqq->queued[rq_is_sync(rq)]++; - /* - * looks a little odd, but the first insert might return an alias. - * if that happens, put the alias on the dispatch list - */ - while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) - cfq_dispatch_insert(cfqd->queue, __alias); + elv_rb_add(&cfqq->sort_list, rq); if (!cfq_cfqq_on_rr(cfqq)) cfq_add_cfqq_rr(cfqd, cfqq); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 5139c0e..c644137 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -77,10 +77,8 @@ static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { struct rb_root *root = deadline_rb_root(dd, rq); - struct request *__alias; - while (unlikely(__alias = elv_rb_add(root, rq))) - deadline_move_request(dd, __alias); + elv_rb_add(root, rq); } static inline void diff --git a/block/elevator.c b/block/elevator.c index b0b38ce..a3b64bc 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -353,7 +353,7 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ -struct request *elv_rb_add(struct rb_root *root, struct request *rq) +void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -365,15 +365,12 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq) if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; - else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; - else - return __rq; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); - return NULL; } EXPORT_SYMBOL(elv_rb_add); -- cgit v1.1 From 9b50902db5eb8a220160fb89e95aa11967998d12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Jun 2011 06:01:13 +0200 Subject: cfq-iosched: fix locking around ioc->ioc_data assignment Since we are modifying this RCU pointer, we need to hold the lock protecting it around it. This fixes a potential reuse and double free of a cfq io_context structure. The bug has been in CFQ for a long time, it hit very few people but those it did hit seemed to see it a lot. Tracked in RH bugzilla here: https://bugzilla.redhat.com/show_bug.cgi?id=577968 Credit goes to Paul Bolle for figuring out that the issue was around the one-hit ioc->ioc_data cache. Thanks to his hard work the issue is now fixed. Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a2fb14b..000719c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2767,8 +2767,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); - if (ioc->ioc_data == cic) + if (rcu_dereference(ioc->ioc_data) == cic) { + spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); -- cgit v1.1 From 8aea45451b252e4be09ee9974c5658bb47c81625 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 6 Jun 2011 05:07:54 +0200 Subject: CFQ: make two functions static Correctly suggested by sparse. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 000719c..2b2d7a9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1004,8 +1004,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, - unsigned int weight) +static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { struct cfq_group *cfqg = cfqg_of_blkg(blkg); cfqg->new_weight = weight; @@ -1234,7 +1234,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd) * it should not be NULL as even if elevator was exiting, cgroup deltion * path got to it first. */ -void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) { unsigned long flags; struct cfq_data *cfqd = key; -- cgit v1.1 From df4156569d4ace581bd1581e7aa4a9dd7d2f0775 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 6 Jun 2011 05:11:34 +0200 Subject: block: rename the return of two functions If we rename the return of alloc_io_context() and get_io_context() from "ret" to "ioc" the code get's (a bit) more readable and (a lot) more grepable. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/blk-ioc.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index c898049..bfb0493 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -82,26 +82,26 @@ void exit_io_context(struct task_struct *task) struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { - struct io_context *ret; + struct io_context *ioc; - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_long_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = 0; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); + if (ioc) { + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); + ioc->ioprio_changed = 0; + ioc->ioprio = 0; + ioc->last_waited = 0; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); + ioc->ioc_data = NULL; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) - ret->cgroup_changed = 0; + ioc->cgroup_changed = 0; #endif } - return ret; + return ioc; } /* @@ -139,19 +139,19 @@ struct io_context *current_io_context(gfp_t gfp_flags, int node) */ struct io_context *get_io_context(gfp_t gfp_flags, int node) { - struct io_context *ret = NULL; + struct io_context *ioc = NULL; /* * Check for unlikely race with exiting task. ioc ref count is * zero when ioc is being detached. */ do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) + ioc = current_io_context(gfp_flags, node); + if (unlikely(!ioc)) break; - } while (!atomic_long_inc_not_zero(&ret->refcount)); + } while (!atomic_long_inc_not_zero(&ioc->refcount)); - return ret; + return ioc; } EXPORT_SYMBOL(get_io_context); -- cgit v1.1 From 9f5e486550456587348e2048eecda5bee778250a Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Mon, 13 Jun 2011 10:45:43 +0200 Subject: block:remove some spare spaces in genhd.c Remove the end-of-line spaces in genhd.c. Signed-off-by: Wanlong Gao Signed-off-by: Jens Axboe --- block/genhd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 95822ae..ed3fe82 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -602,7 +602,7 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); - /* Register BDI before referencing it from bdev */ + /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); @@ -1148,7 +1148,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "wsect wuse running use aveq" "\n\n"); */ - + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); @@ -1172,7 +1172,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) ); } disk_part_iter_exit(&piter); - + return 0; } -- cgit v1.1 From fd16d263194aa6b50b215eb593a567b59d744d6e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Jun 2011 10:42:49 +0200 Subject: block: Add __attribute__((format(printf...) and fix fallout Use the compiler to verify format strings and arguments. Fix fallout. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- block/cfq-iosched.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a62be8d..3689f83 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q) bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", total_nr_queued(td), td->nr_queued[READ], td->nr_queued[WRITE]); @@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) } queue_bio: - throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" " iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2b2d7a9..3d403a1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" - " sect=%u", used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, + "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", + used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, unaccounted_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); @@ -2018,8 +2019,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ if (sample_valid(cic->ttime_samples) && (cfqq->slice_end - jiffies < cic->ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", - cic->ttime_mean); + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", + cic->ttime_mean); return; } -- cgit v1.1 From d2f31a5fd60d168b00fc4f7617b68a1287b21e90 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Jun 2011 20:19:27 +0200 Subject: blk-throttle: Make total_nr_queued unsigned The total of two unsigned values should also be unsigned. Update throtl_log output to unsigned. Update total_nr_queued test to non-zero to be the same as the other total_nr_queued tests. Signed-off-by: Joe Perches Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3689f83..f6a7941 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -142,9 +142,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) return NULL; } -static inline int total_nr_queued(struct throtl_data *td) +static inline unsigned int total_nr_queued(struct throtl_data *td) { - return (td->nr_queued[0] + td->nr_queued[1]); + return td->nr_queued[0] + td->nr_queued[1]; } static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) @@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q) bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", + throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", total_nr_queued(td), td->nr_queued[READ], td->nr_queued[WRITE]); @@ -970,7 +970,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) struct delayed_work *dwork = &td->throtl_work; /* schedule work if limits changed even if no bio is queued */ - if (total_nr_queued(td) > 0 || td->limits_changed) { + if (total_nr_queued(td) || td->limits_changed) { /* * We might have a work scheduled to be executed in future. * Cancel that and schedule a new one. -- cgit v1.1 From 80ceb057135ad77f513277f3bcd04b885501877a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 20 Jun 2011 13:27:44 +0200 Subject: bsg: fix bsg_poll() to return POLLOUT properly POLLOUT should be returned only if bd->queued_cmds < bd->max_queue so that bsg_alloc_command() can proceed. Signed-off-by: Namhyung Kim Acked-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/bsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 0c8b64a..c4f49e2 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -878,7 +878,7 @@ static unsigned int bsg_poll(struct file *file, poll_table *wait) spin_lock_irq(&bd->lock); if (!list_empty(&bd->done_list)) mask |= POLLIN | POLLRDNORM; - if (bd->queued_cmds >= bd->max_queue) + if (bd->queued_cmds < bd->max_queue) mask |= POLLOUT; spin_unlock_irq(&bd->lock); -- cgit v1.1 From 44194e3e88fcfe77a2a6e2333847d4f27816259a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 20 Jun 2011 13:27:44 +0200 Subject: bsg: remove unnecessary conditional expressions Second condition in OR always implies first condition is false thus bytes_read in the second is not needed. The same goes to bytes_written. Signed-off-by: Namhyung Kim Acked-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/bsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index c4f49e2..b7e42ad 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -606,7 +606,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) ret = __bsg_read(buf, count, bd, NULL, &bytes_read); *ppos = bytes_read; - if (!bytes_read || (bytes_read && err_block_err(ret))) + if (!bytes_read || err_block_err(ret)) bytes_read = ret; return bytes_read; @@ -686,7 +686,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* * return bytes written on non-fatal errors */ - if (!bytes_written || (bytes_written && err_block_err(ret))) + if (!bytes_written || err_block_err(ret)) bytes_written = ret; dprintk("%s: returning %Zd\n", bd->name, bytes_written); -- cgit v1.1 From 2b727c6300b49352f80f63704bb50c256949e95e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 20 Jun 2011 13:27:45 +0200 Subject: bsg: fix address space warning from sparse copy_from/to_user() and blk_rq_map_user() want __user pointer. This patch fixes following warnings from sparse: CHECK block/bsg.c block/bsg.c:185:38: warning: incorrect type in argument 2 (different address spaces) block/bsg.c:185:38: expected void const [noderef] *from block/bsg.c:185:38: got void * block/bsg.c:295:58: warning: incorrect type in argument 4 (different address spaces) block/bsg.c:295:58: expected void [noderef] * block/bsg.c:295:58: got void *[assigned] dxferp block/bsg.c:311:52: warning: incorrect type in argument 4 (different address spaces) block/bsg.c:311:52: expected void [noderef] * block/bsg.c:311:52: got void *[assigned] dxferp block/bsg.c:448:37: warning: incorrect type in argument 1 (different address spaces) block/bsg.c:448:37: expected void [noderef] *dst block/bsg.c:448:37: got void * Signed-off-by: Namhyung Kim Acked-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/bsg.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index b7e42ad..702f131 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -182,7 +182,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -ENOMEM; } - if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request, + if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request, hdr->request_len)) return -EFAULT; @@ -249,7 +249,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, struct request *rq, *next_rq = NULL; int ret, rw; unsigned int dxfer_len; - void *dxferp = NULL; + void __user *dxferp = NULL; struct bsg_class_device *bcd = &q->bsg_dev; /* if the LLD has been removed then the bsg_unregister_queue will @@ -291,7 +291,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, rq->next_rq = next_rq; next_rq->cmd_type = rq->cmd_type; - dxferp = (void*)(unsigned long)hdr->din_xferp; + dxferp = (void __user *)(unsigned long)hdr->din_xferp; ret = blk_rq_map_user(q, next_rq, NULL, dxferp, hdr->din_xfer_len, GFP_KERNEL); if (ret) @@ -300,10 +300,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, if (hdr->dout_xfer_len) { dxfer_len = hdr->dout_xfer_len; - dxferp = (void*)(unsigned long)hdr->dout_xferp; + dxferp = (void __user *)(unsigned long)hdr->dout_xferp; } else if (hdr->din_xfer_len) { dxfer_len = hdr->din_xfer_len; - dxferp = (void*)(unsigned long)hdr->din_xferp; + dxferp = (void __user *)(unsigned long)hdr->din_xferp; } else dxfer_len = 0; @@ -445,7 +445,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, int len = min_t(unsigned int, hdr->max_response_len, rq->sense_len); - ret = copy_to_user((void*)(unsigned long)hdr->response, + ret = copy_to_user((void __user *)(unsigned long)hdr->response, rq->sense, len); if (!ret) hdr->response_len = len; -- cgit v1.1 From 85ef06d1d252f6a2e73b678591ab71caad4667bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Jul 2011 16:17:47 +0200 Subject: block: flush MEDIA_CHANGE from drivers on close(2) Currently, only open(2) is defined as the 'clearing' point. It has two roles - first, it's an acknowledgement from userland indicating that the event has been received and kernel can clear pending states and proceed to generate more events. Secondly, it's passed on to device drivers as a hint indicating that a synchronization point has been reached and it might want to take a deeper look at the device. The latter currently is only used by sr which uses two different mechanisms - GET_EVENT_MEDIA_STATUS_NOTIFICATION and TEST_UNIT_READY to discover events, where the former is lighter weight and safe to be used repeatedly but may not provide full coverage. Among other things, GET_EVENT can't detect media removal while TUR can. This patch makes close(2) - blkdev_put() - indicate clearing hint for MEDIA_CHANGE to drivers. disk_check_events() is renamed to disk_flush_events() and updated to take @mask for events to flush which is or'd to ev->clearing and will be passed to the driver on the next ->check_events() invocation. This change makes sr generate MEDIA_CHANGE when media is ejected from userland - e.g. with eject(1). Note: Given the current usage, it seems @clearing hint is needlessly complex. disk_clear_events() can simply clear all events and the hint can be boolean @flush. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 82d97c3..fbd7605 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1500,30 +1500,32 @@ void disk_unblock_events(struct gendisk *disk) } /** - * disk_check_events - schedule immediate event checking - * @disk: disk to check events for + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush * - * Schedule immediate event checking on @disk if not blocked. + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. * * CONTEXT: - * Don't care. Safe to call from irq context. + * If @mask is non-zero must be called with bdev->bd_mutex held. */ -void disk_check_events(struct gendisk *disk) +void disk_flush_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; - unsigned long flags; if (!ev) return; - spin_lock_irqsave(&ev->lock, flags); + spin_lock_irq(&ev->lock); + ev->clearing |= mask; if (!ev->block) { cancel_delayed_work(&ev->dwork); queue_delayed_work(system_nrt_wq, &ev->dwork, 0); } - spin_unlock_irqrestore(&ev->lock, flags); + spin_unlock_irq(&ev->lock); } -EXPORT_SYMBOL_GPL(disk_check_events); /** * disk_clear_events - synchronously check, clear and return pending events @@ -1713,7 +1715,7 @@ static int disk_events_set_dfl_poll_msecs(const char *val, mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) - disk_check_events(ev->disk); + disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); -- cgit v1.1 From 390192b300570b2bc721d77067ca133f58015ae8 Mon Sep 17 00:00:00 2001 From: Johannes Stezenbach Date: Fri, 1 Jul 2011 22:32:26 +0200 Subject: compat_ioctl: fix warning caused by qemu On Linux x86_64 host with 32bit userspace, running qemu or even just "qemu-img create -f qcow2 some.img 1G" causes a kernel warning: ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(00005326){t:'S';sz:0} arg(7fffffff) on some.img ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(801c0204){t:02;sz:28} arg(fff77350) on some.img ioctl 00005326 is CDROM_DRIVE_STATUS, ioctl 801c0204 is FDGETPRM. The warning appears because the Linux compat-ioctl handler for these ioctls only applies to block devices, while qemu also uses the ioctls on plain files. Signed-off-by: Johannes Stezenbach Acked-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'block') diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index cc3eb78..7b72502 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -208,19 +208,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode, #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) -struct compat_floppy_struct { - compat_uint_t size; - compat_uint_t sect; - compat_uint_t head; - compat_uint_t track; - compat_uint_t stretch; - unsigned char gap; - unsigned char rate; - unsigned char spec1; - unsigned char fmt_gap; - const compat_caddr_t name; -}; - struct compat_floppy_drive_params { char cmos; compat_ulong_t max_dtr; @@ -288,7 +275,6 @@ struct compat_floppy_write_errors { #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct) #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct) -#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params) #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params) #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct) -- cgit v1.1 From 0f79960391a5a1e3679956024e18aeeb0369ac44 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 6 Jul 2011 21:30:50 +0200 Subject: block: eliminate potential for infinite loop in blkdev_issue_discard Due to the recently identified overflow in read_capacity_16() it was possible for max_discard_sectors to be zero but still have discards enabled on the associated device's queue. Eliminate the possibility for blkdev_issue_discard to infinitely loop. Interestingly this issue wasn't identified until a device, whose discard_granularity was 0 due to read_capacity_16 overflow, was consumed by blk_stack_limits() to construct limits for a higher-level DM multipath device. The multipath device's resulting limits never had the discard limits stacked because blk_stack_limits() will only do so if the bottom device's discard_granularity != 0. This resulted in the multipath device's limits.max_discard_sectors being 0. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 78e627e..64974b1 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -59,7 +59,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, * granularity */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - if (q->limits.discard_granularity) { + if (!unlikely(!max_discard_sectors)) { + /* Avoid infinite loop below. Being cautious never hurts. */ + return -EOPNOTSUPP; + } else if (q->limits.discard_granularity) { unsigned int disc_sects = q->limits.discard_granularity >> 9; max_discard_sectors &= ~(disc_sects - 1); -- cgit v1.1 From 55c022bbddb2c056b5dff1bd1b1758d31b6d64c9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Jul 2011 08:19:20 +0200 Subject: block: avoid building too big plug list When I test fio script with big I/O depth, I found the total throughput drops compared to some relative small I/O depth. The reason is the thread accumulates big requests in its plug list and causes some delays (surely this depends on CPU speed). I thought we'd better have a threshold for requests. When a threshold reaches, this means there is no request merge and queue lock contention isn't severe when pushing per-task requests to queue, so the main advantages of blk plug don't exist. We can force a plug list flush in this case. With this, my test throughput actually increases and almost equals to small I/O depth. Another side effect is irq off time decreases in blk_flush_plug_list() for big I/O depth. The BLK_MAX_REQUEST_COUNT is choosen arbitarily, but 16 is efficiently to reduce lock contention to me. But I'm open here, 32 is ok in my test too. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d2f8f40..a564852 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1302,7 +1302,10 @@ get_rq: plug->should_sort = 1; } list_add_tail(&req->queuelist, &plug->list); + plug->count++; drive_stat_acct(req, 1); + if (plug->count >= BLK_MAX_REQUEST_COUNT) + blk_flush_plug_list(plug, false); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2626,6 +2629,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; + plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2709,6 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); + plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); -- cgit v1.1 From a07405b7802691d29ab3b23bdc76ee6d006aad0b Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Sun, 10 Jul 2011 22:09:19 +0200 Subject: cfq: Remove special treatment for metadata rqs. There is no consistency among filesystems from what bios (or requests) are marked as being metadata. It's interesting to expose this in traces, but we shouldn't schedule the requests differently based on whether or not they're marked as being metadata. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b2e1c75..762bd50 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -129,8 +129,6 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; - /* pending metadata requests */ - int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -670,9 +668,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) - return rq1->cmd_flags & REQ_META ? rq1 : rq2; - s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1593,10 +1588,6 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_META) { - WARN_ON(!cfqq->meta_pending); - cfqq->meta_pending--; - } } static int cfq_merge(struct request_queue *q, struct request **req, @@ -3335,13 +3326,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* - * So both queues are sync. Let the new request get disk time if - * it's a metadata request and the current queue is doing regular IO. - */ - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) - return true; - - /* * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. */ if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) @@ -3405,8 +3389,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_META) - cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cic); cfq_update_io_seektime(cfqd, cfqq, rq); -- cgit v1.1 From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Tue, 12 Jul 2011 08:31:45 +0200 Subject: fixlet: Remove fs_excl from struct task. fs_excl is a poor man's priority inheritance for filesystems to hint to the block layer that an operation is important. It was never clearly specified, not widely adopted, and will not prevent starvation in many cases (like across cgroups). fs_excl was introduced with the time sliced CFQ IO scheduler, to indicate when a process held FS exclusive resources and thus needed a boost. It doesn't cover all file systems, and it was never fully complete. Lets kill it. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 762bd50..d8b1087 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -134,7 +134,7 @@ struct cfq_queue { /* io prio of this group */ unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short ioprio_class; pid_t pid; @@ -2869,7 +2869,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) * elevate the priority of this queue */ cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -3593,30 +3592,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -/* - * we temporarily boost lower priority queues if they are holding fs exclusive - * resources. they are boosted to normal prio (CLASS_BE/4) - */ -static void cfq_prio_boost(struct cfq_queue *cfqq) -{ - if (has_fs_excl()) { - /* - * boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } else { - /* - * unboost the queue (if needed) - */ - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } -} - static inline int __cfq_may_queue(struct cfq_queue *cfqq) { if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { @@ -3647,7 +3622,6 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { cfq_init_prio_data(cfqq, cic->ioc); - cfq_prio_boost(cfqq); return __cfq_may_queue(cfqq); } -- cgit v1.1 From 383cd7213f95a2784ab5038fe292844178768b82 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 Jul 2011 14:24:35 +0200 Subject: CFQ: move think time check variables to a separate struct Move the variables to do think time check to a sepatate struct. This is to prepare adding think time check for service tree and group. No functional change. Signed-off-by: Shaohua Li Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d8b1087..5625559 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2008,10 +2008,10 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * slice, then don't idle. This avoids overrunning the allotted * time slice. */ - if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) { + if (sample_valid(cic->ttime.ttime_samples) && + (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) { cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", - cic->ttime_mean); + cic->ttime.ttime_mean); return; } @@ -2819,7 +2819,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, cfqd->queue->node); if (cic) { - cic->last_end_request = jiffies; + cic->ttime.last_end_request = jiffies; INIT_LIST_HEAD(&cic->queue_list); INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; @@ -3206,14 +3206,22 @@ err: } static void -cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) +__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) { - unsigned long elapsed = jiffies - cic->last_end_request; - unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); + unsigned long elapsed = jiffies - ttime->last_end_request; + elapsed = min(elapsed, 2UL * slice_idle); - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; + ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8; + ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples; +} + +static void +cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_io_context *cic) +{ + if (cfq_cfqq_sync(cfqq)) + __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); } static void @@ -3262,8 +3270,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; - else if (sample_valid(cic->ttime_samples)) { - if (cic->ttime_mean > cfqd->cfq_slice_idle) + else if (sample_valid(cic->ttime.ttime_samples)) { + if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -3389,7 +3397,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->rq_queued++; - cfq_update_io_thinktime(cfqd, cic); + cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); cfq_update_idle_window(cfqd, cfqq, cic); @@ -3500,8 +3508,8 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) return true; /* if slice left is less than think time, wait busy */ - if (cic && sample_valid(cic->ttime_samples) - && (cfqq->slice_end - jiffies < cic->ttime_mean)) + if (cic && sample_valid(cic->ttime.ttime_samples) + && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) return true; /* @@ -3542,7 +3550,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { - RQ_CIC(rq)->last_end_request = now; + RQ_CIC(rq)->ttime.last_end_request = now; if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) cfqd->last_delayed_sync = now; } -- cgit v1.1 From f5f2b6ceb23e02ff35c6dbc6a39aa776ace99cda Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 Jul 2011 14:24:55 +0200 Subject: CFQ: add think time check for service tree Currently when the last queue of a service tree has no request, we don't expire the queue to hope request from the service tree comes soon, so the service tree doesn't miss its share. But if the think time is big, the assumption isn't correct and we just waste bandwidth. In such case, we don't do idle. [global] runtime=10 direct=1 [test1] rw=randread ioengine=libaio size=500m directory=/mnt filename=file1 thinktime=9000 [test2] rw=read ioengine=libaio size=1G directory=/mnt filename=file2 patched base test1 41k/s 33k/s test2 15868k/s 15789k/s total 15902k/s 15817k/s A slightly better To check if the patch changes behavior of queue without think time. I also tried to give test1 2ms think time or no think time. The test has variation even without the patch, but the average throughput doesn't change with/without the patch. Signed-off-by: Shaohua Li Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5625559..baa9060 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,9 +87,10 @@ struct cfq_rb_root { unsigned count; unsigned total_weight; u64 min_vdisktime; + struct cfq_ttime ttime; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ - .count = 0, .min_vdisktime = 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ + .ttime = {.last_end_request = jiffies,},} /* * Per process-grouping structure @@ -391,6 +392,18 @@ CFQ_CFQQ_FNS(wait_busy); j++, st = i < IDLE_WORKLOAD ? \ &cfqg->service_trees[i][j]: NULL) \ +static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, + struct cfq_ttime *ttime, bool group_idle) +{ + unsigned long slice; + if (!sample_valid(ttime->ttime_samples)) + return false; + if (group_idle) + slice = cfqd->cfq_group_idle; + else + slice = cfqd->cfq_slice_idle; + return ttime->ttime_mean > slice; +} static inline bool iops_mode(struct cfq_data *cfqd) { @@ -1955,7 +1968,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && + !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) return true; cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", service_tree->count); @@ -3220,8 +3234,11 @@ static void cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic) { - if (cfq_cfqq_sync(cfqq)) + if (cfq_cfqq_sync(cfqq)) { __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); + __cfq_update_io_thinktime(&cfqq->service_tree->ttime, + cfqd->cfq_slice_idle); + } } static void @@ -3550,7 +3567,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { + struct cfq_rb_root *service_tree; + RQ_CIC(rq)->ttime.last_end_request = now; + + if (cfq_cfqq_on_rr(cfqq)) + service_tree = cfqq->service_tree; + else + service_tree = service_tree_for(cfqq->cfqg, + cfqq_prio(cfqq), cfqq_type(cfqq)); + service_tree->ttime.last_end_request = now; if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) cfqd->last_delayed_sync = now; } -- cgit v1.1 From 7700fc4f675fa38094e78e345b594363a2fd895b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 Jul 2011 14:24:56 +0200 Subject: CFQ: add think time check for group Currently when the last queue of a group has no request, we don't expire the queue to hope request from the group comes soon, so the group doesn't miss its share. But if the think time is big, the assumption isn't correct and we just waste bandwidth. In such case, we don't do idle. [global] runtime=30 direct=1 [test1] cgroup=test1 cgroup_weight=1000 rw=randread ioengine=libaio size=500m runtime=30 directory=/mnt filename=file1 thinktime=9000 [test2] cgroup=test2 cgroup_weight=1000 rw=randread ioengine=libaio size=500m runtime=30 directory=/mnt filename=file2 patched base test1 64k 39k test2 548k 540k total 604k 578k group1 gets much better throughput because it waits less time. To check if the patch changes behavior of queue without think time. I also tried to give test1 2ms think time or no think time. The test result is stable. The thoughput doesn't change with/without the patch. Signed-off-by: Shaohua Li Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index baa9060..1f96ad6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -211,6 +211,7 @@ struct cfq_group { #endif /* number of requests that are on the dispatch list or inside driver */ int dispatched; + struct cfq_ttime ttime; }; /* @@ -1067,6 +1068,8 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); + cfqg->ttime.last_end_request = jiffies; + /* * Take the initial reference that will be released on destroy * This can be thought of a joint reference by cgroup and @@ -2381,8 +2384,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * this group, wait for requests to complete. */ check_group_idle: - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 - && cfqq->cfqg->dispatched) { + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && + cfqq->cfqg->dispatched && + !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { cfqq = NULL; goto keep_queue; } @@ -3239,6 +3243,9 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, __cfq_update_io_thinktime(&cfqq->service_tree->ttime, cfqd->cfq_slice_idle); } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); +#endif } static void @@ -3521,6 +3528,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (cfqq->cfqg->nr_cfqq > 1) return false; + /* the only queue in the group, but think time is big */ + if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) + return false; + if (cfq_slice_used(cfqq)) return true; @@ -3581,6 +3592,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->last_delayed_sync = now; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + cfqq->cfqg->ttime.last_end_request = now; +#endif + /* * If this is the active queue, check if it needs to be expired, * or if we want to idle in case it has no pending requests. -- cgit v1.1 From 57bdfbf9ee2b0856d8b62180c3b3f8fa1533b8d1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:42:58 +0800 Subject: block,rcu: Convert call_rcu(disk_free_ptbl_rcu_cb) to kfree_rcu() The rcu callback disk_free_ptbl_rcu_cb() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(disk_free_ptbl_rcu_cb). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: Jens Axboe Reviewed-by: Josh Triplett --- block/genhd.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 3608289..6024b82 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1018,14 +1018,6 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; -static void disk_free_ptbl_rcu_cb(struct rcu_head *head) -{ - struct disk_part_tbl *ptbl = - container_of(head, struct disk_part_tbl, rcu_head); - - kfree(ptbl); -} - /** * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way * @disk: disk to replace part_tbl for @@ -1046,7 +1038,7 @@ static void disk_replace_part_tbl(struct gendisk *disk, if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); - call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); + kfree_rcu(old_ptbl, rcu_head); } } -- cgit v1.1 From bfe159a51203c15d23cb3158fffdc25ec4b4dda1 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 7 Jul 2011 15:45:40 -0500 Subject: [SCSI] fix crash in scsi_dispatch_cmd() USB surprise removal of sr is triggering an oops in scsi_dispatch_command(). What seems to be happening is that USB is hanging on to a queue reference until the last close of the upper device, so the crash is caused by surprise remove of a mounted CD followed by attempted unmount. The problem is that USB doesn't issue its final commands as part of the SCSI teardown path, but on last close when the block queue is long gone. The long term fix is probably to make sr do the teardown in the same way as sd (so remove all the lower bits on ejection, but keep the upper disk alive until last close of user space). However, the current oops can be simply fixed by not allowing any commands to be sent to a dead queue. Cc: stable@kernel.org Signed-off-by: James Bottomley --- block/blk-core.c | 3 +++ block/blk-exec.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d2f8f40..1d49e1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -839,6 +839,9 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); diff --git a/block/blk-exec.c b/block/blk-exec.c index 8a0e7ec..a1ebceb 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -50,6 +50,13 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + rq->errors = -ENXIO; + if (rq->end_io) + rq->end_io(rq, rq->errors); + return; + } + rq->rq_disk = bd_disk; rq->end_io = done; WARN_ON(irqs_disabled()); -- cgit v1.1 From 4c64500eada358165d0bb9a20d6c7d30821995b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 Jul 2011 20:34:59 +0200 Subject: block: fix patch import error in max_discard_sectors check A '!' snuck in before the unlikely, rendering it useless. Reported-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 64974b1..2b461b4 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -59,7 +59,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, * granularity */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - if (!unlikely(!max_discard_sectors)) { + if (unlikely(!max_discard_sectors)) { /* Avoid infinite loop below. Being cautious never hurts. */ return -EOPNOTSUPP; } else if (q->limits.discard_granularity) { -- cgit v1.1 From 5757a6d76cdf6dda2a492c09b985c015e86779b1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 23 Jul 2011 20:44:25 +0200 Subject: block: strict rq_affinity Some systems benefit from completions always being steered to the strict requester cpu rather than the looser "per-socket" steering that blk_cpu_to_group() attempts by default. This is because the first CPU in the group mask ends up being completely overloaded with work, while the others (including the original submitter) has power left to spare. Allow the strict mode to be set by writing '2' to the sysfs control file. This is identical to the scheme used for the nomerges file, where '2' is a more aggressive setting than just being turned on. echo 2 > /sys/block//queue/rq_affinity Cc: Christoph Hellwig Cc: Roland Dreier Tested-by: Dave Jiang Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++---- block/blk-softirq.c | 11 +++++++---- block/blk-sysfs.c | 13 +++++++++---- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a564852..b322825 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1279,10 +1279,8 @@ get_rq: init_request_from_bio(req, bio); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) { - req->cpu = blk_cpu_to_group(get_cpu()); - put_cpu(); - } + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = smp_processor_id(); plug = current->plug; if (plug) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ee9c216..475fab8 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; - int ccpu, cpu, group_cpu; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { ccpu = req->cpu; - else + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } + } else ccpu = cpu; if (ccpu == cpu || ccpu == group_cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d935bd8..0ee17b5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); - return queue_var_show(set, page); + return queue_var_show(set << force, page); } static ssize_t @@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) + if (val) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - else - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + if (val == 2) + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else { + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } spin_unlock_irq(q->queue_lock); #endif return ret; -- cgit v1.1 From 11ccf116d0d756d06989775288e41f737d98e0c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Jul 2011 15:01:15 +0200 Subject: block: fix warning with calling smp_processor_id() in preemptible section After commit 5757a6d7 introduced an unsafe calling of smp_processor_id(), with preempt debuggin turned on we spew a lot of: BUG: using smp_processor_id() in preemptible [00000000] code: kjournald/514 caller is __make_request+0x1b8/0x308 [] (unwind_backtrace+0x0/0xe8) from [] (debug_smp_processor_id+0xbc/0xf0) [] (debug_smp_processor_id+0xbc/0xf0) from [] (__make_request+0x1b8/0x308) [] (__make_request+0x1b8/0x308) from [] (generic_make_request+0x4dc/0x558) [] (generic_make_request+0x4dc/0x558) from [] (submit_bio+0x114/0x138) [] (submit_bio+0x114/0x138) from [] (submit_bh+0x148/0x16c) [] (submit_bh+0x148/0x16c) from [] (__sync_dirty_buffer+0x88/0xd8) [] (__sync_dirty_buffer+0x88/0xd8) from [] (journal_commit_transaction+0x1198/0x1688) [] (journal_commit_transaction+0x1198/0x1688) from [] (kjournald+0xb4/0x224) [] (kjournald+0xb4/0x224) from [] (kthread+0x8c/0x94) [] (kthread+0x8c/0x94) from [] (kernel_thread_exit+0x0/0x8) Fix this by just using raw_smp_processor_id(), it's just a hint after all. There's no pinning of the CPU or accessing per-cpu structures involved. Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f8cb099..f925581 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1283,7 +1283,7 @@ get_rq: if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = smp_processor_id(); + req->cpu = raw_smp_processor_id(); plug = current->plug; if (plug) { -- cgit v1.1 From b2c9cd3793e5878e301ec2219785a7b8ca402ef1 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Jul 2011 16:09:03 -0700 Subject: fail_make_request: cleanup should_fail_request This changes should_fail_request() to more usable wrapper function of should_fail(). It can avoid putting #ifdef CONFIG_FAIL_MAKE_REQUEST in the middle of a function. Signed-off-by: Akinobu Mita Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-core.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f925581..b850bed 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1361,14 +1361,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static int should_fail_request(struct bio *bio) +static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { - struct hd_struct *part = bio->bi_bdev->bd_part; - - if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) - return should_fail(&fail_make_request, bio->bi_size); - - return 0; + return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -1381,9 +1376,10 @@ late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline int should_fail_request(struct bio *bio) +static inline bool should_fail_request(struct hd_struct *part, + unsigned int bytes) { - return 0; + return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ @@ -1466,6 +1462,7 @@ static inline void __generic_make_request(struct bio *bio) old_dev = 0; do { char b[BDEVNAME_SIZE]; + struct hd_struct *part; q = bdev_get_queue(bio->bi_bdev); if (unlikely(!q)) { @@ -1489,7 +1486,10 @@ static inline void __generic_make_request(struct bio *bio) if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; - if (should_fail_request(bio)) + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_size) || + should_fail_request(&part_to_disk(part)->part0, + bio->bi_size)) goto end_io; /* @@ -1704,11 +1704,9 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (blk_rq_check_limits(q, rq)) return -EIO; -#ifdef CONFIG_FAIL_MAKE_REQUEST - if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && - should_fail(&fail_make_request, blk_rq_bytes(rq))) + if (rq->rq_disk && + should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; -#endif spin_lock_irqsave(q->queue_lock, flags); -- cgit v1.1 From aa387cc895672b00f807ad7c734a2defaf677712 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 31 Jul 2011 22:05:09 +0200 Subject: block: add bsg helper library This moves the FC classes bsg code to the block layer and makes it a lib so that other classes like iscsi and SAS can use it. It is helpful because working with the request queue, bios, creating scatterlists, etc are a pain that the LLD does not have to worry about with normal IOs and should not have to worry about for bsg requests. Signed-off-by: Mike Christie Signed-off-by: Jens Axboe --- block/Kconfig | 10 ++ block/Makefile | 1 + block/bsg-lib.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 308 insertions(+) create mode 100644 block/bsg-lib.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 60be1e0..e97934e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -65,6 +65,16 @@ config BLK_DEV_BSG If unsure, say Y. +config BLK_DEV_BSGLIB + bool "Block layer SG support v4 helper lib" + default n + select BLK_DEV_BSG + help + Subsystems will normally enable this if needed. Users will not + normally need to manually enable this. + + If unsure, say N. + config BLK_DEV_INTEGRITY bool "Block layer data integrity support" ---help--- diff --git a/block/Makefile b/block/Makefile index 0fec4b3..514c6e4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/bsg-lib.c b/block/bsg-lib.c new file mode 100644 index 0000000..f8c0a61 --- /dev/null +++ b/block/bsg-lib.c @@ -0,0 +1,297 @@ +/* + * BSG helper library + * + * Copyright (C) 2008 James Smart, Emulex Corporation + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2011 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include +#include +#include +#include +#include +#include + +/** + * bsg_destroy_job - routine to teardown/delete a bsg job + * @job: bsg_job that is to be torn down + */ +static void bsg_destroy_job(struct bsg_job *job) +{ + put_device(job->dev); /* release reference for the request */ + + kfree(job->request_payload.sg_list); + kfree(job->reply_payload.sg_list); + kfree(job); +} + +/** + * bsg_job_done - completion routine for bsg requests + * @job: bsg_job that is complete + * @result: job reply result + * @reply_payload_rcv_len: length of payload recvd + * + * The LLD should call this when the bsg job has completed. + */ +void bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + struct request *req = job->req; + struct request *rsp = req->next_rq; + int err; + + err = job->req->errors = result; + if (err < 0) + /* we're only returning the result field in the reply */ + job->req->sense_len = sizeof(u32); + else + job->req->sense_len = job->reply_len; + /* we assume all request payload was transferred, residual == 0 */ + req->resid_len = 0; + + if (rsp) { + WARN_ON(reply_payload_rcv_len > rsp->resid_len); + + /* set reply (bidi) residual */ + rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len); + } + blk_complete_request(req); +} +EXPORT_SYMBOL_GPL(bsg_job_done); + +/** + * bsg_softirq_done - softirq done routine for destroying the bsg requests + * @rq: BSG request that holds the job to be destroyed + */ +static void bsg_softirq_done(struct request *rq) +{ + struct bsg_job *job = rq->special; + + blk_end_request_all(rq, rq->errors); + bsg_destroy_job(job); +} + +static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + BUG_ON(!req->nr_phys_segments); + + buf->sg_list = kzalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +/** + * bsg_create_job - create the bsg_job structure for the bsg request + * @dev: device that is being sent the bsg request + * @req: BSG request that needs a job structure + */ +static int bsg_create_job(struct device *dev, struct request *req) +{ + struct request *rsp = req->next_rq; + struct request_queue *q = req->q; + struct bsg_job *job; + int ret; + + BUG_ON(req->special); + + job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL); + if (!job) + return -ENOMEM; + + req->special = job; + job->req = req; + if (q->bsg_job_size) + job->dd_data = (void *)&job[1]; + job->request = req->cmd; + job->request_len = req->cmd_len; + job->reply = req->sense; + job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer + * allocated */ + if (req->bio) { + ret = bsg_map_buffer(&job->request_payload, req); + if (ret) + goto failjob_rls_job; + } + if (rsp && rsp->bio) { + ret = bsg_map_buffer(&job->reply_payload, rsp); + if (ret) + goto failjob_rls_rqst_payload; + } + job->dev = dev; + /* take a reference for the request */ + get_device(job->dev); + return 0; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + kfree(job); + return -ENOMEM; +} + +/* + * bsg_goose_queue - restart queue in case it was stopped + * @q: request q to be restarted + */ +void bsg_goose_queue(struct request_queue *q) +{ + if (!q) + return; + + blk_run_queue_async(q); +} +EXPORT_SYMBOL_GPL(bsg_goose_queue); + +/** + * bsg_request_fn - generic handler for bsg requests + * @q: request queue to manage + * + * On error the create_bsg_job function should return a -Exyz error value + * that will be set to the req->errors. + * + * Drivers/subsys should pass this to the queue init function. + */ +void bsg_request_fn(struct request_queue *q) +{ + struct device *dev = q->queuedata; + struct request *req; + struct bsg_job *job; + int ret; + + if (!get_device(dev)) + return; + + while (1) { + req = blk_fetch_request(q); + if (!req) + break; + spin_unlock_irq(q->queue_lock); + + ret = bsg_create_job(dev, req); + if (ret) { + req->errors = ret; + blk_end_request_all(req, ret); + spin_lock_irq(q->queue_lock); + continue; + } + + job = req->special; + ret = q->bsg_job_fn(job); + spin_lock_irq(q->queue_lock); + if (ret) + break; + } + + spin_unlock_irq(q->queue_lock); + put_device(dev); + spin_lock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(bsg_request_fn); + +/** + * bsg_setup_queue - Create and add the bsg hooks so we can receive requests + * @dev: device to attach bsg device to + * @q: request queue setup by caller + * @name: device to give bsg device + * @job_fn: bsg job handler + * @dd_job_size: size of LLD data needed for each job + * + * The caller should have setup the reuqest queue with bsg_request_fn + * as the request_fn. + */ +int bsg_setup_queue(struct device *dev, struct request_queue *q, + char *name, bsg_job_fn *job_fn, int dd_job_size) +{ + int ret; + + q->queuedata = dev; + q->bsg_job_size = dd_job_size; + q->bsg_job_fn = job_fn; + queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + blk_queue_softirq_done(q, bsg_softirq_done); + blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); + + ret = bsg_register_queue(q, dev, name, NULL); + if (ret) { + printk(KERN_ERR "%s: bsg interface failed to " + "initialize - register queue\n", dev->kobj.name); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bsg_setup_queue); + +/** + * bsg_remove_queue - Deletes the bsg dev from the q + * @q: the request_queue that is to be torn down. + * + * Notes: + * Before unregistering the queue empty any requests that are blocked + */ +void bsg_remove_queue(struct request_queue *q) +{ + struct request *req; /* block request */ + int counts; /* totals for request_list count and starved */ + + if (!q) + return; + + /* Stop taking in new requests */ + spin_lock_irq(q->queue_lock); + blk_stop_queue(q); + + /* drain all requests in the queue */ + while (1) { + /* need the lock to fetch a request + * this may fetch the same reqeust as the previous pass + */ + req = blk_fetch_request(q); + /* save requests in use and starved */ + counts = q->rq.count[0] + q->rq.count[1] + + q->rq.starved[0] + q->rq.starved[1]; + spin_unlock_irq(q->queue_lock); + /* any requests still outstanding? */ + if (counts == 0) + break; + + /* This may be the same req as the previous iteration, + * always send the blk_end_request_all after a prefetch. + * It is not okay to not end the request because the + * prefetch started the request. + */ + if (req) { + /* return -ENXIO to indicate that this queue is + * going away + */ + req->errors = -ENXIO; + blk_end_request_all(req, -ENXIO); + } + + msleep(200); /* allow bsg to possibly finish */ + spin_lock_irq(q->queue_lock); + } + bsg_unregister_queue(q); +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); -- cgit v1.1 From e5a94f56845bb4b272d82e84b5a1e2080b07ba82 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 1 Aug 2011 10:31:06 +0200 Subject: blk-throttle: correctly determine sync bio read request is always sync. Using rw_is_sync() to determine if a bio is sync. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f6a7941..a19f58c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -746,7 +746,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - bool sync = bio->bi_rw & REQ_SYNC; + bool sync = rw_is_sync(bio->bi_rw); /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; @@ -1150,7 +1150,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) if (tg_no_rule_group(tg, rw)) { blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, - rw, bio->bi_rw & REQ_SYNC); + rw, rw_is_sync(bio->bi_rw)); rcu_read_unlock(); return 0; } -- cgit v1.1 From a5395b83b78f62ccf5e3af854aacd025c2a6e7b5 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 2 Aug 2011 09:24:09 +0200 Subject: cfq-iosched: Reduce linked group count upon group destruction FQ keeps track of number of groups which are linked on blkcg->blkg_list. This is useful to avoid races between queue exit and cgroup exit code paths. So if at the request queue exit time linked group count is not zero, that means there are some group out there which is yet to be deleted under rcu read period and queue exit code should wait for on rcu period. In my previous patch I forgot to decrease the number of group count. So in current form, we nr_blkcg_linked_grps is always non-zero and we will always wait one rcu period (if BLK_CGROUP=y). The side effect of this is that it can increase boot time. I am surprised, nobody complained so far. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1f96ad6..6508345 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1209,6 +1209,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) hlist_del_init(&cfqg->cfqd_node); + BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); + cfqd->nr_blkcg_linked_grps--; + /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. -- cgit v1.1 From e2a5429ff7947ad251310376384f449297b7492a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Aug 2011 10:43:35 +0200 Subject: bsg-lib: add module.h include Due to conflicts with the moduleh tree in linux-next, we run into an include file mess. We really need export.h in that tree, but if we add module.h locally then the issue is easier to resolve. Reported-by: Stephen Rothwell Signed-off-by: Jens Axboe --- block/bsg-lib.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index f8c0a61..6690e6e 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /** -- cgit v1.1 From f95fe9cfb49f6e625fbb5888cae2ed6f3a276b89 Mon Sep 17 00:00:00 2001 From: Herbert Poetzl Date: Tue, 2 Aug 2011 12:43:50 +0200 Subject: block/genhd.c: remove useless cast in diskstats_show() Remove the (unsigned long long) cast in diskstats_show() and adjusts the seq_printf() format string to 'unsigned long' diskstats_show() uses part_stat_read() to get the stats, which either accesses the specified field in the struct disk_stats directly (non SMP) or sums up the per CPU values in a variable of the same type as the field, so in any case the result will have the same type and range as the specified field which for all disk_stats entries is unsigned long Also, for unsigned long ranges the output of %lu should be identical to the one of %llu, so no change in the actual proc entry contents. Signed-off-by: Herbert Poetzl Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5cb51c5..e2f6790 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1146,17 +1146,17 @@ static int diskstats_show(struct seq_file *seqf, void *v) cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu " - "%u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s %lu %lu %lu " + "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[READ]), part_stat_read(hd, merges[READ]), - (unsigned long long)part_stat_read(hd, sectors[READ]), + part_stat_read(hd, sectors[READ]), jiffies_to_msecs(part_stat_read(hd, ticks[READ])), part_stat_read(hd, ios[WRITE]), part_stat_read(hd, merges[WRITE]), - (unsigned long long)part_stat_read(hd, sectors[WRITE]), + part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), -- cgit v1.1 From dd48c085c1cdf9446f92826f1fd451167fb6c2fd Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 3 Aug 2011 16:21:01 -0700 Subject: fault-injection: add ability to export fault_attr in arbitrary directory init_fault_attr_dentries() is used to export fault_attr via debugfs. But it can only export it in debugfs root directory. Per Forlin is working on mmc_fail_request which adds support to inject data errors after a completed host transfer in MMC subsystem. The fault_attr for mmc_fail_request should be defined per mmc host and export it in debugfs directory per mmc host like /sys/kernel/debug/mmc0/mmc_fail_request. init_fault_attr_dentries() doesn't help for mmc_fail_request. So this introduces fault_create_debugfs_attr() which is able to create a directory in the arbitrary directory and replace init_fault_attr_dentries(). [akpm@linux-foundation.org: extraneous semicolon, per Randy] Signed-off-by: Akinobu Mita Tested-by: Per Forlin Cc: Jens Axboe Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Cc: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-core.c | 6 ++++-- block/blk-timeout.c | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b850bed..b627558 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1368,8 +1368,10 @@ static bool should_fail_request(struct hd_struct *part, unsigned int bytes) static int __init fail_make_request_debugfs(void) { - return init_fault_attr_dentries(&fail_make_request, - "fail_make_request"); + struct dentry *dir = fault_create_debugfs_attr("fail_make_request", + NULL, &fail_make_request); + + return IS_ERR(dir) ? PTR_ERR(dir) : 0; } late_initcall(fail_make_request_debugfs); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4f0c06c..7803548 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -28,7 +28,10 @@ int blk_should_fake_timeout(struct request_queue *q) static int __init fail_io_timeout_debugfs(void) { - return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout"); + struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", + NULL, &fail_io_timeout); + + return IS_ERR(dir) ? PTR_ERR(dir) : 0; } late_initcall(fail_io_timeout_debugfs); -- cgit v1.1 From 35ae66e0a09ab70ed588e65f26b4c725cd1656b6 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 5 Aug 2011 09:37:10 +0200 Subject: block: Make rq_affinity = 1 work as expected Commit 5757a6d76c introduced a new rq_affinity = 2 so as to make the request completed in the __make_request cpu. But it makes the old rq_affinity = 1 not work any more. The root cause is that if the 'cpu' and 'req->cpu' is in the same group and cpu != req->cpu, ccpu will be the same as group_cpu, so the completion will be excuted in the 'cpu' not 'group_cpu'. This patch fix problem by simpling removing group_cpu and the codes are more explicit now. If ccpu == cpu, we complete in cpu, otherwise we raise_blk_irq to ccpu. Cc: Christoph Hellwig Cc: Roland Dreier Cc: Dan Williams Cc: Jens Axboe Signed-off-by: Tao Ma Reviewed-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-softirq.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 475fab8..487addc 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { - int ccpu, cpu, group_cpu = NR_CPUS; + int ccpu, cpu; struct request_queue *q = req->q; unsigned long flags; @@ -117,14 +117,12 @@ void __blk_complete_request(struct request *req) */ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { ccpu = req->cpu; - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) ccpu = blk_cpu_to_group(ccpu); - group_cpu = blk_cpu_to_group(cpu); - } } else ccpu = cpu; - if (ccpu == cpu || ccpu == group_cpu) { + if (ccpu == cpu) { struct list_head *list; do_local: list = &__get_cpu_var(blk_cpu_done); -- cgit v1.1 From fa1bf42ff9296ac4cf211b0a1b450a6071d26a95 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 9 Aug 2011 20:32:09 +0200 Subject: allow blk_flush_policy to return REQ_FSEQ_DATA independent of *FLUSH blk_insert_flush has the following check: /* * If there's data but flush is not necessary, the request can be * processed directly without going through flush machinery. Queue * for normal execution. */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { list_add_tail(&rq->queuelist, &q->queue_head); return; } However, blk_flush_policy will not return with policy set to only REQ_FSEQ_DATA: static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { unsigned int policy = 0; if (fflags & REQ_FLUSH) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } return policy; } Notice that REQ_FSEQ_DATA is only set if REQ_FLUSH is set. Fix this mismatch by moving the setting of REQ_FSEQ_DATA outside of the REQ_FLUSH check. Tejun notes: Hmmm... yes, this can become a correctness issue if (and only if) blk_queue_flush() is called to change q->flush_flags while requests are in-flight; otherwise, requests wouldn't reach the function at all. Also, I think it would be a generally good idea to always set FSEQ_DATA if the request has data. Cheers, Jeff Signed-off-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-flush.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index bb21e4c..2d162bd 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { unsigned int policy = 0; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + if (fflags & REQ_FLUSH) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (blk_rq_sectors(rq)) - policy |= REQ_FSEQ_DATA; if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } -- cgit v1.1 From bcf30e75b773b60379338768677a1301ef602ff9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Aug 2011 10:39:04 +0200 Subject: block: improve rq_affinity placement This patch reverts commit 35ae66e0a09ab70ed(block: Make rq_affinity = 1 work as expected). The purpose is to avoid an unnecessary IPI. Let's take an example. My test box has cpu 0-7, one socket. Say request is added from CPU 1, blk_complete_request() occurs at CPU 7. Without the reverted patch, softirq will be done at CPU 7. With it, an IPI will be directed to CPU 0, and softirq will be done at CPU 0. In this case, doing softirq at CPU 0 and CPU 7 have no difference from cache sharing point view and we can avoid an ipi if doing it in CPU 7. An immediate concern is this is just like QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is running in interrupt handler, and currently I/O controller doesn't support multiple interrupts (I checked several LSI cards and AHCI), so only one CPU can run blk_complete_request(). This is still quite different as QUEUE_FLAG_SAME_FORCE. Since only one CPU runs softirq, the only difference with below patch is softirq not always runs at the first CPU of a group. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-softirq.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 487addc..58340d0 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { - int ccpu, cpu; + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; @@ -117,12 +117,22 @@ void __blk_complete_request(struct request *req) */ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { ccpu = req->cpu; - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } } else ccpu = cpu; - if (ccpu == cpu) { + /* + * If current CPU and requested CPU are in the same group, running + * softirq in current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ + if (ccpu == cpu || ccpu == group_cpu) { struct list_head *list; do_local: list = &__get_cpu_var(blk_cpu_done); -- cgit v1.1 From 4853abaae7e4a2af938115ce9071ef8684fb7af4 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 15 Aug 2011 21:37:25 +0200 Subject: block: fix flush machinery for stacking drivers with differring flush flags Commit ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae, block: reimplement FLUSH/FUA to support merge, introduced a performance regression when running any sort of fsyncing workload using dm-multipath and certain storage (in our case, an HP EVA). The test I ran was fs_mark, and it dropped from ~800 files/sec on ext4 to ~100 files/sec. It turns out that dm-multipath always advertised flush+fua support, and passed commands on down the stack, where those flags used to get stripped off. The above commit changed that behavior: static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || - (rq->cmd_flags & REQ_FLUSH_SEQ)) - return rq; - rq = blk_do_flush(q, rq); - if (rq) - return rq; + return rq; } Note that previously, a command would come in here, have REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush: struct request *blk_do_flush(struct request_queue *q, struct request *rq) { unsigned int fflags = q->flush_flags; /* may change, cache it */ bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); unsigned skip = 0; ... if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { rq->cmd_flags &= ~REQ_FLUSH; if (!has_fua) rq->cmd_flags &= ~REQ_FUA; return rq; } So, the flush machinery was bypassed in such cases (q->flush_flags == 0 && rq->cmd_flags & (REQ_FLUSH|REQ_FUA)). Now, however, we don't get into the flush machinery at all. Instead, __elv_next_request just hands a request with flush and fua bits set to the scsi_request_fn, even if the underlying request_queue does not support flush or fua. The agreed upon approach is to fix the flush machinery to allow stacking. While this isn't used in practice (since there is only one request-based dm target, and that target will now reflect the flush flags of the underlying device), it does future-proof the solution, and make it function as designed. In order to make this work, I had to add a field to the struct request, inside the flush structure (to store the original req->end_io). Shaohua had suggested overloading the union with rb_node and completion_data, but the completion data is used by device mapper and can also be used by other drivers. So, I didn't see a way around the additional field. I tested this patch on an HP EVA with both ext4 and xfs, and it recovers the lost performance. Comments and other testers, as always, are appreciated. Cheers, Jeff Signed-off-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- block/blk-flush.c | 20 ++++++++++++++++---- block/blk.h | 2 ++ 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b850bed..7c59b0f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1700,6 +1700,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits); int blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; + int where = ELEVATOR_INSERT_BACK; if (blk_rq_check_limits(q, rq)) return -EIO; @@ -1716,7 +1717,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - add_acct_request(q, rq, ELEVATOR_INSERT_BACK); + if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA)) + where = ELEVATOR_INSERT_FLUSH; + + add_acct_request(q, rq, where); spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -2273,7 +2277,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool __blk_end_bidi_request(struct request *rq, int error, +bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) diff --git a/block/blk-flush.c b/block/blk-flush.c index 2d162bd..491eb30 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -123,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; - rq->end_io = NULL; + rq->end_io = rq->flush.saved_end_io; } /** @@ -301,9 +301,6 @@ void blk_insert_flush(struct request *rq) unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); - BUG_ON(rq->end_io); - BUG_ON(!rq->bio || rq->bio != rq->biotail); - /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. @@ -313,6 +310,19 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags &= ~REQ_FUA; /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ + if (!policy) { + __blk_end_bidi_request(rq, 0, 0, 0); + return; + } + + BUG_ON(!rq->bio || rq->bio != rq->biotail); + + /* * If there's data but flush is not necessary, the request can be * processed directly without going through flush machinery. Queue * for normal execution. @@ -320,6 +330,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { list_add_tail(&rq->queuelist, &q->queue_head); + blk_run_queue_async(q); return; } @@ -330,6 +341,7 @@ void blk_insert_flush(struct request *rq) memset(&rq->flush, 0, sizeof(rq->flush)); INIT_LIST_HEAD(&rq->flush.list); rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = flush_data_end_io; blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); diff --git a/block/blk.h b/block/blk.h index d658628..20b900a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); +bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); -- cgit v1.1 From b53d1ed734a2b9af8da115b836b658daa7d47a48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Aug 2011 08:34:48 +0200 Subject: Revert "cfq: Remove special treatment for metadata rqs." We have a kernel build regression since 3.1-rc1, which is about 10% regression. The kernel source is in an ext3 filesystem. Alex Shi bisect it to commit: commit a07405b7802691d29ab3b23bdc76ee6d006aad0b Author: Justin TerAvest Date: Sun Jul 10 22:09:19 2011 +0200 cfq: Remove special treatment for metadata rqs. Apparently this is caused by lack metadata preemption, where ext3/ext4 do use READ_META. I didn't see a way to fix the issue, so suggest reverting the patch. This reverts commit a07405b7802691d29ab3b23bdc76ee6d006aad0b. Reported-by: Alex Shi Reported-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6508345..a33bd43 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -130,6 +130,8 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; + /* pending metadata requests */ + int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -682,6 +684,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) + return rq1->cmd_flags & REQ_META ? rq1 : rq2; + s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1607,6 +1612,10 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); + if (rq->cmd_flags & REQ_META) { + WARN_ON(!cfqq->meta_pending); + cfqq->meta_pending--; + } } static int cfq_merge(struct request_queue *q, struct request **req, @@ -3360,6 +3369,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* + * So both queues are sync. Let the new request get disk time if + * it's a metadata request and the current queue is doing regular IO. + */ + if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) + return true; + + /* * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. */ if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) @@ -3423,6 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; + if (rq->cmd_flags & REQ_META) + cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); -- cgit v1.1 From 65299a3b788bd274bed92f9fa3232082c9f3ea70 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 14:50:29 +0200 Subject: block: separate priority boosting from REQ_META Add a new REQ_PRIO to let requests preempt others in the cfq I/O schedule, and lave REQ_META purely for marking requests as metadata in blktrace. All existing callers of REQ_META except for XFS are updated to also set REQ_PRIO for now. Signed-off-by: Christoph Hellwig Reviewed-by: Namhyung Kim Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a33bd43..16ace89 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -130,8 +130,8 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; - /* pending metadata requests */ - int meta_pending; + /* pending priority requests */ + int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -684,8 +684,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) - return rq1->cmd_flags & REQ_META ? rq1 : rq2; + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) + return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1612,9 +1612,9 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_META) { - WARN_ON(!cfqq->meta_pending); - cfqq->meta_pending--; + if (rq->cmd_flags & REQ_PRIO) { + WARN_ON(!cfqq->prio_pending); + cfqq->prio_pending--; } } @@ -3372,7 +3372,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; /* @@ -3439,8 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_META) - cfqq->meta_pending++; + if (rq->cmd_flags & REQ_PRIO) + cfqq->prio_pending++; cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); -- cgit v1.1 From d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Aug 2011 20:01:04 +0200 Subject: block: add GENHD_FL_NO_PART_SCAN There are cases where suppressing partition scan is useful - e.g. for lo devices and pseudo SATA devices which advertise to be a disk but get upset on partition scan (some port multiplier control devices show such behavior). This patch adds GENHD_FL_NO_PART_SCAN which suppresses partition scan regardless of the number of possible partitions. disk_partitionable() is renamed to disk_part_scan_enabled() as suppressing partition scan doesn't imply the device can't be partitioned using BLKPG_ADD/DEL_PARTITION calls from userland. show_partition() now directly tests disk_max_parts() to maintain backward-compatibility. -v2: Updated to make it clear that only partition scan is suppressed not partitioning itself as suggested by Kay Sievers. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- block/ioctl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5cb51c5..2429ecb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -536,7 +536,7 @@ void register_disk(struct gendisk *disk) disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ - if (!disk_partitionable(disk)) + if (!disk_part_scan_enabled(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -841,7 +841,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/block/ioctl.c b/block/ioctl.c index 1124cd2..5c74efc 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -101,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev) struct gendisk *disk = bdev->bd_disk; int res; - if (!disk_partitionable(disk) || bdev != bdev->bd_contains) + if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; -- cgit v1.1 From e8037d49835482c9534a9a07bed0d0ea831135ae Mon Sep 17 00:00:00 2001 From: Eric Seppanen Date: Tue, 23 Aug 2011 21:25:12 +0200 Subject: block: Fix queue_flag update when rq_affinity goes from 2 to 1 Commit 5757a6d76cdf added the QUEUE_FLAG_SAME_FORCE flag, but fails to clear that flag when the current state is '2' (SAME_COMP + SAME_FORCE) and the new state is '1' (SAME_COMP). Acked-by: Dan Williams Reviewed-by: Roland Dreier Signed-off-by: Eric Seppanen Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0ee17b5..e681805 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -258,11 +258,13 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) { + if (val == 2) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - if (val == 2) - queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); - } else { + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 1) { + queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 0) { queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } -- cgit v1.1 From a63271627521b825b0dd0a564e9a9c62b4c1ca89 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 24 Aug 2011 16:04:32 +0200 Subject: block: change force plug flush call order Do blk_flush_plug_list() first and then add new request aDo blk_flush_plug_list() first and then add new request aDo blk_flush_plug_list() first and then add new request at the tail. New request can't be merged to existing requests, but later new requests might be merged with this new one. If blk_flush_plug_list() is done later, the merge doesn't happen. Believe it or not, this fixes a 10% regression running sysbench workload. Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 90e1ffd..67dba69 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1302,11 +1302,11 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } - list_add_tail(&req->queuelist, &plug->list); - plug->count++; - drive_stat_acct(req, 1); if (plug->count >= BLK_MAX_REQUEST_COUNT) blk_flush_plug_list(plug, false); + plug->count++; + list_add_tail(&req->queuelist, &plug->list); + drive_stat_acct(req, 1); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); -- cgit v1.1 From 56ebdaf2fa3c5276be201c5d1aff1490b682ecf2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 24 Aug 2011 16:04:34 +0200 Subject: block: simplify force plug flush code a little bit Cleaning up the code a little bit. attempt_plug_merge() traverses the plug list anyway, we can do the request counting there, so stack size is reduced a little bit. The motivation here is I suspect if we should count the requests for each queue (task could handle multiple disks in the meantime), but my test doesn't show it's worthy doing. If somebody proves we should do it, below change will make that more easier. Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 67dba69..b2ed78a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1167,7 +1167,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, * true if merge was successful, otherwise false. */ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, - struct bio *bio) + struct bio *bio, unsigned int *request_count) { struct blk_plug *plug; struct request *rq; @@ -1176,10 +1176,13 @@ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, plug = tsk->plug; if (!plug) goto out; + *request_count = 0; list_for_each_entry_reverse(rq, &plug->list, queuelist) { int el_ret; + (*request_count)++; + if (rq->q != q) continue; @@ -1219,6 +1222,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; + unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a @@ -1237,7 +1241,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(current, q, bio)) + if (attempt_plug_merge(current, q, bio, &request_count)) goto out; spin_lock_irq(q->queue_lock); @@ -1302,9 +1306,8 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } - if (plug->count >= BLK_MAX_REQUEST_COUNT) + if (request_count >= BLK_MAX_REQUEST_COUNT) blk_flush_plug_list(plug, false); - plug->count++; list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); } else { @@ -2634,7 +2637,6 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; - plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2718,7 +2720,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); - plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); -- cgit v1.1 From a72c5e5eb738033938ab30d6a634b74d1d060f10 Mon Sep 17 00:00:00 2001 From: Nao Nishijima Date: Thu, 25 Aug 2011 18:04:06 +0900 Subject: [SCSI] genhd: add a new attribute "alias" in gendisk This patch allows the user to set an "alias" of the disk via sysfs interface. This patch only adds a new attribute "alias" in gendisk structure. To show the alias instead of the device name in kernel messages, we need to revise printk messages and use alias_name() in them. Example: (current) printk("disk name is %s\n", disk->disk_name); (new) printk("disk name is %s\n", alias_name(disk)); Users can use alphabets, numbers, '-' and '_' in "alias" attribute. A disk can have an "alias" which length is up to 255 bytes. This attribute is write-once. Suggested-by: James Bottomley Suggested-by: Jon Masters Signed-off-by: Nao Nishijima Signed-off-by: James Bottomley --- block/genhd.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e2f6790..94855a9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "blk.h" @@ -909,6 +910,74 @@ static int __init genhd_device_init(void) subsys_initcall(genhd_device_init); +static ssize_t alias_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + ssize_t ret = 0; + + if (disk->alias) + ret = snprintf(buf, ALIAS_LEN, "%s\n", disk->alias); + return ret; +} + +static ssize_t alias_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + char *alias; + char *envp[] = { NULL, NULL }; + unsigned char c; + int i; + ssize_t ret = count; + + if (!count) + return -EINVAL; + + if (count >= ALIAS_LEN) { + printk(KERN_ERR "alias: alias is too long\n"); + return -EINVAL; + } + + /* Validation check */ + for (i = 0; i < count; i++) { + c = buf[i]; + if (i == count - 1 && c == '\n') + break; + if (!isalnum(c) && c != '_' && c != '-') { + printk(KERN_ERR "alias: invalid alias\n"); + return -EINVAL; + } + } + + if (disk->alias) { + printk(KERN_INFO "alias: %s is already assigned (%s)\n", + disk->disk_name, disk->alias); + return -EINVAL; + } + + alias = kasprintf(GFP_KERNEL, "%s", buf); + if (!alias) + return -ENOMEM; + + if (alias[count - 1] == '\n') + alias[count - 1] = '\0'; + + envp[0] = kasprintf(GFP_KERNEL, "ALIAS=%s", alias); + if (!envp[0]) { + kfree(alias); + return -ENOMEM; + } + + disk->alias = alias; + printk(KERN_INFO "alias: assigned %s to %s\n", alias, disk->disk_name); + + kobject_uevent_env(&dev->kobj, KOBJ_ADD, envp); + + kfree(envp[0]); + return ret; +} + static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -968,6 +1037,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static DEVICE_ATTR(alias, S_IRUGO|S_IWUSR, alias_show, alias_store); static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -990,6 +1060,7 @@ static struct device_attribute dev_attr_fail_timeout = #endif static struct attribute *disk_attrs[] = { + &dev_attr_alias.attr, &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, -- cgit v1.1 From 484fc254b88257a2d8b3759aa062e8e8b35e0988 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 8 Sep 2011 12:32:14 +0200 Subject: elevator: use ELV_NAME_MAX instead of magic number 16 for chosen_elevator We have ELV_NAME_MAX defined to 16, and hence we should use it instead of the magic nubmer 16 for elevator's name string. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index a3b64bc..cb332cb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -182,7 +182,7 @@ static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, eq->elevator_data = data; } -static char chosen_elevator[16]; +static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) { -- cgit v1.1 From 166e1f901b01872e8b70733a3f2e2c6980389cf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Sep 2011 12:08:27 +0200 Subject: block: export __make_request Avoid the hacks need for request based device mappers currently by simply exporting the symbol instead of trying to get it through the back door. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b627558..56ef387 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -38,8 +38,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); -static int __make_request(struct request_queue *q, struct bio *bio); - /* * For the allocated request tables */ @@ -1213,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static int __make_request(struct request_queue *q, struct bio *bio) +int __make_request(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1317,6 +1315,7 @@ out_unlock: out: return 0; } +EXPORT_SYMBOL_GPL(__make_request); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location -- cgit v1.1 From c20e8de27fef9f59869c81c288ad6cf28200e00c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Sep 2011 12:03:37 +0200 Subject: block: rename __make_request() to blk_queue_bio() Now that it's exported, lets put it in a more sane namespace. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 56ef387..ab673f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -540,7 +540,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, /* * This also sets hw/phys segments, boundary and size */ - blk_queue_make_request(q, __make_request); + blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; @@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -int __make_request(struct request_queue *q, struct bio *bio) +int blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1315,7 +1315,7 @@ out_unlock: out: return 0; } -EXPORT_SYMBOL_GPL(__make_request); /* for device mapper only */ +EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location -- cgit v1.1 From 5a7bbad27a410350e64a2d7f5ec18fc73836c14f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Sep 2011 12:12:01 +0200 Subject: block: remove support for bio remapping from ->make_request There is very little benefit in allowing to let a ->make_request instance update the bios device and sector and loop around it in __generic_make_request when we can archive the same through calling generic_make_request from the driver and letting the loop in generic_make_request handle it. Note that various drivers got the return value from ->make_request and returned non-zero values for errors. Signed-off-by: Christoph Hellwig Acked-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 153 ++++++++++++++++++++++--------------------------------- 1 file changed, 62 insertions(+), 91 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ab673f0..f58e019 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -int blk_queue_bio(struct request_queue *q, struct bio *bio) +void blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1236,7 +1236,7 @@ int blk_queue_bio(struct request_queue *q, struct bio *bio) * any locks. */ if (attempt_plug_merge(current, q, bio)) - goto out; + return; spin_lock_irq(q->queue_lock); @@ -1312,8 +1312,6 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } -out: - return 0; } EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ @@ -1441,112 +1439,85 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) static inline void __generic_make_request(struct bio *bio) { struct request_queue *q; - sector_t old_sector; - int ret, nr_sectors = bio_sectors(bio); - dev_t old_dev; + int nr_sectors = bio_sectors(bio); int err = -EIO; + char b[BDEVNAME_SIZE]; + struct hd_struct *part; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; - /* - * Resolve the mapping until finished. (drivers are - * still free to implement/resolve their own stacking - * by explicitly returning 0) - * - * NOTE: we don't repeat the blk_size check for each new device. - * Stacking drivers are expected to know what they are doing. - */ - old_sector = -1; - old_dev = 0; - do { - char b[BDEVNAME_SIZE]; - struct hd_struct *part; - - q = bdev_get_queue(bio->bi_bdev); - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_sector); - goto end_io; - } - - if (unlikely(!(bio->bi_rw & REQ_DISCARD) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - - part = bio->bi_bdev->bd_part; - if (should_fail_request(part, bio->bi_size) || - should_fail_request(&part_to_disk(part)->part0, - bio->bi_size)) - goto end_io; + q = bdev_get_queue(bio->bi_bdev); + if (unlikely(!q)) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%Lu)\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_sector); + goto end_io; + } - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); + if (unlikely(!(bio->bi_rw & REQ_DISCARD) && + nr_sectors > queue_max_hw_sectors(q))) { + printk(KERN_ERR "bio too big device %s (%u > %u)\n", + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + queue_max_hw_sectors(q)); + goto end_io; + } - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + goto end_io; - if (old_sector != -1) - trace_block_bio_remap(q, bio, old_dev, old_sector); + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_size) || + should_fail_request(&part_to_disk(part)->part0, + bio->bi_size)) + goto end_io; - old_sector = bio->bi_sector; - old_dev = bio->bi_bdev->bd_dev; + /* + * If this device has partitions, remap block n + * of partition p to block n+start(p) of the disk. + */ + blk_partition_remap(bio); - if (bio_check_eod(bio, nr_sectors)) - goto end_io; + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; - /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. - */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); - if (!nr_sectors) { - err = 0; - goto end_io; - } - } + if (bio_check_eod(bio, nr_sectors)) + goto end_io; - if ((bio->bi_rw & REQ_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && - !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + err = 0; goto end_io; } + } - if (blk_throtl_bio(q, &bio)) - goto end_io; - - /* - * If bio = NULL, bio has been throttled and will be submitted - * later. - */ - if (!bio) - break; - - trace_block_bio_queue(q, bio); + if ((bio->bi_rw & REQ_DISCARD) && + (!blk_queue_discard(q) || + ((bio->bi_rw & REQ_SECURE) && + !blk_queue_secdiscard(q)))) { + err = -EOPNOTSUPP; + goto end_io; + } - ret = q->make_request_fn(q, bio); - } while (ret); + if (blk_throtl_bio(q, &bio)) + goto end_io; + /* if bio = NULL, bio has been throttled and will be submitted later. */ + if (!bio) + return; + trace_block_bio_queue(q, bio); + q->make_request_fn(q, bio); return; end_io: -- cgit v1.1 From 8ad6a56f5679a987bfeacad1bd818a2a381aa98e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 14 Sep 2011 09:31:01 +0200 Subject: block: Don't check QUEUE_FLAG_SAME_COMP in __blk_complete_request In __blk_complete_request, we check both QUEUE_FLAG_SAME_COMP and req->cpu to decide whether we should use req->cpu. Actually the user can also select the complete cpu by either setting BIO_CPU_AFFINE or by calling bio_set_completion_cpu. Current solution makes these 2 ways don't work any more. So we'd better just check req->cpu. Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- block/blk-softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 58340d0..1366a89 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -115,7 +115,7 @@ void __blk_complete_request(struct request *req) /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { + if (req->cpu != -1) { ccpu = req->cpu; if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { ccpu = blk_cpu_to_group(ccpu); -- cgit v1.1 From 27a84d54c02591e815d291ae0ee4bfb9cfd21065 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2011 14:01:40 +0200 Subject: block: refactor generic_make_request Move all the checks performed on a bio into a new helper, and call it as soon as bio is submitted even if it is a re-submission from ->make_request. We explicitly mark the new helper as beeing non-inlined as the stack usage for printing the block device name in the failure case is quite high and this a patch where we have to be extremely conservative about stack usage. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 95 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 46 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f58e019..684d7eb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1412,31 +1412,8 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; } -/** - * generic_make_request - hand a buffer to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct bio, which describes the I/O that needs - * to be done. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bio->bi_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that bi_io_vec - * are set to describe the memory buffer, and that bi_dev and bi_sector are - * set to describe the device address, and the - * bi_end_io and optionally bi_private are set to describe how - * completion notification should be signaled. - * - * generic_make_request and the drivers it calls may use bi_next if this - * bio happens to be merged with someone else, and may change bi_dev and - * bi_sector for remaps as it sees fit. So the values of these fields - * should NOT be depended on after the call to generic_make_request. - */ -static inline void __generic_make_request(struct bio *bio) +static noinline_for_stack bool +generic_make_request_checks(struct bio *bio) { struct request_queue *q; int nr_sectors = bio_sectors(bio); @@ -1515,35 +1492,62 @@ static inline void __generic_make_request(struct bio *bio) /* if bio = NULL, bio has been throttled and will be submitted later. */ if (!bio) - return; + return false; + trace_block_bio_queue(q, bio); - q->make_request_fn(q, bio); - return; + return true; end_io: bio_endio(bio, err); + return false; } -/* - * We only want one ->make_request_fn to be active at a time, - * else stack usage with stacked devices could be a problem. - * So use current->bio_list to keep a list of requests - * submited by a make_request_fn function. - * current->bio_list is also used as a flag to say if - * generic_make_request is currently active in this task or not. - * If it is NULL, then no make_request is active. If it is non-NULL, - * then a make_request is active, and new requests should be added - * at the tail +/** + * generic_make_request - hand a buffer to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * generic_make_request() is used to make I/O requests of block + * devices. It is passed a &struct bio, which describes the I/O that needs + * to be done. + * + * generic_make_request() does not return any status. The + * success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the bio->bi_end_io + * function described (one day) else where. + * + * The caller of generic_make_request must make sure that bi_io_vec + * are set to describe the memory buffer, and that bi_dev and bi_sector are + * set to describe the device address, and the + * bi_end_io and optionally bi_private are set to describe how + * completion notification should be signaled. + * + * generic_make_request and the drivers it calls may use bi_next if this + * bio happens to be merged with someone else, and may resubmit the bio to + * a lower device by calling into generic_make_request recursively, which + * means the bio should NOT be touched after the call to ->make_request_fn. */ void generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + if (!generic_make_request_checks(bio)) + return; + + /* + * We only want one ->make_request_fn to be active at a time, else + * stack usage with stacked devices could be a problem. So use + * current->bio_list to keep a list of requests submited by a + * make_request_fn function. current->bio_list is also used as a + * flag to say if generic_make_request is currently active in this + * task or not. If it is NULL, then no make_request is active. If + * it is non-NULL, then a make_request is active, and new requests + * should be added at the tail + */ if (current->bio_list) { - /* make_request is active */ bio_list_add(current->bio_list, bio); return; } + /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers @@ -1551,22 +1555,21 @@ void generic_make_request(struct bio *bio) * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be - * added. __generic_make_request may indeed add some more bios + * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from - * bio_list, and call into __generic_make_request again. - * - * The loop was structured like this to make only one call to - * __generic_make_request (which is important as it is large and - * inlined) and to keep the structure simple. + * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); current->bio_list = &bio_list_on_stack; do { - __generic_make_request(bio); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + q->make_request_fn(q, bio); + bio = bio_list_pop(current->bio_list); } while (bio); current->bio_list = NULL; /* deactivate */ -- cgit v1.1 From 75df713627f28f88b901b329c8857747545fd4ab Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 21 Sep 2011 10:00:16 +0200 Subject: block: document blk-plug Thus spake Andrew Morton: "And I have the usual maintainability whine. If someone comes up to vmscan.c and sees it calling blk_start_plug(), how are they supposed to work out why that call is there? They go look at the blk_start_plug() definition and it is undocumented. I think we can do better than this?" Adapted from the LWN article - http://lwn.net/Articles/438256/ by Jens Axboe and from an earlier attempt by Shaohua Li to document blk-plug. [akpm@linux-foundation.org: grammatical and spelling tweaks] Signed-off-by: Suresh Jayaraman Cc: Shaohua Li Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 684d7eb..97e9e54 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2595,6 +2595,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work); #define PLUG_MAGIC 0x91827364 +/** + * blk_start_plug - initialize blk_plug and track it inside the task_struct + * @plug: The &struct blk_plug that needs to be initialized + * + * Description: + * Tracking blk_plug inside the task_struct will help with auto-flushing the + * pending I/O should the task end up blocking between blk_start_plug() and + * blk_finish_plug(). This is important from a performance perspective, but + * also ensures that we don't deadlock. For instance, if the task is blocking + * for a memory allocation, memory reclaim could end up wanting to free a + * page belonging to that request that is currently residing in our private + * plug. By flushing the pending I/O when the process goes to sleep, we avoid + * this kind of deadlock. + */ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; -- cgit v1.1 From 499337bb6511e665a236a6a947f819d98ea340c6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Sep 2011 10:01:22 +0200 Subject: block/blk-sysfs.c: fix kerneldoc references The kerneldoc for blk_release_queue() is referring to blk_cleanup_queue(). Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0ee17b5..adc923e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -455,11 +455,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, } /** - * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed - * @kobj: the kobj belonging of the request queue to be released + * blk_release_queue: - release a &struct request_queue when it is no longer needed + * @kobj: the kobj belonging to the request queue to be released * * Description: - * blk_cleanup_queue is the pair to blk_init_queue() or + * blk_release_queue is the pair to blk_init_queue() or * blk_queue_make_request(). It should be called when a request queue is * being released; typically when a block device is being de-registered. * Currently, its primary task it to free all the &struct request -- cgit v1.1 From d11bb4462c4cc6ddd45c6927c617ad79fa6fb8fc Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 21 Sep 2011 10:22:10 +0200 Subject: blk-cgroup: be able to remove the record of unplugged device The bug is we're not able to remove the device from blkio cgroup's per-device control files if it gets unplugged. To reproduce the bug: # mount -t cgroup -o blkio xxx /cgroup # cd /cgroup # echo "8:0 1000" > blkio.throttle.read_bps_device # unplug the device # cat blkio.throttle.read_bps_device 8:0 1000 # echo "8:0 0" > blkio.throttle.read_bps_device -bash: echo: write error: No such device After patching, the device removal will succeed. Thanks for the comments of Paul, Zefan, and Vivek. Signed-off-by: Wanlong Gao Cc: Li Zefan Cc: Paul Menage Acked-by: Vivek Goyal Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bcaf16e..b596e54 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -785,10 +785,10 @@ static int blkio_policy_parse_and_set(char *buf, { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; - unsigned long major, minor, temp; + unsigned long major, minor; int i = 0; dev_t dev; - u64 bps, iops; + u64 temp; memset(s, 0, sizeof(s)); @@ -826,20 +826,23 @@ static int blkio_policy_parse_and_set(char *buf, dev = MKDEV(major, minor); - ret = blkio_check_dev_num(dev); + ret = strict_strtoull(s[1], 10, &temp); if (ret) - return ret; + return -EINVAL; - newpn->dev = dev; + /* For rule removal, do not check for device presence. */ + if (temp) { + ret = blkio_check_dev_num(dev); + if (ret) + return ret; + } - if (s[1] == NULL) - return -EINVAL; + newpn->dev = dev; switch (plid) { case BLKIO_POLICY_PROP: - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) + if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) return -EINVAL; newpn->plid = plid; @@ -850,26 +853,18 @@ static int blkio_policy_parse_and_set(char *buf, switch(fileid) { case BLKIO_THROTL_read_bps_device: case BLKIO_THROTL_write_bps_device: - ret = strict_strtoull(s[1], 10, &bps); - if (ret) - return -EINVAL; - newpn->plid = plid; newpn->fileid = fileid; - newpn->val.bps = bps; + newpn->val.bps = temp; break; case BLKIO_THROTL_read_iops_device: case BLKIO_THROTL_write_iops_device: - ret = strict_strtoull(s[1], 10, &iops); - if (ret) - return -EINVAL; - - if (iops > THROTL_IOPS_MAX) + if (temp > THROTL_IOPS_MAX) return -EINVAL; newpn->plid = plid; newpn->fileid = fileid; - newpn->val.iops = (unsigned int)iops; + newpn->val.iops = (unsigned int)temp; break; } break; -- cgit v1.1 From 777eb1bf15b8532c396821774bf6451e563438f5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 28 Sep 2011 08:07:01 -0600 Subject: block: Free queue resources at blk_release_queue() A kernel crash is observed when a mounted ext3/ext4 filesystem is physically removed. The problem is that blk_cleanup_queue() frees up some resources eg by calling elevator_exit(), which are not checked for in normal operation. So we should rather move these calls to the destructor function blk_release_queue() as at that point all remaining references are gone. However, in doing so we have to ensure that any externally supplied queue_lock is disconnected as the driver might free up the lock after the call of blk_cleanup_queue(), Signed-off-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 13 ++++++------- block/blk-sysfs.c | 5 +++++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b2ed78a..d34433a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -348,9 +348,10 @@ void blk_put_queue(struct request_queue *q) EXPORT_SYMBOL(blk_put_queue); /* - * Note: If a driver supplied the queue lock, it should not zap that lock - * unexpectedly as some queue cleanup components like elevator_exit() and - * blk_throtl_exit() need queue lock. + * Note: If a driver supplied the queue lock, it is disconnected + * by this function. The actual state of the lock doesn't matter + * here as the request_queue isn't accessible after this point + * (QUEUE_FLAG_DEAD is set) and no other requests will be queued. */ void blk_cleanup_queue(struct request_queue *q) { @@ -367,10 +368,8 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); mutex_unlock(&q->sysfs_lock); - if (q->elevator) - elevator_exit(q->elevator); - - blk_throtl_exit(q); + if (q->queue_lock != &q->__queue_lock) + q->queue_lock = &q->__queue_lock; blk_put_queue(q); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e681805..60fda88 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -479,6 +479,11 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); + if (q->elevator) + elevator_exit(q->elevator); + + blk_throtl_exit(q); + if (rl->rq_pool) mempool_destroy(rl->rq_pool); -- cgit v1.1 From 523e1d399ce0e23bec562abe2b2f8d297af81161 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:07 +0200 Subject: block: make gendisk hold a reference to its queue The following command sequence triggers an oops. # mount /dev/sdb1 /mnt # echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete # umount /mnt general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs RIP: 0010:[] [] __lock_acquire+0x389/0x1d60 ... Call Trace: [] lock_acquire+0x95/0x140 [] _raw_spin_lock+0x3b/0x50 [] bdi_lock_two+0x5c/0x70 [] bdev_inode_switch_bdi+0x4c/0xf0 [] __blkdev_put+0x11b/0x1d0 [] __blkdev_put+0x160/0x1d0 [] blkdev_put+0x5f/0x190 [] kill_block_super+0x4d/0x80 [] deactivate_locked_super+0x45/0x70 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0xed/0x130 [] sys_umount+0x7e/0x3a0 [] system_call_fastpath+0x16/0x1b This is because bdev holds on to disk but disk doesn't pin the associated queue. If a SCSI device is removed while the device is still open, the sdev puts the base reference to the queue on release. When the bdev is finally released, the associated queue is already gone along with the bdi and bdev_inode_switch_bdi() ends up dereferencing already freed bdi. Even if it were not for this bug, disk not holding onto the associated queue is very unusual and error-prone. Fix it by making add_disk() take an extra reference to its queue and put it on disk_release() and ensuring that disk and its fops owner are put in that order after all accesses to the disk and queue are complete. Signed-off-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e2f6790..d261b73 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { -- cgit v1.1 From ece84241b93c4693bfe0a8ec9a043a16d216d0cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:15 +0200 Subject: block: fix genhd refcounting in blkio_policy_parse_and_set() blkio_policy_parse_and_set() calls blkio_check_dev_num() to check whether the given dev_t is valid. blkio_check_dev_num() uses get_gendisk() for verification but never puts the returned genhd leaking the reference. This patch collapses blkio_check_dev_num() into its caller and updates it such that the genhd is put before returning. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 56 ++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b596e54..d61ec56 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -768,25 +768,14 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return disk_total; } -static int blkio_check_dev_num(dev_t dev) -{ - int part = 0; - struct gendisk *disk; - - disk = get_gendisk(dev, &part); - if (!disk || part) - return -ENODEV; - - return 0; -} - static int blkio_policy_parse_and_set(char *buf, struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { + struct gendisk *disk = NULL; char *s[4], *p, *major_s = NULL, *minor_s = NULL; - int ret; unsigned long major, minor; - int i = 0; + int i = 0, ret = -EINVAL; + int part; dev_t dev; u64 temp; @@ -804,37 +793,36 @@ static int blkio_policy_parse_and_set(char *buf, } if (i != 2) - return -EINVAL; + goto out; p = strsep(&s[0], ":"); if (p != NULL) major_s = p; else - return -EINVAL; + goto out; minor_s = s[0]; if (!minor_s) - return -EINVAL; + goto out; - ret = strict_strtoul(major_s, 10, &major); - if (ret) - return -EINVAL; + if (strict_strtoul(major_s, 10, &major)) + goto out; - ret = strict_strtoul(minor_s, 10, &minor); - if (ret) - return -EINVAL; + if (strict_strtoul(minor_s, 10, &minor)) + goto out; dev = MKDEV(major, minor); - ret = strict_strtoull(s[1], 10, &temp); - if (ret) - return -EINVAL; + if (strict_strtoull(s[1], 10, &temp)) + goto out; /* For rule removal, do not check for device presence. */ if (temp) { - ret = blkio_check_dev_num(dev); - if (ret) - return ret; + disk = get_gendisk(dev, &part); + if (!disk || part) { + ret = -ENODEV; + goto out; + } } newpn->dev = dev; @@ -843,7 +831,7 @@ static int blkio_policy_parse_and_set(char *buf, case BLKIO_POLICY_PROP: if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || temp > BLKIO_WEIGHT_MAX) - return -EINVAL; + goto out; newpn->plid = plid; newpn->fileid = fileid; @@ -860,7 +848,7 @@ static int blkio_policy_parse_and_set(char *buf, case BLKIO_THROTL_read_iops_device: case BLKIO_THROTL_write_iops_device: if (temp > THROTL_IOPS_MAX) - return -EINVAL; + goto out; newpn->plid = plid; newpn->fileid = fileid; @@ -871,8 +859,10 @@ static int blkio_policy_parse_and_set(char *buf, default: BUG(); } - - return 0; + ret = 0; +out: + put_disk(disk); + return ret; } unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, -- cgit v1.1 From bc9fcbf9cb8ec76d340da16fbf48a9a316e14c52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:18 +0200 Subject: block: move blk_throtl prototypes to block/blk.h blk_throtl interface is block internal and there's no reason to have them in linux/blkdev.h. Move them to block/blk.h. This patch doesn't introduce any functional change. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 1 + block/blk.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a19f58c..f3f495e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,6 +10,7 @@ #include #include #include "blk-cgroup.h" +#include "blk.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; diff --git a/block/blk.h b/block/blk.h index 20b900a..da247ba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -188,4 +188,17 @@ static inline int blk_do_io_stat(struct request *rq) (rq->cmd_flags & REQ_DISCARD)); } -#endif +#ifdef CONFIG_BLK_DEV_THROTTLING +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) +{ + return 0; +} +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif /* BLK_INTERNAL_H */ -- cgit v1.1 From 75eb6c372d41d6d140b893873f6687d78c987a44 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:22 +0200 Subject: block: pass around REQ_* flags instead of broken down booleans during request alloc/free blk_alloc_request() and freed_request() take different combinations of REQ_* @flags, @priv and @is_sync when @flags is superset of the latter two. Make them take @flags only. This cleans up the code a bit and will ease updating allocation related REQ_* flags. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 79e41a7..a3d2fdc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -574,7 +574,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) } static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -585,12 +585,10 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) rq->cmd_flags = flags | REQ_ALLOCED; - if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - rq->cmd_flags |= REQ_ELVPRIV; + if ((flags & REQ_ELVPRIV) && + unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; } return rq; @@ -649,12 +647,13 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) +static void freed_request(struct request_queue *q, unsigned int flags) { struct request_list *rl = &q->rq; + int sync = rw_is_sync(flags); rl->count[sync]--; - if (priv) + if (flags & REQ_ELVPRIV) rl->elvpriv--; __freed_request(q, sync); @@ -694,7 +693,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv = 0; + int may_queue; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -738,17 +737,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->count[is_sync]++; rl->starved[is_sync] = 0; - if (blk_rq_should_init_elevator(bio)) { - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (blk_rq_should_init_elevator(bio) && + !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { + rw_flags |= REQ_ELVPRIV; + rl->elvpriv++; } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + rq = blk_alloc_request(q, rw_flags, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -758,7 +757,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + freed_request(q, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -1050,14 +1049,13 @@ void __blk_put_request(struct request_queue *q, struct request *req) * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { - int is_sync = rq_is_sync(req) != 0; - int priv = req->cmd_flags & REQ_ELVPRIV; + unsigned int flags = req->cmd_flags; BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, is_sync, priv); + freed_request(q, flags); } } EXPORT_SYMBOL_GPL(__blk_put_request); -- cgit v1.1 From 315fceee81155ef2aeed9316ca72aeea9347db5c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:25 +0200 Subject: block: drop unnecessary blk_get/put_queue() in scsi_cmd_ioctl() and blk_get_tg() blk_get/put_queue() in scsi_cmd_ioctl() and throtl_get_tg() are completely bogus. The caller must have a reference to the queue on entry and taking an extra reference doesn't change anything. For scsi_cmd_ioctl(), the only effect is that it ends up checking QUEUE_FLAG_DEAD on entry; however, this is bogus as queue can die right after blk_get_queue(). Dead queue should be and is handled in request issue path (it's somewhat broken now but that's a separate problem and doesn't affect this one much). throtl_get_tg() incorrectly assumes that q is rcu freed. Also, it doesn't check return value of blk_get_queue(). If the queue is already dead, it ends up doing an extra put. Drop them. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 +------- block/scsi_ioctl.c | 3 +-- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f3f495e..ecba5fc 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -324,12 +324,8 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) /* * Need to allocate a group. Allocation of group also needs allocation * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc - * - * Take the request queue reference to make sure queue does not - * go away once we return from allocation. + * we need to drop rcu lock and queue_lock before we call alloc. */ - blk_get_queue(q); rcu_read_unlock(); spin_unlock_irq(q->queue_lock); @@ -339,13 +335,11 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) * dead */ if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - blk_put_queue(q); if (tg) kfree(tg); return ERR_PTR(-ENODEV); } - blk_put_queue(q); /* Group allocated and queue is still alive. take the lock */ spin_lock_irq(q->queue_lock); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4f4230b..fbdf0d8 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -565,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod { int err; - if (!q || blk_get_queue(q)) + if (!q) return -ENXIO; switch (cmd) { @@ -686,7 +686,6 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod err = -ENOTTY; } - blk_put_queue(q); return err; } EXPORT_SYMBOL(scsi_cmd_ioctl); -- cgit v1.1 From e3c78ca524d230bc145e902625e88c392a58ddf3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:32:38 +0200 Subject: block: reorganize queue draining Reorganize queue draining related code in preparation of queue exit changes. * Factor out actual draining from elv_quiesce_start() to blk_drain_queue(). * Make elv_quiesce_start/end() responsible for their own locking. * Replace open-coded ELVSWITCH clearing in elevator_switch() with elv_quiesce_end(). This patch doesn't cause any visible functional difference. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 28 ++++++++++++++++++++++++++++ block/blk.h | 1 + block/elevator.c | 37 +++++++++++-------------------------- 3 files changed, 40 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a3d2fdc..149149d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,6 +28,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -345,6 +346,33 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); +/** + * blk_drain_queue - drain requests from request_queue + * @q: queue to drain + * + * Drain ELV_PRIV requests from @q. The caller is responsible for ensuring + * that no new requests which need to be drained are queued. + */ +void blk_drain_queue(struct request_queue *q) +{ + while (true) { + int nr_rqs; + + spin_lock_irq(q->queue_lock); + + elv_drain_elevator(q); + + __blk_run_queue(q); + nr_rqs = q->rq.elvpriv; + + spin_unlock_irq(q->queue_lock); + + if (!nr_rqs) + break; + msleep(10); + } +} + /* * Note: If a driver supplied the queue lock, it is disconnected * by this function. The actual state of the lock doesn't matter diff --git a/block/blk.h b/block/blk.h index da247ba..2b66dc2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,6 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); +void blk_drain_queue(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, diff --git a/block/elevator.c b/block/elevator.c index cb332cb..74a277f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -606,43 +605,35 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) void elv_drain_elevator(struct request_queue *q) { static int printed; + + lockdep_assert_held(q->queue_lock); + while (q->elevator->ops->elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted == 0) - return; - if (printed++ < 10) { + if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", q->elevator->elevator_type->elevator_name, q->nr_sorted); } } -/* - * Call with queue lock held, interrupts disabled - */ void elv_quiesce_start(struct request_queue *q) { if (!q->elevator) return; + spin_lock_irq(q->queue_lock); queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); - /* - * make sure we don't have any requests in flight - */ - elv_drain_elevator(q); - while (q->rq.elvpriv) { - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); - msleep(10); - spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); - } + blk_drain_queue(q); } void elv_quiesce_end(struct request_queue *q) { + spin_lock_irq(q->queue_lock); queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); } void __elv_add_request(struct request_queue *q, struct request *rq, int where) @@ -972,7 +963,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * Turn on BYPASS and drain all requests w/ elevator private data */ - spin_lock_irq(q->queue_lock); elv_quiesce_start(q); /* @@ -983,8 +973,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * attach and start new elevator */ + spin_lock_irq(q->queue_lock); elevator_attach(q, e, data); - spin_unlock_irq(q->queue_lock); if (old_elevator->registered) { @@ -999,9 +989,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * finally exit old elevator and turn off BYPASS. */ elevator_exit(old_elevator); - spin_lock_irq(q->queue_lock); elv_quiesce_end(q); - spin_unlock_irq(q->queue_lock); blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); @@ -1015,10 +1003,7 @@ fail_register: elevator_exit(e); q->elevator = old_elevator; elv_register_queue(q); - - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); + elv_quiesce_end(q); return err; } -- cgit v1.1 From bc16a4f933bc5ed50826b20561e4c3515061998b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:33:01 +0200 Subject: block: reorganize throtl_get_tg() and blk_throtl_bio() blk_throtl_bio() and throtl_get_tg() have rather unusual interface. * throtl_get_tg() returns pointer to a valid tg or ERR_PTR(-ENODEV), and drops queue_lock in the latter case. Different locking context depending on return value is error-prone and DEAD state is scheduled to be protected by queue_lock anyway. Move DEAD check inside queue_lock and return valid tg or NULL. * blk_throtl_bio() indicates return status both with its return value and in/out param **@bio. The former is used to indicate whether queue is found to be dead during throtl processing. The latter whether the bio is throttled. There's no point in returning DEAD check result from blk_throtl_bio(). The queue can die after blk_throtl_bio() is finished but before make_request_fn() grabs queue lock. Make it take *@bio instead and return boolean result indicating whether the request is throttled or not. This patch doesn't cause any visible functional difference. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++------ block/blk-throttle.c | 49 +++++++++++++++++-------------------------------- block/blk.h | 6 +++--- 3 files changed, 22 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 149149d..6c491f2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1515,12 +1515,8 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (blk_throtl_bio(q, &bio)) - goto end_io; - - /* if bio = NULL, bio has been throttled and will be submitted later. */ - if (!bio) - return false; + if (blk_throtl_bio(q, bio)) + return false; /* throttled, will be resubmitted later */ trace_block_bio_queue(q, bio); return true; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index ecba5fc..900a0c9 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -303,10 +303,6 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) return tg; } -/* - * This function returns with queue lock unlocked in case of error, like - * request queue is no more - */ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) { struct throtl_grp *tg = NULL, *__tg = NULL; @@ -330,20 +326,16 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) spin_unlock_irq(q->queue_lock); tg = throtl_alloc_tg(td); - /* - * We might have slept in group allocation. Make sure queue is not - * dead - */ - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - if (tg) - kfree(tg); - - return ERR_PTR(-ENODEV); - } /* Group allocated and queue is still alive. take the lock */ spin_lock_irq(q->queue_lock); + /* Make sure @q is still alive */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + kfree(tg); + return NULL; + } + /* * Initialize the new group. After sleeping, read the blkcg again. */ @@ -1118,17 +1110,17 @@ static struct blkio_policy_type blkio_policy_throtl = { .plid = BLKIO_POLICY_THROTL, }; -int blk_throtl_bio(struct request_queue *q, struct bio **biop) +bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { struct throtl_data *td = q->td; struct throtl_grp *tg; - struct bio *bio = *biop; bool rw = bio_data_dir(bio), update_disptime = true; struct blkio_cgroup *blkcg; + bool throttled = false; if (bio->bi_rw & REQ_THROTTLED) { bio->bi_rw &= ~REQ_THROTTLED; - return 0; + goto out; } /* @@ -1147,7 +1139,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, rw_is_sync(bio->bi_rw)); rcu_read_unlock(); - return 0; + goto out; } } rcu_read_unlock(); @@ -1156,18 +1148,10 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) * Either group has not been allocated yet or it is not an unlimited * IO group */ - spin_lock_irq(q->queue_lock); tg = throtl_get_tg(td); - - if (IS_ERR(tg)) { - if (PTR_ERR(tg) == -ENODEV) { - /* - * Queue is gone. No queue lock held here. - */ - return -ENODEV; - } - } + if (unlikely(!tg)) + goto out_unlock; if (tg->nr_queued[rw]) { /* @@ -1195,7 +1179,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(td, tg, rw); - goto out; + goto out_unlock; } queue_bio: @@ -1207,16 +1191,17 @@ queue_bio: tg->nr_queued[READ], tg->nr_queued[WRITE]); throtl_add_bio_tg(q->td, tg, bio); - *biop = NULL; + throttled = true; if (update_disptime) { tg_update_disptime(td, tg); throtl_schedule_next_dispatch(td); } -out: +out_unlock: spin_unlock_irq(q->queue_lock); - return 0; +out: + return throttled; } int blk_throtl_init(struct request_queue *q) diff --git a/block/blk.h b/block/blk.h index 2b66dc2..c018dba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -190,13 +190,13 @@ static inline int blk_do_io_stat(struct request *rq) } #ifdef CONFIG_BLK_DEV_THROTTLING -extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); +extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) +static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { - return 0; + return false; } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } -- cgit v1.1 From da8303c63b8de73619884382d6e573d44aae0810 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:33:05 +0200 Subject: block: make get_request[_wait]() fail if queue is dead Currently get_request[_wait]() allocates request whether queue is dead or not. This patch makes get_request[_wait]() return NULL if @q is dead. blk_queue_bio() is updated to fail the submitted bio if request allocation fails. While at it, add docbook comments for get_request[_wait](). Note that the current code has rather unclear (there are spurious DEAD tests scattered around) assumption that the owner of a queue guarantees that no request travels block layer if the queue is dead and this patch in itself doesn't change much; however, this will allow fixing the broken assumption in the next patch. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 54 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6c491f2..3508751 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -709,10 +709,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio) return true; } -/* - * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. - * Returns !NULL on success, with queue_lock *not held*. +/** + * get_request - get a free request + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask + * + * Get a free request from @q. This function may fail under memory + * pressure or if @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -723,6 +732,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags, const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -815,11 +827,18 @@ out: return rq; } -/* - * No available requests for this queue, wait for some requests to become - * available. +/** + * get_request_wait - get a free request with retry + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) * - * Called with q->queue_lock held, and returns with it unlocked. + * Get a free request from @q. This function keeps retrying under memory + * pressure and fails iff @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) @@ -833,6 +852,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct io_context *ioc; struct request_list *rl = &q->rq; + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); @@ -863,19 +885,15 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - return NULL; - BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw, NULL); - } else { + else rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) - spin_unlock_irq(q->queue_lock); - } + if (!rq) + spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ return rq; @@ -1299,6 +1317,10 @@ get_rq: * Returns with the queue unlocked. */ req = get_request_wait(q, rw_flags, bio); + if (unlikely(!req)) { + bio_endio(bio, -ENODEV); /* @q is dead */ + goto out_unlock; + } /* * After dropping the lock and possibly sleeping here, our request -- cgit v1.1 From bd87b5898a72b1aef6acf3705c61c9f6372adf0c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:33:08 +0200 Subject: block: drop @tsk from attempt_plug_merge() and explain sync rules attempt_plug_merge() accesses elevator without holding queue_lock and may call into ->elevator_bio_merge_fn(). The elvator is guaranteed to be valid because it's accessed iff the plugged list has requests and elevator is never exited with live requests, so as long as the elevator method can deal with unlocked access, this is safe. Explain the sync rules around attempt_plug_merge() and drop the unnecessary @tsk parameter. This patch doesn't introduce any functional change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3508751..034cbb2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1203,18 +1203,32 @@ static bool bio_attempt_front_merge(struct request_queue *q, return true; } -/* - * Attempts to merge with the plugged list in the current process. Returns - * true if merge was successful, otherwise false. +/** + * attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @request_count: out parameter for number of traversed plugged requests + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * This function is called without @q->queue_lock; however, elevator is + * accessed iff there already are requests on the plugged list which in + * turn guarantees validity of the elevator. + * + * Note that, on successful merge, elevator operation + * elevator_bio_merged_fn() will be called without queue lock. Elevator + * must be ready for this. */ -static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, - struct bio *bio, unsigned int *request_count) +static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) { struct blk_plug *plug; struct request *rq; bool ret = false; - plug = tsk->plug; + plug = current->plug; if (!plug) goto out; *request_count = 0; @@ -1282,7 +1296,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(current, q, bio, &request_count)) + if (attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); -- cgit v1.1 From c9a929dde3913780b5c416f4bb9d9ed804f509ce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:42:16 +0200 Subject: block: fix request_queue lifetime handling by making blk_queue_cleanup() properly shutdown request_queue is refcounted but actually depdends on lifetime management from the queue owner - on blk_cleanup_queue(), block layer expects that there's no request passing through request_queue and no new one will. This is fundamentally broken. The queue owner (e.g. SCSI layer) doesn't have a way to know whether there are other active users before calling blk_cleanup_queue() and other users (e.g. bsg) don't have any guarantee that the queue is and would stay valid while it's holding a reference. With delay added in blk_queue_bio() before queue_lock is grabbed, the following oops can be easily triggered when a device is removed with in-flight IOs. sd 0:0:1:0: [sdb] Stopping disk ata1.01: disabled general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: Pid: 648, comm: test_rawio Not tainted 3.1.0-rc3-work+ #56 Bochs Bochs RIP: 0010:[] [] elv_rqhash_find+0x61/0x100 ... Process test_rawio (pid: 648, threadinfo ffff880019efa000, task ffff880019ef8a80) ... Call Trace: [] elv_merge+0x84/0xe0 [] blk_queue_bio+0xf4/0x400 [] generic_make_request+0xca/0x100 [] submit_bio+0x74/0x100 [] dio_bio_submit+0xbc/0xc0 [] __blockdev_direct_IO+0x92e/0xb40 [] blkdev_direct_IO+0x57/0x60 [] generic_file_aio_read+0x6d5/0x760 [] do_sync_read+0xda/0x120 [] vfs_read+0xc5/0x180 [] sys_pread64+0x9a/0xb0 [] system_call_fastpath+0x16/0x1b This happens because blk_queue_cleanup() destroys the queue and elevator whether IOs are in progress or not and DEAD tests are sprinkled in the request processing path without proper synchronization. Similar problem exists for blk-throtl. On queue cleanup, blk-throtl is shutdown whether it has requests in it or not. Depending on timing, it either oopses or throttled bios are lost putting tasks which are waiting for bio completion into eternal D state. The way it should work is having the usual clear distinction between shutdown and release. Shutdown drains all currently pending requests, marks the queue dead, and performs partial teardown of the now unnecessary part of the queue. Even after shutdown is complete, reference holders are still allowed to issue requests to the queue although they will be immmediately failed. The rest of teardown happens on release. This patch makes the following changes to make blk_queue_cleanup() behave as proper shutdown. * QUEUE_FLAG_DEAD is now set while holding both q->exit_mutex and queue_lock. * Unsynchronized DEAD check in generic_make_request_checks() removed. This couldn't make any meaningful difference as the queue could die after the check. * blk_drain_queue() updated such that it can drain all requests and is now called during cleanup. * blk_throtl updated such that it checks DEAD on grabbing queue_lock, drains all throttled bios during cleanup and free td when queue is released. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 57 +++++++++++++++++++++++++++++++++------------------- block/blk-sysfs.c | 1 + block/blk-throttle.c | 50 +++++++++++++++++++++++++++++++++++++++------ block/blk.h | 6 +++++- block/elevator.c | 2 +- 5 files changed, 87 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 034cbb2..7e15235 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -349,11 +349,13 @@ EXPORT_SYMBOL(blk_put_queue); /** * blk_drain_queue - drain requests from request_queue * @q: queue to drain + * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV * - * Drain ELV_PRIV requests from @q. The caller is responsible for ensuring - * that no new requests which need to be drained are queued. + * Drain requests from @q. If @drain_all is set, all requests are drained. + * If not, only ELVPRIV requests are drained. The caller is responsible + * for ensuring that no new requests which need to be drained are queued. */ -void blk_drain_queue(struct request_queue *q) +void blk_drain_queue(struct request_queue *q, bool drain_all) { while (true) { int nr_rqs; @@ -361,9 +363,15 @@ void blk_drain_queue(struct request_queue *q) spin_lock_irq(q->queue_lock); elv_drain_elevator(q); + if (drain_all) + blk_throtl_drain(q); __blk_run_queue(q); - nr_rqs = q->rq.elvpriv; + + if (drain_all) + nr_rqs = q->rq.count[0] + q->rq.count[1]; + else + nr_rqs = q->rq.elvpriv; spin_unlock_irq(q->queue_lock); @@ -373,30 +381,40 @@ void blk_drain_queue(struct request_queue *q) } } -/* - * Note: If a driver supplied the queue lock, it is disconnected - * by this function. The actual state of the lock doesn't matter - * here as the request_queue isn't accessible after this point - * (QUEUE_FLAG_DEAD is set) and no other requests will be queued. +/** + * blk_cleanup_queue - shutdown a request queue + * @q: request queue to shutdown + * + * Mark @q DEAD, drain all pending requests, destroy and put it. All + * future requests will be failed immediately with -ENODEV. */ void blk_cleanup_queue(struct request_queue *q) { - /* - * We know we have process context here, so we can be a little - * cautious and ensure that pending block actions on this device - * are done before moving on. Going into this function, we should - * not have processes doing IO to this device. - */ - blk_sync_queue(q); + spinlock_t *lock = q->queue_lock; - del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + /* mark @q DEAD, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); - mutex_unlock(&q->sysfs_lock); + + spin_lock_irq(lock); + queue_flag_set(QUEUE_FLAG_NOMERGES, q); + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + queue_flag_set(QUEUE_FLAG_DEAD, q); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; + spin_unlock_irq(lock); + mutex_unlock(&q->sysfs_lock); + + /* drain all requests queued before DEAD marking */ + blk_drain_queue(q, true); + + /* @q won't process any more request, flush async actions */ + del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + blk_sync_queue(q); + + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -1509,9 +1527,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_size) || should_fail_request(&part_to_disk(part)->part0, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a8eff5f..e7f9f65 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -490,6 +490,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); + blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 900a0c9..8edb949 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -309,6 +309,10 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) struct blkio_cgroup *blkcg; struct request_queue *q = td->queue; + /* no throttling for dead queue */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + return NULL; + rcu_read_lock(); blkcg = task_blkio_cgroup(current); tg = throtl_find_tg(td, blkcg); @@ -1001,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td) } } -static void throtl_td_free(struct throtl_data *td) -{ - kfree(td); -} - /* * Blk cgroup controller notification saying that blkio_group object is being * delinked as associated cgroup object is going away. That also means that @@ -1204,6 +1203,41 @@ out: return throttled; } +/** + * blk_throtl_drain - drain throttled bios + * @q: request_queue to drain throttled bios for + * + * Dispatch all currently throttled bios on @q through ->make_request_fn(). + */ +void blk_throtl_drain(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct throtl_data *td = q->td; + struct throtl_rb_root *st = &td->tg_service_tree; + struct throtl_grp *tg; + struct bio_list bl; + struct bio *bio; + + lockdep_is_held(q->queue_lock); + + bio_list_init(&bl); + + while ((tg = throtl_rb_first(st))) { + throtl_dequeue_tg(td, tg); + + while ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + } + spin_unlock_irq(q->queue_lock); + + while ((bio = bio_list_pop(&bl))) + generic_make_request(bio); + + spin_lock_irq(q->queue_lock); +} + int blk_throtl_init(struct request_queue *q) { struct throtl_data *td; @@ -1276,7 +1310,11 @@ void blk_throtl_exit(struct request_queue *q) * it. */ throtl_shutdown_wq(q); - throtl_td_free(td); +} + +void blk_throtl_release(struct request_queue *q) +{ + kfree(q->td); } static int __init throtl_init(void) diff --git a/block/blk.h b/block/blk.h index c018dba..3f6551b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,7 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_drain_queue(struct request_queue *q); +void blk_drain_queue(struct request_queue *q, bool drain_all); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, @@ -191,15 +191,19 @@ static inline int blk_do_io_stat(struct request *rq) #ifdef CONFIG_BLK_DEV_THROTTLING extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); +extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_release(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { return false; } +static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_release(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif /* BLK_INTERNAL_H */ diff --git a/block/elevator.c b/block/elevator.c index 74a277f..66343d6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -626,7 +626,7 @@ void elv_quiesce_start(struct request_queue *q) queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); - blk_drain_queue(q); + blk_drain_queue(q, false); } void elv_quiesce_end(struct request_queue *q) -- cgit v1.1 From e890413af4c2dfebf5432ef30cc70cb11dad3213 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Mon, 24 Oct 2011 16:08:38 +0200 Subject: block: fix a typo in the blk-cgroup.h file byptes -> bytes. Signed-off-by: Jie Liu Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index a71d290..6f3ace7 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -188,7 +188,7 @@ struct blkio_policy_node { union { unsigned int weight; /* - * Rate read/write in terms of byptes per second + * Rate read/write in terms of bytes per second * Whether this rate represents read or write is determined * by file type "fileid". */ -- cgit v1.1 From 9562ad9ab36df7ccef920d119f3b5100025db95f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 24 Oct 2011 16:11:30 +0200 Subject: block: Remove the control of complete cpu from bio. bio originally has the functionality to set the complete cpu, but it is broken. Chirstoph said that "This code is unused, and from the all the discussions lately pretty obviously broken. The only thing keeping it serves is creating more confusion and possibly more bugs." And Jens replied with "We can kill bio_set_completion_cpu(). I'm fine with leaving cpu control to the request based drivers, they are the only ones that can toggle the setting anyway". So this patch tries to remove all the work of controling complete cpu from a bio. Cc: Shaohua Li Cc: Christoph Hellwig Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 7e15235..da69793 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1276,7 +1276,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { - req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; @@ -1362,8 +1361,7 @@ get_rq: */ init_request_from_bio(req, bio); - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) req->cpu = raw_smp_processor_id(); plug = current->plug; -- cgit v1.1 From 834f9f61a525d2f6d3d0c93894e26326c8d3ceed Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 17 Oct 2011 12:57:22 +0200 Subject: blk-flush: fix invalid BUG_ON in blk_insert_flush A user reported a regression due to commit 4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush machinery for stacking drivers with differring flush flags). Part of the problem is that blk_insert_flush required a single bio be attached to the request. In reality, having no attached bio is also a valid case, as can be observed with an empty flush. [1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html Reported-by: Christophe Saout Signed-off-by: Jeff Moyer Stable note: 3.1 Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 491eb30..89ae3b9 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -320,7 +320,7 @@ void blk_insert_flush(struct request *rq) return; } - BUG_ON(!rq->bio || rq->bio != rq->biotail); + BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ /* * If there's data but flush is not necessary, the request can be -- cgit v1.1 From e67b77c791ca2778198c9e7088f3266ed2da7a55 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 17 Oct 2011 12:57:23 +0200 Subject: blk-flush: move the queue kick into A dm-multipath user reported[1] a problem when trying to boot a kernel with commit 4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush machinery for stacking drivers with differring flush flags) applied. It turns out that an empty flush request can be sent into blk_insert_flush. When the BUG_ON was fixed to allow for this, I/O on the underlying device would stall. The reason is that blk_insert_cloned_request does not kick the queue. In the aforementioned commit, I had added a special case to kick the queue if data was sent down but the queue flags did not require a flush. A better solution is to push the queue kick up into blk_insert_cloned_request. This patch, along with a follow-on which fixes the BUG_ON, fixes the issue reported. [1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html Reported-by: Christophe Saout Signed-off-by: Jeff Moyer Acked-by: Tejun Heo Stable note: 3.1 Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-flush.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d34433a..795154e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1725,6 +1725,8 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); + if (where == ELEVATOR_INSERT_FLUSH) + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return 0; diff --git a/block/blk-flush.c b/block/blk-flush.c index 89ae3b9..720ad60 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,6 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { list_add_tail(&rq->queuelist, &q->queue_head); - blk_run_queue_async(q); return; } -- cgit v1.1 From f992ae801a7dec34a4ed99a6598bbbbfb82af4fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 17 Oct 2011 13:42:43 +0200 Subject: block: make gendisk hold a reference to its queue The following command sequence triggers an oops. # mount /dev/sdb1 /mnt # echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete # umount /mnt general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs RIP: 0010:[] [] __lock_acquire+0x389/0x1d60 ... Call Trace: [] lock_acquire+0x95/0x140 [] _raw_spin_lock+0x3b/0x50 [] bdi_lock_two+0x5c/0x70 [] bdev_inode_switch_bdi+0x4c/0xf0 [] __blkdev_put+0x11b/0x1d0 [] __blkdev_put+0x160/0x1d0 [] blkdev_put+0x5f/0x190 [] kill_block_super+0x4d/0x80 [] deactivate_locked_super+0x45/0x70 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0xed/0x130 [] sys_umount+0x7e/0x3a0 [] system_call_fastpath+0x16/0x1b This is because bdev holds on to disk but disk doesn't pin the associated queue. If a SCSI device is removed while the device is still open, the sdev puts the base reference to the queue on release. When the bdev is finally released, the associated queue is already gone along with the bdi and bdev_inode_switch_bdi() ends up dereferencing already freed bdi. Even if it were not for this bug, disk not holding onto the associated queue is very unusual and error-prone. Fix it by making add_disk() take an extra reference to its queue and put it on disk_release() and ensuring that disk and its fops owner are put in that order after all accesses to the disk and queue are complete. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e2f6790..d261b73 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { -- cgit v1.1 From 5e08159197b5b98a6648a172008de23f420e6c11 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 25 Oct 2011 10:20:05 +0200 Subject: block: warn if tag is greater than real_max_depth. In case tag depth is reduced, it is max_depth not real_max_depth. So we should allow a request with tag >= max_depth, but for a tag >= real_max_depth, there really should be some problem. Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- block/blk-tag.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-tag.c b/block/blk-tag.c index ece65fc..e74d6d1 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -286,12 +286,14 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) BUG_ON(tag == -1); - if (unlikely(tag >= bqt->real_max_depth)) + if (unlikely(tag >= bqt->max_depth)) { /* * This can happen after tag depth has been reduced. - * FIXME: how about a warning or info message here? + * But tag shouldn't be larger than real_max_depth. */ + WARN_ON(tag >= bqt->real_max_depth); return; + } list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; -- cgit v1.1 From e060f00beee23568fe2c4faf1e88ff22edefd7b2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 25 Oct 2011 15:48:12 +0200 Subject: blk-throttle: Free up policy node associated with deleted rule If a rule is being deleted, free up associated policy node. Otherwise that memory is leaked. Signed-off-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d61ec56..6155367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1075,6 +1075,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, if (blkio_delete_rule_command(newpn)) { blkio_policy_delete_node(pn); + kfree(pn); spin_unlock_irq(&blkcg->lock); goto update_io_group; } -- cgit v1.1 From a38eb630fa224d6fba8c14a4063174bc5e0f63bb Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 25 Oct 2011 15:48:12 +0200 Subject: blk-throttle: Take blkcg->lock while traversing blkcg->policy_list blkcg->policy_list is protected by blkcg->lock. Its not rcu protected list. So even for readers, they need to take blkcg->lock. There are few functions which were reading the list without taking lock. Fix it. Signed-off-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6155367..8f630ce 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -869,60 +869,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int weight; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); if (pn) - return pn->val.weight; + weight = pn->val.weight; else - return blkcg->weight; + weight = blkcg->weight; + + spin_unlock_irqrestore(&blkcg->lock, flags); + + return weight; } EXPORT_SYMBOL_GPL(blkcg_get_weight); uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_read_bps_device); if (pn) - return pn->val.bps; - else - return -1; + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; } uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_write_bps_device); if (pn) - return pn->val.bps; - else - return -1; + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; } unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_read_iops_device); if (pn) - return pn->val.iops; - else - return -1; + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; } unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) { struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + + spin_lock_irqsave(&blkcg->lock, flags); pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, BLKIO_THROTL_write_iops_device); if (pn) - return pn->val.iops; - else - return -1; + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; } /* Checks whether user asked for deleting a policy rule */ -- cgit v1.1 From 334c2b0b8b2ab186fa198413386cba41fffcb4f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Oct 2011 15:51:48 +0200 Subject: blk-throttle: use queue_is_locked() instead of lockdep_is_held() We can't use the latter if !CONFIG_LOCKDEP. Reported-by: Sedat Dilek Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8edb949..4553245 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1218,7 +1218,7 @@ void blk_throtl_drain(struct request_queue *q) struct bio_list bl; struct bio *bio; - lockdep_is_held(q->queue_lock); + WARN_ON_ONCE(!queue_is_locked(q)); bio_list_init(&bl); -- cgit v1.1 From d5decd3b9512e35c87492312a72443192eebdda9 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 16:00:52 -0400 Subject: block: add export.h to files using EXPORT_SYMBOL/THIS_MODULE macros These files were getting via an implicit include path, but we want to crush those out of existence since they cost time during compiles of processing thousands of lines of headers for no reason. Give them the lightweight header that just contains the EXPORT_SYMBOL infrastructure. Signed-off-by: Paul Gortmaker --- block/blk-integrity.c | 1 + block/ioctl.c | 1 + 2 files changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 129b9e2..da2a818 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "blk.h" diff --git a/block/ioctl.c b/block/ioctl.c index 1124cd2..5554403 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include -- cgit v1.1 From 6adb1236b5c1220987209aa68192e0cbad73e9fc Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 28 Sep 2011 18:26:05 -0400 Subject: block: Change module.h -> export.h in bsg-lib.c This file isn't using full modular functionality, and hence can be "downgraded" to just using the export.h header. Reported-by: Stephen Rothwell Signed-off-by: Paul Gortmaker --- block/bsg-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 6690e6e..7ad49c8 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include /** -- cgit v1.1 From 6dd9ad7df2019b1e33a372a501907db293ebcd0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Nov 2011 18:52:11 +0100 Subject: block: don't call blk_drain_queue() if elevator is not up blk_cleanup_queue() may be called before elevator is set up on a queue which triggers the following oops. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] elv_drain_elevator+0x1c/0x70 ... Pid: 830, comm: kworker/0:2 Not tainted 3.1.0-next-20111025_64+ #1590 Bochs Bochs RIP: 0010:[] [] elv_drain_elevator+0x1c/0x70 ... Call Trace: [] blk_drain_queue+0x42/0x70 [] blk_cleanup_queue+0xd0/0x1c0 [] md_free+0x50/0x70 [] kobject_release+0x8b/0x1d0 [] kref_put+0x36/0xa0 [] kobject_put+0x27/0x60 [] mddev_delayed_delete+0x2f/0x40 [] process_one_work+0x100/0x3b0 [] worker_thread+0x15f/0x3a0 [] kthread+0x87/0x90 [] kernel_thread_helper+0x4/0x10 Fix it by making blk_cleanup_queue() check whether q->elevator is set up before invoking blk_drain_queue. Signed-off-by: Tejun Heo Reported-and-tested-by: Jiri Slaby Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f658711..f43c8a5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -407,8 +407,13 @@ void blk_cleanup_queue(struct request_queue *q) spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); - /* drain all requests queued before DEAD marking */ - blk_drain_queue(q, true); + /* + * Drain all requests queued before DEAD marking. The caller might + * be trying to tear down @q before its elevator is initialized, in + * which case we don't want to call into draining. + */ + if (q->elevator) + blk_drain_queue(q, true); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); -- cgit v1.1 From d0985394e7fee6b25a7cc8335d45bc1c1a8ab2d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Nov 2011 09:03:55 +0100 Subject: block: Revert "[SCSI] genhd: add a new attribute "alias" in gendisk" This reverts commit a72c5e5eb738033938ab30d6a634b74d1d060f10. The commit introduced alias for block devices which is intended to be used during logging although actual usage hasn't been committed yet. This approach adds very limited benefit (raw log might be easier to follow) which can be trivially implemented in userland but has a lot of problems. It is much worse than netif renames because it doesn't rename the actual device but just adds conveninence name which isn't used universally or enforced. Everything internal including device lookup and sysfs still uses the internal name and nothing prevents two devices from using conflicting alias - ie. sda can have sdb as its alias. This has been nacked by people working on device driver core, block layer and kernel-userland interface and shouldn't have been upstreamed. Revert it. http://thread.gmane.org/gmane.linux.kernel/1155104 http://thread.gmane.org/gmane.linux.scsi/68632 http://thread.gmane.org/gmane.linux.scsi/69776 Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Kay Sievers Cc: "James E.J. Bottomley" Cc: Nao Nishijima Cc: Alan Cox Cc: Al Viro Signed-off-by: Jens Axboe --- block/genhd.c | 71 ----------------------------------------------------------- 1 file changed, 71 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9253839..02e9fca 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "blk.h" @@ -916,74 +915,6 @@ static int __init genhd_device_init(void) subsys_initcall(genhd_device_init); -static ssize_t alias_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - ssize_t ret = 0; - - if (disk->alias) - ret = snprintf(buf, ALIAS_LEN, "%s\n", disk->alias); - return ret; -} - -static ssize_t alias_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - char *alias; - char *envp[] = { NULL, NULL }; - unsigned char c; - int i; - ssize_t ret = count; - - if (!count) - return -EINVAL; - - if (count >= ALIAS_LEN) { - printk(KERN_ERR "alias: alias is too long\n"); - return -EINVAL; - } - - /* Validation check */ - for (i = 0; i < count; i++) { - c = buf[i]; - if (i == count - 1 && c == '\n') - break; - if (!isalnum(c) && c != '_' && c != '-') { - printk(KERN_ERR "alias: invalid alias\n"); - return -EINVAL; - } - } - - if (disk->alias) { - printk(KERN_INFO "alias: %s is already assigned (%s)\n", - disk->disk_name, disk->alias); - return -EINVAL; - } - - alias = kasprintf(GFP_KERNEL, "%s", buf); - if (!alias) - return -ENOMEM; - - if (alias[count - 1] == '\n') - alias[count - 1] = '\0'; - - envp[0] = kasprintf(GFP_KERNEL, "ALIAS=%s", alias); - if (!envp[0]) { - kfree(alias); - return -ENOMEM; - } - - disk->alias = alias; - printk(KERN_INFO "alias: assigned %s to %s\n", alias, disk->disk_name); - - kobject_uevent_env(&dev->kobj, KOBJ_ADD, envp); - - kfree(envp[0]); - return ret; -} - static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1043,7 +974,6 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } -static DEVICE_ATTR(alias, S_IRUGO|S_IWUSR, alias_show, alias_store); static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -1066,7 +996,6 @@ static struct device_attribute dev_attr_fail_timeout = #endif static struct attribute *disk_attrs[] = { - &dev_attr_alias.attr, &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, -- cgit v1.1 From 6b76106d8ef31111d6fc469564b83b5f5542794f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 13 Nov 2011 19:58:09 +0100 Subject: block: Always check length of all iov entries in blk_rq_map_user_iov() Even after commit 5478755616ae2ef1ce144dded589b62b2a50d575 ("block: check for proper length of iov entries earlier ...") we still won't check for zero-length entries after an unaligned entry. Remove the break-statement, so all entries are checked. Signed-off-by: Ben Hutchings Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index e663ac2..164cd00 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -204,10 +204,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (!iov[i].iov_len) return -EINVAL; - if (uaddr & queue_dma_alignment(q)) { + /* + * Keep going so we check length of all segments + */ + if (uaddr & queue_dma_alignment(q)) unaligned = 1; - break; - } } if (unaligned || (q->dma_pad_mask & len) || map_data) -- cgit v1.1 From 3540d5e89b2ac268fcfc9b07a50a9ba4acc2e5e5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 16 Nov 2011 09:21:50 +0100 Subject: block: avoid unnecessary plug list flush get_request_wait() could sleep and flush the plug list. If the list is already flushed, don't flush again. Signed-off-by: Shaohua Li Reviewed-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f43c8a5..6403c12 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1379,15 +1379,17 @@ get_rq: */ if (list_empty(&plug->list)) trace_block_plug(q); - else if (!plug->should_sort) { - struct request *__rq; + else { + if (!plug->should_sort) { + struct request *__rq; - __rq = list_entry_rq(plug->list.prev); - if (__rq->q != q) - plug->should_sort = 1; + __rq = list_entry_rq(plug->list.prev); + if (__rq->q != q) + plug->should_sort = 1; + } + if (request_count >= BLK_MAX_REQUEST_COUNT) + blk_flush_plug_list(plug, false); } - if (request_count >= BLK_MAX_REQUEST_COUNT) - blk_flush_plug_list(plug, false); list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); } else { -- cgit v1.1 From 019ceb7d5d252ce71001a157cf29f4ac28501b72 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 16 Nov 2011 09:21:50 +0100 Subject: block: add missed trace_block_plug After flush plug list, the list has no request, so we need to add a trace_block_plug(). Signed-off-by: Shaohua Li Reviewed-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6403c12..ea70e6c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1387,8 +1387,10 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } - if (request_count >= BLK_MAX_REQUEST_COUNT) + if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); + trace_block_plug(q); + } } list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); -- cgit v1.1 From 5151412dd4338b273afdb107c3772528e9e67d92 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 23 Nov 2011 10:59:13 +0100 Subject: block: initialize request_queue's numa node during struct request_queue is allocated with __GFP_ZERO so its "node" field is zero before initialization. This causes an oops if node 0 is offline in the page allocator because its zonelists are not initialized. From Dave Young's dmesg: SRAT: Node 1 PXM 2 0-d0000000 SRAT: Node 1 PXM 2 100000000-330000000 SRAT: Node 0 PXM 1 330000000-630000000 Initmem setup node 1 0000000000000000-000000000affb000 ... Built 1 zonelists in Node order, mobility grouping on. ... BUG: unable to handle kernel paging request at 0000000000001c08 IP: [] __alloc_pages_nodemask+0xb5/0x870 and __alloc_pages_nodemask+0xb5 translates to a NULL pointer on zonelist->_zonerefs. The fix is to initialize q->node at the time of allocation so the correct node is passed to the slab allocator later. Since blk_init_allocated_queue_node() is no longer needed, merge it with blk_init_allocated_queue(). [rientjes@google.com: changelog, initializing q->node] Cc: stable@vger.kernel.org [2.6.37+] Reported-by: Dave Young Signed-off-by: Mike Snitzer Signed-off-by: David Rientjes Tested-by: Dave Young Signed-off-by: Jens Axboe --- block/blk-core.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ea70e6c..20d69f6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -467,6 +467,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; q->backing_dev_info.name = "block"; + q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) { @@ -551,7 +552,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; - q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) blk_cleanup_queue(uninit_q); @@ -563,18 +564,9 @@ struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) { - return blk_init_allocated_queue_node(q, rfn, lock, -1); -} -EXPORT_SYMBOL(blk_init_allocated_queue); - -struct request_queue * -blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, - spinlock_t *lock, int node_id) -{ if (!q) return NULL; - q->node = node_id; if (blk_init_free_list(q)) return NULL; @@ -604,7 +596,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return NULL; } -EXPORT_SYMBOL(blk_init_allocated_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue); int blk_get_queue(struct request_queue *q) { -- cgit v1.1 From 2984ff38ccf6cbc02a7a996a36c7d6f69f3c6146 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 30 Nov 2011 15:47:48 +0100 Subject: cfq-iosched: free cic_index if blkio_alloc_blkg_stats fails If we fail allocating the blkpg stats, we free cfqd and cfgq. But we need to free the IDA cfqd->cic_index as well. Signed-off-by: majianpeng Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 16ace89..3beed83 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4036,6 +4036,11 @@ static void *cfq_init_queue(struct request_queue *q) if (blkio_alloc_blkg_stats(&cfqg->blkg)) { kfree(cfqg); + + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, cfqd->cic_index); + spin_unlock(&cic_index_lock); + kfree(cfqd); return NULL; } -- cgit v1.1 From 5eb46851de3904cd1be9192fdacb8d34deadc1fc Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Fri, 2 Dec 2011 10:07:07 +0100 Subject: cfq-iosched: fix cfq_cic_link() race confition cfq_cic_link() has race condition. When some processes which shared ioc issue I/O to same block device simultaneously, cfq_cic_link() returns -EEXIST sometimes. The race condition might stop I/O by following steps: step 1: Process A: Issue an I/O to /dev/sda step 2: Process A: Get an ioc (iocA here) in get_io_context() which does not linked with a cic for the device step 3: Process A: Get a new cic for the device (cicA here) in cfq_alloc_io_context() step 4: Process B: Issue an I/O to /dev/sda step 5: Process B: Get iocA in get_io_context() since process A and B share the same ioc step 6: Process B: Get a new cic for the device (cicB here) in cfq_alloc_io_context() since iocA has not been linked with a cic for the device yet step 7: Process A: Link cicA to iocA in cfq_cic_link() step 8: Process A: Dispatch I/O to driver and finish it step 9: Process B: Try to link cicB to iocA in cfq_cic_link() But it fails with showing "cfq: cic link failed!" kernel message, since iocA has already linked with cicA at step 7. step 10: Process B: Wait for finishig I/O in get_request_wait() The function does not wake up, when there is no I/O to the device. When cfq_cic_link() returns -EEXIST, it means ioc has already linked with cic. So when cfq_cic_link() return -EEXIST, retry cfq_cic_lookup(). Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3beed83..4c12869 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3184,7 +3184,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, } } - if (ret) + if (ret && ret != -EEXIST) printk(KERN_ERR "cfq: cic link failed!\n"); return ret; @@ -3200,6 +3200,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct io_context *ioc = NULL; struct cfq_io_context *cic; + int ret; might_sleep_if(gfp_mask & __GFP_WAIT); @@ -3207,6 +3208,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) if (!ioc) return NULL; +retry: cic = cfq_cic_lookup(cfqd, ioc); if (cic) goto out; @@ -3215,7 +3217,12 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) if (cic == NULL) goto err; - if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) + ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask); + if (ret == -EEXIST) { + /* someone has linked cic to ioc already */ + cfq_cic_free(cic); + goto retry; + } else if (ret) goto err_free; out: -- cgit v1.1 From 4eabc941259f9d8c8fb71746d3f30c87e1d9e49b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Dec 2011 20:03:04 +0100 Subject: block: don't kick empty queue in blk_drain_queue() While probing, fd sets up queue, probes hardware and tears down the queue if probing fails. In the process, blk_drain_queue() kicks the queue which failed to finish initialization and fd is unhappy about that. floppy0: no floppy controllers found ------------[ cut here ]------------ WARNING: at drivers/block/floppy.c:2929 do_fd_request+0xbf/0xd0() Hardware name: To Be Filled By O.E.M. VFS: do_fd_request called on non-open device Modules linked in: Pid: 1, comm: swapper Not tainted 3.2.0-rc4-00077-g5983fe2 #2 Call Trace: [] warn_slowpath_common+0x7a/0xb0 [] warn_slowpath_fmt+0x41/0x50 [] do_fd_request+0xbf/0xd0 [] blk_drain_queue+0x65/0x80 [] blk_cleanup_queue+0xe3/0x1a0 [] floppy_init+0xdeb/0xe28 [] ? daring+0x6b/0x6b [] do_one_initcall+0x3f/0x170 [] kernel_init+0x9d/0x11e [] ? schedule_tail+0x22/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? start_kernel+0x2be/0x2be [] ? gs_change+0xb/0xb Avoid it by making blk_drain_queue() kick queue iff dispatch queue has something on it. Signed-off-by: Tejun Heo Reported-by: Ralf Hildebrandt Reported-by: Wu Fengguang Tested-by: Sergei Trofimovich Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 20d69f6..15de223 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -366,7 +366,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (drain_all) blk_throtl_drain(q); - __blk_run_queue(q); + /* + * This function might be called on a queue which failed + * driver init after queue creation. Some drivers + * (e.g. fd) get unhappy in such cases. Kick queue iff + * dispatch queue has something on it. + */ + if (!list_empty(&q->queue_head)) + __blk_run_queue(q); if (drain_all) nr_rqs = q->rq.count[0] + q->rq.count[1]; -- cgit v1.1 From 6ae0516b8a50ece5d766be608a305707e0450060 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 16 Dec 2011 14:04:23 +0100 Subject: block, cfq: fix empty queue crash caused by request merge All requests of a queue could be merged to other requests of other queue. Such queue will not have request in it, but it's in service tree. This will cause kernel oops. I encounter a BUG_ON() in cfq_dispatch_request() with next patch, but the issue should exist without the patch. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4c12869..3548705 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1655,6 +1655,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct cfq_queue *cfqq = RQ_CFQQ(rq); + struct cfq_data *cfqd = q->elevator->elevator_data; + /* * reposition in fifo if next is older than rq */ @@ -1669,6 +1671,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, cfq_remove_request(next); cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), rq_is_sync(next)); + + cfqq = RQ_CFQQ(next); + /* + * all requests of this queue are merged to other queues, delete it + * from the service tree. If it's the active_queue, + * cfq_dispatch_requests() will choose to expire it or do idle + */ + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && + cfqq != cfqd->active_queue) + cfq_del_cfqq_rr(cfqd, cfqq); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, -- cgit v1.1 From 609f6ea1c9cdfe0c43a927e13205a57d0c266d5a Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 21 Dec 2011 15:27:24 +0100 Subject: block: re-use existing 'reading' variable instead of checking direction again Signed-off-by: majianpeng Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 164cd00..623e1cd 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (rq_data_dir(rq) == WRITE) + if (!reading) bio->bi_rw |= REQ_WRITE; if (do_copy) -- cgit v1.1 From f2b20d436534f22ccc3f5ad172499fcb013bb315 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 29 Dec 2011 09:16:28 +0100 Subject: block: fix blk_queue_end_tag() Commit 5e081591 "block: warn if tag is greater than real_max_depth" cleaned up blk_queue_end_tag() to warn when the tag is truly invalid (greater than real_max_depth). However, it changed behavior in the tag < max_depth case to not end the request. Leading to triggering of BUG_ON(blk_queued_rq(rq)) in the request completion path: http://marc.info/?l=linux-kernel&m=132204370518629&w=2 In order to allow blk_queue_resize_tags() to shrink the tag space blk_queue_end_tag() must always complete tags with a value less than real_max_depth regardless of the current max_depth. The comment about "handling the shrink case" seems to be what prompted changes in this space, so remove it and BUG on all invalid tags (made even simpler by Matthew's suggestion to use an unsigned compare). Signed-off-by: Dan Williams Cc: Tao Ma Cc: Matthew Wilcox Reported-by: Meelis Roos Reported-by: Ed Nadolski Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-tag.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-tag.c b/block/blk-tag.c index e74d6d1..4af6f5c 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags); void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - int tag = rq->tag; + unsigned tag = rq->tag; /* negative tags invalid */ - BUG_ON(tag == -1); - - if (unlikely(tag >= bqt->max_depth)) { - /* - * This can happen after tag depth has been reduced. - * But tag shouldn't be larger than real_max_depth. - */ - WARN_ON(tag >= bqt->real_max_depth); - return; - } + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; -- cgit v1.1 From 36a7ce632fb11cce578b7fdf4b4bbc8fbb99987a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Jan 2012 16:01:27 +0100 Subject: block: add and use scsi_blk_cmd_ioctl commit 577ebb374c78314ac4617242f509e2f5e7156649 upstream. Introduce a wrapper around scsi_cmd_ioctl that takes a block device. The function will then be enhanced to detect partition block devices and, in that case, subject the ioctls to whitelisting. Cc: linux-scsi@vger.kernel.org Cc: Jens Axboe Cc: James Bottomley Signed-off-by: Paolo Bonzini Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- block/scsi_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index fbdf0d8..a2c11f3 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -690,6 +690,13 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod } EXPORT_SYMBOL(scsi_cmd_ioctl); +int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, + unsigned int cmd, void __user *arg) +{ + return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); +} +EXPORT_SYMBOL(scsi_cmd_blk_ioctl); + static int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); -- cgit v1.1 From d33310de0d7914f2e27ed4fef67a1979f10037e1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Jan 2012 16:01:28 +0100 Subject: block: fail SCSI passthrough ioctls on partition devices commit 0bfc96cb77224736dfa35c3c555d37b3646ef35e upstream. [ Changes with respect to 3.3: return -ENOTTY from scsi_verify_blk_ioctl and -ENOIOCTLCMD from sd_compat_ioctl. ] Linux allows executing the SG_IO ioctl on a partition or LVM volume, and will pass the command to the underlying block device. This is well-known, but it is also a large security problem when (via Unix permissions, ACLs, SELinux or a combination thereof) a program or user needs to be granted access only to part of the disk. This patch lets partitions forward a small set of harmless ioctls; others are logged with printk so that we can see which ioctls are actually sent. In my tests only CDROM_GET_CAPABILITY actually occurred. Of course it was being sent to a (partition on a) hard disk, so it would have failed with ENOTTY and the patch isn't changing anything in practice. Still, I'm treating it specially to avoid spamming the logs. In principle, this restriction should include programs running with CAP_SYS_RAWIO. If for example I let a program access /dev/sda2 and /dev/sdb, it still should not be able to read/write outside the boundaries of /dev/sda2 independent of the capabilities. However, for now programs with CAP_SYS_RAWIO will still be allowed to send the ioctls. Their actions will still be logged. This patch does not affect the non-libata IDE driver. That driver however already tests for bd != bd->bd_contains before issuing some ioctl; it could be restricted further to forbid these ioctls even for programs running with CAP_SYS_ADMIN/CAP_SYS_RAWIO. Cc: linux-scsi@vger.kernel.org Cc: Jens Axboe Cc: James Bottomley Signed-off-by: Paolo Bonzini [ Make it also print the command name when warning - Linus ] Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- block/scsi_ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a2c11f3..688be8a 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -690,9 +691,53 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod } EXPORT_SYMBOL(scsi_cmd_ioctl); +int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) +{ + if (bd && bd == bd->bd_contains) + return 0; + + /* Actually none of these is particularly useful on a partition, + * but they are safe. + */ + switch (cmd) { + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + case SCSI_IOCTL_GET_PCI: + case SCSI_IOCTL_PROBE_HOST: + case SG_GET_VERSION_NUM: + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_RESERVED_SIZE: + case SG_SET_RESERVED_SIZE: + case SG_EMULATED_HOST: + return 0; + case CDROM_GET_CAPABILITY: + /* Keep this until we remove the printk below. udev sends it + * and we do not want to spam dmesg about it. CD-ROMs do + * not have partitions, so we get here only for disks. + */ + return -ENOTTY; + default: + break; + } + + /* In particular, rule out all resets and host-specific ioctls. */ + printk_ratelimited(KERN_WARNING + "%s: sending ioctl %x to a partition!\n", current->comm, cmd); + + return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY; +} +EXPORT_SYMBOL(scsi_verify_blk_ioctl); + int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, unsigned int cmd, void __user *arg) { + int ret; + + ret = scsi_verify_blk_ioctl(bd, cmd); + if (ret < 0) + return ret; + return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); } EXPORT_SYMBOL(scsi_cmd_blk_ioctl); -- cgit v1.1 From 6e118375b14e1cd89cab325c79faf6f48dc9c50b Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 8 Feb 2012 20:02:03 +0100 Subject: bsg: fix sysfs link remove warning commit 37b40adf2d1b4a5e51323be73ccf8ddcf3f15dd3 upstream. We create "bsg" link if q->kobj.sd is not NULL, so remove it only when the same condition is true. Fixes: WARNING: at fs/sysfs/inode.c:323 sysfs_hash_and_remove+0x2b/0x77() sysfs: can not remove 'bsg', no directory Call Trace: [] warn_slowpath_common+0x6a/0x7f [] ? sysfs_hash_and_remove+0x2b/0x77 [] warn_slowpath_fmt+0x2b/0x2f [] sysfs_hash_and_remove+0x2b/0x77 [] sysfs_remove_link+0x20/0x23 [] bsg_unregister_queue+0x40/0x6d [] __scsi_remove_device+0x31/0x9d [] scsi_forget_host+0x41/0x52 [] scsi_remove_host+0x71/0xe0 [] quiesce_and_remove_host+0x51/0x83 [usb_storage] [] usb_stor_disconnect+0x18/0x22 [usb_storage] [] usb_unbind_interface+0x4e/0x109 [] __device_release_driver+0x6b/0xa6 [] device_release_driver+0x17/0x22 [] bus_remove_device+0xd6/0xe6 [] device_del+0xf2/0x137 [] usb_disable_device+0x94/0x1a0 Signed-off-by: Stanislaw Gruszka Signed-off-by: Jens Axboe Signed-off-by: Tim Gardner Signed-off-by: Greg Kroah-Hartman --- block/bsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 702f131..c0ab25c 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -985,7 +985,8 @@ void bsg_unregister_queue(struct request_queue *q) mutex_lock(&bsg_mutex); idr_remove(&bsg_minor_idr, bcd->minor); - sysfs_remove_link(&q->kobj, "bsg"); + if (q->kobj.sd) + sysfs_remove_link(&q->kobj, "bsg"); device_unregister(bcd->class_dev); bcd->class_dev = NULL; kref_put(&bcd->ref, bsg_kref_release_function); -- cgit v1.1 From eee92c36399fe1c0d19de4982e9cfdf526b37922 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 2 Mar 2012 10:43:28 +0100 Subject: block: fix __blkdev_get and add_disk race condition commit 9f53d2fe815b4011ff930a7b6db98385d45faa68 upstream. The following situation might occur: __blkdev_get: add_disk: register_disk() get_gendisk() disk_block_events() disk->ev == NULL disk_add_events() __disk_unblock_events() disk->ev != NULL --ev->block Then we unblock events, when they are suppose to be blocked. This can trigger events related block/genhd.c warnings, but also can crash in sd_check_events() or other places. I'm able to reproduce crashes with the following scripts (with connected usb dongle as sdb disk). DEV=/dev/sdb ENABLE=/sys/bus/usb/devices/1-2/bConfigurationValue function stop_me() { for i in `jobs -p` ; do kill $i 2> /dev/null ; done exit } trap stop_me SIGHUP SIGINT SIGTERM for ((i = 0; i < 10; i++)) ; do while true; do fdisk -l $DEV 2>&1 > /dev/null ; done & done while true ; do echo 1 > $ENABLE sleep 1 echo 0 > $ENABLE done I use the script to verify patch fixing oops in sd_revalidate_disk http://marc.info/?l=linux-scsi&m=132935572512352&w=2 Without Jun'ichi Nomura patch titled "Fix NULL pointer dereference in sd_revalidate_disk" or this one, script easily crash kernel within a few seconds. With both patches applied I do not observe crash. Unfortunately after some time (dozen of minutes), script will hung in: [ 1563.906432] [] schedule_timeout_uninterruptible+0x15/0x20 [ 1563.906437] [] msleep+0x15/0x20 [ 1563.906443] [] blk_drain_queue+0x32/0xd0 [ 1563.906447] [] blk_cleanup_queue+0xd0/0x170 [ 1563.906454] [] scsi_free_queue+0x3f/0x60 [ 1563.906459] [] __scsi_remove_device+0x6e/0xb0 [ 1563.906463] [] scsi_forget_host+0x4f/0x60 [ 1563.906468] [] scsi_remove_host+0x5a/0xf0 [ 1563.906482] [] quiesce_and_remove_host+0x5b/0xa0 [usb_storage] [ 1563.906490] [] usb_stor_disconnect+0x13/0x20 [usb_storage] Anyway I think this patch is some step forward. As drawback, I do not teardown on sysfs file create error, because I do not know how to nullify disk->ev (since it can be used). However add_disk error handling practically does not exist too, and things will work without this sysfs file, except events will not be exported to user space. Signed-off-by: Stanislaw Gruszka Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 02e9fca..9db720d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -36,6 +36,7 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_alloc_events(struct gendisk *disk); static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); @@ -602,6 +603,8 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + disk_alloc_events(disk); + /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); @@ -1734,9 +1737,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* - * disk_{add|del|release}_events - initialize and destroy disk_events. + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -static void disk_add_events(struct gendisk *disk) +static void disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; @@ -1749,16 +1752,6 @@ static void disk_add_events(struct gendisk *disk) return; } - if (sysfs_create_files(&disk_to_dev(disk)->kobj, - disk_events_attrs) < 0) { - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - kfree(ev); - return; - } - - disk->ev = ev; - INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); @@ -1767,8 +1760,21 @@ static void disk_add_events(struct gendisk *disk) ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + disk->ev = ev; +} + +static void disk_add_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + mutex_lock(&disk_events_mutex); - list_add_tail(&ev->node, &disk_events); + list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* -- cgit v1.1 From 2053689f68e19b8c1bb38aa68049c57576eed6e0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 2 Mar 2012 10:51:00 +0100 Subject: Block: use a freezable workqueue for disk-event polling commit 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 upstream. This patch (as1519) fixes a bug in the block layer's disk-events polling. The polling is done by a work routine queued on the system_nrt_wq workqueue. Since that workqueue isn't freezable, the polling continues even in the middle of a system sleep transition. Obviously, polling a suspended drive for media changes and such isn't a good thing to do; in the case of USB mass-storage devices it can lead to real problems requiring device resets and even re-enumeration. The patch fixes things by creating a new system-wide, non-reentrant, freezable workqueue and using it for disk-events polling. Signed-off-by: Alan Stern Acked-by: Tejun Heo Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9db720d..997afd6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1479,9 +1479,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) intv = disk_events_poll_jiffies(disk); set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); else if (intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } @@ -1525,7 +1525,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) ev->clearing |= mask; if (!ev->block) { cancel_delayed_work(&ev->dwork); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); } spin_unlock_irq(&ev->lock); } @@ -1562,7 +1562,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) /* uncondtionally schedule event check and wait for it to finish */ disk_block_events(disk); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); @@ -1599,7 +1599,7 @@ static void disk_events_workfn(struct work_struct *work) intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); -- cgit v1.1 From 9fd246e048bbd8569a84b412382cd299e0313a62 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 May 2012 08:22:04 +0200 Subject: block: fix buffer overflow when printing partition UUIDs commit 05c69d298c96703741cac9a5cbbf6c53bd55a6e2 upstream. 6d1d8050b4bc8 "block, partition: add partition_meta_info to hd_struct" added part_unpack_uuid() which assumes that the passed in buffer has enough space for sprintfing "%pU" - 37 characters including '\0'. Unfortunately, b5af921ec0233 "init: add support for root devices specified by partition UUID" supplied 33 bytes buffer to the function leading to the following panic with stackprotector enabled. Kernel panic - not syncing: stack-protector: Kernel stack corrupted in: ffffffff81b14c7e [] panic+0xba/0x1c6 [] ? printk_all_partitions+0x259/0x26xb [] __stack_chk_fail+0x1b/0x20 [] printk_all_paritions+0x259/0x26xb [] mount_block_root+0x1bc/0x27f [] mount_root+0x57/0x5b [] prepare_namespace+0x13d/0x176 [] ? release_tgcred.isra.4+0x330/0x30 [] kernel_init+0x155/0x15a [] ? schedule_tail+0x27/0xb0 [] kernel_thread_helper+0x5/0x10 [] ? start_kernel+0x3c5/0x3c5 [] ? gs_change+0x13/0x13 Increase the buffer size, remove the dangerous part_unpack_uuid() and use snprintf() directly from printk_all_partitions(). Signed-off-by: Tejun Heo Reported-by: Szymon Gruszczynski Cc: Will Drewry Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/genhd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 997afd6..4927476 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -744,7 +744,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; - u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; + char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5]; /* * Don't show empty devices or things that have been @@ -763,14 +763,16 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - uuid[0] = 0; + uuid_buf[0] = '\0'; if (part->info) - part_unpack_uuid(part->info->uuid, uuid); + snprintf(uuid_buf, sizeof(uuid_buf), "%pU", + part->info->uuid); printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf), uuid); + disk_name(disk, part->partno, name_buf), + uuid_buf); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) -- cgit v1.1 From f45c9a6eec20cd712421c442785e7a4e9215a230 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 15 Jun 2012 12:52:46 +0200 Subject: scsi: Silence unnecessary warnings about ioctl to partition commit 6d9359280753d2955f86d6411047516a9431eb51 upstream. Sometimes, warnings about ioctls to partition happen often enough that they form majority of the warnings in the kernel log and users complain. In some cases warnings are about ioctls such as SG_IO so it's not good to get rid of the warnings completely as they can ease debugging of userspace problems when ioctl is refused. Since I have seen warnings from lots of commands, including some proprietary userspace applications, I don't think disallowing the ioctls for processes with CAP_SYS_RAWIO will happen in the near future if ever. So lets just stop warning for processes with CAP_SYS_RAWIO for which ioctl is allowed. CC: Paolo Bonzini CC: James Bottomley CC: linux-scsi@vger.kernel.org Acked-by: Paolo Bonzini Signed-off-by: Jan Kara Signed-off-by: Jens Axboe [bwh: Backported to 3.2: use ENOTTY, not ENOIOCTLCMD] Signed-off-by: Ben Hutchings --- block/scsi_ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 688be8a..9e76a32 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -721,11 +721,14 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) break; } + if (capable(CAP_SYS_RAWIO)) + return 0; + /* In particular, rule out all resets and host-specific ioctls. */ printk_ratelimited(KERN_WARNING "%s: sending ioctl %x to a partition!\n", current->comm, cmd); - return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY; + return -ENOTTY; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); -- cgit v1.1 From 68e9e9fee2bbec4853a993e98b0df8479292f572 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Dec 2011 00:33:37 +0100 Subject: block: add blk_queue_dead() commit 34f6055c80285e4efb3f602a9119db75239744dc upstream. There are a number of QUEUE_FLAG_DEAD tests. Add blk_queue_dead() macro and use it. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-core.c | 6 +++--- block/blk-exec.c | 2 +- block/blk-sysfs.c | 4 ++-- block/blk-throttle.c | 4 ++-- block/blk.h | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 15de223..49d9e91 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -607,7 +607,7 @@ EXPORT_SYMBOL(blk_init_allocated_queue); int blk_get_queue(struct request_queue *q) { - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + if (likely(!blk_queue_dead(q))) { kobject_get(&q->kobj); return 0; } @@ -754,7 +754,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue; - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + if (unlikely(blk_queue_dead(q))) return NULL; may_queue = elv_may_queue(q, rw_flags); @@ -874,7 +874,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct io_context *ioc; struct request_list *rl = &q->rq; - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + if (unlikely(blk_queue_dead(q))) return NULL; prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, diff --git a/block/blk-exec.c b/block/blk-exec.c index a1ebceb..6053285 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -50,7 +50,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + if (unlikely(blk_queue_dead(q))) { rq->errors = -ENXIO; if (rq->end_io) rq->end_io(rq, rq->errors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e7f9f65..f0b2ca8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dead(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } @@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dead(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4553245..5eed6a7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) struct request_queue *q = td->queue; /* no throttling for dead queue */ - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + if (unlikely(blk_queue_dead(q))) return NULL; rcu_read_lock(); @@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) spin_lock_irq(q->queue_lock); /* Make sure @q is still alive */ - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + if (unlikely(blk_queue_dead(q))) { kfree(tg); return NULL; } diff --git a/block/blk.h b/block/blk.h index 3f6551b..e38691d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -85,7 +85,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) q->flush_queue_delayed = 1; return NULL; } - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || + if (unlikely(blk_queue_dead(q)) || !q->elevator->ops->elevator_dispatch_fn(q, 0)) return NULL; } -- cgit v1.1 From 32eb44e290d1151e9eb26f57564dae6966190685 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 22 Nov 2012 02:00:11 -0800 Subject: block: Don't access request after it might be freed commit 893d290f1d7496db97c9471bc352ad4a11dc8a25 upstream. After we've done __elv_add_request() and __blk_run_queue() in blk_execute_rq_nowait(), the request might finish and be freed immediately. Therefore checking if the type is REQ_TYPE_PM_RESUME isn't safe afterwards, because if it isn't, rq might be gone. Instead, check beforehand and stash the result in a temporary. This fixes crashes in blk_execute_rq_nowait() I get occasionally when running with lots of memory debugging options enabled -- I think this race is usually harmless because the window for rq to be reallocated is so small. Signed-off-by: Roland Dreier Signed-off-by: Jens Axboe [bwh: Backported to 3.2: adjust context] Signed-off-by: Ben Hutchings --- block/blk-exec.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index 6053285..ac2c6e7 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -49,6 +49,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + bool is_pm_resume; if (unlikely(blk_queue_dead(q))) { rq->errors = -ENXIO; @@ -59,12 +60,18 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + /* + * need to check this before __blk_run_queue(), because rq can + * be freed before that returns. + */ + is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; + WARN_ON(irqs_disabled()); spin_lock_irq(q->queue_lock); __elv_add_request(q, rq, where); __blk_run_queue(q); /* the queue is stopped so it won't be run */ - if (rq->cmd_type == REQ_TYPE_PM_RESUME) + if (is_pm_resume) q->request_fn(q); spin_unlock_irq(q->queue_lock); } -- cgit v1.1 From 2abb7d3a3cf05900a6d9cc6b98b1ba3aa432ce30 Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Wed, 27 Feb 2013 17:03:32 -0800 Subject: block: fix ext_devt_idr handling commit 7b74e912785a11572da43292786ed07ada7e3e0c upstream. While adding and removing a lot of disks disks and partitions this sometimes shows up: WARNING: at fs/sysfs/dir.c:512 sysfs_add_one+0xc9/0x130() (Not tainted) Hardware name: sysfs: cannot create duplicate filename '/dev/block/259:751' Modules linked in: raid1 autofs4 bnx2fc cnic uio fcoe libfcoe libfc 8021q scsi_transport_fc scsi_tgt garp stp llc sunrpc cpufreq_ondemand powernow_k8 freq_table mperf ipv6 dm_mirror dm_region_hash dm_log power_meter microcode dcdbas serio_raw amd64_edac_mod edac_core edac_mce_amd i2c_piix4 i2c_core k10temp bnx2 sg ixgbe dca mdio ext4 mbcache jbd2 dm_round_robin sr_mod cdrom sd_mod crc_t10dif ata_generic pata_acpi pata_atiixp ahci mptsas mptscsih mptbase scsi_transport_sas dm_multipath dm_mod [last unloaded: scsi_wait_scan] Pid: 44103, comm: async/16 Not tainted 2.6.32-195.el6.x86_64 #1 Call Trace: warn_slowpath_common+0x87/0xc0 warn_slowpath_fmt+0x46/0x50 sysfs_add_one+0xc9/0x130 sysfs_do_create_link+0x12b/0x170 sysfs_create_link+0x13/0x20 device_add+0x317/0x650 idr_get_new+0x13/0x50 add_partition+0x21c/0x390 rescan_partitions+0x32b/0x470 sd_open+0x81/0x1f0 [sd_mod] __blkdev_get+0x1b6/0x3c0 blkdev_get+0x10/0x20 register_disk+0x155/0x170 add_disk+0xa6/0x160 sd_probe_async+0x13b/0x210 [sd_mod] add_wait_queue+0x46/0x60 async_thread+0x102/0x250 default_wake_function+0x0/0x20 async_thread+0x0/0x250 kthread+0x96/0xa0 child_rip+0xa/0x20 kthread+0x0/0xa0 child_rip+0x0/0x20 This most likely happens because dev_t is freed while the number is still used and idr_get_new() is not protected on every use. The fix adds a mutex where it wasn't before and moves the dev_t free function so it is called after device del. Signed-off-by: Tomas Henzl Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [bwh: Backported to 3.2: adjust filename] Signed-off-by: Ben Hutchings --- block/genhd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4927476..ac9aeb4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -421,14 +421,18 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) do { if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) return -ENOMEM; + mutex_lock(&ext_devt_mutex); rc = idr_get_new(&ext_devt_idr, part, &idx); + mutex_unlock(&ext_devt_mutex); } while (rc == -EAGAIN); if (rc) return rc; if (idx > MAX_EXT_DEVT) { + mutex_lock(&ext_devt_mutex); idr_remove(&ext_devt_idr, idx); + mutex_unlock(&ext_devt_mutex); return -EBUSY; } @@ -645,7 +649,6 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - blk_free_devt(disk_to_dev(disk)->devt); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; @@ -663,6 +666,7 @@ void del_gendisk(struct gendisk *disk) if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); device_del(disk_to_dev(disk)); + blk_free_devt(disk_to_dev(disk)->devt); } EXPORT_SYMBOL(del_gendisk); -- cgit v1.1 From a51db52c68e3ad5c718c1915d28086b2c0ef8642 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Feb 2013 17:03:56 -0800 Subject: block: fix synchronization and limit check in blk_alloc_devt() commit ce23bba842aee98092225d9576dba47c82352521 upstream. idr allocation in blk_alloc_devt() wasn't synchronized against lookup and removal, and its limit check was off by one - 1 << MINORBITS is the number of minors allowed, not the maximum allowed minor. Add locking and rename MAX_EXT_DEVT to NR_EXT_DEVT and fix limit checking. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ben Hutchings --- block/genhd.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ac9aeb4..6edf228 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ -#define MAX_EXT_DEVT (1 << MINORBITS) +#define NR_EXT_DEVT (1 << MINORBITS) /* For extended devt allocation. ext_devt_mutex prevents look up * results from going away underneath its user. @@ -423,19 +423,16 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return -ENOMEM; mutex_lock(&ext_devt_mutex); rc = idr_get_new(&ext_devt_idr, part, &idx); + if (!rc && idx >= NR_EXT_DEVT) { + idr_remove(&ext_devt_idr, idx); + rc = -EBUSY; + } mutex_unlock(&ext_devt_mutex); } while (rc == -EAGAIN); if (rc) return rc; - if (idx > MAX_EXT_DEVT) { - mutex_lock(&ext_devt_mutex); - idr_remove(&ext_devt_idr, idx); - mutex_unlock(&ext_devt_mutex); - return -EBUSY; - } - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; } -- cgit v1.1 From 4316936730689af432346fa1d71c3f69f22ec3ac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2013 21:53:57 +0200 Subject: block: avoid using uninitialized value in from queue_var_store commit c678ef5286ddb5cf70384ad5af286b0afc9b73e1 upstream. As found by gcc-4.8, the QUEUE_SYSFS_BIT_FNS macro creates functions that use a value generated by queue_var_store independent of whether that value was set or not. block/blk-sysfs.c: In function 'queue_store_nonrot': block/blk-sysfs.c:244:385: warning: 'val' may be used uninitialized in this function [-Wmaybe-uninitialized] Unlike most other such warnings, this one is not a false positive, writing any non-number string into the sysfs files indeed has an undefined result, rather than returning an error. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-sysfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f0b2ca8..1789e7a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -200,6 +200,8 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \ unsigned long val; \ ssize_t ret; \ ret = queue_var_store(&val, page, count); \ + if (ret < 0) \ + return ret; \ if (neg) \ val = !val; \ \ -- cgit v1.1 From b442223040adf969fd02124c29c856a06cf5649c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jul 2013 15:01:14 -0700 Subject: block: do not pass disk names as format strings commit ffc8b30866879ed9ba62bd0a86fecdbd51cd3d19 upstream. Disk names may contain arbitrary strings, so they must not be interpreted as format strings. It seems that only md allows arbitrary strings to be used for disk names, but this could allow for a local memory corruption from uid 0 into ring 0. CVE-2013-2851 Signed-off-by: Kees Cook Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [bwh: Backported to 3.2: adjust device pointer name in nbd.c] Signed-off-by: Ben Hutchings --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 6edf228..8bd4ef2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk) ddev->parent = disk->driverfs_dev; - dev_set_name(ddev, disk->disk_name); + dev_set_name(ddev, "%s", disk->disk_name); /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); -- cgit v1.1 From c76cf3d0c317ebd020361ae737641bd53ab0d30b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 8 Oct 2013 14:36:41 -0400 Subject: block: fix race between request completion and timeout handling commit 4912aa6c11e6a5d910264deedbec2075c6f1bb73 upstream. crocode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support shpchp ioatdma dca be2net sg ses enclosure ext4 mbcache jbd2 sd_mod crc_t10dif ahci megaraid_sas(U) dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan] Pid: 491, comm: scsi_eh_0 Tainted: G W ---------------- 2.6.32-220.13.1.el6.x86_64 #1 IBM -[8722PAX]-/00D1461 RIP: 0010:[] [] blk_requeue_request+0x94/0xa0 RSP: 0018:ffff881057eefd60 EFLAGS: 00010012 RAX: ffff881d99e3e8a8 RBX: ffff881d99e3e780 RCX: ffff881d99e3e8a8 RDX: ffff881d99e3e8a8 RSI: ffff881d99e3e780 RDI: ffff881d99e3e780 RBP: ffff881057eefd80 R08: ffff881057eefe90 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff881057f92338 R13: 0000000000000000 R14: ffff881057f92338 R15: ffff883058188000 FS: 0000000000000000(0000) GS:ffff880040200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 00000000006d3ec0 CR3: 000000302cd7d000 CR4: 00000000000406b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process scsi_eh_0 (pid: 491, threadinfo ffff881057eee000, task ffff881057e29540) Stack: 0000000000001057 0000000000000286 ffff8810275efdc0 ffff881057f16000 <0> ffff881057eefdd0 ffffffff81362323 ffff881057eefe20 ffffffff8135f393 <0> ffff881057e29af8 ffff8810275efdc0 ffff881057eefe78 ffff881057eefe90 Call Trace: [] __scsi_queue_insert+0xa3/0x150 [] ? scsi_eh_ready_devs+0x5e3/0x850 [] scsi_queue_insert+0x13/0x20 [] scsi_eh_flush_done_q+0x104/0x160 [] scsi_error_handler+0x35b/0x660 [] ? scsi_error_handler+0x0/0x660 [] kthread+0x96/0xa0 [] child_rip+0xa/0x20 [] ? kthread+0x0/0xa0 [] ? child_rip+0x0/0x20 Code: 00 00 eb d1 4c 8b 2d 3c 8f 97 00 4d 85 ed 74 bf 49 8b 45 00 49 83 c5 08 48 89 de 4c 89 e7 ff d0 49 8b 45 00 48 85 c0 75 eb eb a4 <0f> 0b eb fe 0f 1f 84 00 00 00 00 00 55 48 89 e5 0f 1f 44 00 00 RIP [] blk_requeue_request+0x94/0xa0 RSP The RIP is this line: BUG_ON(blk_queued_rq(rq)); After digging through the code, I think there may be a race between the request completion and the timer handler running. A timer is started for each request put on the device's queue (see blk_start_request->blk_add_timer). If the request does not complete before the timer expires, the timer handler (blk_rq_timed_out_timer) will mark the request complete atomically: static inline int blk_mark_rq_complete(struct request *rq) { return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); } and then call blk_rq_timed_out. The latter function will call scsi_times_out, which will return one of BLK_EH_HANDLED, BLK_EH_RESET_TIMER or BLK_EH_NOT_HANDLED. If BLK_EH_RESET_TIMER is returned, blk_clear_rq_complete is called, and blk_add_timer is again called to simply wait longer for the request to complete. Now, if the request happens to complete while this is going on, what happens? Given that we know the completion handler will bail if it finds the REQ_ATOM_COMPLETE bit set, we need to focus on the completion handler running after that bit is cleared. So, from the above paragraph, after the call to blk_clear_rq_complete. If the completion sets REQ_ATOM_COMPLETE before the BUG_ON in blk_add_timer, we go boom there (I haven't seen this in the cores). Next, if we get the completion before the call to list_add_tail, then the timer will eventually fire for an old req, which may either be freed or reallocated (there is evidence that this might be the case). Finally, if the completion comes in *after* the addition to the timeout list, I think it's harmless. The request will be removed from the timeout list, req_atom_complete will be set, and all will be well. This will only actually explain the coredumps *IF* the request structure was freed, reallocated *and* queued before the error handler thread had a chance to process it. That is possible, but it may make sense to keep digging for another race. I think that if this is what was happening, we would see other instances of this problem showing up as null pointer or garbage pointer dereferences, for example when the request structure was not re-used. It looks like we actually do run into that situation in other reports. This patch moves the BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); from blk_add_timer to the only caller that could trip over it (blk_start_request). It then inverts the calls to blk_clear_rq_complete and blk_add_timer in blk_rq_timed_out to address the race. I've boot tested this patch, but nothing more. Signed-off-by: Jeff Moyer Acked-by: Hannes Reinecke Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-core.c | 1 + block/blk-timeout.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 49d9e91..50c8305 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2015,6 +2015,7 @@ void blk_start_request(struct request *req) if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 7803548..b1182ea 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -90,8 +90,8 @@ static void blk_rq_timed_out(struct request *req) __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - blk_clear_rq_complete(req); blk_add_timer(req); + blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: /* @@ -173,7 +173,6 @@ void blk_add_timer(struct request *req) return; BUG_ON(!list_empty(&req->timeout_list)); - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); /* * Some LLDs, like scsi, peek at the timeout to prevent a -- cgit v1.1 From 9be9b3e8f63644f30360164ff0efa997366af67c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 14 Oct 2013 12:11:36 -0400 Subject: blk-core: Fix memory corruption if blkcg_init_queue fails commit fff4996b7db7955414ac74386efa5e07fd766b50 upstream. If blkcg_init_queue fails, blk_alloc_queue_node doesn't call bdi_destroy to clean up structures allocated by the backing dev. ------------[ cut here ]------------ WARNING: at lib/debugobjects.c:260 debug_print_object+0x85/0xa0() ODEBUG: free active (active state 0) object type: percpu_counter hint: (null) Modules linked in: dm_loop dm_mod ip6table_filter ip6_tables uvesafb cfbcopyarea cfbimgblt cfbfillrect fbcon font bitblit fbcon_rotate fbcon_cw fbcon_ud fbcon_ccw softcursor fb fbdev ipt_MASQUERADE iptable_nat nf_nat_ipv4 msr nf_conntrack_ipv4 nf_defrag_ipv4 xt_state ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc tun ipv6 cpufreq_userspace cpufreq_stats cpufreq_powersave cpufreq_ondemand cpufreq_conservative spadfs fuse hid_generic usbhid hid raid0 md_mod dmi_sysfs nf_nat_ftp nf_nat nf_conntrack_ftp nf_conntrack lm85 hwmon_vid snd_usb_audio snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc snd_hwdep snd_usbmidi_lib snd_rawmidi snd soundcore acpi_cpufreq freq_table mperf sata_svw serverworks kvm_amd ide_core ehci_pci ohci_hcd libata ehci_hcd kvm usbcore tg3 usb_common libphy k10temp pcspkr ptp i2c_piix4 i2c_core evdev microcode hwmon rtc_cmos pps_core e100 skge floppy mii processor button unix CPU: 0 PID: 2739 Comm: lvchange Tainted: G W 3.10.15-devel #14 Hardware name: empty empty/S3992-E, BIOS 'V1.06 ' 06/09/2009 0000000000000009 ffff88023c3c1ae8 ffffffff813c8fd4 ffff88023c3c1b20 ffffffff810399eb ffff88043d35cd58 ffffffff81651940 ffff88023c3c1bf8 ffffffff82479d90 0000000000000005 ffff88023c3c1b80 ffffffff81039a67 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x6b/0xa0 [] warn_slowpath_fmt+0x47/0x50 [] ? debug_check_no_obj_freed+0xcf/0x250 [] debug_print_object+0x85/0xa0 [] debug_check_no_obj_freed+0x203/0x250 [] kmem_cache_free+0x20c/0x3a0 [] blk_alloc_queue_node+0x2a9/0x2c0 [] blk_alloc_queue+0xe/0x10 [] dm_create+0x1a3/0x530 [dm_mod] [] ? list_version_get_info+0xe0/0xe0 [dm_mod] [] dev_create+0x57/0x2b0 [dm_mod] [] ? list_version_get_info+0xe0/0xe0 [dm_mod] [] ? list_version_get_info+0xe0/0xe0 [dm_mod] [] ctl_ioctl+0x268/0x500 [dm_mod] [] ? get_lock_stats+0x22/0x70 [] dm_ctl_ioctl+0xe/0x20 [dm_mod] [] do_vfs_ioctl+0x2ed/0x520 [] ? fget_light+0x377/0x4e0 [] SyS_ioctl+0x4b/0x90 [] system_call_fastpath+0x1a/0x1f ---[ end trace 4b5ff0d55673d986 ]--- ------------[ cut here ]------------ This fix should be backported to stable kernels starting with 2.6.37. Note that in the kernels prior to 3.5 the affected code is different, but the bug is still there - bdi_init is called and bdi_destroy isn't. Signed-off-by: Mikulas Patocka Acked-by: Tejun Heo Signed-off-by: Jens Axboe [bwh: Backported to 3.2: add bdi_destroy() to the single error path after the call to bdi_init()] Signed-off-by: Ben Hutchings --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 50c8305..a219c89 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -483,6 +483,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) } if (blk_throtl_init(q)) { + bdi_destroy(&q->backing_dev_info); kmem_cache_free(blk_requestq_cachep, q); return NULL; } -- cgit v1.1 From e5b56dda4701918b565764e47a60aa252964411c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2014 09:34:01 -0700 Subject: block: add cond_resched() to potentially long running ioctl discard loop commit c8123f8c9cb517403b51aa41c3c46ff5e10b2c17 upstream. When mkfs issues a full device discard and the device only supports discards of a smallish size, we can loop in blkdev_issue_discard() for a long time. If preempt isn't enabled, this can turn into a softlock situation and the kernel will start complaining. Add an explicit cond_resched() at the end of the loop to avoid that. Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 2b461b4..36751e2 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -101,6 +101,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, atomic_inc(&bb.done); submit_bio(type, bio); + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); } /* Wait for bios in-flight */ -- cgit v1.1 From 5b85afa68e4f56c27f1d5c6f49e5257bce6448e6 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 4 Mar 2014 23:13:10 +0900 Subject: blktrace: fix accounting of partially completed requests commit af5040da01ef980670b3741b3e10733ee3e33566 upstream. trace_block_rq_complete does not take into account that request can be partially completed, so we can get the following incorrect output of blkparser: C R 232 + 240 [0] C R 240 + 232 [0] C R 248 + 224 [0] C R 256 + 216 [0] but should be: C R 232 + 8 [0] C R 240 + 8 [0] C R 248 + 8 [0] C R 256 + 8 [0] Also, the whole output summary statistics of completed requests and final throughput will be incorrect. This patch takes into account real completion size of the request and fixes wrong completion accounting. Signed-off-by: Roman Pen CC: Steven Rostedt CC: Frederic Weisbecker CC: Ingo Molnar CC: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe [bwh: Backported to 3.2: drop change in blk-mq.c] Signed-off-by: Ben Hutchings --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a219c89..ec494ff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2077,7 +2077,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (!req->bio) return false; - trace_block_rq_complete(req->q, req); + trace_block_rq_complete(req->q, req, nr_bytes); /* * For fs requests, rq is just carrier of independent bio's -- cgit v1.1 From 60d940c6fd6252f6307ecacd49acac54cc6d8e8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Jul 2014 12:25:28 +0200 Subject: block: don't assume last put of shared tags is for the host commit d45b3279a5a2252cafcd665bbf2db8c9b31ef783 upstream. There is no inherent reason why the last put of a tag structure must be the one for the Scsi_Host, as device model objects can be held for arbitrary periods. Merge blk_free_tags and __blk_free_tags into a single funtion that just release a references and get rid of the BUG() when the host reference wasn't the last. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-tag.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-tag.c b/block/blk-tag.c index 4af6f5c..f606487 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag) EXPORT_SYMBOL(blk_queue_find_tag); /** - * __blk_free_tags - release a given set of tag maintenance info + * blk_free_tags - release a given set of tag maintenance info * @bqt: the tag map to free * - * Tries to free the specified @bqt. Returns true if it was - * actually freed and false if there are still references using it + * Drop the reference count on @bqt and frees it when the last reference + * is dropped. */ -static int __blk_free_tags(struct blk_queue_tag *bqt) +void blk_free_tags(struct blk_queue_tag *bqt) { - int retval; - - retval = atomic_dec_and_test(&bqt->refcnt); - if (retval) { + if (atomic_dec_and_test(&bqt->refcnt)) { BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < bqt->max_depth); @@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt) kfree(bqt); } - - return retval; } +EXPORT_SYMBOL(blk_free_tags); /** * __blk_queue_free_tags - release tag maintenance info @@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q) if (!bqt) return; - __blk_free_tags(bqt); + blk_free_tags(bqt); q->queue_tags = NULL; queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); } /** - * blk_free_tags - release a given set of tag maintenance info - * @bqt: the tag map to free - * - * For externally managed @bqt frees the map. Callers of this - * function must guarantee to have released all the queues that - * might have been using this tag map. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ - if (unlikely(!__blk_free_tags(bqt))) - BUG(); -} -EXPORT_SYMBOL(blk_free_tags); - -/** * blk_queue_free_tags - release tag maintenance info * @q: the request queue for the device * -- cgit v1.1 From a288cafeceb6e18ba4fd10ec8d3270593472cca6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 26 Aug 2014 09:05:36 -0600 Subject: block: Fix dev_t minor allocation lifetime commit 2da78092dda13f1efd26edbbf99a567776913750 upstream. Releases the dev_t minor when all references are closed to prevent another device from acquiring the same major/minor. Since the partition's release may be invoked from call_rcu's soft-irq context, the ext_dev_idr's mutex had to be replaced with a spinlock so as not so sleep. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe [bwh: Backported to 3.2: - Adjust filename - idr insertion API is different, and blk_alloc_devt() is preallocating a node in a different way] Signed-off-by: Ben Hutchings --- block/genhd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8bd4ef2..35415810 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -28,10 +28,10 @@ struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) -/* For extended devt allocation. ext_devt_mutex prevents look up +/* For extended devt allocation. ext_devt_lock prevents look up * results from going away underneath its user. */ -static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; @@ -421,13 +421,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) do { if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) return -ENOMEM; - mutex_lock(&ext_devt_mutex); + spin_lock(&ext_devt_lock); rc = idr_get_new(&ext_devt_idr, part, &idx); if (!rc && idx >= NR_EXT_DEVT) { idr_remove(&ext_devt_idr, idx); rc = -EBUSY; } - mutex_unlock(&ext_devt_mutex); + spin_unlock(&ext_devt_lock); } while (rc == -EAGAIN); if (rc) @@ -454,9 +454,9 @@ void blk_free_devt(dev_t devt) return; if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(&ext_devt_mutex); + spin_lock(&ext_devt_lock); idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(&ext_devt_mutex); + spin_unlock(&ext_devt_lock); } } @@ -663,7 +663,6 @@ void del_gendisk(struct gendisk *disk) if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)->devt); } EXPORT_SYMBOL(del_gendisk); @@ -688,13 +687,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } else { struct hd_struct *part; - mutex_lock(&ext_devt_mutex); + spin_lock(&ext_devt_lock); part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); } - mutex_unlock(&ext_devt_mutex); + spin_unlock(&ext_devt_lock); } return disk; @@ -1102,6 +1101,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); -- cgit v1.1 From 607530bf21828c96c2880282c844088baf6ae3c4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Sep 2014 13:38:51 -0600 Subject: genhd: fix leftover might_sleep() in blk_free_devt() commit 46f341ffcfb5d8530f7d1e60f3be06cce6661b62 upstream. Commit 2da78092 changed the locking from a mutex to a spinlock, so we now longer sleep in this context. But there was a leftover might_sleep() in there, which now triggers since we do the final free from an RCU callback. Get rid of it. Reported-by: Pontus Fuchs Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/genhd.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 35415810..41b0435 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -448,8 +448,6 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) */ void blk_free_devt(dev_t devt) { - might_sleep(); - if (devt == MKDEV(0, 0)) return; -- cgit v1.1 From 5c250851e1f7850dfe765ff8f6d04ef4ce6e19a7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 8 Oct 2014 18:26:13 -0400 Subject: block: fix alignment_offset math that assumes io_min is a power-of-2 commit b8839b8c55f3fdd60dc36abcda7e0266aff7985c upstream. The math in both blk_stack_limits() and queue_limit_alignment_offset() assume that a block device's io_min (aka minimum_io_size) is always a power-of-2. Fix the math such that it works for non-power-of-2 io_min. This issue (of alignment_offset != 0) became apparent when testing dm-thinp with a thinp blocksize that matches a RAID6 stripesize of 1280K. Commit fdfb4c8c1 ("dm thin: set minimum_io_size to pool's data block size") unlocked the potential for alignment_offset != 0 due to the dm-thin-pool's io_min possibly being a non-power-of-2. Signed-off-by: Mike Snitzer Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/blk-settings.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index fa1eb04..d55a3e4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -521,7 +521,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ - if (max(top, bottom) & (min(top, bottom) - 1)) { + if (max(top, bottom) % min(top, bottom)) { t->misaligned = 1; ret = -1; } @@ -562,7 +562,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Find lowest common alignment_offset */ t->alignment_offset = lcm(t->alignment_offset, alignment) - & (max(t->physical_block_size, t->io_min) - 1); + % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { -- cgit v1.1 From d73b032b63e8967462e1cf5763858ed89e97880f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Oct 2014 20:13:39 -0600 Subject: scsi: Fix error handling in SCSI_IOCTL_SEND_COMMAND commit 84ce0f0e94ac97217398b3b69c21c7a62ebeed05 upstream. When sg_scsi_ioctl() fails to prepare request to submit in blk_rq_map_kern() we jump to a label where we just end up copying (luckily zeroed-out) kernel buffer to userspace instead of reporting error. Fix the problem by jumping to the right label. CC: Jens Axboe CC: linux-scsi@vger.kernel.org Coverity-id: 1226871 Signed-off-by: Jan Kara Fixed up the, now unused, out label. Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/scsi_ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9e76a32..f124268 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -505,7 +505,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { err = DRIVER_ERROR << 24; - goto out; + goto error; } memset(sense, 0, sizeof(sense)); @@ -515,7 +515,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, blk_execute_rq(q, disk, rq, 0); -out: err = rq->errors & 0xff; /* only 8 bit SCSI status */ if (err) { if (rq->sense_len && rq->sense) { -- cgit v1.1 From 44cee4aa677c9cc2a48d2bdde8928115e06cb9a0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Nov 2014 13:06:22 -0700 Subject: genhd: check for int overflow in disk_expand_part_tbl() commit 5fabcb4c33fe11c7e3afdf805fde26c1a54d0953 upstream. We can get here from blkdev_ioctl() -> blkpg_ioctl() -> add_partition() with a user passed in partno value. If we pass in 0x7fffffff, the new target in disk_expand_part_tbl() overflows the 'int' and we access beyond the end of ptbl->part[] and even write to it when we do the rcu_assign_pointer() to assign the new partition. Reported-by: David Ramos Signed-off-by: Jens Axboe Signed-off-by: Ben Hutchings --- block/genhd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 41b0435..424d1fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) -- cgit v1.1