From 796d5116c407690b14fd5bda136aa67a39e7061a Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 2 Jun 2011 21:19:05 +0200
Subject: iosched: prevent aliased requests from starving other I/O

Hi, Jens,

If you recall, I posted an RFC patch for this back in July of last year:
http://lkml.org/lkml/2010/7/13/279

The basic problem is that a process can issue a never-ending stream of
async direct I/Os to the same sector on a device, thus starving out
other I/O in the system (due to the way the alias handling works in both
cfq and deadline).  The solution I proposed back then was to start
dispatching from the fifo after a certain number of aliases had been
dispatched.  Vivek asked why we had to treat aliases differently at all,
and I never had a good answer.  So, I put together a simple patch which
allows aliases to be added to the rb tree (it adds them to the right,
though that doesn't matter as the order isn't guaranteed anyway).  I
think this is the preferred solution, as it doesn't break up time slices
in CFQ or batches in deadline.  I've tested it, and it does solve the
starvation issue.  Let me know what you think.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c      | 9 ++-------
 block/deadline-iosched.c | 4 +---
 block/elevator.c         | 7 ++-----
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7c52d68..a2fb14b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1501,16 +1501,11 @@ static void cfq_add_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct request *__alias, *prev;
+	struct request *prev;
 
 	cfqq->queued[rq_is_sync(rq)]++;
 
-	/*
-	 * looks a little odd, but the first insert might return an alias.
-	 * if that happens, put the alias on the dispatch list
-	 */
-	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
-		cfq_dispatch_insert(cfqd->queue, __alias);
+	elv_rb_add(&cfqq->sort_list, rq);
 
 	if (!cfq_cfqq_on_rr(cfqq))
 		cfq_add_cfqq_rr(cfqd, cfqq);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 5139c0e..c644137 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -77,10 +77,8 @@ static void
 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
 	struct rb_root *root = deadline_rb_root(dd, rq);
-	struct request *__alias;
 
-	while (unlikely(__alias = elv_rb_add(root, rq)))
-		deadline_move_request(dd, __alias);
+	elv_rb_add(root, rq);
 }
 
 static inline void
diff --git a/block/elevator.c b/block/elevator.c
index b0b38ce..a3b64bc 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -353,7 +353,7 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
  * RB-tree support functions for inserting/lookup/removal of requests
  * in a sorted RB tree.
  */
-struct request *elv_rb_add(struct rb_root *root, struct request *rq)
+void elv_rb_add(struct rb_root *root, struct request *rq)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -365,15 +365,12 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 
 		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
 			p = &(*p)->rb_left;
-		else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
+		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
 			p = &(*p)->rb_right;
-		else
-			return __rq;
 	}
 
 	rb_link_node(&rq->rb_node, parent, p);
 	rb_insert_color(&rq->rb_node, root);
-	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_add);
 
-- 
cgit v1.1


From 9b50902db5eb8a220160fb89e95aa11967998d12 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 5 Jun 2011 06:01:13 +0200
Subject: cfq-iosched: fix locking around ioc->ioc_data assignment

Since we are modifying this RCU pointer, we need to hold
the lock protecting it around it.

This fixes a potential reuse and double free of a cfq
io_context structure. The bug has been in CFQ for a long
time, it hit very few people but those it did hit seemed
to see it a lot.

Tracked in RH bugzilla here:

https://bugzilla.redhat.com/show_bug.cgi?id=577968

Credit goes to Paul Bolle for figuring out that the issue
was around the one-hit ioc->ioc_data cache. Thanks to his
hard work the issue is now fixed.

Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a2fb14b..000719c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2767,8 +2767,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	smp_wmb();
 	cic->key = cfqd_dead_key(cfqd);
 
-	if (ioc->ioc_data == cic)
+	if (rcu_dereference(ioc->ioc_data) == cic) {
+		spin_lock(&ioc->lock);
 		rcu_assign_pointer(ioc->ioc_data, NULL);
+		spin_unlock(&ioc->lock);
+	}
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-- 
cgit v1.1


From 8aea45451b252e4be09ee9974c5658bb47c81625 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Mon, 6 Jun 2011 05:07:54 +0200
Subject: CFQ: make two functions static

Correctly suggested by sparse.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 000719c..2b2d7a9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1004,8 +1004,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
-					unsigned int weight)
+static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+					  unsigned int weight)
 {
 	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
 	cfqg->new_weight = weight;
@@ -1234,7 +1234,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
  * it should not be NULL as even if elevator was exiting, cgroup deltion
  * path got to it first.
  */
-void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 {
 	unsigned long  flags;
 	struct cfq_data *cfqd = key;
-- 
cgit v1.1


From df4156569d4ace581bd1581e7aa4a9dd7d2f0775 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Mon, 6 Jun 2011 05:11:34 +0200
Subject: block: rename the return of two functions

If we rename the return of alloc_io_context() and get_io_context() from
"ret" to "ioc" the code get's (a bit) more readable and (a lot) more
grepable.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c898049..bfb0493 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -82,26 +82,26 @@ void exit_io_context(struct task_struct *task)
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret;
+	struct io_context *ioc;
 
-	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-	if (ret) {
-		atomic_long_set(&ret->refcount, 1);
-		atomic_set(&ret->nr_tasks, 1);
-		spin_lock_init(&ret->lock);
-		ret->ioprio_changed = 0;
-		ret->ioprio = 0;
-		ret->last_waited = 0; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
-		INIT_HLIST_HEAD(&ret->cic_list);
-		ret->ioc_data = NULL;
+	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	if (ioc) {
+		atomic_long_set(&ioc->refcount, 1);
+		atomic_set(&ioc->nr_tasks, 1);
+		spin_lock_init(&ioc->lock);
+		ioc->ioprio_changed = 0;
+		ioc->ioprio = 0;
+		ioc->last_waited = 0; /* doesn't matter... */
+		ioc->nr_batch_requests = 0; /* because this is 0 */
+		INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+		INIT_HLIST_HEAD(&ioc->cic_list);
+		ioc->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-		ret->cgroup_changed = 0;
+		ioc->cgroup_changed = 0;
 #endif
 	}
 
-	return ret;
+	return ioc;
 }
 
 /*
@@ -139,19 +139,19 @@ struct io_context *current_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret = NULL;
+	struct io_context *ioc = NULL;
 
 	/*
 	 * Check for unlikely race with exiting task. ioc ref count is
 	 * zero when ioc is being detached.
 	 */
 	do {
-		ret = current_io_context(gfp_flags, node);
-		if (unlikely(!ret))
+		ioc = current_io_context(gfp_flags, node);
+		if (unlikely(!ioc))
 			break;
-	} while (!atomic_long_inc_not_zero(&ret->refcount));
+	} while (!atomic_long_inc_not_zero(&ioc->refcount));
 
-	return ret;
+	return ioc;
 }
 EXPORT_SYMBOL(get_io_context);
 
-- 
cgit v1.1


From 9f5e486550456587348e2048eecda5bee778250a Mon Sep 17 00:00:00 2001
From: Wanlong Gao <wanlong.gao@gmail.com>
Date: Mon, 13 Jun 2011 10:45:43 +0200
Subject: block:remove some spare spaces in genhd.c

Remove the end-of-line spaces in genhd.c.

Signed-off-by: Wanlong Gao <wanlong.gao@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 95822ae..ed3fe82 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -602,7 +602,7 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
-	/* Register BDI before referencing it from bdev */ 
+	/* Register BDI before referencing it from bdev */
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, disk_devt(disk));
 
@@ -1148,7 +1148,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 				"wsect wuse running use aveq"
 				"\n\n");
 	*/
- 
+
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
 		cpu = part_stat_lock();
@@ -1172,7 +1172,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			);
 	}
 	disk_part_iter_exit(&piter);
- 
+
 	return 0;
 }
 
-- 
cgit v1.1


From fd16d263194aa6b50b215eb593a567b59d744d6e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 13 Jun 2011 10:42:49 +0200
Subject: block: Add __attribute__((format(printf...) and fix fallout

Use the compiler to verify format strings and arguments.

Fix fallout.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c |  4 ++--
 block/cfq-iosched.c  | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a62be8d..3689f83 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 
@@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2b2d7a9..3d403a1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
-			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
-			iops_mode(cfqd), cfqq->nr_sectors);
+	cfq_log_cfqq(cfqq->cfqd, cfqq,
+		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
+		     used_sl, cfqq->slice_dispatch, charge,
+		     iops_mode(cfqd), cfqq->nr_sectors);
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
 					  unaccounted_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
@@ -2018,8 +2019,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 */
 	if (sample_valid(cic->ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
-		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
-				cic->ttime_mean);
+		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
+			     cic->ttime_mean);
 		return;
 	}
 
-- 
cgit v1.1


From d2f31a5fd60d168b00fc4f7617b68a1287b21e90 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 13 Jun 2011 20:19:27 +0200
Subject: blk-throttle: Make total_nr_queued unsigned

The total of two unsigned values should also be unsigned.

Update throtl_log output to unsigned.
Update total_nr_queued test to non-zero to be the
same as the other total_nr_queued tests.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3689f83..f6a7941 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -142,9 +142,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-static inline int total_nr_queued(struct throtl_data *td)
+static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
-	return (td->nr_queued[0] + td->nr_queued[1]);
+	return td->nr_queued[0] + td->nr_queued[1];
 }
 
 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
+	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 
@@ -970,7 +970,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	struct delayed_work *dwork = &td->throtl_work;
 
 	/* schedule work if limits changed even if no bio is queued */
-	if (total_nr_queued(td) > 0 || td->limits_changed) {
+	if (total_nr_queued(td) || td->limits_changed) {
 		/*
 		 * We might have a work scheduled to be executed in future.
 		 * Cancel that and schedule a new one.
-- 
cgit v1.1


From 80ceb057135ad77f513277f3bcd04b885501877a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 20 Jun 2011 13:27:44 +0200
Subject: bsg: fix bsg_poll() to return POLLOUT properly

POLLOUT should be returned only if bd->queued_cmds < bd->max_queue
so that bsg_alloc_command() can proceed.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/bsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 0c8b64a..c4f49e2 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -878,7 +878,7 @@ static unsigned int bsg_poll(struct file *file, poll_table *wait)
 	spin_lock_irq(&bd->lock);
 	if (!list_empty(&bd->done_list))
 		mask |= POLLIN | POLLRDNORM;
-	if (bd->queued_cmds >= bd->max_queue)
+	if (bd->queued_cmds < bd->max_queue)
 		mask |= POLLOUT;
 	spin_unlock_irq(&bd->lock);
 
-- 
cgit v1.1


From 44194e3e88fcfe77a2a6e2333847d4f27816259a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 20 Jun 2011 13:27:44 +0200
Subject: bsg: remove unnecessary conditional expressions

Second condition in OR always implies first condition is false
thus bytes_read in the second is not needed. The same goes to
bytes_written.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/bsg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index c4f49e2..b7e42ad 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -606,7 +606,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
 	*ppos = bytes_read;
 
-	if (!bytes_read || (bytes_read && err_block_err(ret)))
+	if (!bytes_read || err_block_err(ret))
 		bytes_read = ret;
 
 	return bytes_read;
@@ -686,7 +686,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	/*
 	 * return bytes written on non-fatal errors
 	 */
-	if (!bytes_written || (bytes_written && err_block_err(ret)))
+	if (!bytes_written || err_block_err(ret))
 		bytes_written = ret;
 
 	dprintk("%s: returning %Zd\n", bd->name, bytes_written);
-- 
cgit v1.1


From 2b727c6300b49352f80f63704bb50c256949e95e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 20 Jun 2011 13:27:45 +0200
Subject: bsg: fix address space warning from sparse

copy_from/to_user() and blk_rq_map_user() want __user pointer.
This patch fixes following warnings from sparse:

   CHECK   block/bsg.c
 block/bsg.c:185:38: warning: incorrect type in argument 2 (different address spaces)
 block/bsg.c:185:38:    expected void const [noderef] <asn:1>*from
 block/bsg.c:185:38:    got void *<noident>
 block/bsg.c:295:58: warning: incorrect type in argument 4 (different address spaces)
 block/bsg.c:295:58:    expected void [noderef] <asn:1>*<noident>
 block/bsg.c:295:58:    got void *[assigned] dxferp
 block/bsg.c:311:52: warning: incorrect type in argument 4 (different address spaces)
 block/bsg.c:311:52:    expected void [noderef] <asn:1>*<noident>
 block/bsg.c:311:52:    got void *[assigned] dxferp
 block/bsg.c:448:37: warning: incorrect type in argument 1 (different address spaces)
 block/bsg.c:448:37:    expected void [noderef] <asn:1>*dst
 block/bsg.c:448:37:    got void *<noident>

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/bsg.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index b7e42ad..702f131 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -182,7 +182,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 			return -ENOMEM;
 	}
 
-	if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request,
+	if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request,
 			   hdr->request_len))
 		return -EFAULT;
 
@@ -249,7 +249,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 	struct request *rq, *next_rq = NULL;
 	int ret, rw;
 	unsigned int dxfer_len;
-	void *dxferp = NULL;
+	void __user *dxferp = NULL;
 	struct bsg_class_device *bcd = &q->bsg_dev;
 
 	/* if the LLD has been removed then the bsg_unregister_queue will
@@ -291,7 +291,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 		rq->next_rq = next_rq;
 		next_rq->cmd_type = rq->cmd_type;
 
-		dxferp = (void*)(unsigned long)hdr->din_xferp;
+		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
 		ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
 				       hdr->din_xfer_len, GFP_KERNEL);
 		if (ret)
@@ -300,10 +300,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 
 	if (hdr->dout_xfer_len) {
 		dxfer_len = hdr->dout_xfer_len;
-		dxferp = (void*)(unsigned long)hdr->dout_xferp;
+		dxferp = (void __user *)(unsigned long)hdr->dout_xferp;
 	} else if (hdr->din_xfer_len) {
 		dxfer_len = hdr->din_xfer_len;
-		dxferp = (void*)(unsigned long)hdr->din_xferp;
+		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
 	} else
 		dxfer_len = 0;
 
@@ -445,7 +445,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		int len = min_t(unsigned int, hdr->max_response_len,
 					rq->sense_len);
 
-		ret = copy_to_user((void*)(unsigned long)hdr->response,
+		ret = copy_to_user((void __user *)(unsigned long)hdr->response,
 				   rq->sense, len);
 		if (!ret)
 			hdr->response_len = len;
-- 
cgit v1.1


From 85ef06d1d252f6a2e73b678591ab71caad4667bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 1 Jul 2011 16:17:47 +0200
Subject: block: flush MEDIA_CHANGE from drivers on close(2)

Currently, only open(2) is defined as the 'clearing' point.  It has
two roles - first, it's an acknowledgement from userland indicating
that the event has been received and kernel can clear pending states
and proceed to generate more events.  Secondly, it's passed on to
device drivers as a hint indicating that a synchronization point has
been reached and it might want to take a deeper look at the device.

The latter currently is only used by sr which uses two different
mechanisms - GET_EVENT_MEDIA_STATUS_NOTIFICATION and TEST_UNIT_READY
to discover events, where the former is lighter weight and safe to be
used repeatedly but may not provide full coverage.  Among other
things, GET_EVENT can't detect media removal while TUR can.

This patch makes close(2) - blkdev_put() - indicate clearing hint for
MEDIA_CHANGE to drivers.  disk_check_events() is renamed to
disk_flush_events() and updated to take @mask for events to flush
which is or'd to ev->clearing and will be passed to the driver on the
next ->check_events() invocation.

This change makes sr generate MEDIA_CHANGE when media is ejected from
userland - e.g. with eject(1).

Note: Given the current usage, it seems @clearing hint is needlessly
complex.  disk_clear_events() can simply clear all events and the hint
can be boolean @flush.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 82d97c3..fbd7605 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1500,30 +1500,32 @@ void disk_unblock_events(struct gendisk *disk)
 }
 
 /**
- * disk_check_events - schedule immediate event checking
- * @disk: disk to check events for
+ * disk_flush_events - schedule immediate event checking and flushing
+ * @disk: disk to check and flush events for
+ * @mask: events to flush
  *
- * Schedule immediate event checking on @disk if not blocked.
+ * Schedule immediate event checking on @disk if not blocked.  Events in
+ * @mask are scheduled to be cleared from the driver.  Note that this
+ * doesn't clear the events from @disk->ev.
  *
  * CONTEXT:
- * Don't care.  Safe to call from irq context.
+ * If @mask is non-zero must be called with bdev->bd_mutex held.
  */
-void disk_check_events(struct gendisk *disk)
+void disk_flush_events(struct gendisk *disk, unsigned int mask)
 {
 	struct disk_events *ev = disk->ev;
-	unsigned long flags;
 
 	if (!ev)
 		return;
 
-	spin_lock_irqsave(&ev->lock, flags);
+	spin_lock_irq(&ev->lock);
+	ev->clearing |= mask;
 	if (!ev->block) {
 		cancel_delayed_work(&ev->dwork);
 		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
 	}
-	spin_unlock_irqrestore(&ev->lock, flags);
+	spin_unlock_irq(&ev->lock);
 }
-EXPORT_SYMBOL_GPL(disk_check_events);
 
 /**
  * disk_clear_events - synchronously check, clear and return pending events
@@ -1713,7 +1715,7 @@ static int disk_events_set_dfl_poll_msecs(const char *val,
 	mutex_lock(&disk_events_mutex);
 
 	list_for_each_entry(ev, &disk_events, node)
-		disk_check_events(ev->disk);
+		disk_flush_events(ev->disk, 0);
 
 	mutex_unlock(&disk_events_mutex);
 
-- 
cgit v1.1


From 390192b300570b2bc721d77067ca133f58015ae8 Mon Sep 17 00:00:00 2001
From: Johannes Stezenbach <js@sig21.net>
Date: Fri, 1 Jul 2011 22:32:26 +0200
Subject: compat_ioctl: fix warning caused by qemu

On Linux x86_64 host with 32bit userspace, running
qemu or even just "qemu-img create -f qcow2 some.img 1G"
causes a kernel warning:

ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(00005326){t:'S';sz:0} arg(7fffffff) on some.img
ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(801c0204){t:02;sz:28} arg(fff77350) on some.img

ioctl 00005326 is CDROM_DRIVE_STATUS,
ioctl 801c0204 is FDGETPRM.

The warning appears because the Linux compat-ioctl handler for these
ioctls only applies to block devices, while qemu also uses the ioctls on
plain files.

Signed-off-by: Johannes Stezenbach <js@sig21.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/compat_ioctl.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'block')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index cc3eb78..7b72502 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -208,19 +208,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
 #define BLKBSZSET_32		_IOW(0x12, 113, int)
 #define BLKGETSIZE64_32		_IOR(0x12, 114, int)
 
-struct compat_floppy_struct {
-	compat_uint_t	size;
-	compat_uint_t	sect;
-	compat_uint_t	head;
-	compat_uint_t	track;
-	compat_uint_t	stretch;
-	unsigned char	gap;
-	unsigned char	rate;
-	unsigned char	spec1;
-	unsigned char	fmt_gap;
-	const compat_caddr_t name;
-};
-
 struct compat_floppy_drive_params {
 	char		cmos;
 	compat_ulong_t	max_dtr;
@@ -288,7 +275,6 @@ struct compat_floppy_write_errors {
 
 #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct)
 #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct)
-#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct)
 #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params)
 #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params)
 #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)
-- 
cgit v1.1


From 0f79960391a5a1e3679956024e18aeeb0369ac44 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 6 Jul 2011 21:30:50 +0200
Subject: block: eliminate potential for infinite loop in blkdev_issue_discard

Due to the recently identified overflow in read_capacity_16() it was
possible for max_discard_sectors to be zero but still have discards
enabled on the associated device's queue.

Eliminate the possibility for blkdev_issue_discard to infinitely loop.

Interestingly this issue wasn't identified until a device, whose
discard_granularity was 0 due to read_capacity_16 overflow, was consumed
by blk_stack_limits() to construct limits for a higher-level DM
multipath device.  The multipath device's resulting limits never had the
discard limits stacked because blk_stack_limits() will only do so if
the bottom device's discard_granularity != 0.  This resulted in the
multipath device's limits.max_discard_sectors being 0.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 78e627e..64974b1 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -59,7 +59,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	 * granularity
 	 */
 	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-	if (q->limits.discard_granularity) {
+	if (!unlikely(!max_discard_sectors)) {
+		/* Avoid infinite loop below. Being cautious never hurts. */
+		return -EOPNOTSUPP;
+	} else if (q->limits.discard_granularity) {
 		unsigned int disc_sects = q->limits.discard_granularity >> 9;
 
 		max_discard_sectors &= ~(disc_sects - 1);
-- 
cgit v1.1


From 55c022bbddb2c056b5dff1bd1b1758d31b6d64c9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 8 Jul 2011 08:19:20 +0200
Subject: block: avoid building too big plug list

When I test fio script with big I/O depth, I found the total throughput drops
compared to some relative small I/O depth. The reason is the thread accumulates
big requests in its plug list and causes some delays (surely this depends
on CPU speed).
I thought we'd better have a threshold for requests. When a threshold reaches,
this means there is no request merge and queue lock contention isn't severe
when pushing per-task requests to queue, so the main advantages of blk plug
don't exist. We can force a plug list flush in this case.
With this, my test throughput actually increases and almost equals to small
I/O depth. Another side effect is irq off time decreases in blk_flush_plug_list()
for big I/O depth.
The BLK_MAX_REQUEST_COUNT is choosen arbitarily, but 16 is efficiently to
reduce lock contention to me. But I'm open here, 32 is ok in my test too.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index d2f8f40..a564852 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1302,7 +1302,10 @@ get_rq:
 				plug->should_sort = 1;
 		}
 		list_add_tail(&req->queuelist, &plug->list);
+		plug->count++;
 		drive_stat_acct(req, 1);
+		if (plug->count >= BLK_MAX_REQUEST_COUNT)
+			blk_flush_plug_list(plug, false);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
@@ -2626,6 +2629,7 @@ void blk_start_plug(struct blk_plug *plug)
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->cb_list);
 	plug->should_sort = 0;
+	plug->count = 0;
 
 	/*
 	 * If this is a nested plug, don't actually assign it. It will be
@@ -2709,6 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		return;
 
 	list_splice_init(&plug->list, &list);
+	plug->count = 0;
 
 	if (plug->should_sort) {
 		list_sort(NULL, &list, plug_rq_cmp);
-- 
cgit v1.1


From a07405b7802691d29ab3b23bdc76ee6d006aad0b Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Sun, 10 Jul 2011 22:09:19 +0200
Subject: cfq: Remove special treatment for metadata rqs.

There is no consistency among filesystems from what bios (or requests)
are marked as being metadata. It's interesting to expose this in traces,
but we shouldn't schedule the requests differently based on whether or
not they're marked as being metadata.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b2e1c75..762bd50 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -129,8 +129,6 @@ struct cfq_queue {
 	unsigned long slice_end;
 	long slice_resid;
 
-	/* pending metadata requests */
-	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -670,9 +668,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 	if (rq_is_sync(rq1) != rq_is_sync(rq2))
 		return rq_is_sync(rq1) ? rq1 : rq2;
 
-	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
-		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
-
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 
@@ -1593,10 +1588,6 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
-	if (rq->cmd_flags & REQ_META) {
-		WARN_ON(!cfqq->meta_pending);
-		cfqq->meta_pending--;
-	}
 }
 
 static int cfq_merge(struct request_queue *q, struct request **req,
@@ -3335,13 +3326,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		return true;
 
 	/*
-	 * So both queues are sync. Let the new request get disk time if
-	 * it's a metadata request and the current queue is doing regular IO.
-	 */
-	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
-		return true;
-
-	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
@@ -3405,8 +3389,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
-	if (rq->cmd_flags & REQ_META)
-		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
-- 
cgit v1.1


From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 12 Jul 2011 08:31:45 +0200
Subject: fixlet: Remove fs_excl from struct task.

fs_excl is a poor man's priority inheritance for filesystems to hint to
the block layer that an operation is important. It was never clearly
specified, not widely adopted, and will not prevent starvation in many
cases (like across cgroups).

fs_excl was introduced with the time sliced CFQ IO scheduler, to
indicate when a process held FS exclusive resources and thus needed
a boost.

It doesn't cover all file systems, and it was never fully complete.
Lets kill it.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 762bd50..d8b1087 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -134,7 +134,7 @@ struct cfq_queue {
 
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
-	unsigned short ioprio_class, org_ioprio_class;
+	unsigned short ioprio_class;
 
 	pid_t pid;
 
@@ -2869,7 +2869,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	 * elevate the priority of this queue
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
-	cfqq->org_ioprio_class = cfqq->ioprio_class;
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
@@ -3593,30 +3592,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		cfq_schedule_dispatch(cfqd);
 }
 
-/*
- * we temporarily boost lower priority queues if they are holding fs exclusive
- * resources. they are boosted to normal prio (CLASS_BE/4)
- */
-static void cfq_prio_boost(struct cfq_queue *cfqq)
-{
-	if (has_fs_excl()) {
-		/*
-		 * boost idle prio on transactions that would lock out other
-		 * users of the filesystem
-		 */
-		if (cfq_class_idle(cfqq))
-			cfqq->ioprio_class = IOPRIO_CLASS_BE;
-		if (cfqq->ioprio > IOPRIO_NORM)
-			cfqq->ioprio = IOPRIO_NORM;
-	} else {
-		/*
-		 * unboost the queue (if needed)
-		 */
-		cfqq->ioprio_class = cfqq->org_ioprio_class;
-		cfqq->ioprio = cfqq->org_ioprio;
-	}
-}
-
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
@@ -3647,7 +3622,6 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic->ioc);
-		cfq_prio_boost(cfqq);
 
 		return __cfq_may_queue(cfqq);
 	}
-- 
cgit v1.1


From 383cd7213f95a2784ab5038fe292844178768b82 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 12 Jul 2011 14:24:35 +0200
Subject: CFQ: move think time check variables to a separate struct

Move the variables to do think time check to a sepatate struct. This is
to prepare adding think time check for service tree and group. No
functional change.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d8b1087..5625559 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2008,10 +2008,10 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * slice, then don't idle. This avoids overrunning the allotted
 	 * time slice.
 	 */
-	if (sample_valid(cic->ttime_samples) &&
-	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
+	if (sample_valid(cic->ttime.ttime_samples) &&
+	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
 		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
-			     cic->ttime_mean);
+			     cic->ttime.ttime_mean);
 		return;
 	}
 
@@ -2819,7 +2819,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
 							cfqd->queue->node);
 	if (cic) {
-		cic->last_end_request = jiffies;
+		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = cfq_free_io_context;
@@ -3206,14 +3206,22 @@ err:
 }
 
 static void
-cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
+__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
-	unsigned long elapsed = jiffies - cic->last_end_request;
-	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
+	unsigned long elapsed = jiffies - ttime->last_end_request;
+	elapsed = min(elapsed, 2UL * slice_idle);
 
-	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
-	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
-	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
+	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
+	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
+}
+
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+	struct cfq_io_context *cic)
+{
+	if (cfq_cfqq_sync(cfqq))
+		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
 }
 
 static void
@@ -3262,8 +3270,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
-	else if (sample_valid(cic->ttime_samples)) {
-		if (cic->ttime_mean > cfqd->cfq_slice_idle)
+	else if (sample_valid(cic->ttime.ttime_samples)) {
+		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
@@ -3389,7 +3397,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfqd->rq_queued++;
 
-	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_io_thinktime(cfqd, cfqq, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
@@ -3500,8 +3508,8 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return true;
 
 	/* if slice left is less than think time, wait busy */
-	if (cic && sample_valid(cic->ttime_samples)
-	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
+	if (cic && sample_valid(cic->ttime.ttime_samples)
+	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
 		return true;
 
 	/*
@@ -3542,7 +3550,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
-		RQ_CIC(rq)->last_end_request = now;
+		RQ_CIC(rq)->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
-- 
cgit v1.1


From f5f2b6ceb23e02ff35c6dbc6a39aa776ace99cda Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 12 Jul 2011 14:24:55 +0200
Subject: CFQ: add think time check for service tree

Currently when the last queue of a service tree has no request, we don't
expire the queue to hope request from the service tree comes soon, so the
service tree doesn't miss its share. But if the think time is big, the
assumption isn't correct and we just waste bandwidth. In such case, we
don't do idle.

[global]
runtime=10
direct=1

[test1]
rw=randread
ioengine=libaio
size=500m
directory=/mnt
filename=file1
thinktime=9000

[test2]
rw=read
ioengine=libaio
size=1G
directory=/mnt
filename=file2

	patched		base
test1	41k/s		33k/s
test2	15868k/s	15789k/s
total	15902k/s	15817k/s

A slightly better

To check if the patch changes behavior of queue without think time. I also
tried to give test1 2ms think time or no think time. The test has variation
even without the patch, but the average throughput doesn't change with/without
the patch.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5625559..baa9060 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,9 +87,10 @@ struct cfq_rb_root {
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
+	struct cfq_ttime ttime;
 };
-#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
-			.count = 0, .min_vdisktime = 0, }
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
+			.ttime = {.last_end_request = jiffies,},}
 
 /*
  * Per process-grouping structure
@@ -391,6 +392,18 @@ CFQ_CFQQ_FNS(wait_busy);
 			j++, st = i < IDLE_WORKLOAD ? \
 			&cfqg->service_trees[i][j]: NULL) \
 
+static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
+	struct cfq_ttime *ttime, bool group_idle)
+{
+	unsigned long slice;
+	if (!sample_valid(ttime->ttime_samples))
+		return false;
+	if (group_idle)
+		slice = cfqd->cfq_group_idle;
+	else
+		slice = cfqd->cfq_slice_idle;
+	return ttime->ttime_mean > slice;
+}
 
 static inline bool iops_mode(struct cfq_data *cfqd)
 {
@@ -1955,7 +1968,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
 		return true;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
@@ -3220,8 +3234,11 @@ static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic)
 {
-	if (cfq_cfqq_sync(cfqq))
+	if (cfq_cfqq_sync(cfqq)) {
 		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
+		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
+			cfqd->cfq_slice_idle);
+	}
 }
 
 static void
@@ -3550,7 +3567,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
+		struct cfq_rb_root *service_tree;
+
 		RQ_CIC(rq)->ttime.last_end_request = now;
+
+		if (cfq_cfqq_on_rr(cfqq))
+			service_tree = cfqq->service_tree;
+		else
+			service_tree = service_tree_for(cfqq->cfqg,
+				cfqq_prio(cfqq), cfqq_type(cfqq));
+		service_tree->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
-- 
cgit v1.1


From 7700fc4f675fa38094e78e345b594363a2fd895b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 12 Jul 2011 14:24:56 +0200
Subject: CFQ: add think time check for group

Currently when the last queue of a group has no request, we don't expire
the queue to hope request from the group comes soon, so the group doesn't
miss its share. But if the think time is big, the assumption isn't correct
and we just waste bandwidth. In such case, we don't do idle.

[global]
runtime=30
direct=1

[test1]
cgroup=test1
cgroup_weight=1000
rw=randread
ioengine=libaio
size=500m
runtime=30
directory=/mnt
filename=file1
thinktime=9000

[test2]
cgroup=test2
cgroup_weight=1000
rw=randread
ioengine=libaio
size=500m
runtime=30
directory=/mnt
filename=file2

	patched		base
test1	64k		39k
test2	548k		540k
total	604k		578k

group1 gets much better throughput because it waits less time.

To check if the patch changes behavior of queue without think time. I also
tried to give test1 2ms think time or no think time. The test result is stable.
The thoughput doesn't change with/without the patch.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index baa9060..1f96ad6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -211,6 +211,7 @@ struct cfq_group {
 #endif
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
+	struct cfq_ttime ttime;
 };
 
 /*
@@ -1067,6 +1068,8 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
+	cfqg->ttime.last_end_request = jiffies;
+
 	/*
 	 * Take the initial reference that will be released on destroy
 	 * This can be thought of a joint reference by cgroup and
@@ -2381,8 +2384,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * this group, wait for requests to complete.
 	 */
 check_group_idle:
-	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
-	    && cfqq->cfqg->dispatched) {
+	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
+	    cfqq->cfqg->dispatched &&
+	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
@@ -3239,6 +3243,9 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
 			cfqd->cfq_slice_idle);
 	}
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
+#endif
 }
 
 static void
@@ -3521,6 +3528,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (cfqq->cfqg->nr_cfqq > 1)
 		return false;
 
+	/* the only queue in the group, but think time is big */
+	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
+		return false;
+
 	if (cfq_slice_used(cfqq))
 		return true;
 
@@ -3581,6 +3592,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfqd->last_delayed_sync = now;
 	}
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	cfqq->cfqg->ttime.last_end_request = now;
+#endif
+
 	/*
 	 * If this is the active queue, check if it needs to be expired,
 	 * or if we want to idle in case it has no pending requests.
-- 
cgit v1.1


From 57bdfbf9ee2b0856d8b62180c3b3f8fa1533b8d1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 18 Mar 2011 11:42:58 +0800
Subject: block,rcu: Convert call_rcu(disk_free_ptbl_rcu_cb) to kfree_rcu()

The rcu callback disk_free_ptbl_rcu_cb() just calls a kfree(),
so we use kfree_rcu() instead of the call_rcu(disk_free_ptbl_rcu_cb).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 block/genhd.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3608289..6024b82 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1018,14 +1018,6 @@ static const struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
-static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
-{
-	struct disk_part_tbl *ptbl =
-		container_of(head, struct disk_part_tbl, rcu_head);
-
-	kfree(ptbl);
-}
-
 /**
  * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
  * @disk: disk to replace part_tbl for
@@ -1046,7 +1038,7 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+		kfree_rcu(old_ptbl, rcu_head);
 	}
 }
 
-- 
cgit v1.1


From bfe159a51203c15d23cb3158fffdc25ec4b4dda1 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Thu, 7 Jul 2011 15:45:40 -0500
Subject: [SCSI] fix crash in scsi_dispatch_cmd()

USB surprise removal of sr is triggering an oops in
scsi_dispatch_command().  What seems to be happening is that USB is
hanging on to a queue reference until the last close of the upper
device, so the crash is caused by surprise remove of a mounted CD
followed by attempted unmount.

The problem is that USB doesn't issue its final commands as part of
the SCSI teardown path, but on last close when the block queue is long
gone.  The long term fix is probably to make sr do the teardown in the
same way as sd (so remove all the lower bits on ejection, but keep the
upper disk alive until last close of user space).  However, the
current oops can be simply fixed by not allowing any commands to be
sent to a dead queue.

Cc: stable@kernel.org
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 block/blk-core.c | 3 +++
 block/blk-exec.c | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index d2f8f40..1d49e1c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -839,6 +839,9 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		return NULL;
+
 	BUG_ON(rw != READ && rw != WRITE);
 
 	spin_lock_irq(q->queue_lock);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8a0e7ec..a1ebceb 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,6 +50,13 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+		rq->errors = -ENXIO;
+		if (rq->end_io)
+			rq->end_io(rq, rq->errors);
+		return;
+	}
+
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
-- 
cgit v1.1


From 4c64500eada358165d0bb9a20d6c7d30821995b4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sat, 23 Jul 2011 20:34:59 +0200
Subject: block: fix patch import error in max_discard_sectors check

A '!' snuck in before the unlikely, rendering it useless.

Reported-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 64974b1..2b461b4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -59,7 +59,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	 * granularity
 	 */
 	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-	if (!unlikely(!max_discard_sectors)) {
+	if (unlikely(!max_discard_sectors)) {
 		/* Avoid infinite loop below. Being cautious never hurts. */
 		return -EOPNOTSUPP;
 	} else if (q->limits.discard_granularity) {
-- 
cgit v1.1


From 5757a6d76cdf6dda2a492c09b985c015e86779b1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 23 Jul 2011 20:44:25 +0200
Subject: block: strict rq_affinity

Some systems benefit from completions always being steered to the strict
requester cpu rather than the looser "per-socket" steering that
blk_cpu_to_group() attempts by default. This is because the first
CPU in the group mask ends up being completely overloaded with work,
while the others (including the original submitter) has power left
to spare.

Allow the strict mode to be set by writing '2' to the sysfs control
file. This is identical to the scheme used for the nomerges file,
where '2' is a more aggressive setting than just being turned on.

echo 2 > /sys/block/<bdev>/queue/rq_affinity

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roland Dreier <roland@purestorage.com>
Tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c    |  6 ++----
 block/blk-softirq.c | 11 +++++++----
 block/blk-sysfs.c   | 13 +++++++++----
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a564852..b322825 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1279,10 +1279,8 @@ get_rq:
 	init_request_from_bio(req, bio);
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE)) {
-		req->cpu = blk_cpu_to_group(get_cpu());
-		put_cpu();
-	}
+	    bio_flagged(bio, BIO_CPU_AFFINE))
+		req->cpu = smp_processor_id();
 
 	plug = current->plug;
 	if (plug) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c216..475fab8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
-	int ccpu, cpu, group_cpu;
 
 	BUG_ON(!q->softirq_done_fn);
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	group_cpu = blk_cpu_to_group(cpu);
 
 	/*
 	 * Select completion CPU
 	 */
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-	else
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
+	} else
 		ccpu = cpu;
 
 	if (ccpu == cpu || ccpu == group_cpu) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d935bd8..0ee17b5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
 	bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+	bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
 
-	return queue_var_show(set, page);
+	return queue_var_show(set << force, page);
 }
 
 static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 	ret = queue_var_store(&val, page, count);
 	spin_lock_irq(q->queue_lock);
-	if (val)
+	if (val) {
 		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-	else
-		queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+		if (val == 2)
+			queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+	} else {
+		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	}
 	spin_unlock_irq(q->queue_lock);
 #endif
 	return ret;
-- 
cgit v1.1


From 11ccf116d0d756d06989775288e41f737d98e0c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 26 Jul 2011 15:01:15 +0200
Subject: block: fix warning with calling smp_processor_id() in preemptible
 section

After commit 5757a6d7 introduced an unsafe calling of
smp_processor_id(), with preempt debuggin turned on we spew a lot of:

BUG: using smp_processor_id() in preemptible [00000000] code: kjournald/514
caller is __make_request+0x1b8/0x308
[<c0019f44>] (unwind_backtrace+0x0/0xe8) from [<c024b4cc>] (debug_smp_processor_id+0xbc/0xf0)
[<c024b4cc>] (debug_smp_processor_id+0xbc/0xf0) from [<c0223d14>] (__make_request+0x1b8/0x308)
[<c0223d14>] (__make_request+0x1b8/0x308) from [<c02215ac>] (generic_make_request+0x4dc/0x558)
[<c02215ac>] (generic_make_request+0x4dc/0x558) from [<c022173c>] (submit_bio+0x114/0x138)
[<c022173c>] (submit_bio+0x114/0x138) from [<c011f504>] (submit_bh+0x148/0x16c)
[<c011f504>] (submit_bh+0x148/0x16c) from [<c0121ed8>] (__sync_dirty_buffer+0x88/0xd8)
[<c0121ed8>] (__sync_dirty_buffer+0x88/0xd8) from [<c01aff78>] (journal_commit_transaction+0x1198/0x1688)
[<c01aff78>] (journal_commit_transaction+0x1198/0x1688) from [<c01b4034>] (kjournald+0xb4/0x224)
[<c01b4034>] (kjournald+0xb4/0x224) from [<c0069ea0>] (kthread+0x8c/0x94)
[<c0069ea0>] (kthread+0x8c/0x94) from [<c00137f8>] (kernel_thread_exit+0x0/0x8)

Fix this by just using raw_smp_processor_id(), it's just a hint
after all. There's no pinning of the CPU or accessing per-cpu
structures involved.

Reported-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f8cb099..f925581 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1283,7 +1283,7 @@ get_rq:
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
-		req->cpu = smp_processor_id();
+		req->cpu = raw_smp_processor_id();
 
 	plug = current->plug;
 	if (plug) {
-- 
cgit v1.1


From b2c9cd3793e5878e301ec2219785a7b8ca402ef1 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 26 Jul 2011 16:09:03 -0700
Subject: fail_make_request: cleanup should_fail_request

This changes should_fail_request() to more usable wrapper function of
should_fail().  It can avoid putting #ifdef CONFIG_FAIL_MAKE_REQUEST in
the middle of a function.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f925581..b850bed 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1361,14 +1361,9 @@ static int __init setup_fail_make_request(char *str)
 }
 __setup("fail_make_request=", setup_fail_make_request);
 
-static int should_fail_request(struct bio *bio)
+static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
 {
-	struct hd_struct *part = bio->bi_bdev->bd_part;
-
-	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
-		return should_fail(&fail_make_request, bio->bi_size);
-
-	return 0;
+	return part->make_it_fail && should_fail(&fail_make_request, bytes);
 }
 
 static int __init fail_make_request_debugfs(void)
@@ -1381,9 +1376,10 @@ late_initcall(fail_make_request_debugfs);
 
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 
-static inline int should_fail_request(struct bio *bio)
+static inline bool should_fail_request(struct hd_struct *part,
+					unsigned int bytes)
 {
-	return 0;
+	return false;
 }
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
@@ -1466,6 +1462,7 @@ static inline void __generic_make_request(struct bio *bio)
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
+		struct hd_struct *part;
 
 		q = bdev_get_queue(bio->bi_bdev);
 		if (unlikely(!q)) {
@@ -1489,7 +1486,10 @@ static inline void __generic_make_request(struct bio *bio)
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 
-		if (should_fail_request(bio))
+		part = bio->bi_bdev->bd_part;
+		if (should_fail_request(part, bio->bi_size) ||
+		    should_fail_request(&part_to_disk(part)->part0,
+					bio->bi_size))
 			goto end_io;
 
 		/*
@@ -1704,11 +1704,9 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
 
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
-	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
+	if (rq->rq_disk &&
+	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
 		return -EIO;
-#endif
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-- 
cgit v1.1


From aa387cc895672b00f807ad7c734a2defaf677712 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Sun, 31 Jul 2011 22:05:09 +0200
Subject: block: add bsg helper library

This moves the FC classes bsg code to the block layer and
makes it a lib so that other classes like iscsi and SAS can use it.

It is helpful because working with the request queue, bios,
creating scatterlists, etc are a pain that the LLD does not
have to worry about with normal IOs and should not have to
worry about for bsg requests.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig   |  10 ++
 block/Makefile  |   1 +
 block/bsg-lib.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 308 insertions(+)
 create mode 100644 block/bsg-lib.c

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 60be1e0..e97934e 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -65,6 +65,16 @@ config BLK_DEV_BSG
 
 	  If unsure, say Y.
 
+config BLK_DEV_BSGLIB
+	bool "Block layer SG support v4 helper lib"
+	default n
+	select BLK_DEV_BSG
+	help
+	  Subsystems will normally enable this if needed. Users will not
+	  normally need to manually enable this.
+
+	  If unsure, say N.
+
 config BLK_DEV_INTEGRITY
 	bool "Block layer data integrity support"
 	---help---
diff --git a/block/Makefile b/block/Makefile
index 0fec4b3..514c6e4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
new file mode 100644
index 0000000..f8c0a61
--- /dev/null
+++ b/block/bsg-lib.c
@@ -0,0 +1,297 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/bsg-lib.h>
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * bsg_destroy_job - routine to teardown/delete a bsg job
+ * @job: bsg_job that is to be torn down
+ */
+static void bsg_destroy_job(struct bsg_job *job)
+{
+	put_device(job->dev);	/* release reference for the request */
+
+	kfree(job->request_payload.sg_list);
+	kfree(job->reply_payload.sg_list);
+	kfree(job);
+}
+
+/**
+ * bsg_job_done - completion routine for bsg requests
+ * @job: bsg_job that is complete
+ * @result: job reply result
+ * @reply_payload_rcv_len: length of payload recvd
+ *
+ * The LLD should call this when the bsg job has completed.
+ */
+void bsg_job_done(struct bsg_job *job, int result,
+		  unsigned int reply_payload_rcv_len)
+{
+	struct request *req = job->req;
+	struct request *rsp = req->next_rq;
+	int err;
+
+	err = job->req->errors = result;
+	if (err < 0)
+		/* we're only returning the result field in the reply */
+		job->req->sense_len = sizeof(u32);
+	else
+		job->req->sense_len = job->reply_len;
+	/* we assume all request payload was transferred, residual == 0 */
+	req->resid_len = 0;
+
+	if (rsp) {
+		WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+
+		/* set reply (bidi) residual */
+		rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+	}
+	blk_complete_request(req);
+}
+EXPORT_SYMBOL_GPL(bsg_job_done);
+
+/**
+ * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * @rq: BSG request that holds the job to be destroyed
+ */
+static void bsg_softirq_done(struct request *rq)
+{
+	struct bsg_job *job = rq->special;
+
+	blk_end_request_all(rq, rq->errors);
+	bsg_destroy_job(job);
+}
+
+static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+	size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+
+	BUG_ON(!req->nr_phys_segments);
+
+	buf->sg_list = kzalloc(sz, GFP_KERNEL);
+	if (!buf->sg_list)
+		return -ENOMEM;
+	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+	buf->payload_len = blk_rq_bytes(req);
+	return 0;
+}
+
+/**
+ * bsg_create_job - create the bsg_job structure for the bsg request
+ * @dev: device that is being sent the bsg request
+ * @req: BSG request that needs a job structure
+ */
+static int bsg_create_job(struct device *dev, struct request *req)
+{
+	struct request *rsp = req->next_rq;
+	struct request_queue *q = req->q;
+	struct bsg_job *job;
+	int ret;
+
+	BUG_ON(req->special);
+
+	job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	req->special = job;
+	job->req = req;
+	if (q->bsg_job_size)
+		job->dd_data = (void *)&job[1];
+	job->request = req->cmd;
+	job->request_len = req->cmd_len;
+	job->reply = req->sense;
+	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
+						 * allocated */
+	if (req->bio) {
+		ret = bsg_map_buffer(&job->request_payload, req);
+		if (ret)
+			goto failjob_rls_job;
+	}
+	if (rsp && rsp->bio) {
+		ret = bsg_map_buffer(&job->reply_payload, rsp);
+		if (ret)
+			goto failjob_rls_rqst_payload;
+	}
+	job->dev = dev;
+	/* take a reference for the request */
+	get_device(job->dev);
+	return 0;
+
+failjob_rls_rqst_payload:
+	kfree(job->request_payload.sg_list);
+failjob_rls_job:
+	kfree(job);
+	return -ENOMEM;
+}
+
+/*
+ * bsg_goose_queue - restart queue in case it was stopped
+ * @q: request q to be restarted
+ */
+void bsg_goose_queue(struct request_queue *q)
+{
+	if (!q)
+		return;
+
+	blk_run_queue_async(q);
+}
+EXPORT_SYMBOL_GPL(bsg_goose_queue);
+
+/**
+ * bsg_request_fn - generic handler for bsg requests
+ * @q: request queue to manage
+ *
+ * On error the create_bsg_job function should return a -Exyz error value
+ * that will be set to the req->errors.
+ *
+ * Drivers/subsys should pass this to the queue init function.
+ */
+void bsg_request_fn(struct request_queue *q)
+{
+	struct device *dev = q->queuedata;
+	struct request *req;
+	struct bsg_job *job;
+	int ret;
+
+	if (!get_device(dev))
+		return;
+
+	while (1) {
+		req = blk_fetch_request(q);
+		if (!req)
+			break;
+		spin_unlock_irq(q->queue_lock);
+
+		ret = bsg_create_job(dev, req);
+		if (ret) {
+			req->errors = ret;
+			blk_end_request_all(req, ret);
+			spin_lock_irq(q->queue_lock);
+			continue;
+		}
+
+		job = req->special;
+		ret = q->bsg_job_fn(job);
+		spin_lock_irq(q->queue_lock);
+		if (ret)
+			break;
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	put_device(dev);
+	spin_lock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(bsg_request_fn);
+
+/**
+ * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
+ * @dev: device to attach bsg device to
+ * @q: request queue setup by caller
+ * @name: device to give bsg device
+ * @job_fn: bsg job handler
+ * @dd_job_size: size of LLD data needed for each job
+ *
+ * The caller should have setup the reuqest queue with bsg_request_fn
+ * as the request_fn.
+ */
+int bsg_setup_queue(struct device *dev, struct request_queue *q,
+		    char *name, bsg_job_fn *job_fn, int dd_job_size)
+{
+	int ret;
+
+	q->queuedata = dev;
+	q->bsg_job_size = dd_job_size;
+	q->bsg_job_fn = job_fn;
+	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+	blk_queue_softirq_done(q, bsg_softirq_done);
+	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+
+	ret = bsg_register_queue(q, dev, name, NULL);
+	if (ret) {
+		printk(KERN_ERR "%s: bsg interface failed to "
+		       "initialize - register queue\n", dev->kobj.name);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bsg_setup_queue);
+
+/**
+ * bsg_remove_queue - Deletes the bsg dev from the q
+ * @q:	the request_queue that is to be torn down.
+ *
+ * Notes:
+ *   Before unregistering the queue empty any requests that are blocked
+ */
+void bsg_remove_queue(struct request_queue *q)
+{
+	struct request *req; /* block request */
+	int counts; /* totals for request_list count and starved */
+
+	if (!q)
+		return;
+
+	/* Stop taking in new requests */
+	spin_lock_irq(q->queue_lock);
+	blk_stop_queue(q);
+
+	/* drain all requests in the queue */
+	while (1) {
+		/* need the lock to fetch a request
+		 * this may fetch the same reqeust as the previous pass
+		 */
+		req = blk_fetch_request(q);
+		/* save requests in use and starved */
+		counts = q->rq.count[0] + q->rq.count[1] +
+			 q->rq.starved[0] + q->rq.starved[1];
+		spin_unlock_irq(q->queue_lock);
+		/* any requests still outstanding? */
+		if (counts == 0)
+			break;
+
+		/* This may be the same req as the previous iteration,
+		 * always send the blk_end_request_all after a prefetch.
+		 * It is not okay to not end the request because the
+		 * prefetch started the request.
+		 */
+		if (req) {
+			/* return -ENXIO to indicate that this queue is
+			 * going away
+			 */
+			req->errors = -ENXIO;
+			blk_end_request_all(req, -ENXIO);
+		}
+
+		msleep(200); /* allow bsg to possibly finish */
+		spin_lock_irq(q->queue_lock);
+	}
+	bsg_unregister_queue(q);
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
-- 
cgit v1.1


From e5a94f56845bb4b272d82e84b5a1e2080b07ba82 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 1 Aug 2011 10:31:06 +0200
Subject: blk-throttle: correctly determine sync bio

read request is always sync. Using rw_is_sync() to determine
if a bio is sync.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f6a7941..a19f58c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -746,7 +746,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
-	bool sync = bio->bi_rw & REQ_SYNC;
+	bool sync = rw_is_sync(bio->bi_rw);
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
@@ -1150,7 +1150,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 
 		if (tg_no_rule_group(tg, rw)) {
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-					rw, bio->bi_rw & REQ_SYNC);
+					rw, rw_is_sync(bio->bi_rw));
 			rcu_read_unlock();
 			return 0;
 		}
-- 
cgit v1.1


From a5395b83b78f62ccf5e3af854aacd025c2a6e7b5 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 2 Aug 2011 09:24:09 +0200
Subject: cfq-iosched: Reduce linked group count upon group destruction

FQ keeps track of number of groups which are linked on blkcg->blkg_list.
This is useful to avoid races between queue exit and cgroup exit code
paths. So if at the request queue exit time linked group count is not
zero, that means there are some group out there which is yet to be
deleted under rcu read period and queue exit code should wait for
on rcu period.

In my previous patch I forgot to decrease the number of group count.
So in current form, we nr_blkcg_linked_grps is always non-zero and
we will always wait one rcu period (if BLK_CGROUP=y). The side effect
of this is that it can increase boot time. I am surprised, nobody
complained so far.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f96ad6..6508345 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1209,6 +1209,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	hlist_del_init(&cfqg->cfqd_node);
 
+	BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+	cfqd->nr_blkcg_linked_grps--;
+
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
-- 
cgit v1.1


From e2a5429ff7947ad251310376384f449297b7492a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 2 Aug 2011 10:43:35 +0200
Subject: bsg-lib: add module.h include

Due to conflicts with the moduleh tree in linux-next, we
run into an include file mess. We really need export.h
in that tree, but if we add module.h locally then the
issue is easier to resolve.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/bsg-lib.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f8c0a61..6690e6e 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/bsg-lib.h>
+#include <linux/module.h>
 #include <scsi/scsi_cmnd.h>
 
 /**
-- 
cgit v1.1


From f95fe9cfb49f6e625fbb5888cae2ed6f3a276b89 Mon Sep 17 00:00:00 2001
From: Herbert Poetzl <herbert@13thfloor.at>
Date: Tue, 2 Aug 2011 12:43:50 +0200
Subject: block/genhd.c: remove useless cast in diskstats_show()

Remove the (unsigned long long) cast in diskstats_show() and adjusts the
seq_printf() format string to 'unsigned long'

diskstats_show() uses part_stat_read() to get the stats, which either
accesses the specified field in the struct disk_stats directly (non SMP)
or sums up the per CPU values in a variable of the same type as the field,
so in any case the result will have the same type and range as the
specified field which for all disk_stats entries is unsigned long

Also, for unsigned long ranges the output of %lu should be identical to
the one of %llu, so no change in the actual proc entry contents.

Signed-off-by: Herbert Poetzl <herbert@13thfloor.at>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 5cb51c5..e2f6790 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1146,17 +1146,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		cpu = part_stat_lock();
 		part_round_stats(cpu, hd);
 		part_stat_unlock();
-		seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
-			   "%u %lu %lu %llu %u %u %u %u\n",
+		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
+			   "%u %lu %lu %lu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[READ]),
 			   part_stat_read(hd, merges[READ]),
-			   (unsigned long long)part_stat_read(hd, sectors[READ]),
+			   part_stat_read(hd, sectors[READ]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
 			   part_stat_read(hd, ios[WRITE]),
 			   part_stat_read(hd, merges[WRITE]),
-			   (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+			   part_stat_read(hd, sectors[WRITE]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
 			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
-- 
cgit v1.1


From dd48c085c1cdf9446f92826f1fd451167fb6c2fd Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 3 Aug 2011 16:21:01 -0700
Subject: fault-injection: add ability to export fault_attr in arbitrary
 directory

init_fault_attr_dentries() is used to export fault_attr via debugfs.
But it can only export it in debugfs root directory.

Per Forlin is working on mmc_fail_request which adds support to inject
data errors after a completed host transfer in MMC subsystem.

The fault_attr for mmc_fail_request should be defined per mmc host and
export it in debugfs directory per mmc host like
/sys/kernel/debug/mmc0/mmc_fail_request.

init_fault_attr_dentries() doesn't help for mmc_fail_request.  So this
introduces fault_create_debugfs_attr() which is able to create a
directory in the arbitrary directory and replace
init_fault_attr_dentries().

[akpm@linux-foundation.org: extraneous semicolon, per Randy]
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Tested-by: Per Forlin <per.forlin@linaro.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c    | 6 ++++--
 block/blk-timeout.c | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b850bed..b627558 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1368,8 +1368,10 @@ static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
 
 static int __init fail_make_request_debugfs(void)
 {
-	return init_fault_attr_dentries(&fail_make_request,
-					"fail_make_request");
+	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
+						NULL, &fail_make_request);
+
+	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 
 late_initcall(fail_make_request_debugfs);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4f0c06c..7803548 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -28,7 +28,10 @@ int blk_should_fake_timeout(struct request_queue *q)
 
 static int __init fail_io_timeout_debugfs(void)
 {
-	return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
+	struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
+						NULL, &fail_io_timeout);
+
+	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 
 late_initcall(fail_io_timeout_debugfs);
-- 
cgit v1.1


From 35ae66e0a09ab70ed588e65f26b4c725cd1656b6 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 5 Aug 2011 09:37:10 +0200
Subject: block: Make rq_affinity = 1 work as expected

Commit 5757a6d76c introduced a new rq_affinity = 2 so as to make
the request completed in the __make_request cpu. But it makes the
old rq_affinity = 1 not work any more. The root cause is that
if the 'cpu' and 'req->cpu' is in the same group and cpu != req->cpu,
ccpu will be the same as group_cpu, so the completion will be
excuted in the 'cpu' not 'group_cpu'.

This patch fix problem by simpling removing group_cpu and the codes
are more explicit now. If ccpu == cpu, we complete in cpu, otherwise
we raise_blk_irq to ccpu.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Reviewed-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-softirq.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 475fab8..487addc 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
-	int ccpu, cpu, group_cpu = NR_CPUS;
+	int ccpu, cpu;
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
@@ -117,14 +117,12 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 			ccpu = blk_cpu_to_group(ccpu);
-			group_cpu = blk_cpu_to_group(cpu);
-		}
 	} else
 		ccpu = cpu;
 
-	if (ccpu == cpu || ccpu == group_cpu) {
+	if (ccpu == cpu) {
 		struct list_head *list;
 do_local:
 		list = &__get_cpu_var(blk_cpu_done);
-- 
cgit v1.1


From fa1bf42ff9296ac4cf211b0a1b450a6071d26a95 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 9 Aug 2011 20:32:09 +0200
Subject: allow blk_flush_policy to return REQ_FSEQ_DATA independent of *FLUSH

blk_insert_flush has the following check:

	/*
	 * If there's data but flush is not necessary, the request can be
	 * processed directly without going through flush machinery.  Queue
	 * for normal execution.
	 */
	if ((policy & REQ_FSEQ_DATA) &&
	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
		list_add_tail(&rq->queuelist, &q->queue_head);
		return;
	}

However, blk_flush_policy will not return with policy set to only
REQ_FSEQ_DATA:

static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
{
	unsigned int policy = 0;

	if (fflags & REQ_FLUSH) {
		if (rq->cmd_flags & REQ_FLUSH)
			policy |= REQ_FSEQ_PREFLUSH;
		if (blk_rq_sectors(rq))
			policy |= REQ_FSEQ_DATA;
		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
			policy |= REQ_FSEQ_POSTFLUSH;
	}
	return policy;
}

Notice that REQ_FSEQ_DATA is only set if REQ_FLUSH is set.  Fix this
mismatch by moving the setting of REQ_FSEQ_DATA outside of the REQ_FLUSH
check.

Tejun notes:

  Hmmm... yes, this can become a correctness issue if (and only if)
  blk_queue_flush() is called to change q->flush_flags while requests
  are in-flight; otherwise, requests wouldn't reach the function at all.
  Also, I think it would be a generally good idea to always set
  FSEQ_DATA if the request has data.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index bb21e4c..2d162bd 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
 	unsigned int policy = 0;
 
+	if (blk_rq_sectors(rq))
+		policy |= REQ_FSEQ_DATA;
+
 	if (fflags & REQ_FLUSH) {
 		if (rq->cmd_flags & REQ_FLUSH)
 			policy |= REQ_FSEQ_PREFLUSH;
-		if (blk_rq_sectors(rq))
-			policy |= REQ_FSEQ_DATA;
 		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
 			policy |= REQ_FSEQ_POSTFLUSH;
 	}
-- 
cgit v1.1


From bcf30e75b773b60379338768677a1301ef602ff9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 11 Aug 2011 10:39:04 +0200
Subject: block: improve rq_affinity placement

This patch reverts commit 35ae66e0a09ab70ed(block: Make rq_affinity = 1
work as expected). The purpose is to avoid an unnecessary IPI.
Let's take an example. My test box has cpu 0-7, one socket. Say request is
added from CPU 1, blk_complete_request() occurs at CPU 7. Without the reverted
patch, softirq will be done at CPU 7. With it, an IPI will be directed to CPU
0, and softirq will be done at CPU 0. In this case, doing softirq at CPU 0 and
CPU 7 have no difference from cache sharing point view and we can avoid an
ipi if doing it in CPU 7.
An immediate concern is this is just like QUEUE_FLAG_SAME_FORCE, but actually
not. blk_complete_request() is running in interrupt handler, and currently
I/O controller doesn't support multiple interrupts (I checked several LSI
cards and AHCI), so only one CPU can run blk_complete_request(). This is
still quite different as QUEUE_FLAG_SAME_FORCE.
Since only one CPU runs softirq, the only difference with below patch is
softirq not always runs at the first CPU of a group.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-softirq.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 487addc..58340d0 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
-	int ccpu, cpu;
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
@@ -117,12 +117,22 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
 			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
 	} else
 		ccpu = cpu;
 
-	if (ccpu == cpu) {
+	/*
+	 * If current CPU and requested CPU are in the same group, running
+	 * softirq in current CPU. One might concern this is just like
+	 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
+	 * running in interrupt handler, and currently I/O controller doesn't
+	 * support multiple interrupts, so current CPU is unique actually. This
+	 * avoids IPI sending from current CPU to the first CPU of a group.
+	 */
+	if (ccpu == cpu || ccpu == group_cpu) {
 		struct list_head *list;
 do_local:
 		list = &__get_cpu_var(blk_cpu_done);
-- 
cgit v1.1


From 4853abaae7e4a2af938115ce9071ef8684fb7af4 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 15 Aug 2011 21:37:25 +0200
Subject: block: fix flush machinery for stacking drivers with differring flush
 flags

Commit ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae, block: reimplement
FLUSH/FUA to support merge, introduced a performance regression when
running any sort of fsyncing workload using dm-multipath and certain
storage (in our case, an HP EVA).  The test I ran was fs_mark, and it
dropped from ~800 files/sec on ext4 to ~100 files/sec.  It turns out
that dm-multipath always advertised flush+fua support, and passed
commands on down the stack, where those flags used to get stripped off.
The above commit changed that behavior:

static inline struct request *__elv_next_request(struct request_queue *q)
{
        struct request *rq;

        while (1) {
-               while (!list_empty(&q->queue_head)) {
+               if (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                       if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-                           (rq->cmd_flags & REQ_FLUSH_SEQ))
-                               return rq;
-                       rq = blk_do_flush(q, rq);
-                       if (rq)
-                               return rq;
+                       return rq;
                }

Note that previously, a command would come in here, have
REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush:

struct request *blk_do_flush(struct request_queue *q, struct request *rq)
{
        unsigned int fflags = q->flush_flags; /* may change, cache it */
        bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
        bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
        bool do_postflush = has_flush && !has_fua && (rq->cmd_flags &
        REQ_FUA);
        unsigned skip = 0;
...
        if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
                rq->cmd_flags &= ~REQ_FLUSH;
		if (!has_fua)
			rq->cmd_flags &= ~REQ_FUA;
	        return rq;
	}

So, the flush machinery was bypassed in such cases (q->flush_flags == 0
&& rq->cmd_flags & (REQ_FLUSH|REQ_FUA)).

Now, however, we don't get into the flush machinery at all.  Instead,
__elv_next_request just hands a request with flush and fua bits set to
the scsi_request_fn, even if the underlying request_queue does not
support flush or fua.

The agreed upon approach is to fix the flush machinery to allow
stacking.  While this isn't used in practice (since there is only one
request-based dm target, and that target will now reflect the flush
flags of the underlying device), it does future-proof the solution, and
make it function as designed.

In order to make this work, I had to add a field to the struct request,
inside the flush structure (to store the original req->end_io).  Shaohua
had suggested overloading the union with rb_node and completion_data,
but the completion data is used by device mapper and can also be used by
other drivers.  So, I didn't see a way around the additional field.

I tested this patch on an HP EVA with both ext4 and xfs, and it recovers
the lost performance.  Comments and other testers, as always, are
appreciated.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  |  8 ++++++--
 block/blk-flush.c | 20 ++++++++++++++++----
 block/blk.h       |  2 ++
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b850bed..7c59b0f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1700,6 +1700,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
+	int where = ELEVATOR_INSERT_BACK;
 
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
@@ -1716,7 +1717,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+	if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+		where = ELEVATOR_INSERT_FLUSH;
+
+	add_acct_request(q, rq, where);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -2273,7 +2277,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+bool __blk_end_bidi_request(struct request *rq, int error,
 				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 2d162bd..491eb30 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -123,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq)
 
 	/* make @rq a normal request */
 	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
-	rq->end_io = NULL;
+	rq->end_io = rq->flush.saved_end_io;
 }
 
 /**
@@ -301,9 +301,6 @@ void blk_insert_flush(struct request *rq)
 	unsigned int fflags = q->flush_flags;	/* may change, cache */
 	unsigned int policy = blk_flush_policy(fflags, rq);
 
-	BUG_ON(rq->end_io);
-	BUG_ON(!rq->bio || rq->bio != rq->biotail);
-
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_FLUSH and FUA for the driver.
@@ -313,6 +310,19 @@ void blk_insert_flush(struct request *rq)
 		rq->cmd_flags &= ~REQ_FUA;
 
 	/*
+	 * An empty flush handed down from a stacking driver may
+	 * translate into nothing if the underlying device does not
+	 * advertise a write-back cache.  In this case, simply
+	 * complete the request.
+	 */
+	if (!policy) {
+		__blk_end_bidi_request(rq, 0, 0, 0);
+		return;
+	}
+
+	BUG_ON(!rq->bio || rq->bio != rq->biotail);
+
+	/*
 	 * If there's data but flush is not necessary, the request can be
 	 * processed directly without going through flush machinery.  Queue
 	 * for normal execution.
@@ -320,6 +330,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		list_add_tail(&rq->queuelist, &q->queue_head);
+		blk_run_queue_async(q);
 		return;
 	}
 
@@ -330,6 +341,7 @@ void blk_insert_flush(struct request *rq)
 	memset(&rq->flush, 0, sizeof(rq->flush));
 	INIT_LIST_HEAD(&rq->flush.list);
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	rq->end_io = flush_data_end_io;
 
 	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
diff --git a/block/blk.h b/block/blk.h
index d658628..20b900a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes);
 
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
-- 
cgit v1.1


From b53d1ed734a2b9af8da115b836b658daa7d47a48 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 19 Aug 2011 08:34:48 +0200
Subject: Revert "cfq: Remove special treatment for metadata rqs."

We have a kernel build regression since 3.1-rc1, which is about 10%
regression. The kernel source is in an ext3 filesystem.
Alex Shi bisect it to commit:
commit a07405b7802691d29ab3b23bdc76ee6d006aad0b
Author: Justin TerAvest <teravest@google.com>
Date:   Sun Jul 10 22:09:19 2011 +0200

    cfq: Remove special treatment for metadata rqs.

Apparently this is caused by lack metadata preemption, where ext3/ext4
do use READ_META. I didn't see a way to fix the issue, so suggest
reverting the patch.

This reverts commit a07405b7802691d29ab3b23bdc76ee6d006aad0b.

Reported-by: Alex Shi<alex.shi@intel.com>
Reported-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6508345..a33bd43 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -130,6 +130,8 @@ struct cfq_queue {
 	unsigned long slice_end;
 	long slice_resid;
 
+	/* pending metadata requests */
+	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -682,6 +684,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 	if (rq_is_sync(rq1) != rq_is_sync(rq2))
 		return rq_is_sync(rq1) ? rq1 : rq2;
 
+	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
+
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 
@@ -1607,6 +1612,10 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
+	if (rq->cmd_flags & REQ_META) {
+		WARN_ON(!cfqq->meta_pending);
+		cfqq->meta_pending--;
+	}
 }
 
 static int cfq_merge(struct request_queue *q, struct request **req,
@@ -3360,6 +3369,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		return true;
 
 	/*
+	 * So both queues are sync. Let the new request get disk time if
+	 * it's a metadata request and the current queue is doing regular IO.
+	 */
+	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+		return true;
+
+	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
@@ -3423,6 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
+	if (rq->cmd_flags & REQ_META)
+		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cfqq, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
-- 
cgit v1.1


From 65299a3b788bd274bed92f9fa3232082c9f3ea70 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 14:50:29 +0200
Subject: block: separate priority boosting from REQ_META

Add a new REQ_PRIO to let requests preempt others in the cfq I/O schedule,
and lave REQ_META purely for marking requests as metadata in blktrace.

All existing callers of REQ_META except for XFS are updated to also
set REQ_PRIO for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a33bd43..16ace89 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -130,8 +130,8 @@ struct cfq_queue {
 	unsigned long slice_end;
 	long slice_resid;
 
-	/* pending metadata requests */
-	int meta_pending;
+	/* pending priority requests */
+	int prio_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -684,8 +684,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 	if (rq_is_sync(rq1) != rq_is_sync(rq2))
 		return rq_is_sync(rq1) ? rq1 : rq2;
 
-	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
-		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
+	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
+		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
 
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
@@ -1612,9 +1612,9 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
-	if (rq->cmd_flags & REQ_META) {
-		WARN_ON(!cfqq->meta_pending);
-		cfqq->meta_pending--;
+	if (rq->cmd_flags & REQ_PRIO) {
+		WARN_ON(!cfqq->prio_pending);
+		cfqq->prio_pending--;
 	}
 }
 
@@ -3372,7 +3372,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
-	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
 		return true;
 
 	/*
@@ -3439,8 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
-	if (rq->cmd_flags & REQ_META)
-		cfqq->meta_pending++;
+	if (rq->cmd_flags & REQ_PRIO)
+		cfqq->prio_pending++;
 
 	cfq_update_io_thinktime(cfqd, cfqq, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
-- 
cgit v1.1


From d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 23 Aug 2011 20:01:04 +0200
Subject: block: add GENHD_FL_NO_PART_SCAN

There are cases where suppressing partition scan is useful - e.g. for
lo devices and pseudo SATA devices which advertise to be a disk but
get upset on partition scan (some port multiplier control devices show
such behavior).

This patch adds GENHD_FL_NO_PART_SCAN which suppresses partition scan
regardless of the number of possible partitions.  disk_partitionable()
is renamed to disk_part_scan_enabled() as suppressing partition scan
doesn't imply the device can't be partitioned using
BLKPG_ADD/DEL_PARTITION calls from userland.  show_partition() now
directly tests disk_max_parts() to maintain backward-compatibility.

-v2: Updated to make it clear that only partition scan is suppressed
     not partitioning itself as suggested by Kay Sievers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 4 ++--
 block/ioctl.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 5cb51c5..2429ecb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -536,7 +536,7 @@ void register_disk(struct gendisk *disk)
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	/* No minors to use for partitions */
-	if (!disk_partitionable(disk))
+	if (!disk_part_scan_enabled(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
@@ -841,7 +841,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
+	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/ioctl.c b/block/ioctl.c
index 1124cd2..5c74efc 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -101,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 	struct gendisk *disk = bdev->bd_disk;
 	int res;
 
-	if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
+	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-- 
cgit v1.1


From e8037d49835482c9534a9a07bed0d0ea831135ae Mon Sep 17 00:00:00 2001
From: Eric Seppanen <eric@purestorage.com>
Date: Tue, 23 Aug 2011 21:25:12 +0200
Subject: block: Fix queue_flag update when rq_affinity goes from 2 to 1

Commit 5757a6d76cdf added the QUEUE_FLAG_SAME_FORCE flag, but fails to
clear that flag when the current state is '2' (SAME_COMP + SAME_FORCE)
and the new state is '1' (SAME_COMP).

Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Eric Seppanen <eric@purestorage.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-sysfs.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0ee17b5..e681805 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -258,11 +258,13 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 	ret = queue_var_store(&val, page, count);
 	spin_lock_irq(q->queue_lock);
-	if (val) {
+	if (val == 2) {
 		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-		if (val == 2)
-			queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
-	} else {
+		queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+	} else if (val == 1) {
+		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	} else if (val == 0) {
 		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
 		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
 	}
-- 
cgit v1.1


From a63271627521b825b0dd0a564e9a9c62b4c1ca89 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 24 Aug 2011 16:04:32 +0200
Subject: block: change force plug flush call order

Do blk_flush_plug_list() first and then add new request aDo blk_flush_plug_list() first and then add new request aDo blk_flush_plug_list() first and then add new request at the tail. New
request can't be merged to existing requests, but later new requests might
be merged with this new one. If blk_flush_plug_list() is done later, the
merge doesn't happen.
Believe it or not, this fixes a 10% regression running sysbench workload.

Signed-off-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 90e1ffd..67dba69 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1302,11 +1302,11 @@ get_rq:
 			if (__rq->q != q)
 				plug->should_sort = 1;
 		}
-		list_add_tail(&req->queuelist, &plug->list);
-		plug->count++;
-		drive_stat_acct(req, 1);
 		if (plug->count >= BLK_MAX_REQUEST_COUNT)
 			blk_flush_plug_list(plug, false);
+		plug->count++;
+		list_add_tail(&req->queuelist, &plug->list);
+		drive_stat_acct(req, 1);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
-- 
cgit v1.1


From 56ebdaf2fa3c5276be201c5d1aff1490b682ecf2 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 24 Aug 2011 16:04:34 +0200
Subject: block: simplify force plug flush code a little bit

Cleaning up the code a little bit. attempt_plug_merge() traverses the plug
list anyway, we can do the request counting there, so stack size is reduced
a little bit.
The motivation here is I suspect if we should count the requests for each
queue (task could handle multiple disks in the meantime), but my test doesn't
show it's worthy doing. If somebody proves we should do it, below change
will make that more easier.

Signed-off-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 67dba69..b2ed78a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1167,7 +1167,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
  * true if merge was successful, otherwise false.
  */
 static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-			       struct bio *bio)
+			       struct bio *bio, unsigned int *request_count)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1176,10 +1176,13 @@ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
 	plug = tsk->plug;
 	if (!plug)
 		goto out;
+	*request_count = 0;
 
 	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
 		int el_ret;
 
+		(*request_count)++;
+
 		if (rq->q != q)
 			continue;
 
@@ -1219,6 +1222,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	struct blk_plug *plug;
 	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
+	unsigned int request_count = 0;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1237,7 +1241,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (attempt_plug_merge(current, q, bio))
+	if (attempt_plug_merge(current, q, bio, &request_count))
 		goto out;
 
 	spin_lock_irq(q->queue_lock);
@@ -1302,9 +1306,8 @@ get_rq:
 			if (__rq->q != q)
 				plug->should_sort = 1;
 		}
-		if (plug->count >= BLK_MAX_REQUEST_COUNT)
+		if (request_count >= BLK_MAX_REQUEST_COUNT)
 			blk_flush_plug_list(plug, false);
-		plug->count++;
 		list_add_tail(&req->queuelist, &plug->list);
 		drive_stat_acct(req, 1);
 	} else {
@@ -2634,7 +2637,6 @@ void blk_start_plug(struct blk_plug *plug)
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->cb_list);
 	plug->should_sort = 0;
-	plug->count = 0;
 
 	/*
 	 * If this is a nested plug, don't actually assign it. It will be
@@ -2718,7 +2720,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		return;
 
 	list_splice_init(&plug->list, &list);
-	plug->count = 0;
 
 	if (plug->should_sort) {
 		list_sort(NULL, &list, plug_rq_cmp);
-- 
cgit v1.1


From a72c5e5eb738033938ab30d6a634b74d1d060f10 Mon Sep 17 00:00:00 2001
From: Nao Nishijima <nao.nishijima.xt@hitachi.com>
Date: Thu, 25 Aug 2011 18:04:06 +0900
Subject: [SCSI] genhd: add a new attribute "alias" in gendisk

This patch allows the user to set an "alias" of the disk via sysfs interface.

This patch only adds a new attribute "alias" in gendisk structure.
To show the alias instead of the device name in kernel messages,
we need to revise printk messages and use alias_name() in them.

Example:
(current) printk("disk name is %s\n", disk->disk_name);
(new)     printk("disk name is %s\n", alias_name(disk));

Users can use alphabets, numbers, '-' and '_' in "alias" attribute. A disk can
have an "alias" which length is up to 255 bytes. This attribute is write-once.

Suggested-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Suggested-by: Jon Masters <jcm@redhat.com>
Signed-off-by: Nao Nishijima <nao.nishijima.xt@hitachi.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 block/genhd.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index e2f6790..94855a9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
+#include <linux/ctype.h>
 
 #include "blk.h"
 
@@ -909,6 +910,74 @@ static int __init genhd_device_init(void)
 
 subsys_initcall(genhd_device_init);
 
+static ssize_t alias_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	ssize_t ret = 0;
+
+	if (disk->alias)
+		ret = snprintf(buf, ALIAS_LEN, "%s\n", disk->alias);
+	return ret;
+}
+
+static ssize_t alias_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	char *alias;
+	char *envp[] = { NULL, NULL };
+	unsigned char c;
+	int i;
+	ssize_t ret = count;
+
+	if (!count)
+		return -EINVAL;
+
+	if (count >= ALIAS_LEN) {
+		printk(KERN_ERR "alias: alias is too long\n");
+		return -EINVAL;
+	}
+
+	/* Validation check */
+	for (i = 0; i < count; i++) {
+		c = buf[i];
+		if (i == count - 1 && c == '\n')
+			break;
+		if (!isalnum(c) && c != '_' && c != '-') {
+			printk(KERN_ERR "alias: invalid alias\n");
+			return -EINVAL;
+		}
+	}
+
+	if (disk->alias) {
+		printk(KERN_INFO "alias: %s is already assigned (%s)\n",
+		       disk->disk_name, disk->alias);
+		return -EINVAL;
+	}
+
+	alias = kasprintf(GFP_KERNEL, "%s", buf);
+	if (!alias)
+		return -ENOMEM;
+
+	if (alias[count - 1] == '\n')
+		alias[count - 1] = '\0';
+
+	envp[0] = kasprintf(GFP_KERNEL, "ALIAS=%s", alias);
+	if (!envp[0]) {
+		kfree(alias);
+		return -ENOMEM;
+	}
+
+	disk->alias = alias;
+	printk(KERN_INFO "alias: assigned %s to %s\n", alias, disk->disk_name);
+
+	kobject_uevent_env(&dev->kobj, KOBJ_ADD, envp);
+
+	kfree(envp[0]);
+	return ret;
+}
+
 static ssize_t disk_range_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -968,6 +1037,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
+static DEVICE_ATTR(alias, S_IRUGO|S_IWUSR, alias_show, alias_store);
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -990,6 +1060,7 @@ static struct device_attribute dev_attr_fail_timeout =
 #endif
 
 static struct attribute *disk_attrs[] = {
+	&dev_attr_alias.attr,
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
-- 
cgit v1.1


From 484fc254b88257a2d8b3759aa062e8e8b35e0988 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Thu, 8 Sep 2011 12:32:14 +0200
Subject: elevator: use ELV_NAME_MAX instead of magic number 16 for
 chosen_elevator

We have ELV_NAME_MAX defined to 16, and hence we should use it
instead of the magic nubmer 16 for elevator's name string.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index a3b64bc..cb332cb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -182,7 +182,7 @@ static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
 	eq->elevator_data = data;
 }
 
-static char chosen_elevator[16];
+static char chosen_elevator[ELV_NAME_MAX];
 
 static int __init elevator_setup(char *str)
 {
-- 
cgit v1.1


From 166e1f901b01872e8b70733a3f2e2c6980389cf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 12 Sep 2011 12:08:27 +0200
Subject: block: export __make_request

Avoid the hacks need for request based device mappers currently by simply
exporting the symbol instead of trying to get it through the back door.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b627558..56ef387 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -38,8 +38,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
-static int __make_request(struct request_queue *q, struct bio *bio);
-
 /*
  * For the allocated request tables
  */
@@ -1213,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-static int __make_request(struct request_queue *q, struct bio *bio)
+int __make_request(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1317,6 +1315,7 @@ out_unlock:
 out:
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__make_request);	/* for device mapper only */
 
 /*
  * If bio->bi_dev is a partition, remap the location
-- 
cgit v1.1


From c20e8de27fef9f59869c81c288ad6cf28200e00c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 12 Sep 2011 12:03:37 +0200
Subject: block: rename __make_request() to blk_queue_bio()

Now that it's exported, lets put it in a more sane namespace.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 56ef387..ab673f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -540,7 +540,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
-	blk_queue_make_request(q, __make_request);
+	blk_queue_make_request(q, blk_queue_bio);
 
 	q->sg_reserved_size = INT_MAX;
 
@@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-int __make_request(struct request_queue *q, struct bio *bio)
+int blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1315,7 +1315,7 @@ out_unlock:
 out:
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__make_request);	/* for device mapper only */
+EXPORT_SYMBOL_GPL(blk_queue_bio);	/* for device mapper only */
 
 /*
  * If bio->bi_dev is a partition, remap the location
-- 
cgit v1.1


From 5a7bbad27a410350e64a2d7f5ec18fc73836c14f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 12 Sep 2011 12:12:01 +0200
Subject: block: remove support for bio remapping from ->make_request

There is very little benefit in allowing to let a ->make_request
instance update the bios device and sector and loop around it in
__generic_make_request when we can archive the same through calling
generic_make_request from the driver and letting the loop in
generic_make_request handle it.

Note that various drivers got the return value from ->make_request and
returned non-zero values for errors.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 153 ++++++++++++++++++++++---------------------------------
 1 file changed, 62 insertions(+), 91 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ab673f0..f58e019 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1211,7 +1211,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-int blk_queue_bio(struct request_queue *q, struct bio *bio)
+void blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
@@ -1236,7 +1236,7 @@ int blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (attempt_plug_merge(current, q, bio))
-		goto out;
+		return;
 
 	spin_lock_irq(q->queue_lock);
 
@@ -1312,8 +1312,6 @@ get_rq:
 out_unlock:
 		spin_unlock_irq(q->queue_lock);
 	}
-out:
-	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_queue_bio);	/* for device mapper only */
 
@@ -1441,112 +1439,85 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
-	sector_t old_sector;
-	int ret, nr_sectors = bio_sectors(bio);
-	dev_t old_dev;
+	int nr_sectors = bio_sectors(bio);
 	int err = -EIO;
+	char b[BDEVNAME_SIZE];
+	struct hd_struct *part;
 
 	might_sleep();
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 
-	/*
-	 * Resolve the mapping until finished. (drivers are
-	 * still free to implement/resolve their own stacking
-	 * by explicitly returning 0)
-	 *
-	 * NOTE: we don't repeat the blk_size check for each new device.
-	 * Stacking drivers are expected to know what they are doing.
-	 */
-	old_sector = -1;
-	old_dev = 0;
-	do {
-		char b[BDEVNAME_SIZE];
-		struct hd_struct *part;
-
-		q = bdev_get_queue(bio->bi_bdev);
-		if (unlikely(!q)) {
-			printk(KERN_ERR
-			       "generic_make_request: Trying to access "
-				"nonexistent block-device %s (%Lu)\n",
-				bdevname(bio->bi_bdev, b),
-				(long long) bio->bi_sector);
-			goto end_io;
-		}
-
-		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-			     nr_sectors > queue_max_hw_sectors(q))) {
-			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-			       bdevname(bio->bi_bdev, b),
-			       bio_sectors(bio),
-			       queue_max_hw_sectors(q));
-			goto end_io;
-		}
-
-		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-			goto end_io;
-
-		part = bio->bi_bdev->bd_part;
-		if (should_fail_request(part, bio->bi_size) ||
-		    should_fail_request(&part_to_disk(part)->part0,
-					bio->bi_size))
-			goto end_io;
+	q = bdev_get_queue(bio->bi_bdev);
+	if (unlikely(!q)) {
+		printk(KERN_ERR
+		       "generic_make_request: Trying to access "
+			"nonexistent block-device %s (%Lu)\n",
+			bdevname(bio->bi_bdev, b),
+			(long long) bio->bi_sector);
+		goto end_io;
+	}
 
-		/*
-		 * If this device has partitions, remap block n
-		 * of partition p to block n+start(p) of the disk.
-		 */
-		blk_partition_remap(bio);
+	if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
+		     nr_sectors > queue_max_hw_sectors(q))) {
+		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+		       bdevname(bio->bi_bdev, b),
+		       bio_sectors(bio),
+		       queue_max_hw_sectors(q));
+		goto end_io;
+	}
 
-		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
-			goto end_io;
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		goto end_io;
 
-		if (old_sector != -1)
-			trace_block_bio_remap(q, bio, old_dev, old_sector);
+	part = bio->bi_bdev->bd_part;
+	if (should_fail_request(part, bio->bi_size) ||
+	    should_fail_request(&part_to_disk(part)->part0,
+				bio->bi_size))
+		goto end_io;
 
-		old_sector = bio->bi_sector;
-		old_dev = bio->bi_bdev->bd_dev;
+	/*
+	 * If this device has partitions, remap block n
+	 * of partition p to block n+start(p) of the disk.
+	 */
+	blk_partition_remap(bio);
 
-		if (bio_check_eod(bio, nr_sectors))
-			goto end_io;
+	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+		goto end_io;
 
-		/*
-		 * Filter flush bio's early so that make_request based
-		 * drivers without flush support don't have to worry
-		 * about them.
-		 */
-		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-			if (!nr_sectors) {
-				err = 0;
-				goto end_io;
-			}
-		}
+	if (bio_check_eod(bio, nr_sectors))
+		goto end_io;
 
-		if ((bio->bi_rw & REQ_DISCARD) &&
-		    (!blk_queue_discard(q) ||
-		     ((bio->bi_rw & REQ_SECURE) &&
-		      !blk_queue_secdiscard(q)))) {
-			err = -EOPNOTSUPP;
+	/*
+	 * Filter flush bio's early so that make_request based
+	 * drivers without flush support don't have to worry
+	 * about them.
+	 */
+	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+		if (!nr_sectors) {
+			err = 0;
 			goto end_io;
 		}
+	}
 
-		if (blk_throtl_bio(q, &bio))
-			goto end_io;
-
-		/*
-		 * If bio = NULL, bio has been throttled and will be submitted
-		 * later.
-		 */
-		if (!bio)
-			break;
-
-		trace_block_bio_queue(q, bio);
+	if ((bio->bi_rw & REQ_DISCARD) &&
+	    (!blk_queue_discard(q) ||
+	     ((bio->bi_rw & REQ_SECURE) &&
+	      !blk_queue_secdiscard(q)))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
-		ret = q->make_request_fn(q, bio);
-	} while (ret);
+	if (blk_throtl_bio(q, &bio))
+		goto end_io;
 
+	/* if bio = NULL, bio has been throttled and will be submitted later. */
+	if (!bio)
+		return;
+	trace_block_bio_queue(q, bio);
+	q->make_request_fn(q, bio);
 	return;
 
 end_io:
-- 
cgit v1.1


From 8ad6a56f5679a987bfeacad1bd818a2a381aa98e Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 14 Sep 2011 09:31:01 +0200
Subject: block: Don't check QUEUE_FLAG_SAME_COMP in __blk_complete_request

In __blk_complete_request, we check both QUEUE_FLAG_SAME_COMP and req->cpu
to decide whether we should use req->cpu. Actually the user can also
select the complete cpu by either setting BIO_CPU_AFFINE or by calling
bio_set_completion_cpu. Current solution makes these 2 ways don't work
any more. So we'd better just check req->cpu.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 58340d0..1366a89 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -115,7 +115,7 @@ void __blk_complete_request(struct request *req)
 	/*
 	 * Select completion CPU
 	 */
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
+	if (req->cpu != -1) {
 		ccpu = req->cpu;
 		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
 			ccpu = blk_cpu_to_group(ccpu);
-- 
cgit v1.1


From 27a84d54c02591e815d291ae0ee4bfb9cfd21065 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 15 Sep 2011 14:01:40 +0200
Subject: block: refactor generic_make_request

Move all the checks performed on a bio into a new helper, and call it as
soon as bio is submitted even if it is a re-submission from ->make_request.

We explicitly mark the new helper as beeing non-inlined as the stack
usage for printing the block device name in the failure case is quite
high and this a patch where we have to be extremely conservative about
stack usage.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 95 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 46 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f58e019..684d7eb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1412,31 +1412,8 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 	return 0;
 }
 
-/**
- * generic_make_request - hand a buffer to its device driver for I/O
- * @bio:  The bio describing the location in memory and on the device.
- *
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status.  The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may change bi_dev and
- * bi_sector for remaps as it sees fit.  So the values of these fields
- * should NOT be depended on after the call to generic_make_request.
- */
-static inline void __generic_make_request(struct bio *bio)
+static noinline_for_stack bool
+generic_make_request_checks(struct bio *bio)
 {
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
@@ -1515,35 +1492,62 @@ static inline void __generic_make_request(struct bio *bio)
 
 	/* if bio = NULL, bio has been throttled and will be submitted later. */
 	if (!bio)
-		return;
+		return false;
+
 	trace_block_bio_queue(q, bio);
-	q->make_request_fn(q, bio);
-	return;
+	return true;
 
 end_io:
 	bio_endio(bio, err);
+	return false;
 }
 
-/*
- * We only want one ->make_request_fn to be active at a time,
- * else stack usage with stacked devices could be a problem.
- * So use current->bio_list to keep a list of requests
- * submited by a make_request_fn function.
- * current->bio_list is also used as a flag to say if
- * generic_make_request is currently active in this task or not.
- * If it is NULL, then no make_request is active.  If it is non-NULL,
- * then a make_request is active, and new requests should be added
- * at the tail
+/**
+ * generic_make_request - hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may resubmit the bio to
+ * a lower device by calling into generic_make_request recursively, which
+ * means the bio should NOT be touched after the call to ->make_request_fn.
  */
 void generic_make_request(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack;
 
+	if (!generic_make_request_checks(bio))
+		return;
+
+	/*
+	 * We only want one ->make_request_fn to be active at a time, else
+	 * stack usage with stacked devices could be a problem.  So use
+	 * current->bio_list to keep a list of requests submited by a
+	 * make_request_fn function.  current->bio_list is also used as a
+	 * flag to say if generic_make_request is currently active in this
+	 * task or not.  If it is NULL, then no make_request is active.  If
+	 * it is non-NULL, then a make_request is active, and new requests
+	 * should be added at the tail
+	 */
 	if (current->bio_list) {
-		/* make_request is active */
 		bio_list_add(current->bio_list, bio);
 		return;
 	}
+
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1551,22 +1555,21 @@ void generic_make_request(struct bio *bio)
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to a pointer to the bio_list_on_stack,
 	 * thus initialising the bio_list of new bios to be
-	 * added.  __generic_make_request may indeed add some more bios
+	 * added.  ->make_request() may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
-	 * bio_list, and call into __generic_make_request again.
-	 *
-	 * The loop was structured like this to make only one call to
-	 * __generic_make_request (which is important as it is large and
-	 * inlined) and to keep the structure simple.
+	 * bio_list, and call into ->make_request() again.
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
 	current->bio_list = &bio_list_on_stack;
 	do {
-		__generic_make_request(bio);
+		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+		q->make_request_fn(q, bio);
+
 		bio = bio_list_pop(current->bio_list);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
-- 
cgit v1.1


From 75df713627f28f88b901b329c8857747545fd4ab Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 21 Sep 2011 10:00:16 +0200
Subject: block: document blk-plug

Thus spake Andrew Morton:

"And I have the usual maintainability whine.  If someone comes up to
vmscan.c and sees it calling blk_start_plug(), how are they supposed to
work out why that call is there?  They go look at the blk_start_plug()
definition and it is undocumented.  I think we can do better than this?"

Adapted from the LWN article - http://lwn.net/Articles/438256/ by Jens
Axboe and from an earlier attempt by Shaohua Li to document blk-plug.

[akpm@linux-foundation.org: grammatical and spelling tweaks]
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 684d7eb..97e9e54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2595,6 +2595,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
 #define PLUG_MAGIC	0x91827364
 
+/**
+ * blk_start_plug - initialize blk_plug and track it inside the task_struct
+ * @plug:	The &struct blk_plug that needs to be initialized
+ *
+ * Description:
+ *   Tracking blk_plug inside the task_struct will help with auto-flushing the
+ *   pending I/O should the task end up blocking between blk_start_plug() and
+ *   blk_finish_plug(). This is important from a performance perspective, but
+ *   also ensures that we don't deadlock. For instance, if the task is blocking
+ *   for a memory allocation, memory reclaim could end up wanting to free a
+ *   page belonging to that request that is currently residing in our private
+ *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
+ *   this kind of deadlock.
+ */
 void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
-- 
cgit v1.1


From 499337bb6511e665a236a6a947f819d98ea340c6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 21 Sep 2011 10:01:22 +0200
Subject: block/blk-sysfs.c: fix kerneldoc references

The kerneldoc for blk_release_queue() is referring to blk_cleanup_queue().

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0ee17b5..adc923e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -455,11 +455,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 }
 
 /**
- * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging of the request queue to be released
+ * blk_release_queue: - release a &struct request_queue when it is no longer needed
+ * @kobj:    the kobj belonging to the request queue to be released
  *
  * Description:
- *     blk_cleanup_queue is the pair to blk_init_queue() or
+ *     blk_release_queue is the pair to blk_init_queue() or
  *     blk_queue_make_request().  It should be called when a request queue is
  *     being released; typically when a block device is being de-registered.
  *     Currently, its primary task it to free all the &struct request
-- 
cgit v1.1


From d11bb4462c4cc6ddd45c6927c617ad79fa6fb8fc Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Wed, 21 Sep 2011 10:22:10 +0200
Subject: blk-cgroup: be able to remove the record of unplugged device

The bug is we're not able to remove the device from blkio cgroup's
per-device control files if it gets unplugged.

To reproduce the bug:

  # mount -t cgroup -o blkio xxx /cgroup
  # cd /cgroup
  # echo "8:0 1000" > blkio.throttle.read_bps_device
  # unplug the device
  # cat blkio.throttle.read_bps_device
  8:0	1000
  # echo "8:0 0" > blkio.throttle.read_bps_device
  -bash: echo: write error: No such device

After patching, the device removal will succeed.

Thanks for the comments of Paul, Zefan, and Vivek.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <paul@paulmenage.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bcaf16e..b596e54 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -785,10 +785,10 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp;
+	unsigned long major, minor;
 	int i = 0;
 	dev_t dev;
-	u64 bps, iops;
+	u64 temp;
 
 	memset(s, 0, sizeof(s));
 
@@ -826,20 +826,23 @@ static int blkio_policy_parse_and_set(char *buf,
 
 	dev = MKDEV(major, minor);
 
-	ret = blkio_check_dev_num(dev);
+	ret = strict_strtoull(s[1], 10, &temp);
 	if (ret)
-		return ret;
+		return -EINVAL;
 
-	newpn->dev = dev;
+	/* For rule removal, do not check for device presence. */
+	if (temp) {
+		ret = blkio_check_dev_num(dev);
+		if (ret)
+			return ret;
+	}
 
-	if (s[1] == NULL)
-		return -EINVAL;
+	newpn->dev = dev;
 
 	switch (plid) {
 	case BLKIO_POLICY_PROP:
-		ret = strict_strtoul(s[1], 10, &temp);
-		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-			temp > BLKIO_WEIGHT_MAX)
+		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+		     temp > BLKIO_WEIGHT_MAX)
 			return -EINVAL;
 
 		newpn->plid = plid;
@@ -850,26 +853,18 @@ static int blkio_policy_parse_and_set(char *buf,
 		switch(fileid) {
 		case BLKIO_THROTL_read_bps_device:
 		case BLKIO_THROTL_write_bps_device:
-			ret = strict_strtoull(s[1], 10, &bps);
-			if (ret)
-				return -EINVAL;
-
 			newpn->plid = plid;
 			newpn->fileid = fileid;
-			newpn->val.bps = bps;
+			newpn->val.bps = temp;
 			break;
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
-			ret = strict_strtoull(s[1], 10, &iops);
-			if (ret)
-				return -EINVAL;
-
-			if (iops > THROTL_IOPS_MAX)
+			if (temp > THROTL_IOPS_MAX)
 				return -EINVAL;
 
 			newpn->plid = plid;
 			newpn->fileid = fileid;
-			newpn->val.iops = (unsigned int)iops;
+			newpn->val.iops = (unsigned int)temp;
 			break;
 		}
 		break;
-- 
cgit v1.1


From 777eb1bf15b8532c396821774bf6451e563438f5 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 28 Sep 2011 08:07:01 -0600
Subject: block: Free queue resources at blk_release_queue()

A kernel crash is observed when a mounted ext3/ext4 filesystem is
physically removed. The problem is that blk_cleanup_queue() frees up
some resources eg by calling elevator_exit(), which are not checked for
in normal operation. So we should rather move these calls to the
destructor function blk_release_queue() as at that point all remaining
references are gone. However, in doing so we have to ensure that any
externally supplied queue_lock is disconnected as the driver might free
up the lock after the call of blk_cleanup_queue(),

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 13 ++++++-------
 block/blk-sysfs.c |  5 +++++
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b2ed78a..d34433a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -348,9 +348,10 @@ void blk_put_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_put_queue);
 
 /*
- * Note: If a driver supplied the queue lock, it should not zap that lock
- * unexpectedly as some queue cleanup components like elevator_exit() and
- * blk_throtl_exit() need queue lock.
+ * Note: If a driver supplied the queue lock, it is disconnected
+ * by this function. The actual state of the lock doesn't matter
+ * here as the request_queue isn't accessible after this point
+ * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
@@ -367,10 +368,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
 
-	if (q->elevator)
-		elevator_exit(q->elevator);
-
-	blk_throtl_exit(q);
+	if (q->queue_lock != &q->__queue_lock)
+		q->queue_lock = &q->__queue_lock;
 
 	blk_put_queue(q);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e681805..60fda88 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -479,6 +479,11 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
+	if (q->elevator)
+		elevator_exit(q->elevator);
+
+	blk_throtl_exit(q);
+
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
-- 
cgit v1.1


From 523e1d399ce0e23bec562abe2b2f8d297af81161 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:07 +0200
Subject: block: make gendisk hold a reference to its queue

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index e2f6790..d261b73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
-- 
cgit v1.1


From ece84241b93c4693bfe0a8ec9a043a16d216d0cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:15 +0200
Subject: block: fix genhd refcounting in blkio_policy_parse_and_set()

blkio_policy_parse_and_set() calls blkio_check_dev_num() to check
whether the given dev_t is valid.  blkio_check_dev_num() uses
get_gendisk() for verification but never puts the returned genhd
leaking the reference.

This patch collapses blkio_check_dev_num() into its caller and updates
it such that the genhd is put before returning.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 56 ++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b596e54..d61ec56 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -768,25 +768,14 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-static int blkio_check_dev_num(dev_t dev)
-{
-	int part = 0;
-	struct gendisk *disk;
-
-	disk = get_gendisk(dev, &part);
-	if (!disk || part)
-		return -ENODEV;
-
-	return 0;
-}
-
 static int blkio_policy_parse_and_set(char *buf,
 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
+	struct gendisk *disk = NULL;
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
-	int ret;
 	unsigned long major, minor;
-	int i = 0;
+	int i = 0, ret = -EINVAL;
+	int part;
 	dev_t dev;
 	u64 temp;
 
@@ -804,37 +793,36 @@ static int blkio_policy_parse_and_set(char *buf,
 	}
 
 	if (i != 2)
-		return -EINVAL;
+		goto out;
 
 	p = strsep(&s[0], ":");
 	if (p != NULL)
 		major_s = p;
 	else
-		return -EINVAL;
+		goto out;
 
 	minor_s = s[0];
 	if (!minor_s)
-		return -EINVAL;
+		goto out;
 
-	ret = strict_strtoul(major_s, 10, &major);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoul(major_s, 10, &major))
+		goto out;
 
-	ret = strict_strtoul(minor_s, 10, &minor);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoul(minor_s, 10, &minor))
+		goto out;
 
 	dev = MKDEV(major, minor);
 
-	ret = strict_strtoull(s[1], 10, &temp);
-	if (ret)
-		return -EINVAL;
+	if (strict_strtoull(s[1], 10, &temp))
+		goto out;
 
 	/* For rule removal, do not check for device presence. */
 	if (temp) {
-		ret = blkio_check_dev_num(dev);
-		if (ret)
-			return ret;
+		disk = get_gendisk(dev, &part);
+		if (!disk || part) {
+			ret = -ENODEV;
+			goto out;
+		}
 	}
 
 	newpn->dev = dev;
@@ -843,7 +831,7 @@ static int blkio_policy_parse_and_set(char *buf,
 	case BLKIO_POLICY_PROP:
 		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 		     temp > BLKIO_WEIGHT_MAX)
-			return -EINVAL;
+			goto out;
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
@@ -860,7 +848,7 @@ static int blkio_policy_parse_and_set(char *buf,
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
 			if (temp > THROTL_IOPS_MAX)
-				return -EINVAL;
+				goto out;
 
 			newpn->plid = plid;
 			newpn->fileid = fileid;
@@ -871,8 +859,10 @@ static int blkio_policy_parse_and_set(char *buf,
 	default:
 		BUG();
 	}
-
-	return 0;
+	ret = 0;
+out:
+	put_disk(disk);
+	return ret;
 }
 
 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-- 
cgit v1.1


From bc9fcbf9cb8ec76d340da16fbf48a9a316e14c52 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:18 +0200
Subject: block: move blk_throtl prototypes to block/blk.h

blk_throtl interface is block internal and there's no reason to have
them in linux/blkdev.h.  Move them to block/blk.h.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c |  1 +
 block/blk.h          | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a19f58c..f3f495e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,6 +10,7 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
+#include "blk.h"
 
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
diff --git a/block/blk.h b/block/blk.h
index 20b900a..da247ba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -188,4 +188,17 @@ static inline int blk_do_io_stat(struct request *rq)
 	        (rq->cmd_flags & REQ_DISCARD));
 }
 
-#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+	return 0;
+}
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif /* BLK_INTERNAL_H */
-- 
cgit v1.1


From 75eb6c372d41d6d140b893873f6687d78c987a44 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:22 +0200
Subject: block: pass around REQ_* flags instead of broken down booleans during
 request alloc/free

blk_alloc_request() and freed_request() take different combinations of
REQ_* @flags, @priv and @is_sync when @flags is superset of the latter
two.  Make them take @flags only.  This cleans up the code a bit and
will ease updating allocation related REQ_* flags.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 79e41a7..a3d2fdc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,7 +574,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 }
 
 static struct request *
-blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -585,12 +585,10 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 
 	rq->cmd_flags = flags | REQ_ALLOCED;
 
-	if (priv) {
-		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
-			mempool_free(rq, q->rq.rq_pool);
-			return NULL;
-		}
-		rq->cmd_flags |= REQ_ELVPRIV;
+	if ((flags & REQ_ELVPRIV) &&
+	    unlikely(elv_set_request(q, rq, gfp_mask))) {
+		mempool_free(rq, q->rq.rq_pool);
+		return NULL;
 	}
 
 	return rq;
@@ -649,12 +647,13 @@ static void __freed_request(struct request_queue *q, int sync)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_queue *q, int sync, int priv)
+static void freed_request(struct request_queue *q, unsigned int flags)
 {
 	struct request_list *rl = &q->rq;
+	int sync = rw_is_sync(flags);
 
 	rl->count[sync]--;
-	if (priv)
+	if (flags & REQ_ELVPRIV)
 		rl->elvpriv--;
 
 	__freed_request(q, sync);
@@ -694,7 +693,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
-	int may_queue, priv = 0;
+	int may_queue;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
@@ -738,17 +737,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 
-	if (blk_rq_should_init_elevator(bio)) {
-		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-		if (priv)
-			rl->elvpriv++;
+	if (blk_rq_should_init_elevator(bio) &&
+	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
+		rw_flags |= REQ_ELVPRIV;
+		rl->elvpriv++;
 	}
 
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
+	rq = blk_alloc_request(q, rw_flags, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -758,7 +757,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
-		freed_request(q, is_sync, priv);
+		freed_request(q, rw_flags);
 
 		/*
 		 * in the very unlikely event that allocation failed and no
@@ -1050,14 +1049,13 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
-		int is_sync = rq_is_sync(req) != 0;
-		int priv = req->cmd_flags & REQ_ELVPRIV;
+		unsigned int flags = req->cmd_flags;
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
 		blk_free_request(q, req);
-		freed_request(q, is_sync, priv);
+		freed_request(q, flags);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
-- 
cgit v1.1


From 315fceee81155ef2aeed9316ca72aeea9347db5c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:25 +0200
Subject: block: drop unnecessary blk_get/put_queue() in scsi_cmd_ioctl() and
 blk_get_tg()

blk_get/put_queue() in scsi_cmd_ioctl() and throtl_get_tg() are
completely bogus.  The caller must have a reference to the queue on
entry and taking an extra reference doesn't change anything.

For scsi_cmd_ioctl(), the only effect is that it ends up checking
QUEUE_FLAG_DEAD on entry; however, this is bogus as queue can die
right after blk_get_queue().  Dead queue should be and is handled in
request issue path (it's somewhat broken now but that's a separate
problem and doesn't affect this one much).

throtl_get_tg() incorrectly assumes that q is rcu freed.  Also, it
doesn't check return value of blk_get_queue().  If the queue is
already dead, it ends up doing an extra put.

Drop them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 8 +-------
 block/scsi_ioctl.c   | 3 +--
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f3f495e..ecba5fc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -324,12 +324,8 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	/*
 	 * Need to allocate a group. Allocation of group also needs allocation
 	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc
-	 *
-	 * Take the request queue reference to make sure queue does not
-	 * go away once we return from allocation.
+	 * we need to drop rcu lock and queue_lock before we call alloc.
 	 */
-	blk_get_queue(q);
 	rcu_read_unlock();
 	spin_unlock_irq(q->queue_lock);
 
@@ -339,13 +335,11 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	 * dead
 	 */
 	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		blk_put_queue(q);
 		if (tg)
 			kfree(tg);
 
 		return ERR_PTR(-ENODEV);
 	}
-	blk_put_queue(q);
 
 	/* Group allocated and queue is still alive. take the lock */
 	spin_lock_irq(q->queue_lock);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4f4230b..fbdf0d8 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -565,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 {
 	int err;
 
-	if (!q || blk_get_queue(q))
+	if (!q)
 		return -ENXIO;
 
 	switch (cmd) {
@@ -686,7 +686,6 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 			err = -ENOTTY;
 	}
 
-	blk_put_queue(q);
 	return err;
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
-- 
cgit v1.1


From e3c78ca524d230bc145e902625e88c392a58ddf3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:32:38 +0200
Subject: block: reorganize queue draining

Reorganize queue draining related code in preparation of queue exit
changes.

* Factor out actual draining from elv_quiesce_start() to
  blk_drain_queue().

* Make elv_quiesce_start/end() responsible for their own locking.

* Replace open-coded ELVSWITCH clearing in elevator_switch() with
  elv_quiesce_end().

This patch doesn't cause any visible functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 28 ++++++++++++++++++++++++++++
 block/blk.h      |  1 +
 block/elevator.c | 37 +++++++++++--------------------------
 3 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a3d2fdc..149149d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
+#include <linux/delay.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -345,6 +346,33 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
 
+/**
+ * blk_drain_queue - drain requests from request_queue
+ * @q: queue to drain
+ *
+ * Drain ELV_PRIV requests from @q.  The caller is responsible for ensuring
+ * that no new requests which need to be drained are queued.
+ */
+void blk_drain_queue(struct request_queue *q)
+{
+	while (true) {
+		int nr_rqs;
+
+		spin_lock_irq(q->queue_lock);
+
+		elv_drain_elevator(q);
+
+		__blk_run_queue(q);
+		nr_rqs = q->rq.elvpriv;
+
+		spin_unlock_irq(q->queue_lock);
+
+		if (!nr_rqs)
+			break;
+		msleep(10);
+	}
+}
+
 /*
  * Note: If a driver supplied the queue lock, it is disconnected
  * by this function. The actual state of the lock doesn't matter
diff --git a/block/blk.h b/block/blk.h
index da247ba..2b66dc2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,6 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
+void blk_drain_queue(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
diff --git a/block/elevator.c b/block/elevator.c
index cb332cb..74a277f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
@@ -606,43 +605,35 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 void elv_drain_elevator(struct request_queue *q)
 {
 	static int printed;
+
+	lockdep_assert_held(q->queue_lock);
+
 	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
 		;
-	if (q->nr_sorted == 0)
-		return;
-	if (printed++ < 10) {
+	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
 		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
 	}
 }
 
-/*
- * Call with queue lock held, interrupts disabled
- */
 void elv_quiesce_start(struct request_queue *q)
 {
 	if (!q->elevator)
 		return;
 
+	spin_lock_irq(q->queue_lock);
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * make sure we don't have any requests in flight
-	 */
-	elv_drain_elevator(q);
-	while (q->rq.elvpriv) {
-		__blk_run_queue(q);
-		spin_unlock_irq(q->queue_lock);
-		msleep(10);
-		spin_lock_irq(q->queue_lock);
-		elv_drain_elevator(q);
-	}
+	blk_drain_queue(q);
 }
 
 void elv_quiesce_end(struct request_queue *q)
 {
+	spin_lock_irq(q->queue_lock);
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+	spin_unlock_irq(q->queue_lock);
 }
 
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
@@ -972,7 +963,6 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 */
-	spin_lock_irq(q->queue_lock);
 	elv_quiesce_start(q);
 
 	/*
@@ -983,8 +973,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	/*
 	 * attach and start new elevator
 	 */
+	spin_lock_irq(q->queue_lock);
 	elevator_attach(q, e, data);
-
 	spin_unlock_irq(q->queue_lock);
 
 	if (old_elevator->registered) {
@@ -999,9 +989,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * finally exit old elevator and turn off BYPASS.
 	 */
 	elevator_exit(old_elevator);
-	spin_lock_irq(q->queue_lock);
 	elv_quiesce_end(q);
-	spin_unlock_irq(q->queue_lock);
 
 	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
 
@@ -1015,10 +1003,7 @@ fail_register:
 	elevator_exit(e);
 	q->elevator = old_elevator;
 	elv_register_queue(q);
-
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
+	elv_quiesce_end(q);
 
 	return err;
 }
-- 
cgit v1.1


From bc16a4f933bc5ed50826b20561e4c3515061998b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:01 +0200
Subject: block: reorganize throtl_get_tg() and blk_throtl_bio()

blk_throtl_bio() and throtl_get_tg() have rather unusual interface.

* throtl_get_tg() returns pointer to a valid tg or ERR_PTR(-ENODEV),
  and drops queue_lock in the latter case.  Different locking context
  depending on return value is error-prone and DEAD state is scheduled
  to be protected by queue_lock anyway.  Move DEAD check inside
  queue_lock and return valid tg or NULL.

* blk_throtl_bio() indicates return status both with its return value
  and in/out param **@bio.  The former is used to indicate whether
  queue is found to be dead during throtl processing.  The latter
  whether the bio is throttled.

  There's no point in returning DEAD check result from
  blk_throtl_bio().  The queue can die after blk_throtl_bio() is
  finished but before make_request_fn() grabs queue lock.

  Make it take *@bio instead and return boolean result indicating
  whether the request is throttled or not.

This patch doesn't cause any visible functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     |  8 ++------
 block/blk-throttle.c | 49 +++++++++++++++++--------------------------------
 block/blk.h          |  6 +++---
 3 files changed, 22 insertions(+), 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 149149d..6c491f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1515,12 +1515,8 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (blk_throtl_bio(q, &bio))
-		goto end_io;
-
-	/* if bio = NULL, bio has been throttled and will be submitted later. */
-	if (!bio)
-		return false;
+	if (blk_throtl_bio(q, bio))
+		return false;	/* throttled, will be resubmitted later */
 
 	trace_block_bio_queue(q, bio);
 	return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ecba5fc..900a0c9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -303,10 +303,6 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	return tg;
 }
 
-/*
- * This function returns with queue lock unlocked in case of error, like
- * request queue is no more
- */
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
 	struct throtl_grp *tg = NULL, *__tg = NULL;
@@ -330,20 +326,16 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	spin_unlock_irq(q->queue_lock);
 
 	tg = throtl_alloc_tg(td);
-	/*
-	 * We might have slept in group allocation. Make sure queue is not
-	 * dead
-	 */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		if (tg)
-			kfree(tg);
-
-		return ERR_PTR(-ENODEV);
-	}
 
 	/* Group allocated and queue is still alive. take the lock */
 	spin_lock_irq(q->queue_lock);
 
+	/* Make sure @q is still alive */
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+		kfree(tg);
+		return NULL;
+	}
+
 	/*
 	 * Initialize the new group. After sleeping, read the blkcg again.
 	 */
@@ -1118,17 +1110,17 @@ static struct blkio_policy_type blkio_policy_throtl = {
 	.plid = BLKIO_POLICY_THROTL,
 };
 
-int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
-	struct bio *bio = *biop;
 	bool rw = bio_data_dir(bio), update_disptime = true;
 	struct blkio_cgroup *blkcg;
+	bool throttled = false;
 
 	if (bio->bi_rw & REQ_THROTTLED) {
 		bio->bi_rw &= ~REQ_THROTTLED;
-		return 0;
+		goto out;
 	}
 
 	/*
@@ -1147,7 +1139,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
 					rw, rw_is_sync(bio->bi_rw));
 			rcu_read_unlock();
-			return 0;
+			goto out;
 		}
 	}
 	rcu_read_unlock();
@@ -1156,18 +1148,10 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	 * Either group has not been allocated yet or it is not an unlimited
 	 * IO group
 	 */
-
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_get_tg(td);
-
-	if (IS_ERR(tg)) {
-		if (PTR_ERR(tg)	== -ENODEV) {
-			/*
-			 * Queue is gone. No queue lock held here.
-			 */
-			return -ENODEV;
-		}
-	}
+	if (unlikely(!tg))
+		goto out_unlock;
 
 	if (tg->nr_queued[rw]) {
 		/*
@@ -1195,7 +1179,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		 * So keep on trimming slice even if bio is not queued.
 		 */
 		throtl_trim_slice(td, tg, rw);
-		goto out;
+		goto out_unlock;
 	}
 
 queue_bio:
@@ -1207,16 +1191,17 @@ queue_bio:
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	throtl_add_bio_tg(q->td, tg, bio);
-	*biop = NULL;
+	throttled = true;
 
 	if (update_disptime) {
 		tg_update_disptime(td, tg);
 		throtl_schedule_next_dispatch(td);
 	}
 
-out:
+out_unlock:
 	spin_unlock_irq(q->queue_lock);
-	return 0;
+out:
+	return throttled;
 }
 
 int blk_throtl_init(struct request_queue *q)
diff --git a/block/blk.h b/block/blk.h
index 2b66dc2..c018dba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -190,13 +190,13 @@ static inline int blk_do_io_stat(struct request *rq)
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
-	return 0;
+	return false;
 }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
-- 
cgit v1.1


From da8303c63b8de73619884382d6e573d44aae0810 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:05 +0200
Subject: block: make get_request[_wait]() fail if queue is dead

Currently get_request[_wait]() allocates request whether queue is dead
or not.  This patch makes get_request[_wait]() return NULL if @q is
dead.  blk_queue_bio() is updated to fail the submitted bio if request
allocation fails.  While at it, add docbook comments for
get_request[_wait]().

Note that the current code has rather unclear (there are spurious DEAD
tests scattered around) assumption that the owner of a queue
guarantees that no request travels block layer if the queue is dead
and this patch in itself doesn't change much; however, this will allow
fixing the broken assumption in the next patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 54 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6c491f2..3508751 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -709,10 +709,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	return true;
 }
 
-/*
- * Get a free request, queue_lock must be held.
- * Returns NULL on failure, with queue_lock held.
- * Returns !NULL on success, with queue_lock *not held*.
+/**
+ * get_request - get a free request
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
+ *
+ * Get a free request from @q.  This function may fail under memory
+ * pressure or if @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
@@ -723,6 +732,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue;
 
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		return NULL;
+
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
@@ -815,11 +827,18 @@ out:
 	return rq;
 }
 
-/*
- * No available requests for this queue, wait for some requests to become
- * available.
+/**
+ * get_request_wait - get a free request with retry
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
  *
- * Called with q->queue_lock held, and returns with it unlocked.
+ * Get a free request from @q.  This function keeps retrying under memory
+ * pressure and fails iff @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
@@ -833,6 +852,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
+		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+			return NULL;
+
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 
@@ -863,19 +885,15 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-		return NULL;
-
 	BUG_ON(rw != READ && rw != WRITE);
 
 	spin_lock_irq(q->queue_lock);
-	if (gfp_mask & __GFP_WAIT) {
+	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw, NULL);
-	} else {
+	else
 		rq = get_request(q, rw, NULL, gfp_mask);
-		if (!rq)
-			spin_unlock_irq(q->queue_lock);
-	}
+	if (!rq)
+		spin_unlock_irq(q->queue_lock);
 	/* q->queue_lock is unlocked at this point */
 
 	return rq;
@@ -1299,6 +1317,10 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
+	if (unlikely(!req)) {
+		bio_endio(bio, -ENODEV);	/* @q is dead */
+		goto out_unlock;
+	}
 
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
-- 
cgit v1.1


From bd87b5898a72b1aef6acf3705c61c9f6372adf0c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:33:08 +0200
Subject: block: drop @tsk from attempt_plug_merge() and explain sync rules

attempt_plug_merge() accesses elevator without holding queue_lock and
may call into ->elevator_bio_merge_fn().  The elvator is guaranteed to
be valid because it's accessed iff the plugged list has requests and
elevator is never exited with live requests, so as long as the
elevator method can deal with unlocked access, this is safe.

Explain the sync rules around attempt_plug_merge() and drop the
unnecessary @tsk parameter.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3508751..034cbb2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,18 +1203,32 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	return true;
 }
 
-/*
- * Attempts to merge with the plugged list in the current process. Returns
- * true if merge was successful, otherwise false.
+/**
+ * attempt_plug_merge - try to merge with %current's plugged list
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @request_count: out parameter for number of traversed plugged requests
+ *
+ * Determine whether @bio being queued on @q can be merged with a request
+ * on %current's plugged list.  Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * This function is called without @q->queue_lock; however, elevator is
+ * accessed iff there already are requests on the plugged list which in
+ * turn guarantees validity of the elevator.
+ *
+ * Note that, on successful merge, elevator operation
+ * elevator_bio_merged_fn() will be called without queue lock.  Elevator
+ * must be ready for this.
  */
-static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-			       struct bio *bio, unsigned int *request_count)
+static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			       unsigned int *request_count)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	bool ret = false;
 
-	plug = tsk->plug;
+	plug = current->plug;
 	if (!plug)
 		goto out;
 	*request_count = 0;
@@ -1282,7 +1296,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (attempt_plug_merge(current, q, bio, &request_count))
+	if (attempt_plug_merge(q, bio, &request_count))
 		return;
 
 	spin_lock_irq(q->queue_lock);
-- 
cgit v1.1


From c9a929dde3913780b5c416f4bb9d9ed804f509ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:42:16 +0200
Subject: block: fix request_queue lifetime handling by making
 blk_queue_cleanup() properly shutdown

request_queue is refcounted but actually depdends on lifetime
management from the queue owner - on blk_cleanup_queue(), block layer
expects that there's no request passing through request_queue and no
new one will.

This is fundamentally broken.  The queue owner (e.g. SCSI layer)
doesn't have a way to know whether there are other active users before
calling blk_cleanup_queue() and other users (e.g. bsg) don't have any
guarantee that the queue is and would stay valid while it's holding a
reference.

With delay added in blk_queue_bio() before queue_lock is grabbed, the
following oops can be easily triggered when a device is removed with
in-flight IOs.

 sd 0:0:1:0: [sdb] Stopping disk
 ata1.01: disabled
 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 648, comm: test_rawio Not tainted 3.1.0-rc3-work+ #56 Bochs Bochs
 RIP: 0010:[<ffffffff8137d651>]  [<ffffffff8137d651>] elv_rqhash_find+0x61/0x100
 ...
 Process test_rawio (pid: 648, threadinfo ffff880019efa000, task ffff880019ef8a80)
 ...
 Call Trace:
  [<ffffffff8137d774>] elv_merge+0x84/0xe0
  [<ffffffff81385b54>] blk_queue_bio+0xf4/0x400
  [<ffffffff813838ea>] generic_make_request+0xca/0x100
  [<ffffffff81383994>] submit_bio+0x74/0x100
  [<ffffffff811c53ec>] dio_bio_submit+0xbc/0xc0
  [<ffffffff811c610e>] __blockdev_direct_IO+0x92e/0xb40
  [<ffffffff811c39f7>] blkdev_direct_IO+0x57/0x60
  [<ffffffff8113b1c5>] generic_file_aio_read+0x6d5/0x760
  [<ffffffff8118c1ca>] do_sync_read+0xda/0x120
  [<ffffffff8118ce55>] vfs_read+0xc5/0x180
  [<ffffffff8118cfaa>] sys_pread64+0x9a/0xb0
  [<ffffffff81afaf6b>] system_call_fastpath+0x16/0x1b

This happens because blk_queue_cleanup() destroys the queue and
elevator whether IOs are in progress or not and DEAD tests are
sprinkled in the request processing path without proper
synchronization.

Similar problem exists for blk-throtl.  On queue cleanup, blk-throtl
is shutdown whether it has requests in it or not.  Depending on
timing, it either oopses or throttled bios are lost putting tasks
which are waiting for bio completion into eternal D state.

The way it should work is having the usual clear distinction between
shutdown and release.  Shutdown drains all currently pending requests,
marks the queue dead, and performs partial teardown of the now
unnecessary part of the queue.  Even after shutdown is complete,
reference holders are still allowed to issue requests to the queue
although they will be immmediately failed.  The rest of teardown
happens on release.

This patch makes the following changes to make blk_queue_cleanup()
behave as proper shutdown.

* QUEUE_FLAG_DEAD is now set while holding both q->exit_mutex and
  queue_lock.

* Unsynchronized DEAD check in generic_make_request_checks() removed.
  This couldn't make any meaningful difference as the queue could die
  after the check.

* blk_drain_queue() updated such that it can drain all requests and is
  now called during cleanup.

* blk_throtl updated such that it checks DEAD on grabbing queue_lock,
  drains all throttled bios during cleanup and free td when queue is
  released.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 57 +++++++++++++++++++++++++++++++++-------------------
 block/blk-sysfs.c    |  1 +
 block/blk-throttle.c | 50 +++++++++++++++++++++++++++++++++++++++------
 block/blk.h          |  6 +++++-
 block/elevator.c     |  2 +-
 5 files changed, 87 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 034cbb2..7e15235 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -349,11 +349,13 @@ EXPORT_SYMBOL(blk_put_queue);
 /**
  * blk_drain_queue - drain requests from request_queue
  * @q: queue to drain
+ * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
  *
- * Drain ELV_PRIV requests from @q.  The caller is responsible for ensuring
- * that no new requests which need to be drained are queued.
+ * Drain requests from @q.  If @drain_all is set, all requests are drained.
+ * If not, only ELVPRIV requests are drained.  The caller is responsible
+ * for ensuring that no new requests which need to be drained are queued.
  */
-void blk_drain_queue(struct request_queue *q)
+void blk_drain_queue(struct request_queue *q, bool drain_all)
 {
 	while (true) {
 		int nr_rqs;
@@ -361,9 +363,15 @@ void blk_drain_queue(struct request_queue *q)
 		spin_lock_irq(q->queue_lock);
 
 		elv_drain_elevator(q);
+		if (drain_all)
+			blk_throtl_drain(q);
 
 		__blk_run_queue(q);
-		nr_rqs = q->rq.elvpriv;
+
+		if (drain_all)
+			nr_rqs = q->rq.count[0] + q->rq.count[1];
+		else
+			nr_rqs = q->rq.elvpriv;
 
 		spin_unlock_irq(q->queue_lock);
 
@@ -373,30 +381,40 @@ void blk_drain_queue(struct request_queue *q)
 	}
 }
 
-/*
- * Note: If a driver supplied the queue lock, it is disconnected
- * by this function. The actual state of the lock doesn't matter
- * here as the request_queue isn't accessible after this point
- * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
+/**
+ * blk_cleanup_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * Mark @q DEAD, drain all pending requests, destroy and put it.  All
+ * future requests will be failed immediately with -ENODEV.
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
-	/*
-	 * We know we have process context here, so we can be a little
-	 * cautious and ensure that pending block actions on this device
-	 * are done before moving on. Going into this function, we should
-	 * not have processes doing IO to this device.
-	 */
-	blk_sync_queue(q);
+	spinlock_t *lock = q->queue_lock;
 
-	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	/* mark @q DEAD, no new request or merges will be allowed afterwards */
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
-	mutex_unlock(&q->sysfs_lock);
+
+	spin_lock_irq(lock);
+	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+	queue_flag_set(QUEUE_FLAG_DEAD, q);
 
 	if (q->queue_lock != &q->__queue_lock)
 		q->queue_lock = &q->__queue_lock;
 
+	spin_unlock_irq(lock);
+	mutex_unlock(&q->sysfs_lock);
+
+	/* drain all requests queued before DEAD marking */
+	blk_drain_queue(q, true);
+
+	/* @q won't process any more request, flush async actions */
+	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	blk_sync_queue(q);
+
+	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -1509,9 +1527,6 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-		goto end_io;
-
 	part = bio->bi_bdev->bd_part;
 	if (should_fail_request(part, bio->bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a8eff5f..e7f9f65 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -490,6 +490,7 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	blk_throtl_release(q);
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 900a0c9..8edb949 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -309,6 +309,10 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	struct blkio_cgroup *blkcg;
 	struct request_queue *q = td->queue;
 
+	/* no throttling for dead queue */
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		return NULL;
+
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
 	tg = throtl_find_tg(td, blkcg);
@@ -1001,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td)
 	}
 }
 
-static void throtl_td_free(struct throtl_data *td)
-{
-	kfree(td);
-}
-
 /*
  * Blk cgroup controller notification saying that blkio_group object is being
  * delinked as associated cgroup object is going away. That also means that
@@ -1204,6 +1203,41 @@ out:
 	return throttled;
 }
 
+/**
+ * blk_throtl_drain - drain throttled bios
+ * @q: request_queue to drain throttled bios for
+ *
+ * Dispatch all currently throttled bios on @q through ->make_request_fn().
+ */
+void blk_throtl_drain(struct request_queue *q)
+	__releases(q->queue_lock) __acquires(q->queue_lock)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+	struct throtl_grp *tg;
+	struct bio_list bl;
+	struct bio *bio;
+
+	lockdep_is_held(q->queue_lock);
+
+	bio_list_init(&bl);
+
+	while ((tg = throtl_rb_first(st))) {
+		throtl_dequeue_tg(td, tg);
+
+		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
+			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+	}
+	spin_unlock_irq(q->queue_lock);
+
+	while ((bio = bio_list_pop(&bl)))
+		generic_make_request(bio);
+
+	spin_lock_irq(q->queue_lock);
+}
+
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
@@ -1276,7 +1310,11 @@ void blk_throtl_exit(struct request_queue *q)
 	 * it.
 	 */
 	throtl_shutdown_wq(q);
-	throtl_td_free(td);
+}
+
+void blk_throtl_release(struct request_queue *q)
+{
+	kfree(q->td);
 }
 
 static int __init throtl_init(void)
diff --git a/block/blk.h b/block/blk.h
index c018dba..3f6551b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,7 +15,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
-void blk_drain_queue(struct request_queue *q);
+void blk_drain_queue(struct request_queue *q, bool drain_all);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -191,15 +191,19 @@ static inline int blk_do_io_stat(struct request *rq)
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
+extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_release(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	return false;
 }
+static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_release(struct request_queue *q) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif /* BLK_INTERNAL_H */
diff --git a/block/elevator.c b/block/elevator.c
index 74a277f..66343d6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -626,7 +626,7 @@ void elv_quiesce_start(struct request_queue *q)
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
 	spin_unlock_irq(q->queue_lock);
 
-	blk_drain_queue(q);
+	blk_drain_queue(q, false);
 }
 
 void elv_quiesce_end(struct request_queue *q)
-- 
cgit v1.1


From e890413af4c2dfebf5432ef30cc70cb11dad3213 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Mon, 24 Oct 2011 16:08:38 +0200
Subject: block: fix a typo in the blk-cgroup.h file

byptes -> bytes.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index a71d290..6f3ace7 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -188,7 +188,7 @@ struct blkio_policy_node {
 	union {
 		unsigned int weight;
 		/*
-		 * Rate read/write in terms of byptes per second
+		 * Rate read/write in terms of bytes per second
 		 * Whether this rate represents read or write is determined
 		 * by file type "fileid".
 		 */
-- 
cgit v1.1


From 9562ad9ab36df7ccef920d119f3b5100025db95f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 24 Oct 2011 16:11:30 +0200
Subject: block: Remove the control of complete cpu from bio.

bio originally has the functionality to set the complete cpu, but
it is broken.

Chirstoph said that "This code is unused, and from the all the
discussions lately pretty obviously broken.  The only thing keeping
it serves is creating more confusion and possibly more bugs."

And Jens replied with "We can kill bio_set_completion_cpu(). I'm fine
with leaving cpu control to the request based drivers, they are the
only ones that can toggle the setting anyway".

So this patch tries to remove all the work of controling complete cpu
from a bio.

Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7e15235..da69793 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1276,7 +1276,6 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 
 	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1362,8 +1361,7 @@ get_rq:
 	 */
 	init_request_from_bio(req, bio);
 
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE))
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 		req->cpu = raw_smp_processor_id();
 
 	plug = current->plug;
-- 
cgit v1.1


From 834f9f61a525d2f6d3d0c93894e26326c8d3ceed Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 17 Oct 2011 12:57:22 +0200
Subject: blk-flush: fix invalid BUG_ON in blk_insert_flush

A user reported a regression due to commit
4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush
machinery for stacking drivers with differring flush flags).
Part of the problem is that blk_insert_flush required a
single bio be attached to the request.  In reality, having
no attached bio is also a valid case, as can be observed with
an empty flush.

[1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html

Reported-by: Christophe Saout <christophe@saout.de>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com
Acked-by: Tejun Heo <tj@kernel.org>

Stable note: 3.1
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 491eb30..89ae3b9 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -320,7 +320,7 @@ void blk_insert_flush(struct request *rq)
 		return;
 	}
 
-	BUG_ON(!rq->bio || rq->bio != rq->biotail);
+	BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
 
 	/*
 	 * If there's data but flush is not necessary, the request can be
-- 
cgit v1.1


From e67b77c791ca2778198c9e7088f3266ed2da7a55 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 17 Oct 2011 12:57:23 +0200
Subject: blk-flush: move the queue kick into

A dm-multipath user reported[1] a problem when trying to boot
a kernel with commit 4853abaae7e4a2af938115ce9071ef8684fb7af4
(block: fix flush machinery for stacking drivers with differring
flush flags) applied.  It turns out that an empty flush request
can be sent into blk_insert_flush.  When the BUG_ON was fixed
to allow for this, I/O on the underlying device would stall.  The
reason is that blk_insert_cloned_request does not kick the queue.
In the aforementioned commit, I had added a special case to
kick the queue if data was sent down but the queue flags did
not require a flush.  A better solution is to push the queue
kick up into blk_insert_cloned_request.

This patch, along with a follow-on which fixes the BUG_ON, fixes
the issue reported.

[1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html

Reported-by: Christophe Saout <christophe@saout.de>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>

Stable note: 3.1
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 2 ++
 block/blk-flush.c | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index d34433a..795154e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1725,6 +1725,8 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
+	if (where == ELEVATOR_INSERT_FLUSH)
+		__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 89ae3b9..720ad60 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,6 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		list_add_tail(&rq->queuelist, &q->queue_head);
-		blk_run_queue_async(q);
 		return;
 	}
 
-- 
cgit v1.1


From f992ae801a7dec34a4ed99a6598bbbbfb82af4fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 17 Oct 2011 13:42:43 +0200
Subject: block: make gendisk hold a reference to its queue

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index e2f6790..d261b73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
-- 
cgit v1.1


From 5e08159197b5b98a6648a172008de23f420e6c11 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 25 Oct 2011 10:20:05 +0200
Subject: block: warn if tag is greater than real_max_depth.

In case tag depth is reduced, it is max_depth not real_max_depth.
So we should allow a request with tag >= max_depth, but for a
tag >= real_max_depth, there really should be some problem.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-tag.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index ece65fc..e74d6d1 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -286,12 +286,14 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 
 	BUG_ON(tag == -1);
 
-	if (unlikely(tag >= bqt->real_max_depth))
+	if (unlikely(tag >= bqt->max_depth)) {
 		/*
 		 * This can happen after tag depth has been reduced.
-		 * FIXME: how about a warning or info message here?
+		 * But tag shouldn't be larger than real_max_depth.
 		 */
+		WARN_ON(tag >= bqt->real_max_depth);
 		return;
+	}
 
 	list_del_init(&rq->queuelist);
 	rq->cmd_flags &= ~REQ_QUEUED;
-- 
cgit v1.1


From e060f00beee23568fe2c4faf1e88ff22edefd7b2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 25 Oct 2011 15:48:12 +0200
Subject: blk-throttle: Free up policy node associated with deleted rule

If a rule is being deleted, free up associated policy node. Otherwise
that memory is leaked.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d61ec56..6155367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1075,6 +1075,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 
 	if (blkio_delete_rule_command(newpn)) {
 		blkio_policy_delete_node(pn);
+		kfree(pn);
 		spin_unlock_irq(&blkcg->lock);
 		goto update_io_group;
 	}
-- 
cgit v1.1


From a38eb630fa224d6fba8c14a4063174bc5e0f63bb Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 25 Oct 2011 15:48:12 +0200
Subject: blk-throttle: Take blkcg->lock while traversing blkcg->policy_list

blkcg->policy_list is protected by blkcg->lock. Its not rcu protected
list. So even for readers, they need to take blkcg->lock. There are
few functions which were reading the list without taking lock. Fix it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6155367..8f630ce 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -869,60 +869,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 			      dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int weight;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device);
 	if (pn)
-		return pn->val.weight;
+		weight = pn->val.weight;
 	else
-		return blkcg->weight;
+		weight = blkcg->weight;
+
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	uint64_t bps = -1;
 
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_bps_device);
 	if (pn)
-		return pn->val.bps;
-	else
-		return -1;
+		bps = pn->val.bps;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return bps;
 }
 
 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	uint64_t bps = -1;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_bps_device);
 	if (pn)
-		return pn->val.bps;
-	else
-		return -1;
+		bps = pn->val.bps;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return bps;
 }
 
 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int iops = -1;
 
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_read_iops_device);
 	if (pn)
-		return pn->val.iops;
-	else
-		return -1;
+		iops = pn->val.iops;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return iops;
 }
 
 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
+	unsigned long flags;
+	unsigned int iops = -1;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_write_iops_device);
 	if (pn)
-		return pn->val.iops;
-	else
-		return -1;
+		iops = pn->val.iops;
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	return iops;
 }
 
 /* Checks whether user asked for deleting a policy rule */
-- 
cgit v1.1


From 334c2b0b8b2ab186fa198413386cba41fffcb4f2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 25 Oct 2011 15:51:48 +0200
Subject: blk-throttle: use queue_is_locked() instead of lockdep_is_held()

We can't use the latter if !CONFIG_LOCKDEP.

Reported-by: Sedat Dilek <sedat.dilek@googlemail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8edb949..4553245 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1218,7 +1218,7 @@ void blk_throtl_drain(struct request_queue *q)
 	struct bio_list bl;
 	struct bio *bio;
 
-	lockdep_is_held(q->queue_lock);
+	WARN_ON_ONCE(!queue_is_locked(q));
 
 	bio_list_init(&bl);
 
-- 
cgit v1.1


From d5decd3b9512e35c87492312a72443192eebdda9 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 16:00:52 -0400
Subject: block: add export.h to files using EXPORT_SYMBOL/THIS_MODULE macros

These files were getting <linux/module.h> via an implicit include
path, but we want to crush those out of existence since they cost
time during compiles of processing thousands of lines of headers
for no reason.  Give them the lightweight header that just contains
the EXPORT_SYMBOL infrastructure.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 block/blk-integrity.c | 1 +
 block/ioctl.c         | 1 +
 2 files changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 129b9e2..da2a818 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 
 #include "blk.h"
diff --git a/block/ioctl.c b/block/ioctl.c
index 1124cd2..5554403 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,5 +1,6 @@
 #include <linux/capability.h>
 #include <linux/blkdev.h>
+#include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
-- 
cgit v1.1


From 6adb1236b5c1220987209aa68192e0cbad73e9fc Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 28 Sep 2011 18:26:05 -0400
Subject: block: Change module.h -> export.h in bsg-lib.c

This file isn't using full modular functionality, and hence
can be "downgraded" to just using the export.h header.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 block/bsg-lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 6690e6e..7ad49c8 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -25,7 +25,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/bsg-lib.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <scsi/scsi_cmnd.h>
 
 /**
-- 
cgit v1.1


From 6dd9ad7df2019b1e33a372a501907db293ebcd0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Nov 2011 18:52:11 +0100
Subject: block: don't call blk_drain_queue() if elevator is not up

blk_cleanup_queue() may be called before elevator is set up on a
queue which triggers the following oops.

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: [<ffffffff8125a69c>] elv_drain_elevator+0x1c/0x70
 ...
 Pid: 830, comm: kworker/0:2 Not tainted 3.1.0-next-20111025_64+ #1590
 Bochs Bochs
 RIP: 0010:[<ffffffff8125a69c>]  [<ffffffff8125a69c>] elv_drain_elevator+0x1c/0x70
 ...
 Call Trace:
  [<ffffffff8125da92>] blk_drain_queue+0x42/0x70
  [<ffffffff8125db90>] blk_cleanup_queue+0xd0/0x1c0
  [<ffffffff81469640>] md_free+0x50/0x70
  [<ffffffff8126f43b>] kobject_release+0x8b/0x1d0
  [<ffffffff81270d56>] kref_put+0x36/0xa0
  [<ffffffff8126f2b7>] kobject_put+0x27/0x60
  [<ffffffff814693af>] mddev_delayed_delete+0x2f/0x40
  [<ffffffff81083450>] process_one_work+0x100/0x3b0
  [<ffffffff8108527f>] worker_thread+0x15f/0x3a0
  [<ffffffff81089937>] kthread+0x87/0x90
  [<ffffffff81621834>] kernel_thread_helper+0x4/0x10

Fix it by making blk_cleanup_queue() check whether q->elevator is set
up before invoking blk_drain_queue.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f658711..f43c8a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -407,8 +407,13 @@ void blk_cleanup_queue(struct request_queue *q)
 	spin_unlock_irq(lock);
 	mutex_unlock(&q->sysfs_lock);
 
-	/* drain all requests queued before DEAD marking */
-	blk_drain_queue(q, true);
+	/*
+	 * Drain all requests queued before DEAD marking.  The caller might
+	 * be trying to tear down @q before its elevator is initialized, in
+	 * which case we don't want to call into draining.
+	 */
+	if (q->elevator)
+		blk_drain_queue(q, true);
 
 	/* @q won't process any more request, flush async actions */
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
-- 
cgit v1.1


From d0985394e7fee6b25a7cc8335d45bc1c1a8ab2d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 10 Nov 2011 09:03:55 +0100
Subject: block: Revert "[SCSI] genhd: add a new attribute "alias" in gendisk"

This reverts commit a72c5e5eb738033938ab30d6a634b74d1d060f10.

The commit introduced alias for block devices which is intended to be
used during logging although actual usage hasn't been committed yet.
This approach adds very limited benefit (raw log might be easier to
follow) which can be trivially implemented in userland but has a lot
of problems.

It is much worse than netif renames because it doesn't rename the
actual device but just adds conveninence name which isn't used
universally or enforced.  Everything internal including device lookup
and sysfs still uses the internal name and nothing prevents two
devices from using conflicting alias - ie. sda can have sdb as its
alias.

This has been nacked by people working on device driver core, block
layer and kernel-userland interface and shouldn't have been
upstreamed.  Revert it.

 http://thread.gmane.org/gmane.linux.kernel/1155104
 http://thread.gmane.org/gmane.linux.scsi/68632
 http://thread.gmane.org/gmane.linux.scsi/69776

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
 Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Nao Nishijima <nao.nishijima.xt@hitachi.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 71 -----------------------------------------------------------
 1 file changed, 71 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 9253839..02e9fca 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -19,7 +19,6 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
-#include <linux/ctype.h>
 
 #include "blk.h"
 
@@ -916,74 +915,6 @@ static int __init genhd_device_init(void)
 
 subsys_initcall(genhd_device_init);
 
-static ssize_t alias_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-	ssize_t ret = 0;
-
-	if (disk->alias)
-		ret = snprintf(buf, ALIAS_LEN, "%s\n", disk->alias);
-	return ret;
-}
-
-static ssize_t alias_store(struct device *dev, struct device_attribute *attr,
-			   const char *buf, size_t count)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-	char *alias;
-	char *envp[] = { NULL, NULL };
-	unsigned char c;
-	int i;
-	ssize_t ret = count;
-
-	if (!count)
-		return -EINVAL;
-
-	if (count >= ALIAS_LEN) {
-		printk(KERN_ERR "alias: alias is too long\n");
-		return -EINVAL;
-	}
-
-	/* Validation check */
-	for (i = 0; i < count; i++) {
-		c = buf[i];
-		if (i == count - 1 && c == '\n')
-			break;
-		if (!isalnum(c) && c != '_' && c != '-') {
-			printk(KERN_ERR "alias: invalid alias\n");
-			return -EINVAL;
-		}
-	}
-
-	if (disk->alias) {
-		printk(KERN_INFO "alias: %s is already assigned (%s)\n",
-		       disk->disk_name, disk->alias);
-		return -EINVAL;
-	}
-
-	alias = kasprintf(GFP_KERNEL, "%s", buf);
-	if (!alias)
-		return -ENOMEM;
-
-	if (alias[count - 1] == '\n')
-		alias[count - 1] = '\0';
-
-	envp[0] = kasprintf(GFP_KERNEL, "ALIAS=%s", alias);
-	if (!envp[0]) {
-		kfree(alias);
-		return -ENOMEM;
-	}
-
-	disk->alias = alias;
-	printk(KERN_INFO "alias: assigned %s to %s\n", alias, disk->disk_name);
-
-	kobject_uevent_env(&dev->kobj, KOBJ_ADD, envp);
-
-	kfree(envp[0]);
-	return ret;
-}
-
 static ssize_t disk_range_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -1043,7 +974,6 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
-static DEVICE_ATTR(alias, S_IRUGO|S_IWUSR, alias_show, alias_store);
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -1066,7 +996,6 @@ static struct device_attribute dev_attr_fail_timeout =
 #endif
 
 static struct attribute *disk_attrs[] = {
-	&dev_attr_alias.attr,
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
-- 
cgit v1.1


From 6b76106d8ef31111d6fc469564b83b5f5542794f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 13 Nov 2011 19:58:09 +0100
Subject: block: Always check length of all iov entries in
 blk_rq_map_user_iov()

Even after commit 5478755616ae2ef1ce144dded589b62b2a50d575
("block: check for proper length of iov entries earlier ...")
we still won't check for zero-length entries after an unaligned
entry.  Remove the break-statement, so all entries are checked.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index e663ac2..164cd00 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -204,10 +204,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		if (!iov[i].iov_len)
 			return -EINVAL;
 
-		if (uaddr & queue_dma_alignment(q)) {
+		/*
+		 * Keep going so we check length of all segments
+		 */
+		if (uaddr & queue_dma_alignment(q))
 			unaligned = 1;
-			break;
-		}
 	}
 
 	if (unaligned || (q->dma_pad_mask & len) || map_data)
-- 
cgit v1.1


From 3540d5e89b2ac268fcfc9b07a50a9ba4acc2e5e5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 16 Nov 2011 09:21:50 +0100
Subject: block: avoid unnecessary plug list flush

get_request_wait() could sleep and flush the plug list.  If the list is
already flushed, don't flush again.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f43c8a5..6403c12 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1379,15 +1379,17 @@ get_rq:
 		 */
 		if (list_empty(&plug->list))
 			trace_block_plug(q);
-		else if (!plug->should_sort) {
-			struct request *__rq;
+		else {
+			if (!plug->should_sort) {
+				struct request *__rq;
 
-			__rq = list_entry_rq(plug->list.prev);
-			if (__rq->q != q)
-				plug->should_sort = 1;
+				__rq = list_entry_rq(plug->list.prev);
+				if (__rq->q != q)
+					plug->should_sort = 1;
+			}
+			if (request_count >= BLK_MAX_REQUEST_COUNT)
+				blk_flush_plug_list(plug, false);
 		}
-		if (request_count >= BLK_MAX_REQUEST_COUNT)
-			blk_flush_plug_list(plug, false);
 		list_add_tail(&req->queuelist, &plug->list);
 		drive_stat_acct(req, 1);
 	} else {
-- 
cgit v1.1


From 019ceb7d5d252ce71001a157cf29f4ac28501b72 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 16 Nov 2011 09:21:50 +0100
Subject: block: add missed trace_block_plug

After flush plug list, the list has no request, so we need to add a
trace_block_plug().

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6403c12..ea70e6c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1387,8 +1387,10 @@ get_rq:
 				if (__rq->q != q)
 					plug->should_sort = 1;
 			}
-			if (request_count >= BLK_MAX_REQUEST_COUNT)
+			if (request_count >= BLK_MAX_REQUEST_COUNT) {
 				blk_flush_plug_list(plug, false);
+				trace_block_plug(q);
+			}
 		}
 		list_add_tail(&req->queuelist, &plug->list);
 		drive_stat_acct(req, 1);
-- 
cgit v1.1


From 5151412dd4338b273afdb107c3772528e9e67d92 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 23 Nov 2011 10:59:13 +0100
Subject: block: initialize request_queue's numa node during

struct request_queue is allocated with __GFP_ZERO so its "node" field is
zero before initialization.  This causes an oops if node 0 is offline in
the page allocator because its zonelists are not initialized.  From Dave
Young's dmesg:

	SRAT: Node 1 PXM 2 0-d0000000
	SRAT: Node 1 PXM 2 100000000-330000000
	SRAT: Node 0 PXM 1 330000000-630000000
	Initmem setup node 1 0000000000000000-000000000affb000
	...
	Built 1 zonelists in Node order, mobility grouping on.
	...
	BUG: unable to handle kernel paging request at 0000000000001c08
	IP: [<ffffffff8111c355>] __alloc_pages_nodemask+0xb5/0x870

and __alloc_pages_nodemask+0xb5 translates to a NULL pointer on
zonelist->_zonerefs.

The fix is to initialize q->node at the time of allocation so the correct
node is passed to the slab allocator later.

Since blk_init_allocated_queue_node() is no longer needed, merge it with
blk_init_allocated_queue().

[rientjes@google.com: changelog, initializing q->node]
Cc: stable@vger.kernel.org [2.6.37+]
Reported-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Tested-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ea70e6c..20d69f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -467,6 +467,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	q->backing_dev_info.name = "block";
+	q->node = node_id;
 
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
@@ -551,7 +552,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	if (!uninit_q)
 		return NULL;
 
-	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+	q = blk_init_allocated_queue(uninit_q, rfn, lock);
 	if (!q)
 		blk_cleanup_queue(uninit_q);
 
@@ -563,18 +564,9 @@ struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 			 spinlock_t *lock)
 {
-	return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
-			      spinlock_t *lock, int node_id)
-{
 	if (!q)
 		return NULL;
 
-	q->node = node_id;
 	if (blk_init_free_list(q))
 		return NULL;
 
@@ -604,7 +596,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 
 	return NULL;
 }
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
 
 int blk_get_queue(struct request_queue *q)
 {
-- 
cgit v1.1


From 2984ff38ccf6cbc02a7a996a36c7d6f69f3c6146 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Wed, 30 Nov 2011 15:47:48 +0100
Subject: cfq-iosched: free cic_index if blkio_alloc_blkg_stats fails

If we fail allocating the blkpg stats, we free cfqd and cfgq.
But we need to free the IDA cfqd->cic_index as well.

Signed-off-by: majianpeng <majianpeng@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 16ace89..3beed83 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4036,6 +4036,11 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
 		kfree(cfqg);
+
+		spin_lock(&cic_index_lock);
+		ida_remove(&cic_index_ida, cfqd->cic_index);
+		spin_unlock(&cic_index_lock);
+
 		kfree(cfqd);
 		return NULL;
 	}
-- 
cgit v1.1


From 5eb46851de3904cd1be9192fdacb8d34deadc1fc Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Fri, 2 Dec 2011 10:07:07 +0100
Subject: cfq-iosched: fix cfq_cic_link() race confition

cfq_cic_link() has race condition. When some processes which shared ioc
issue I/O to same block device simultaneously, cfq_cic_link() returns -EEXIST
sometimes. The race condition might stop I/O by following steps:

step  1: Process A: Issue an I/O to /dev/sda
step  2: Process A: Get an ioc (iocA here) in get_io_context() which does not
		    linked with a cic for the device
step  3: Process A: Get a new cic for the device (cicA here) in
		    cfq_alloc_io_context()

step  4: Process B: Issue an I/O to /dev/sda
step  5: Process B: Get iocA in get_io_context() since process A and B share the
		    same ioc
step  6: Process B: Get a new cic for the device (cicB here) in
		    cfq_alloc_io_context() since iocA has not been linked with a
		    cic for the device yet

step  7: Process A: Link cicA to iocA in cfq_cic_link()
step  8: Process A: Dispatch I/O to driver and finish it

step  9: Process B: Try to link cicB to iocA in cfq_cic_link()
		    But it fails with showing "cfq: cic link failed!" kernel
		    message, since iocA has already linked with cicA at step 7.
step 10: Process B: Wait for finishig I/O in get_request_wait()
		    The function does not wake up, when there is no I/O to the
		    device.

When cfq_cic_link() returns -EEXIST, it means ioc has already linked with cic.
So when cfq_cic_link() return -EEXIST, retry cfq_cic_lookup().

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3beed83..4c12869 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3184,7 +3184,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 		}
 	}
 
-	if (ret)
+	if (ret && ret != -EEXIST)
 		printk(KERN_ERR "cfq: cic link failed!\n");
 
 	return ret;
@@ -3200,6 +3200,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
+	int ret;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
@@ -3207,6 +3208,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
+retry:
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
@@ -3215,7 +3217,12 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (cic == NULL)
 		goto err;
 
-	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
+	ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
+	if (ret == -EEXIST) {
+		/* someone has linked cic to ioc already */
+		cfq_cic_free(cic);
+		goto retry;
+	} else if (ret)
 		goto err_free;
 
 out:
-- 
cgit v1.1


From 4eabc941259f9d8c8fb71746d3f30c87e1d9e49b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Dec 2011 20:03:04 +0100
Subject: block: don't kick empty queue in blk_drain_queue()

While probing, fd sets up queue, probes hardware and tears down the
queue if probing fails.  In the process, blk_drain_queue() kicks the
queue which failed to finish initialization and fd is unhappy about
that.

  floppy0: no floppy controllers found
  ------------[ cut here ]------------
  WARNING: at drivers/block/floppy.c:2929 do_fd_request+0xbf/0xd0()
  Hardware name: To Be Filled By O.E.M.
  VFS: do_fd_request called on non-open device
  Modules linked in:
  Pid: 1, comm: swapper Not tainted 3.2.0-rc4-00077-g5983fe2 #2
  Call Trace:
   [<ffffffff81039a6a>] warn_slowpath_common+0x7a/0xb0
   [<ffffffff81039b41>] warn_slowpath_fmt+0x41/0x50
   [<ffffffff813d657f>] do_fd_request+0xbf/0xd0
   [<ffffffff81322b95>] blk_drain_queue+0x65/0x80
   [<ffffffff81322c93>] blk_cleanup_queue+0xe3/0x1a0
   [<ffffffff818a809d>] floppy_init+0xdeb/0xe28
   [<ffffffff818a72b2>] ? daring+0x6b/0x6b
   [<ffffffff810002af>] do_one_initcall+0x3f/0x170
   [<ffffffff81884b34>] kernel_init+0x9d/0x11e
   [<ffffffff810317c2>] ? schedule_tail+0x22/0xa0
   [<ffffffff815dbb14>] kernel_thread_helper+0x4/0x10
   [<ffffffff81884a97>] ? start_kernel+0x2be/0x2be
   [<ffffffff815dbb10>] ? gs_change+0xb/0xb

Avoid it by making blk_drain_queue() kick queue iff dispatch queue has
something on it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ralf Hildebrandt <Ralf.Hildebrandt@charite.de>
Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Tested-by: Sergei Trofimovich <slyich@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 20d69f6..15de223 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -366,7 +366,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 		if (drain_all)
 			blk_throtl_drain(q);
 
-		__blk_run_queue(q);
+		/*
+		 * This function might be called on a queue which failed
+		 * driver init after queue creation.  Some drivers
+		 * (e.g. fd) get unhappy in such cases.  Kick queue iff
+		 * dispatch queue has something on it.
+		 */
+		if (!list_empty(&q->queue_head))
+			__blk_run_queue(q);
 
 		if (drain_all)
 			nr_rqs = q->rq.count[0] + q->rq.count[1];
-- 
cgit v1.1


From 6ae0516b8a50ece5d766be608a305707e0450060 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 16 Dec 2011 14:04:23 +0100
Subject: block, cfq: fix empty queue crash caused by request merge

All requests of a queue could be merged to other requests of other queue.
Such queue will not have request in it, but it's in service tree. This
will cause kernel oops.
I encounter a BUG_ON() in cfq_dispatch_request() with next patch, but the
issue should exist without the patch.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4c12869..3548705 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1655,6 +1655,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
@@ -1669,6 +1671,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	cfq_remove_request(next);
 	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(next), rq_is_sync(next));
+
+	cfqq = RQ_CFQQ(next);
+	/*
+	 * all requests of this queue are merged to other queues, delete it
+	 * from the service tree. If it's the active_queue,
+	 * cfq_dispatch_requests() will choose to expire it or do idle
+	 */
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
+	    cfqq != cfqd->active_queue)
+		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
-- 
cgit v1.1


From 609f6ea1c9cdfe0c43a927e13205a57d0c266d5a Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Wed, 21 Dec 2011 15:27:24 +0100
Subject: block: re-use existing 'reading' variable instead of checking
 direction again

Signed-off-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 164cd00..623e1cd 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (rq_data_dir(rq) == WRITE)
+	if (!reading)
 		bio->bi_rw |= REQ_WRITE;
 
 	if (do_copy)
-- 
cgit v1.1


From f2b20d436534f22ccc3f5ad172499fcb013bb315 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 29 Dec 2011 09:16:28 +0100
Subject: block: fix blk_queue_end_tag()

Commit 5e081591 "block: warn if tag is greater than real_max_depth"
cleaned up blk_queue_end_tag() to warn when the tag is truly invalid
(greater than real_max_depth).  However, it changed behavior in the tag <
max_depth case to not end the request.  Leading to triggering of
BUG_ON(blk_queued_rq(rq)) in the request completion path:

  http://marc.info/?l=linux-kernel&m=132204370518629&w=2

In order to allow blk_queue_resize_tags() to shrink the tag space
blk_queue_end_tag() must always complete tags with a value less than
real_max_depth regardless of the current max_depth.  The comment about
"handling the shrink case" seems to be what prompted changes in this
space, so remove it and BUG on all invalid tags (made even simpler by
Matthew's suggestion to use an unsigned compare).

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Tao Ma <boyu.mt@taobao.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Reported-by: Meelis Roos <mroos@ut.ee>
Reported-by: Ed Nadolski <edmund.nadolski@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-tag.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index e74d6d1..4af6f5c 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
-	int tag = rq->tag;
+	unsigned tag = rq->tag; /* negative tags invalid */
 
-	BUG_ON(tag == -1);
-
-	if (unlikely(tag >= bqt->max_depth)) {
-		/*
-		 * This can happen after tag depth has been reduced.
-		 * But tag shouldn't be larger than real_max_depth.
-		 */
-		WARN_ON(tag >= bqt->real_max_depth);
-		return;
-	}
+	BUG_ON(tag >= bqt->real_max_depth);
 
 	list_del_init(&rq->queuelist);
 	rq->cmd_flags &= ~REQ_QUEUED;
-- 
cgit v1.1


From 36a7ce632fb11cce578b7fdf4b4bbc8fbb99987a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2012 16:01:27 +0100
Subject: block: add and use scsi_blk_cmd_ioctl

commit 577ebb374c78314ac4617242f509e2f5e7156649 upstream.

Introduce a wrapper around scsi_cmd_ioctl that takes a block device.

The function will then be enhanced to detect partition block devices
and, in that case, subject the ioctls to whitelisting.

Cc: linux-scsi@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/scsi_ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index fbdf0d8..a2c11f3 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -690,6 +690,13 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
+int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
+		       unsigned int cmd, void __user *arg)
+{
+	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
+}
+EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
+
 static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
-- 
cgit v1.1


From d33310de0d7914f2e27ed4fef67a1979f10037e1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2012 16:01:28 +0100
Subject: block: fail SCSI passthrough ioctls on partition devices

commit 0bfc96cb77224736dfa35c3c555d37b3646ef35e upstream.

[ Changes with respect to 3.3: return -ENOTTY from scsi_verify_blk_ioctl
  and -ENOIOCTLCMD from sd_compat_ioctl. ]

Linux allows executing the SG_IO ioctl on a partition or LVM volume, and
will pass the command to the underlying block device.  This is
well-known, but it is also a large security problem when (via Unix
permissions, ACLs, SELinux or a combination thereof) a program or user
needs to be granted access only to part of the disk.

This patch lets partitions forward a small set of harmless ioctls;
others are logged with printk so that we can see which ioctls are
actually sent.  In my tests only CDROM_GET_CAPABILITY actually occurred.
Of course it was being sent to a (partition on a) hard disk, so it would
have failed with ENOTTY and the patch isn't changing anything in
practice.  Still, I'm treating it specially to avoid spamming the logs.

In principle, this restriction should include programs running with
CAP_SYS_RAWIO.  If for example I let a program access /dev/sda2 and
/dev/sdb, it still should not be able to read/write outside the
boundaries of /dev/sda2 independent of the capabilities.  However, for
now programs with CAP_SYS_RAWIO will still be allowed to send the
ioctls.  Their actions will still be logged.

This patch does not affect the non-libata IDE driver.  That driver
however already tests for bd != bd->bd_contains before issuing some
ioctl; it could be restricted further to forbid these ioctls even for
programs running with CAP_SYS_ADMIN/CAP_SYS_RAWIO.

Cc: linux-scsi@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[ Make it also print the command name when warning - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/scsi_ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a2c11f3..688be8a 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -24,6 +24,7 @@
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/cdrom.h>
+#include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <asm/uaccess.h>
@@ -690,9 +691,53 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
+int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
+{
+	if (bd && bd == bd->bd_contains)
+		return 0;
+
+	/* Actually none of these is particularly useful on a partition,
+	 * but they are safe.
+	 */
+	switch (cmd) {
+	case SCSI_IOCTL_GET_IDLUN:
+	case SCSI_IOCTL_GET_BUS_NUMBER:
+	case SCSI_IOCTL_GET_PCI:
+	case SCSI_IOCTL_PROBE_HOST:
+	case SG_GET_VERSION_NUM:
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_RESERVED_SIZE:
+	case SG_SET_RESERVED_SIZE:
+	case SG_EMULATED_HOST:
+		return 0;
+	case CDROM_GET_CAPABILITY:
+		/* Keep this until we remove the printk below.  udev sends it
+		 * and we do not want to spam dmesg about it.   CD-ROMs do
+		 * not have partitions, so we get here only for disks.
+		 */
+		return -ENOTTY;
+	default:
+		break;
+	}
+
+	/* In particular, rule out all resets and host-specific ioctls.  */
+	printk_ratelimited(KERN_WARNING
+			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
+
+	return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY;
+}
+EXPORT_SYMBOL(scsi_verify_blk_ioctl);
+
 int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
 		       unsigned int cmd, void __user *arg)
 {
+	int ret;
+
+	ret = scsi_verify_blk_ioctl(bd, cmd);
+	if (ret < 0)
+		return ret;
+
 	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
 }
 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
-- 
cgit v1.1


From 6e118375b14e1cd89cab325c79faf6f48dc9c50b Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 8 Feb 2012 20:02:03 +0100
Subject: bsg: fix sysfs link remove warning

commit 37b40adf2d1b4a5e51323be73ccf8ddcf3f15dd3 upstream.

We create "bsg" link if q->kobj.sd is not NULL, so remove it only
when the same condition is true.

Fixes:

WARNING: at fs/sysfs/inode.c:323 sysfs_hash_and_remove+0x2b/0x77()
sysfs: can not remove 'bsg', no directory
Call Trace:
  [<c0429683>] warn_slowpath_common+0x6a/0x7f
  [<c0537a68>] ? sysfs_hash_and_remove+0x2b/0x77
  [<c042970b>] warn_slowpath_fmt+0x2b/0x2f
  [<c0537a68>] sysfs_hash_and_remove+0x2b/0x77
  [<c053969a>] sysfs_remove_link+0x20/0x23
  [<c05d88f1>] bsg_unregister_queue+0x40/0x6d
  [<c0692263>] __scsi_remove_device+0x31/0x9d
  [<c069149f>] scsi_forget_host+0x41/0x52
  [<c0689fa9>] scsi_remove_host+0x71/0xe0
  [<f7de5945>] quiesce_and_remove_host+0x51/0x83 [usb_storage]
  [<f7de5a1e>] usb_stor_disconnect+0x18/0x22 [usb_storage]
  [<c06c29de>] usb_unbind_interface+0x4e/0x109
  [<c067a80f>] __device_release_driver+0x6b/0xa6
  [<c067a861>] device_release_driver+0x17/0x22
  [<c067a46a>] bus_remove_device+0xd6/0xe6
  [<c06785e2>] device_del+0xf2/0x137
  [<c06c101f>] usb_disable_device+0x94/0x1a0

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/bsg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 702f131..c0ab25c 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -985,7 +985,8 @@ void bsg_unregister_queue(struct request_queue *q)
 
 	mutex_lock(&bsg_mutex);
 	idr_remove(&bsg_minor_idr, bcd->minor);
-	sysfs_remove_link(&q->kobj, "bsg");
+	if (q->kobj.sd)
+		sysfs_remove_link(&q->kobj, "bsg");
 	device_unregister(bcd->class_dev);
 	bcd->class_dev = NULL;
 	kref_put(&bcd->ref, bsg_kref_release_function);
-- 
cgit v1.1


From eee92c36399fe1c0d19de4982e9cfdf526b37922 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Fri, 2 Mar 2012 10:43:28 +0100
Subject: block: fix __blkdev_get and add_disk race condition

commit 9f53d2fe815b4011ff930a7b6db98385d45faa68 upstream.

The following situation might occur:

__blkdev_get:			add_disk:

				register_disk()
get_gendisk()

disk_block_events()
	disk->ev == NULL

				disk_add_events()

__disk_unblock_events()
	disk->ev != NULL
	--ev->block

Then we unblock events, when they are suppose to be blocked. This can
trigger events related block/genhd.c warnings, but also can crash in
sd_check_events() or other places.

I'm able to reproduce crashes with the following scripts (with
connected usb dongle as sdb disk).

<snip>
DEV=/dev/sdb
ENABLE=/sys/bus/usb/devices/1-2/bConfigurationValue

function stop_me()
{
	for i in `jobs -p` ; do kill $i 2> /dev/null ; done
	exit
}

trap stop_me SIGHUP SIGINT SIGTERM

for ((i = 0; i < 10; i++)) ; do
	while true; do fdisk -l $DEV  2>&1 > /dev/null ; done &
done

while true ; do
echo 1 > $ENABLE
sleep 1
echo 0 > $ENABLE
done
</snip>

I use the script to verify patch fixing oops in sd_revalidate_disk
http://marc.info/?l=linux-scsi&m=132935572512352&w=2
Without Jun'ichi Nomura patch titled "Fix NULL pointer dereference in
sd_revalidate_disk" or this one, script easily crash kernel within
a few seconds. With both patches applied I do not observe crash.
Unfortunately after some time (dozen of minutes), script will hung in:

[ 1563.906432]  [<c08354f5>] schedule_timeout_uninterruptible+0x15/0x20
[ 1563.906437]  [<c04532d5>] msleep+0x15/0x20
[ 1563.906443]  [<c05d60b2>] blk_drain_queue+0x32/0xd0
[ 1563.906447]  [<c05d6e00>] blk_cleanup_queue+0xd0/0x170
[ 1563.906454]  [<c06d278f>] scsi_free_queue+0x3f/0x60
[ 1563.906459]  [<c06d7e6e>] __scsi_remove_device+0x6e/0xb0
[ 1563.906463]  [<c06d4aff>] scsi_forget_host+0x4f/0x60
[ 1563.906468]  [<c06cd84a>] scsi_remove_host+0x5a/0xf0
[ 1563.906482]  [<f7f030fb>] quiesce_and_remove_host+0x5b/0xa0 [usb_storage]
[ 1563.906490]  [<f7f03203>] usb_stor_disconnect+0x13/0x20 [usb_storage]

Anyway I think this patch is some step forward.

As drawback, I do not teardown on sysfs file create error, because I do
not know how to nullify disk->ev (since it can be used). However add_disk
error handling practically does not exist too, and things will work
without this sysfs file, except events will not be exported to user
space.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 02e9fca..9db720d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -36,6 +36,7 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_alloc_events(struct gendisk *disk);
 static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
@@ -602,6 +603,8 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	disk_alloc_events(disk);
+
 	/* Register BDI before referencing it from bdev */
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, disk_devt(disk));
@@ -1734,9 +1737,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
 		&disk_events_dfl_poll_msecs, 0644);
 
 /*
- * disk_{add|del|release}_events - initialize and destroy disk_events.
+ * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
  */
-static void disk_add_events(struct gendisk *disk)
+static void disk_alloc_events(struct gendisk *disk)
 {
 	struct disk_events *ev;
 
@@ -1749,16 +1752,6 @@ static void disk_add_events(struct gendisk *disk)
 		return;
 	}
 
-	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
-			       disk_events_attrs) < 0) {
-		pr_warn("%s: failed to create sysfs files for events\n",
-			disk->disk_name);
-		kfree(ev);
-		return;
-	}
-
-	disk->ev = ev;
-
 	INIT_LIST_HEAD(&ev->node);
 	ev->disk = disk;
 	spin_lock_init(&ev->lock);
@@ -1767,8 +1760,21 @@ static void disk_add_events(struct gendisk *disk)
 	ev->poll_msecs = -1;
 	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
 
+	disk->ev = ev;
+}
+
+static void disk_add_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	/* FIXME: error handling */
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+
 	mutex_lock(&disk_events_mutex);
-	list_add_tail(&ev->node, &disk_events);
+	list_add_tail(&disk->ev->node, &disk_events);
 	mutex_unlock(&disk_events_mutex);
 
 	/*
-- 
cgit v1.1


From 2053689f68e19b8c1bb38aa68049c57576eed6e0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 2 Mar 2012 10:51:00 +0100
Subject: Block: use a freezable workqueue for disk-event polling

commit 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 upstream.

This patch (as1519) fixes a bug in the block layer's disk-events
polling.  The polling is done by a work routine queued on the
system_nrt_wq workqueue.  Since that workqueue isn't freezable, the
polling continues even in the middle of a system sleep transition.

Obviously, polling a suspended drive for media changes and such isn't
a good thing to do; in the case of USB mass-storage devices it can
lead to real problems requiring device resets and even re-enumeration.

The patch fixes things by creating a new system-wide, non-reentrant,
freezable workqueue and using it for disk-events polling.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 9db720d..997afd6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1479,9 +1479,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	intv = disk_events_poll_jiffies(disk);
 	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	else if (intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1525,7 +1525,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
 	ev->clearing |= mask;
 	if (!ev->block) {
 		cancel_delayed_work(&ev->dwork);
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	}
 	spin_unlock_irq(&ev->lock);
 }
@@ -1562,7 +1562,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
 
 	/* uncondtionally schedule event check and wait for it to finish */
 	disk_block_events(disk);
-	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	flush_delayed_work(&ev->dwork);
 	__disk_unblock_events(disk, false);
 
@@ -1599,7 +1599,7 @@ static void disk_events_workfn(struct work_struct *work)
 
 	intv = disk_events_poll_jiffies(disk);
 	if (!ev->block && intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 
 	spin_unlock_irq(&ev->lock);
 
-- 
cgit v1.1


From 9fd246e048bbd8569a84b412382cd299e0313a62 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 15 May 2012 08:22:04 +0200
Subject: block: fix buffer overflow when printing partition UUIDs

commit 05c69d298c96703741cac9a5cbbf6c53bd55a6e2 upstream.

6d1d8050b4bc8 "block, partition: add partition_meta_info to hd_struct"
added part_unpack_uuid() which assumes that the passed in buffer has
enough space for sprintfing "%pU" - 37 characters including '\0'.

Unfortunately, b5af921ec0233 "init: add support for root devices
specified by partition UUID" supplied 33 bytes buffer to the function
leading to the following panic with stackprotector enabled.

  Kernel panic - not syncing: stack-protector: Kernel stack corrupted in: ffffffff81b14c7e

  [<ffffffff815e226b>] panic+0xba/0x1c6
  [<ffffffff81b14c7e>] ? printk_all_partitions+0x259/0x26xb
  [<ffffffff810566bb>] __stack_chk_fail+0x1b/0x20
  [<ffffffff81b15c7e>] printk_all_paritions+0x259/0x26xb
  [<ffffffff81aedfe0>] mount_block_root+0x1bc/0x27f
  [<ffffffff81aee0fa>] mount_root+0x57/0x5b
  [<ffffffff81aee23b>] prepare_namespace+0x13d/0x176
  [<ffffffff8107eec0>] ? release_tgcred.isra.4+0x330/0x30
  [<ffffffff81aedd60>] kernel_init+0x155/0x15a
  [<ffffffff81087b97>] ? schedule_tail+0x27/0xb0
  [<ffffffff815f4d24>] kernel_thread_helper+0x5/0x10
  [<ffffffff81aedc0b>] ? start_kernel+0x3c5/0x3c5
  [<ffffffff815f4d20>] ? gs_change+0x13/0x13

Increase the buffer size, remove the dangerous part_unpack_uuid() and
use snprintf() directly from printk_all_partitions().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Szymon Gruszczynski <sz.gruszczynski@googlemail.com>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 997afd6..4927476 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -744,7 +744,7 @@ void __init printk_all_partitions(void)
 		struct hd_struct *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
-		u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
+		char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -763,14 +763,16 @@ void __init printk_all_partitions(void)
 		while ((part = disk_part_iter_next(&piter))) {
 			bool is_part0 = part == &disk->part0;
 
-			uuid[0] = 0;
+			uuid_buf[0] = '\0';
 			if (part->info)
-				part_unpack_uuid(part->info->uuid, uuid);
+				snprintf(uuid_buf, sizeof(uuid_buf), "%pU",
+					 part->info->uuid);
 
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
-			       disk_name(disk, part->partno, name_buf), uuid);
+			       disk_name(disk, part->partno, name_buf),
+			       uuid_buf);
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
 				    disk->driverfs_dev->driver != NULL)
-- 
cgit v1.1


From f45c9a6eec20cd712421c442785e7a4e9215a230 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 15 Jun 2012 12:52:46 +0200
Subject: scsi: Silence unnecessary warnings about ioctl to partition

commit 6d9359280753d2955f86d6411047516a9431eb51 upstream.

Sometimes, warnings about ioctls to partition happen often enough that they
form majority of the warnings in the kernel log and users complain. In some
cases warnings are about ioctls such as SG_IO so it's not good to get rid of
the warnings completely as they can ease debugging of userspace problems
when ioctl is refused.

Since I have seen warnings from lots of commands, including some proprietary
userspace applications, I don't think disallowing the ioctls for processes
with CAP_SYS_RAWIO will happen in the near future if ever. So lets just
stop warning for processes with CAP_SYS_RAWIO for which ioctl is allowed.

CC: Paolo Bonzini <pbonzini@redhat.com>
CC: James Bottomley <JBottomley@parallels.com>
CC: linux-scsi@vger.kernel.org
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
[bwh: Backported to 3.2: use ENOTTY, not ENOIOCTLCMD]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/scsi_ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 688be8a..9e76a32 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -721,11 +721,14 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
 		break;
 	}
 
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
 	/* In particular, rule out all resets and host-specific ioctls.  */
 	printk_ratelimited(KERN_WARNING
 			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
 
-	return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY;
+	return -ENOTTY;
 }
 EXPORT_SYMBOL(scsi_verify_blk_ioctl);
 
-- 
cgit v1.1


From 68e9e9fee2bbec4853a993e98b0df8479292f572 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block: add blk_queue_dead()

commit 34f6055c80285e4efb3f602a9119db75239744dc upstream.

There are a number of QUEUE_FLAG_DEAD tests.  Add blk_queue_dead()
macro and use it.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-core.c     | 6 +++---
 block/blk-exec.c     | 2 +-
 block/blk-sysfs.c    | 4 ++--
 block/blk-throttle.c | 4 ++--
 block/blk.h          | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 15de223..49d9e91 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ EXPORT_SYMBOL(blk_init_allocated_queue);
 
 int blk_get_queue(struct request_queue *q)
 {
-	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (likely(!blk_queue_dead(q))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
@@ -754,7 +754,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
 
 	may_queue = elv_may_queue(q, rw_flags);
@@ -874,7 +874,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
-		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		if (unlikely(blk_queue_dead(q)))
 			return NULL;
 
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a1ebceb..6053285 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,7 +50,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (unlikely(blk_queue_dead(q))) {
 		rq->errors = -ENXIO;
 		if (rq->end_io)
 			rq->end_io(rq, rq->errors);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e7f9f65..f0b2ca8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
@@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 
 	q = container_of(kobj, struct request_queue, kobj);
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4553245..5eed6a7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	struct request_queue *q = td->queue;
 
 	/* no throttling for dead queue */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
 
 	rcu_read_lock();
@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	spin_lock_irq(q->queue_lock);
 
 	/* Make sure @q is still alive */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (unlikely(blk_queue_dead(q))) {
 		kfree(tg);
 		return NULL;
 	}
diff --git a/block/blk.h b/block/blk.h
index 3f6551b..e38691d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -85,7 +85,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			q->flush_queue_delayed = 1;
 			return NULL;
 		}
-		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+		if (unlikely(blk_queue_dead(q)) ||
 		    !q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
-- 
cgit v1.1


From 32eb44e290d1151e9eb26f57564dae6966190685 Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Thu, 22 Nov 2012 02:00:11 -0800
Subject: block: Don't access request after it might be freed

commit 893d290f1d7496db97c9471bc352ad4a11dc8a25 upstream.

After we've done __elv_add_request() and __blk_run_queue() in
blk_execute_rq_nowait(), the request might finish and be freed
immediately.  Therefore checking if the type is REQ_TYPE_PM_RESUME
isn't safe afterwards, because if it isn't, rq might be gone.
Instead, check beforehand and stash the result in a temporary.

This fixes crashes in blk_execute_rq_nowait() I get occasionally when
running with lots of memory debugging options enabled -- I think this
race is usually harmless because the window for rq to be reallocated
is so small.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
[bwh: Backported to 3.2: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 6053285..ac2c6e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -49,6 +49,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+	bool is_pm_resume;
 
 	if (unlikely(blk_queue_dead(q))) {
 		rq->errors = -ENXIO;
@@ -59,12 +60,18 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
+	/*
+	 * need to check this before __blk_run_queue(), because rq can
+	 * be freed before that returns.
+	 */
+	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
+
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
 	/* the queue is stopped so it won't be run */
-	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
+	if (is_pm_resume)
 		q->request_fn(q);
 	spin_unlock_irq(q->queue_lock);
 }
-- 
cgit v1.1


From 2abb7d3a3cf05900a6d9cc6b98b1ba3aa432ce30 Mon Sep 17 00:00:00 2001
From: Tomas Henzl <thenzl@redhat.com>
Date: Wed, 27 Feb 2013 17:03:32 -0800
Subject: block: fix ext_devt_idr handling

commit 7b74e912785a11572da43292786ed07ada7e3e0c upstream.

While adding and removing a lot of disks disks and partitions this
sometimes shows up:

  WARNING: at fs/sysfs/dir.c:512 sysfs_add_one+0xc9/0x130() (Not tainted)
  Hardware name:
  sysfs: cannot create duplicate filename '/dev/block/259:751'
  Modules linked in: raid1 autofs4 bnx2fc cnic uio fcoe libfcoe libfc 8021q scsi_transport_fc scsi_tgt garp stp llc sunrpc cpufreq_ondemand powernow_k8 freq_table mperf ipv6 dm_mirror dm_region_hash dm_log power_meter microcode dcdbas serio_raw amd64_edac_mod edac_core edac_mce_amd i2c_piix4 i2c_core k10temp bnx2 sg ixgbe dca mdio ext4 mbcache jbd2 dm_round_robin sr_mod cdrom sd_mod crc_t10dif ata_generic pata_acpi pata_atiixp ahci mptsas mptscsih mptbase scsi_transport_sas dm_multipath dm_mod [last unloaded: scsi_wait_scan]
  Pid: 44103, comm: async/16 Not tainted 2.6.32-195.el6.x86_64 #1
  Call Trace:
    warn_slowpath_common+0x87/0xc0
    warn_slowpath_fmt+0x46/0x50
    sysfs_add_one+0xc9/0x130
    sysfs_do_create_link+0x12b/0x170
    sysfs_create_link+0x13/0x20
    device_add+0x317/0x650
    idr_get_new+0x13/0x50
    add_partition+0x21c/0x390
    rescan_partitions+0x32b/0x470
    sd_open+0x81/0x1f0 [sd_mod]
    __blkdev_get+0x1b6/0x3c0
    blkdev_get+0x10/0x20
    register_disk+0x155/0x170
    add_disk+0xa6/0x160
    sd_probe_async+0x13b/0x210 [sd_mod]
    add_wait_queue+0x46/0x60
    async_thread+0x102/0x250
    default_wake_function+0x0/0x20
    async_thread+0x0/0x250
    kthread+0x96/0xa0
    child_rip+0xa/0x20
    kthread+0x0/0xa0
    child_rip+0x0/0x20

This most likely happens because dev_t is freed while the number is
still used and idr_get_new() is not protected on every use.  The fix
adds a mutex where it wasn't before and moves the dev_t free function so
it is called after device del.

Signed-off-by: Tomas Henzl <thenzl@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 3.2: adjust filename]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 4927476..ac9aeb4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -421,14 +421,18 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 	do {
 		if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
 			return -ENOMEM;
+		mutex_lock(&ext_devt_mutex);
 		rc = idr_get_new(&ext_devt_idr, part, &idx);
+		mutex_unlock(&ext_devt_mutex);
 	} while (rc == -EAGAIN);
 
 	if (rc)
 		return rc;
 
 	if (idx > MAX_EXT_DEVT) {
+		mutex_lock(&ext_devt_mutex);
 		idr_remove(&ext_devt_idr, idx);
+		mutex_unlock(&ext_devt_mutex);
 		return -EBUSY;
 	}
 
@@ -645,7 +649,6 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
-	blk_free_devt(disk_to_dev(disk)->devt);
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 
@@ -663,6 +666,7 @@ void del_gendisk(struct gendisk *disk)
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	device_del(disk_to_dev(disk));
+	blk_free_devt(disk_to_dev(disk)->devt);
 }
 EXPORT_SYMBOL(del_gendisk);
 
-- 
cgit v1.1


From a51db52c68e3ad5c718c1915d28086b2c0ef8642 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 27 Feb 2013 17:03:56 -0800
Subject: block: fix synchronization and limit check in blk_alloc_devt()

commit ce23bba842aee98092225d9576dba47c82352521 upstream.

idr allocation in blk_alloc_devt() wasn't synchronized against lookup
and removal, and its limit check was off by one - 1 << MINORBITS is
the number of minors allowed, not the maximum allowed minor.

Add locking and rename MAX_EXT_DEVT to NR_EXT_DEVT and fix limit
checking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index ac9aeb4..6edf228 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -26,7 +26,7 @@ static DEFINE_MUTEX(block_class_lock);
 struct kobject *block_depr;
 
 /* for extended dynamic devt allocation, currently only one major is used */
-#define MAX_EXT_DEVT		(1 << MINORBITS)
+#define NR_EXT_DEVT		(1 << MINORBITS)
 
 /* For extended devt allocation.  ext_devt_mutex prevents look up
  * results from going away underneath its user.
@@ -423,19 +423,16 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 			return -ENOMEM;
 		mutex_lock(&ext_devt_mutex);
 		rc = idr_get_new(&ext_devt_idr, part, &idx);
+		if (!rc && idx >= NR_EXT_DEVT) {
+			idr_remove(&ext_devt_idr, idx);
+			rc = -EBUSY;
+		}
 		mutex_unlock(&ext_devt_mutex);
 	} while (rc == -EAGAIN);
 
 	if (rc)
 		return rc;
 
-	if (idx > MAX_EXT_DEVT) {
-		mutex_lock(&ext_devt_mutex);
-		idr_remove(&ext_devt_idr, idx);
-		mutex_unlock(&ext_devt_mutex);
-		return -EBUSY;
-	}
-
 	*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
 	return 0;
 }
-- 
cgit v1.1


From 4316936730689af432346fa1d71c3f69f22ec3ac Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 3 Apr 2013 21:53:57 +0200
Subject: block: avoid using uninitialized value in from queue_var_store

commit c678ef5286ddb5cf70384ad5af286b0afc9b73e1 upstream.

As found by gcc-4.8, the QUEUE_SYSFS_BIT_FNS macro creates functions
that use a value generated by queue_var_store independent of whether
that value was set or not.

block/blk-sysfs.c: In function 'queue_store_nonrot':
block/blk-sysfs.c:244:385: warning: 'val' may be used uninitialized in this function [-Wmaybe-uninitialized]

Unlike most other such warnings, this one is not a false positive,
writing any non-number string into the sysfs files indeed has
an undefined result, rather than returning an error.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f0b2ca8..1789e7a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -200,6 +200,8 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
 	unsigned long val;						\
 	ssize_t ret;							\
 	ret = queue_var_store(&val, page, count);			\
+	if (ret < 0)							\
+		 return ret;						\
 	if (neg)							\
 		val = !val;						\
 									\
-- 
cgit v1.1


From b442223040adf969fd02124c29c856a06cf5649c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jul 2013 15:01:14 -0700
Subject: block: do not pass disk names as format strings

commit ffc8b30866879ed9ba62bd0a86fecdbd51cd3d19 upstream.

Disk names may contain arbitrary strings, so they must not be
interpreted as format strings.  It seems that only md allows arbitrary
strings to be used for disk names, but this could allow for a local
memory corruption from uid 0 into ring 0.

CVE-2013-2851

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 3.2: adjust device pointer name in nbd.c]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 6edf228..8bd4ef2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk)
 
 	ddev->parent = disk->driverfs_dev;
 
-	dev_set_name(ddev, disk->disk_name);
+	dev_set_name(ddev, "%s", disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
 	dev_set_uevent_suppress(ddev, 1);
-- 
cgit v1.1


From c76cf3d0c317ebd020361ae737641bd53ab0d30b Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 8 Oct 2013 14:36:41 -0400
Subject: block: fix race between request completion and timeout handling

commit 4912aa6c11e6a5d910264deedbec2075c6f1bb73 upstream.

crocode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support shpchp ioatdma dca be2net sg ses enclosure ext4 mbcache jbd2 sd_mod crc_t10dif ahci megaraid_sas(U) dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]

Pid: 491, comm: scsi_eh_0 Tainted: G        W  ----------------   2.6.32-220.13.1.el6.x86_64 #1 IBM  -[8722PAX]-/00D1461
RIP: 0010:[<ffffffff8124e424>]  [<ffffffff8124e424>] blk_requeue_request+0x94/0xa0
RSP: 0018:ffff881057eefd60  EFLAGS: 00010012
RAX: ffff881d99e3e8a8 RBX: ffff881d99e3e780 RCX: ffff881d99e3e8a8
RDX: ffff881d99e3e8a8 RSI: ffff881d99e3e780 RDI: ffff881d99e3e780
RBP: ffff881057eefd80 R08: ffff881057eefe90 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff881057f92338
R13: 0000000000000000 R14: ffff881057f92338 R15: ffff883058188000
FS:  0000000000000000(0000) GS:ffff880040200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00000000006d3ec0 CR3: 000000302cd7d000 CR4: 00000000000406b0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process scsi_eh_0 (pid: 491, threadinfo ffff881057eee000, task ffff881057e29540)
Stack:
 0000000000001057 0000000000000286 ffff8810275efdc0 ffff881057f16000
<0> ffff881057eefdd0 ffffffff81362323 ffff881057eefe20 ffffffff8135f393
<0> ffff881057e29af8 ffff8810275efdc0 ffff881057eefe78 ffff881057eefe90
Call Trace:
 [<ffffffff81362323>] __scsi_queue_insert+0xa3/0x150
 [<ffffffff8135f393>] ? scsi_eh_ready_devs+0x5e3/0x850
 [<ffffffff81362a23>] scsi_queue_insert+0x13/0x20
 [<ffffffff8135e4d4>] scsi_eh_flush_done_q+0x104/0x160
 [<ffffffff8135fb6b>] scsi_error_handler+0x35b/0x660
 [<ffffffff8135f810>] ? scsi_error_handler+0x0/0x660
 [<ffffffff810908c6>] kthread+0x96/0xa0
 [<ffffffff8100c14a>] child_rip+0xa/0x20
 [<ffffffff81090830>] ? kthread+0x0/0xa0
 [<ffffffff8100c140>] ? child_rip+0x0/0x20
Code: 00 00 eb d1 4c 8b 2d 3c 8f 97 00 4d 85 ed 74 bf 49 8b 45 00 49 83 c5 08 48 89 de 4c 89 e7 ff d0 49 8b 45 00 48 85 c0 75 eb eb a4 <0f> 0b eb fe 0f 1f 84 00 00 00 00 00 55 48 89 e5 0f 1f 44 00 00
RIP  [<ffffffff8124e424>] blk_requeue_request+0x94/0xa0
 RSP <ffff881057eefd60>

The RIP is this line:
        BUG_ON(blk_queued_rq(rq));

After digging through the code, I think there may be a race between the
request completion and the timer handler running.

A timer is started for each request put on the device's queue (see
blk_start_request->blk_add_timer).  If the request does not complete
before the timer expires, the timer handler (blk_rq_timed_out_timer)
will mark the request complete atomically:

static inline int blk_mark_rq_complete(struct request *rq)
{
        return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}

and then call blk_rq_timed_out.  The latter function will call
scsi_times_out, which will return one of BLK_EH_HANDLED,
BLK_EH_RESET_TIMER or BLK_EH_NOT_HANDLED.  If BLK_EH_RESET_TIMER is
returned, blk_clear_rq_complete is called, and blk_add_timer is again
called to simply wait longer for the request to complete.

Now, if the request happens to complete while this is going on, what
happens?  Given that we know the completion handler will bail if it
finds the REQ_ATOM_COMPLETE bit set, we need to focus on the completion
handler running after that bit is cleared.  So, from the above
paragraph, after the call to blk_clear_rq_complete.  If the completion
sets REQ_ATOM_COMPLETE before the BUG_ON in blk_add_timer, we go boom
there (I haven't seen this in the cores).  Next, if we get the
completion before the call to list_add_tail, then the timer will
eventually fire for an old req, which may either be freed or reallocated
(there is evidence that this might be the case).  Finally, if the
completion comes in *after* the addition to the timeout list, I think
it's harmless.  The request will be removed from the timeout list,
req_atom_complete will be set, and all will be well.

This will only actually explain the coredumps *IF* the request
structure was freed, reallocated *and* queued before the error handler
thread had a chance to process it.  That is possible, but it may make
sense to keep digging for another race.  I think that if this is what
was happening, we would see other instances of this problem showing up
as null pointer or garbage pointer dereferences, for example when the
request structure was not re-used.  It looks like we actually do run
into that situation in other reports.

This patch moves the BUG_ON(test_bit(REQ_ATOM_COMPLETE,
&req->atomic_flags)); from blk_add_timer to the only caller that could
trip over it (blk_start_request).  It then inverts the calls to
blk_clear_rq_complete and blk_add_timer in blk_rq_timed_out to address
the race.  I've boot tested this patch, but nothing more.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-core.c    | 1 +
 block/blk-timeout.c | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 49d9e91..50c8305 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2015,6 +2015,7 @@ void blk_start_request(struct request *req)
 	if (unlikely(blk_bidi_rq(req)))
 		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
 
+	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 7803548..b1182ea 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -90,8 +90,8 @@ static void blk_rq_timed_out(struct request *req)
 		__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
-		blk_clear_rq_complete(req);
 		blk_add_timer(req);
+		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		/*
@@ -173,7 +173,6 @@ void blk_add_timer(struct request *req)
 		return;
 
 	BUG_ON(!list_empty(&req->timeout_list));
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 
 	/*
 	 * Some LLDs, like scsi, peek at the timeout to prevent a
-- 
cgit v1.1


From 9be9b3e8f63644f30360164ff0efa997366af67c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 14 Oct 2013 12:11:36 -0400
Subject: blk-core: Fix memory corruption if blkcg_init_queue fails

commit fff4996b7db7955414ac74386efa5e07fd766b50 upstream.

If blkcg_init_queue fails, blk_alloc_queue_node doesn't call bdi_destroy
to clean up structures allocated by the backing dev.

------------[ cut here ]------------
WARNING: at lib/debugobjects.c:260 debug_print_object+0x85/0xa0()
ODEBUG: free active (active state 0) object type: percpu_counter hint:           (null)
Modules linked in: dm_loop dm_mod ip6table_filter ip6_tables uvesafb cfbcopyarea cfbimgblt cfbfillrect fbcon font bitblit fbcon_rotate fbcon_cw fbcon_ud fbcon_ccw softcursor fb fbdev ipt_MASQUERADE iptable_nat nf_nat_ipv4 msr nf_conntrack_ipv4 nf_defrag_ipv4 xt_state ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc tun ipv6 cpufreq_userspace cpufreq_stats cpufreq_powersave cpufreq_ondemand cpufreq_conservative spadfs fuse hid_generic usbhid hid raid0 md_mod dmi_sysfs nf_nat_ftp nf_nat nf_conntrack_ftp nf_conntrack lm85 hwmon_vid snd_usb_audio snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc snd_hwdep snd_usbmidi_lib snd_rawmidi snd soundcore acpi_cpufreq freq_table mperf sata_svw serverworks kvm_amd ide_core ehci_pci ohci_hcd libata ehci_hcd kvm usbcore tg3 usb_common libphy k10temp pcspkr ptp i2c_piix4 i2c_core evdev microcode hwmon rtc_cmos pps_core e100 skge floppy mii processor button unix
CPU: 0 PID: 2739 Comm: lvchange Tainted: G        W
3.10.15-devel #14
Hardware name: empty empty/S3992-E, BIOS 'V1.06   ' 06/09/2009
 0000000000000009 ffff88023c3c1ae8 ffffffff813c8fd4 ffff88023c3c1b20
 ffffffff810399eb ffff88043d35cd58 ffffffff81651940 ffff88023c3c1bf8
 ffffffff82479d90 0000000000000005 ffff88023c3c1b80 ffffffff81039a67
Call Trace:
 [<ffffffff813c8fd4>] dump_stack+0x19/0x1b
 [<ffffffff810399eb>] warn_slowpath_common+0x6b/0xa0
 [<ffffffff81039a67>] warn_slowpath_fmt+0x47/0x50
 [<ffffffff8122aaaf>] ? debug_check_no_obj_freed+0xcf/0x250
 [<ffffffff81229a15>] debug_print_object+0x85/0xa0
 [<ffffffff8122abe3>] debug_check_no_obj_freed+0x203/0x250
 [<ffffffff8113c4ac>] kmem_cache_free+0x20c/0x3a0
 [<ffffffff811f6709>] blk_alloc_queue_node+0x2a9/0x2c0
 [<ffffffff811f672e>] blk_alloc_queue+0xe/0x10
 [<ffffffffa04c0093>] dm_create+0x1a3/0x530 [dm_mod]
 [<ffffffffa04c6bb0>] ? list_version_get_info+0xe0/0xe0 [dm_mod]
 [<ffffffffa04c6c07>] dev_create+0x57/0x2b0 [dm_mod]
 [<ffffffffa04c6bb0>] ? list_version_get_info+0xe0/0xe0 [dm_mod]
 [<ffffffffa04c6bb0>] ? list_version_get_info+0xe0/0xe0 [dm_mod]
 [<ffffffffa04c6528>] ctl_ioctl+0x268/0x500 [dm_mod]
 [<ffffffff81097662>] ? get_lock_stats+0x22/0x70
 [<ffffffffa04c67ce>] dm_ctl_ioctl+0xe/0x20 [dm_mod]
 [<ffffffff81161aad>] do_vfs_ioctl+0x2ed/0x520
 [<ffffffff8116cfc7>] ? fget_light+0x377/0x4e0
 [<ffffffff81161d2b>] SyS_ioctl+0x4b/0x90
 [<ffffffff813cff16>] system_call_fastpath+0x1a/0x1f
---[ end trace 4b5ff0d55673d986 ]---
------------[ cut here ]------------

This fix should be backported to stable kernels starting with 2.6.37. Note
that in the kernels prior to 3.5 the affected code is different, but the
bug is still there - bdi_init is called and bdi_destroy isn't.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
[bwh: Backported to 3.2: add bdi_destroy() to the single error path
 after the call to bdi_init()]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 50c8305..a219c89 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -483,6 +483,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	}
 
 	if (blk_throtl_init(q)) {
+		bdi_destroy(&q->backing_dev_info);
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
-- 
cgit v1.1


From e5b56dda4701918b565764e47a60aa252964411c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 12 Feb 2014 09:34:01 -0700
Subject: block: add cond_resched() to potentially long running ioctl discard
 loop

commit c8123f8c9cb517403b51aa41c3c46ff5e10b2c17 upstream.

When mkfs issues a full device discard and the device only
supports discards of a smallish size, we can loop in
blkdev_issue_discard() for a long time. If preempt isn't enabled,
this can turn into a softlock situation and the kernel will
start complaining.

Add an explicit cond_resched() at the end of the loop to avoid
that.

Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-lib.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2b461b4..36751e2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -101,6 +101,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 		atomic_inc(&bb.done);
 		submit_bio(type, bio);
+
+		/*
+		 * We can loop for a long time in here, if someone does
+		 * full device discards (like mkfs). Be nice and allow
+		 * us to schedule out to avoid softlocking if preempt
+		 * is disabled.
+		 */
+		cond_resched();
 	}
 
 	/* Wait for bios in-flight */
-- 
cgit v1.1


From 5b85afa68e4f56c27f1d5c6f49e5257bce6448e6 Mon Sep 17 00:00:00 2001
From: Roman Pen <r.peniaev@gmail.com>
Date: Tue, 4 Mar 2014 23:13:10 +0900
Subject: blktrace: fix accounting of partially completed requests

commit af5040da01ef980670b3741b3e10733ee3e33566 upstream.

trace_block_rq_complete does not take into account that request can
be partially completed, so we can get the following incorrect output
of blkparser:

  C   R 232 + 240 [0]
  C   R 240 + 232 [0]
  C   R 248 + 224 [0]
  C   R 256 + 216 [0]

but should be:

  C   R 232 + 8 [0]
  C   R 240 + 8 [0]
  C   R 248 + 8 [0]
  C   R 256 + 8 [0]

Also, the whole output summary statistics of completed requests and
final throughput will be incorrect.

This patch takes into account real completion size of the request and
fixes wrong completion accounting.

Signed-off-by: Roman Pen <r.peniaev@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: linux-kernel@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
[bwh: Backported to 3.2: drop change in blk-mq.c]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a219c89..ec494ff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2077,7 +2077,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (!req->bio)
 		return false;
 
-	trace_block_rq_complete(req->q, req);
+	trace_block_rq_complete(req->q, req, nr_bytes);
 
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
-- 
cgit v1.1


From 60d940c6fd6252f6307ecacd49acac54cc6d8e8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jul 2014 12:25:28 +0200
Subject: block: don't assume last put of shared tags is for the host

commit d45b3279a5a2252cafcd665bbf2db8c9b31ef783 upstream.

There is no inherent reason why the last put of a tag structure must be
the one for the Scsi_Host, as device model objects can be held for
arbitrary periods.  Merge blk_free_tags and __blk_free_tags into a single
funtion that just release a references and get rid of the BUG() when the
host reference wasn't the last.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-tag.c | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index 4af6f5c..f606487 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag)
 EXPORT_SYMBOL(blk_queue_find_tag);
 
 /**
- * __blk_free_tags - release a given set of tag maintenance info
+ * blk_free_tags - release a given set of tag maintenance info
  * @bqt:	the tag map to free
  *
- * Tries to free the specified @bqt.  Returns true if it was
- * actually freed and false if there are still references using it
+ * Drop the reference count on @bqt and frees it when the last reference
+ * is dropped.
  */
-static int __blk_free_tags(struct blk_queue_tag *bqt)
+void blk_free_tags(struct blk_queue_tag *bqt)
 {
-	int retval;
-
-	retval = atomic_dec_and_test(&bqt->refcnt);
-	if (retval) {
+	if (atomic_dec_and_test(&bqt->refcnt)) {
 		BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
 							bqt->max_depth);
 
@@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt)
 
 		kfree(bqt);
 	}
-
-	return retval;
 }
+EXPORT_SYMBOL(blk_free_tags);
 
 /**
  * __blk_queue_free_tags - release tag maintenance info
@@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q)
 	if (!bqt)
 		return;
 
-	__blk_free_tags(bqt);
+	blk_free_tags(bqt);
 
 	q->queue_tags = NULL;
 	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
 }
 
 /**
- * blk_free_tags - release a given set of tag maintenance info
- * @bqt:	the tag map to free
- *
- * For externally managed @bqt frees the map.  Callers of this
- * function must guarantee to have released all the queues that
- * might have been using this tag map.
- */
-void blk_free_tags(struct blk_queue_tag *bqt)
-{
-	if (unlikely(!__blk_free_tags(bqt)))
-		BUG();
-}
-EXPORT_SYMBOL(blk_free_tags);
-
-/**
  * blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
-- 
cgit v1.1


From a288cafeceb6e18ba4fd10ec8d3270593472cca6 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 26 Aug 2014 09:05:36 -0600
Subject: block: Fix dev_t minor allocation lifetime

commit 2da78092dda13f1efd26edbbf99a567776913750 upstream.

Releases the dev_t minor when all references are closed to prevent
another device from acquiring the same major/minor.

Since the partition's release may be invoked from call_rcu's soft-irq
context, the ext_dev_idr's mutex had to be replaced with a spinlock so
as not so sleep.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
[bwh: Backported to 3.2:
 - Adjust filename
 - idr insertion API is different, and blk_alloc_devt() is preallocating
   a node in a different way]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 8bd4ef2..35415810 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -28,10 +28,10 @@ struct kobject *block_depr;
 /* for extended dynamic devt allocation, currently only one major is used */
 #define NR_EXT_DEVT		(1 << MINORBITS)
 
-/* For extended devt allocation.  ext_devt_mutex prevents look up
+/* For extended devt allocation.  ext_devt_lock prevents look up
  * results from going away underneath its user.
  */
-static DEFINE_MUTEX(ext_devt_mutex);
+static DEFINE_SPINLOCK(ext_devt_lock);
 static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
@@ -421,13 +421,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 	do {
 		if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
 			return -ENOMEM;
-		mutex_lock(&ext_devt_mutex);
+		spin_lock(&ext_devt_lock);
 		rc = idr_get_new(&ext_devt_idr, part, &idx);
 		if (!rc && idx >= NR_EXT_DEVT) {
 			idr_remove(&ext_devt_idr, idx);
 			rc = -EBUSY;
 		}
-		mutex_unlock(&ext_devt_mutex);
+		spin_unlock(&ext_devt_lock);
 	} while (rc == -EAGAIN);
 
 	if (rc)
@@ -454,9 +454,9 @@ void blk_free_devt(dev_t devt)
 		return;
 
 	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-		mutex_lock(&ext_devt_mutex);
+		spin_lock(&ext_devt_lock);
 		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-		mutex_unlock(&ext_devt_mutex);
+		spin_unlock(&ext_devt_lock);
 	}
 }
 
@@ -663,7 +663,6 @@ void del_gendisk(struct gendisk *disk)
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	device_del(disk_to_dev(disk));
-	blk_free_devt(disk_to_dev(disk)->devt);
 }
 EXPORT_SYMBOL(del_gendisk);
 
@@ -688,13 +687,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	} else {
 		struct hd_struct *part;
 
-		mutex_lock(&ext_devt_mutex);
+		spin_lock(&ext_devt_lock);
 		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 		if (part && get_disk(part_to_disk(part))) {
 			*partno = part->partno;
 			disk = part_to_disk(part);
 		}
-		mutex_unlock(&ext_devt_mutex);
+		spin_unlock(&ext_devt_lock);
 	}
 
 	return disk;
@@ -1102,6 +1101,7 @@ static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
+	blk_free_devt(dev->devt);
 	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
-- 
cgit v1.1


From 607530bf21828c96c2880282c844088baf6ae3c4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 16 Sep 2014 13:38:51 -0600
Subject: genhd: fix leftover might_sleep() in blk_free_devt()

commit 46f341ffcfb5d8530f7d1e60f3be06cce6661b62 upstream.

Commit 2da78092 changed the locking from a mutex to a spinlock,
so we now longer sleep in this context. But there was a leftover
might_sleep() in there, which now triggers since we do the final
free from an RCU callback. Get rid of it.

Reported-by: Pontus Fuchs <pontus.fuchs@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 35415810..41b0435 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -448,8 +448,6 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
  */
 void blk_free_devt(dev_t devt)
 {
-	might_sleep();
-
 	if (devt == MKDEV(0, 0))
 		return;
 
-- 
cgit v1.1


From 5c250851e1f7850dfe765ff8f6d04ef4ce6e19a7 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 8 Oct 2014 18:26:13 -0400
Subject: block: fix alignment_offset math that assumes io_min is a power-of-2

commit b8839b8c55f3fdd60dc36abcda7e0266aff7985c upstream.

The math in both blk_stack_limits() and queue_limit_alignment_offset()
assume that a block device's io_min (aka minimum_io_size) is always a
power-of-2.  Fix the math such that it works for non-power-of-2 io_min.

This issue (of alignment_offset != 0) became apparent when testing
dm-thinp with a thinp blocksize that matches a RAID6 stripesize of
1280K.  Commit fdfb4c8c1 ("dm thin: set minimum_io_size to pool's data
block size") unlocked the potential for alignment_offset != 0 due to
the dm-thin-pool's io_min possibly being a non-power-of-2.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/blk-settings.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index fa1eb04..d55a3e4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -521,7 +521,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		bottom = max(b->physical_block_size, b->io_min) + alignment;
 
 		/* Verify that top and bottom intervals line up */
-		if (max(top, bottom) & (min(top, bottom) - 1)) {
+		if (max(top, bottom) % min(top, bottom)) {
 			t->misaligned = 1;
 			ret = -1;
 		}
@@ -562,7 +562,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	/* Find lowest common alignment_offset */
 	t->alignment_offset = lcm(t->alignment_offset, alignment)
-		& (max(t->physical_block_size, t->io_min) - 1);
+		% max(t->physical_block_size, t->io_min);
 
 	/* Verify that new alignment_offset is on a logical block boundary */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
-- 
cgit v1.1


From d73b032b63e8967462e1cf5763858ed89e97880f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Oct 2014 20:13:39 -0600
Subject: scsi: Fix error handling in SCSI_IOCTL_SEND_COMMAND

commit 84ce0f0e94ac97217398b3b69c21c7a62ebeed05 upstream.

When sg_scsi_ioctl() fails to prepare request to submit in
blk_rq_map_kern() we jump to a label where we just end up copying
(luckily zeroed-out) kernel buffer to userspace instead of reporting
error. Fix the problem by jumping to the right label.

CC: Jens Axboe <axboe@kernel.dk>
CC: linux-scsi@vger.kernel.org
Coverity-id: 1226871
Signed-off-by: Jan Kara <jack@suse.cz>

Fixed up the, now unused, out label.

Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/scsi_ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9e76a32..f124268 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -505,7 +505,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
 		err = DRIVER_ERROR << 24;
-		goto out;
+		goto error;
 	}
 
 	memset(sense, 0, sizeof(sense));
@@ -515,7 +515,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	blk_execute_rq(q, disk, rq, 0);
 
-out:
 	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
 		if (rq->sense_len && rq->sense) {
-- 
cgit v1.1


From 44cee4aa677c9cc2a48d2bdde8928115e06cb9a0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 19 Nov 2014 13:06:22 -0700
Subject: genhd: check for int overflow in disk_expand_part_tbl()

commit 5fabcb4c33fe11c7e3afdf805fde26c1a54d0953 upstream.

We can get here from blkdev_ioctl() -> blkpg_ioctl() -> add_partition()
with a user passed in partno value. If we pass in 0x7fffffff, the
new target in disk_expand_part_tbl() overflows the 'int' and we
access beyond the end of ptbl->part[] and even write to it when we
do the rcu_assign_pointer() to assign the new partition.

Reported-by: David Ramos <daramos@stanford.edu>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 block/genhd.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 41b0435..424d1fa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
 	struct disk_part_tbl *new_ptbl;
 	int len = old_ptbl ? old_ptbl->len : 0;
-	int target = partno + 1;
+	int i, target;
 	size_t size;
-	int i;
+
+	/*
+	 * check for int overflow, since we can get here from blkpg_ioctl()
+	 * with a user passed 'partno'.
+	 */
+	target = partno + 1;
+	if (target < 0)
+		return -EINVAL;
 
 	/* disk_max_parts() is zero during initialization, ignore if so */
 	if (disk_max_parts(disk) && target > disk_max_parts(disk))
-- 
cgit v1.1