aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c14
-rw-r--r--drivers/block/Kconfig18
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/amiflop.c79
-rw-r--r--drivers/block/aoe/aoeblk.c16
-rw-r--r--drivers/block/aoe/aoechr.c10
-rw-r--r--drivers/block/aoe/aoecmd.c6
-rw-r--r--drivers/block/aoe/aoedev.c4
-rw-r--r--drivers/block/ataflop.c65
-rw-r--r--drivers/block/brd.c8
-rw-r--r--drivers/block/cciss.c1027
-rw-r--r--drivers/block/cciss.h4
-rw-r--r--drivers/block/cciss_scsi.c8
-rw-r--r--drivers/block/cpqarray.c15
-rw-r--r--drivers/block/drbd/drbd_actlog.c83
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/drbd/drbd_int.h271
-rw-r--r--drivers/block/drbd/drbd_main.c677
-rw-r--r--drivers/block/drbd/drbd_nl.c396
-rw-r--r--drivers/block/drbd/drbd_proc.c35
-rw-r--r--drivers/block/drbd/drbd_receiver.c1169
-rw-r--r--drivers/block/drbd/drbd_req.c203
-rw-r--r--drivers/block/drbd/drbd_req.h65
-rw-r--r--drivers/block/drbd/drbd_worker.c308
-rw-r--r--drivers/block/floppy.c86
-rw-r--r--drivers/block/loop.c138
-rw-r--r--drivers/block/nbd.c7
-rw-r--r--drivers/block/osdblk.c5
-rw-r--r--drivers/block/paride/pcd.c15
-rw-r--r--drivers/block/paride/pd.c15
-rw-r--r--drivers/block/paride/pf.c17
-rw-r--r--drivers/block/paride/pg.c8
-rw-r--r--drivers/block/paride/pt.c20
-rw-r--r--drivers/block/pktcdvd.c40
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/block/rbd.c2046
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/swim.c15
-rw-r--r--drivers/block/swim3.c15
-rw-r--r--drivers/block/ub.c17
-rw-r--r--drivers/block/viodasd.c11
-rw-r--r--drivers/block/virtio_blk.c54
-rw-r--r--drivers/block/xd.c7
-rw-r--r--drivers/block/xen-blkfront.c114
-rw-r--r--drivers/block/xsysace.c14
-rw-r--r--drivers/block/z2ram.c21
46 files changed, 5070 insertions, 2154 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 4e2c367..1f286ab 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -36,7 +36,7 @@
#include <linux/ioport.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/reboot.h>
@@ -54,6 +54,7 @@
#define DAC960_GAM_MINOR 252
+static DEFINE_MUTEX(DAC960_mutex);
static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
static int DAC960_ControllerCount;
static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
@@ -81,7 +82,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
int drive_nr = (long)disk->private_data;
int ret = -ENXIO;
- lock_kernel();
+ mutex_lock(&DAC960_mutex);
if (p->FirmwareType == DAC960_V1_Controller) {
if (p->V1.LogicalDriveInformation[drive_nr].
LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
@@ -99,7 +100,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
goto out;
ret = 0;
out:
- unlock_kernel();
+ mutex_unlock(&DAC960_mutex);
return ret;
}
@@ -6625,7 +6626,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
long ErrorCode = 0;
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
- lock_kernel();
+ mutex_lock(&DAC960_mutex);
switch (Request)
{
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7056,13 +7057,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
default:
ErrorCode = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&DAC960_mutex);
return ErrorCode;
}
static const struct file_operations DAC960_gam_fops = {
.owner = THIS_MODULE,
- .unlocked_ioctl = DAC960_gam_ioctl
+ .unlocked_ioctl = DAC960_gam_ioctl,
+ .llseek = noop_llseek,
};
static struct miscdevice DAC960_gam_dev = {
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de27768..83c32cb 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -464,6 +464,7 @@ config XEN_BLKDEV_FRONTEND
tristate "Xen virtual block device support"
depends on XEN
default y
+ select XEN_XENBUS_FRONTEND
help
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
@@ -488,4 +489,21 @@ config BLK_DEV_HD
If unsure, say N.
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && EXPERIMENTAL && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac9..d7f463d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 76f114f..7888501 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -60,7 +60,7 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/amifdreg.h>
#include <linux/amifd.h>
#include <linux/buffer_head.h>
@@ -109,13 +109,12 @@
#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */
#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */
+static DEFINE_MUTEX(amiflop_mutex);
static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
module_param(fd_def_df0, ulong, 0);
MODULE_LICENSE("GPL");
-static struct request_queue *floppy_queue;
-
/*
* Macros
*/
@@ -164,6 +163,7 @@ static volatile int selected = -1; /* currently selected drive */
static int writepending;
static int writefromint;
static char *raw_buf;
+static int fdc_queue;
static DEFINE_SPINLOCK(amiflop_lock);
@@ -1334,6 +1334,42 @@ static int get_track(int drive, int track)
return -1;
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int cnt = FD_MAX_UNITS;
+ struct request *rq = NULL;
+
+ /* Find next queue we can dispatch from */
+ fdc_queue = fdc_queue + 1;
+ if (fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+
+ for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
+
+ if (unit[fdc_queue].type->code == FD_NODRIVE) {
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ continue;
+ }
+
+ q = unit[fdc_queue].gendisk->queue;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ }
+
+ return rq;
+}
+
static void redo_fd_request(void)
{
struct request *rq;
@@ -1345,7 +1381,7 @@ static void redo_fd_request(void)
int err;
next_req:
- rq = blk_fetch_request(floppy_queue);
+ rq = set_next_request();
if (!rq) {
/* Nothing left to do */
return;
@@ -1506,9 +1542,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return ret;
}
@@ -1555,11 +1591,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
int old_dev;
unsigned long flags;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
old_dev = fd_device[drive];
if (fd_ref[drive] && old_dev != system) {
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return -EBUSY;
}
@@ -1575,7 +1611,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
rel_fdc();
if (wrprot) {
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return -EROFS;
}
}
@@ -1594,7 +1630,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
unit[drive].type->name, data_types[system].name);
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return 0;
}
@@ -1603,7 +1639,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
struct amiga_floppy_struct *p = disk->private_data;
int drive = p - unit;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
if (unit[drive].dirty == 1) {
del_timer (flush_track_timer + drive);
non_int_flush_track (drive);
@@ -1617,7 +1653,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
/* the mod_use counter is handled this way */
floppy_off (drive | 0x40000000);
#endif
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return 0;
}
@@ -1682,6 +1718,13 @@ static int __init fd_probe_drives(void)
continue;
}
unit[drive].gendisk = disk;
+
+ disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
+ if (!disk->queue) {
+ unit[drive].type->code = FD_NODRIVE;
+ continue;
+ }
+
drives++;
if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
printk("no mem for ");
@@ -1695,7 +1738,6 @@ static int __init fd_probe_drives(void)
disk->fops = &floppy_fops;
sprintf(disk->disk_name, "fd%d", drive);
disk->private_data = &unit[drive];
- disk->queue = floppy_queue;
set_capacity(disk, 880*2);
add_disk(disk);
}
@@ -1743,11 +1785,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
goto out_irq2;
}
- ret = -ENOMEM;
- floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
- if (!floppy_queue)
- goto out_queue;
-
ret = -ENODEV;
if (fd_probe_drives() < 1) /* No usable drives */
goto out_probe;
@@ -1791,8 +1828,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
return 0;
out_probe:
- blk_cleanup_queue(floppy_queue);
-out_queue:
free_irq(IRQ_AMIGA_CIAA_TB, NULL);
out_irq2:
free_irq(IRQ_AMIGA_DSKBLK, NULL);
@@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
for( i = 0; i < FD_MAX_UNITS; i++) {
if (unit[i].type->code != FD_NODRIVE) {
+ struct request_queue *q = unit[i].gendisk->queue;
del_gendisk(unit[i].gendisk);
put_disk(unit[i].gendisk);
kfree(unit[i].trackbuf);
+ if (q)
+ blk_cleanup_queue(q);
}
}
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
free_irq(IRQ_AMIGA_DSKBLK, NULL);
custom.dmacon = DMAF_DISK; /* disable DMA */
amiga_chip_free(raw_buf);
- blk_cleanup_queue(floppy_queue);
unregister_blkdev(FLOPPY_MAJOR, "fd");
}
#endif
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index a946929..528f631 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -4,17 +4,20 @@
* block device routines
*/
+#include <linux/kernel.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/genhd.h>
#include <linux/netdevice.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "aoe.h"
+static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache;
static ssize_t aoedisk_show_state(struct device *dev,
@@ -125,16 +128,16 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
struct aoedev *d = bdev->bd_disk->private_data;
ulong flags;
- lock_kernel();
+ mutex_lock(&aoeblk_mutex);
spin_lock_irqsave(&d->lock, flags);
if (d->flags & DEVFL_UP) {
d->nopen++;
spin_unlock_irqrestore(&d->lock, flags);
- unlock_kernel();
+ mutex_unlock(&aoeblk_mutex);
return 0;
}
spin_unlock_irqrestore(&d->lock, flags);
- unlock_kernel();
+ mutex_unlock(&aoeblk_mutex);
return -ENODEV;
}
@@ -177,9 +180,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
BUG();
bio_endio(bio, -ENXIO);
return 0;
- } else if (bio->bi_rw & REQ_HARDBARRIER) {
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
} else if (bio->bi_io_vec == NULL) {
printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
BUG();
@@ -206,7 +206,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
spin_lock_irqsave(&d->lock, flags);
if ((d->flags & DEVFL_UP) == 0) {
- printk(KERN_INFO "aoe: device %ld.%d is not up\n",
+ pr_info_ratelimited("aoe: device %ld.%d is not up\n",
d->aoemajor, d->aoeminor);
spin_unlock_irqrestore(&d->lock, flags);
mempool_free(buf, d->bufpool);
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 4a1b9e74..146296c 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -9,7 +9,7 @@
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/skbuff.h>
#include "aoe.h"
@@ -37,6 +37,7 @@ struct ErrMsg {
char *msg;
};
+static DEFINE_MUTEX(aoechr_mutex);
static struct ErrMsg emsgs[NMSG];
static int emsgs_head_idx, emsgs_tail_idx;
static struct completion emsgs_comp;
@@ -183,16 +184,16 @@ aoechr_open(struct inode *inode, struct file *filp)
{
int n, i;
- lock_kernel();
+ mutex_lock(&aoechr_mutex);
n = iminor(inode);
filp->private_data = (void *) (unsigned long) n;
for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
if (chardevs[i].minor == n) {
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return 0;
}
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return -EINVAL;
}
@@ -265,6 +266,7 @@ static const struct file_operations aoe_fops = {
.open = aoechr_open,
.release = aoechr_rel,
.owner = THIS_MODULE,
+ .llseek = noop_llseek,
};
static char *aoe_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 5674bd0..de0435e 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -297,8 +297,8 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
struct sk_buff *skb;
struct net_device *ifp;
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, ifp) {
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, ifp) {
dev_hold(ifp);
if (!is_aoe_netif(ifp))
goto cont;
@@ -325,7 +325,7 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
cont:
dev_put(ifp);
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static void
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 0849280..6b5110a 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -102,6 +102,7 @@ aoedev_freedev(struct aoedev *d)
{
struct aoetgt **t, **e;
+ cancel_work_sync(&d->work);
if (d->gd) {
aoedisk_rm_sysfs(d);
del_gendisk(d->gd);
@@ -135,7 +136,6 @@ aoedev_flush(const char __user *str, size_t cnt)
all = !strncmp(buf, "all", 3);
}
- flush_scheduled_work();
spin_lock_irqsave(&devlist_lock, flags);
dd = &devlist;
while ((d = *dd)) {
@@ -257,8 +257,6 @@ aoedev_exit(void)
struct aoedev *d;
ulong flags;
- flush_scheduled_work();
-
while ((d = devlist)) {
devlist = d->next;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index aceb964..605a67e 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,7 +67,7 @@
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/atafd.h>
#include <asm/atafdreg.h>
@@ -79,8 +79,9 @@
#undef DEBUG
-static struct request_queue *floppy_queue;
+static DEFINE_MUTEX(ataflop_mutex);
static struct request *fd_request;
+static int fdc_queue;
/* Disk types: DD, HD, ED */
static struct atari_disk_type {
@@ -1391,6 +1392,29 @@ static void setup_req_params( int drive )
ReqTrack, ReqSector, (unsigned long)ReqData ));
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+ struct request *rq = NULL;
+
+ do {
+ q = unit[fdc_queue].disk->queue;
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return rq;
+}
+
static void redo_fd_request(void)
{
@@ -1405,7 +1429,7 @@ static void redo_fd_request(void)
repeat:
if (!fd_request) {
- fd_request = blk_fetch_request(floppy_queue);
+ fd_request = set_next_request();
if (!fd_request)
goto the_end;
}
@@ -1671,9 +1695,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return ret;
}
@@ -1854,9 +1878,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return ret;
}
@@ -1864,14 +1888,14 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
static int floppy_release(struct gendisk *disk, fmode_t mode)
{
struct atari_floppy_struct *p = disk->private_data;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
if (p->ref < 0)
p->ref = 0;
else if (!p->ref--) {
printk(KERN_ERR "floppy_release with fd_ref == 0");
p->ref = 0;
}
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return 0;
}
@@ -1932,10 +1956,6 @@ static int __init atari_floppy_init (void)
PhysTrackBuffer = virt_to_phys(TrackBuffer);
BufferDrive = BufferSide = BufferTrack = -1;
- floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
- if (!floppy_queue)
- goto Enomem;
-
for (i = 0; i < FD_MAX_UNITS; i++) {
unit[i].track = -1;
unit[i].flags = 0;
@@ -1944,7 +1964,10 @@ static int __init atari_floppy_init (void)
sprintf(unit[i].disk->disk_name, "fd%d", i);
unit[i].disk->fops = &floppy_fops;
unit[i].disk->private_data = &unit[i];
- unit[i].disk->queue = floppy_queue;
+ unit[i].disk->queue = blk_init_queue(do_fd_request,
+ &ataflop_lock);
+ if (!unit[i].disk->queue)
+ goto Enomem;
set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
add_disk(unit[i].disk);
}
@@ -1959,10 +1982,14 @@ static int __init atari_floppy_init (void)
return 0;
Enomem:
- while (i--)
+ while (i--) {
+ struct request_queue *q = unit[i].disk->queue;
+
put_disk(unit[i].disk);
- if (floppy_queue)
- blk_cleanup_queue(floppy_queue);
+ if (q)
+ blk_cleanup_queue(q);
+ }
+
unregister_blkdev(FLOPPY_MAJOR, "fd");
return -ENOMEM;
}
@@ -2011,12 +2038,14 @@ static void __exit atari_floppy_exit(void)
int i;
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
for (i = 0; i < FD_MAX_UNITS; i++) {
+ struct request_queue *q = unit[i].disk->queue;
+
del_gendisk(unit[i].disk);
put_disk(unit[i].disk);
+ blk_cleanup_queue(q);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
- blk_cleanup_queue(floppy_queue);
del_timer_sync(&fd_timer);
atari_stram_free( DMABuffer );
}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f637..b7f51e4 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,7 +15,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/radix-tree.h>
#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
#include <linux/slab.h>
@@ -55,6 +55,7 @@ struct brd_device {
/*
* Look up and return a brd's page for a given sector.
*/
+static DEFINE_MUTEX(brd_mutex);
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
@@ -402,7 +403,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
* ram device BLKFLSBUF has special semantics, we want to actually
* release and destroy the ramdisk data.
*/
- lock_kernel();
+ mutex_lock(&brd_mutex);
mutex_lock(&bdev->bd_mutex);
error = -EBUSY;
if (bdev->bd_openers <= 1) {
@@ -419,7 +420,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
error = 0;
}
mutex_unlock(&bdev->bd_mutex);
- unlock_kernel();
+ mutex_unlock(&brd_mutex);
return error;
}
@@ -482,7 +483,6 @@ static struct brd_device *brd_alloc(int i)
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index a54f7f4..516d5bb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -26,7 +26,6 @@
#include <linux/pci.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/major.h>
#include <linux/fs.h>
@@ -66,11 +65,8 @@ MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
MODULE_VERSION("3.6.26");
MODULE_LICENSE("GPL");
-static int cciss_allow_hpsa;
-module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(cciss_allow_hpsa,
- "Prevent cciss driver from accessing hardware known to be "
- " supported by the hpsa driver");
+static DEFINE_MUTEX(cciss_mutex);
+static struct proc_dir_entry *proc_cciss;
#include "cciss_cmd.h"
#include "cciss.h"
@@ -98,18 +94,6 @@ static const struct pci_device_id cciss_pci_device_id[] = {
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254},
{0,}
};
@@ -130,6 +114,8 @@ static struct board_type products[] = {
{0x409D0E11, "Smart Array 6400 EM", &SA5_access},
{0x40910E11, "Smart Array 6i", &SA5_access},
{0x3225103C, "Smart Array P600", &SA5_access},
+ {0x3223103C, "Smart Array P800", &SA5_access},
+ {0x3234103C, "Smart Array P400", &SA5_access},
{0x3235103C, "Smart Array P400i", &SA5_access},
{0x3211103C, "Smart Array E200i", &SA5_access},
{0x3212103C, "Smart Array E200", &SA5_access},
@@ -137,23 +123,9 @@ static struct board_type products[] = {
{0x3214103C, "Smart Array E200i", &SA5_access},
{0x3215103C, "Smart Array E200i", &SA5_access},
{0x3237103C, "Smart Array E500", &SA5_access},
-/* controllers below this line are also supported by the hpsa driver. */
-#define HPSA_BOUNDARY 0x3223103C
{0x3223103C, "Smart Array P800", &SA5_access},
{0x3234103C, "Smart Array P400", &SA5_access},
{0x323D103C, "Smart Array P700m", &SA5_access},
- {0x3241103C, "Smart Array P212", &SA5_access},
- {0x3243103C, "Smart Array P410", &SA5_access},
- {0x3245103C, "Smart Array P410i", &SA5_access},
- {0x3247103C, "Smart Array P411", &SA5_access},
- {0x3249103C, "Smart Array P812", &SA5_access},
- {0x324A103C, "Smart Array P712m", &SA5_access},
- {0x324B103C, "Smart Array P711m", &SA5_access},
- {0x3250103C, "Smart Array", &SA5_access},
- {0x3251103C, "Smart Array", &SA5_access},
- {0x3252103C, "Smart Array", &SA5_access},
- {0x3253103C, "Smart Array", &SA5_access},
- {0x3254103C, "Smart Array", &SA5_access},
};
/* How long to wait (in milliseconds) for board to go into simple mode */
@@ -392,8 +364,6 @@ static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
#define ENG_GIG_FACTOR (ENG_GIG/512)
#define ENGAGE_SCSI "engage scsi"
-static struct proc_dir_entry *proc_cciss;
-
static void cciss_seq_show_header(struct seq_file *seq)
{
ctlr_info_t *h = seq->private;
@@ -1059,9 +1029,9 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
ret = cciss_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return ret;
}
@@ -1074,13 +1044,13 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
ctlr_info_t *h;
drive_info_struct *drv;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
h = get_host(disk);
drv = get_drv(disk);
dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
drv->usage_count--;
h->usage_count--;
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return 0;
}
@@ -1088,9 +1058,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
int ret;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
ret = cciss_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return ret;
}
@@ -1182,6 +1152,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
int err;
u32 cp;
+ memset(&arg64, 0, sizeof(arg64));
err = 0;
err |=
copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
@@ -1232,470 +1203,452 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
(void)check_for_unit_attention(h, c);
}
-/*
- * ioctl
- */
-static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+
+static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
{
- struct gendisk *disk = bdev->bd_disk;
- ctlr_info_t *h = get_host(disk);
- drive_info_struct *drv = get_drv(disk);
- void __user *argp = (void __user *)arg;
+ cciss_pci_info_struct pciinfo;
- dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
- cmd, arg);
- switch (cmd) {
- case CCISS_GETPCIINFO:
- {
- cciss_pci_info_struct pciinfo;
-
- if (!arg)
- return -EINVAL;
- pciinfo.domain = pci_domain_nr(h->pdev->bus);
- pciinfo.bus = h->pdev->bus->number;
- pciinfo.dev_fn = h->pdev->devfn;
- pciinfo.board_id = h->board_id;
- if (copy_to_user
- (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETINTINFO:
- {
- cciss_coalint_struct intinfo;
- if (!arg)
- return -EINVAL;
- intinfo.delay =
- readl(&h->cfgtable->HostWrite.CoalIntDelay);
- intinfo.count =
- readl(&h->cfgtable->HostWrite.CoalIntCount);
- if (copy_to_user
- (argp, &intinfo, sizeof(cciss_coalint_struct)))
- return -EFAULT;
- return 0;
- }
- case CCISS_SETINTINFO:
- {
- cciss_coalint_struct intinfo;
- unsigned long flags;
- int i;
-
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user
- (&intinfo, argp, sizeof(cciss_coalint_struct)))
- return -EFAULT;
- if ((intinfo.delay == 0) && (intinfo.count == 0))
- return -EINVAL;
- spin_lock_irqsave(&h->lock, flags);
- /* Update the field, and then ring the doorbell */
- writel(intinfo.delay,
- &(h->cfgtable->HostWrite.CoalIntDelay));
- writel(intinfo.count,
- &(h->cfgtable->HostWrite.CoalIntCount));
- writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
-
- for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
- if (!(readl(h->vaddr + SA5_DOORBELL)
- & CFGTBL_ChangeReq))
- break;
- /* delay and try again */
- udelay(1000);
- }
- spin_unlock_irqrestore(&h->lock, flags);
- if (i >= MAX_IOCTL_CONFIG_WAIT)
- return -EAGAIN;
- return 0;
- }
- case CCISS_GETNODENAME:
- {
- NodeName_type NodeName;
- int i;
-
- if (!arg)
- return -EINVAL;
- for (i = 0; i < 16; i++)
- NodeName[i] =
- readb(&h->cfgtable->ServerName[i]);
- if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_SETNODENAME:
- {
- NodeName_type NodeName;
- unsigned long flags;
- int i;
+ if (!argp)
+ return -EINVAL;
+ pciinfo.domain = pci_domain_nr(h->pdev->bus);
+ pciinfo.bus = h->pdev->bus->number;
+ pciinfo.dev_fn = h->pdev->devfn;
+ pciinfo.board_id = h->board_id;
+ if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
+ return -EFAULT;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
- if (copy_from_user
- (NodeName, argp, sizeof(NodeName_type)))
- return -EFAULT;
+ if (!argp)
+ return -EINVAL;
+ intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
+ intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
+ if (copy_to_user
+ (argp, &intinfo, sizeof(cciss_coalint_struct)))
+ return -EFAULT;
+ return 0;
+}
- spin_lock_irqsave(&h->lock, flags);
+static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
+ unsigned long flags;
+ int i;
- /* Update the field, and then ring the doorbell */
- for (i = 0; i < 16; i++)
- writeb(NodeName[i],
- &h->cfgtable->ServerName[i]);
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
+ return -EFAULT;
+ if ((intinfo.delay == 0) && (intinfo.count == 0))
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
+ writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
- writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
- for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
- if (!(readl(h->vaddr + SA5_DOORBELL)
- & CFGTBL_ChangeReq))
- break;
- /* delay and try again */
- udelay(1000);
- }
- spin_unlock_irqrestore(&h->lock, flags);
- if (i >= MAX_IOCTL_CONFIG_WAIT)
- return -EAGAIN;
- return 0;
- }
+static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ int i;
- case CCISS_GETHEARTBEAT:
- {
- Heartbeat_type heartbeat;
-
- if (!arg)
- return -EINVAL;
- heartbeat = readl(&h->cfgtable->HeartBeat);
- if (copy_to_user
- (argp, &heartbeat, sizeof(Heartbeat_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETBUSTYPES:
- {
- BusTypes_type BusTypes;
-
- if (!arg)
- return -EINVAL;
- BusTypes = readl(&h->cfgtable->BusTypes);
- if (copy_to_user
- (argp, &BusTypes, sizeof(BusTypes_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETFIRMVER:
- {
- FirmwareVer_type firmware;
+ if (!argp)
+ return -EINVAL;
+ for (i = 0; i < 16; i++)
+ NodeName[i] = readb(&h->cfgtable->ServerName[i]);
+ if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
+ return -EFAULT;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
- memcpy(firmware, h->firm_ver, 4);
+static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ unsigned long flags;
+ int i;
- if (copy_to_user
- (argp, firmware, sizeof(FirmwareVer_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETDRIVVER:
- {
- DriverVer_type DriverVer = DRIVER_VERSION;
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
+ return -EFAULT;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ for (i = 0; i < 16; i++)
+ writeb(NodeName[i], &h->cfgtable->ServerName[i]);
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
+static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
+{
+ Heartbeat_type heartbeat;
- if (copy_to_user
- (argp, &DriverVer, sizeof(DriverVer_type)))
- return -EFAULT;
- return 0;
- }
+ if (!argp)
+ return -EINVAL;
+ heartbeat = readl(&h->cfgtable->HeartBeat);
+ if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
+ return -EFAULT;
+ return 0;
+}
- case CCISS_DEREGDISK:
- case CCISS_REGNEWD:
- case CCISS_REVALIDVOLS:
- return rebuild_lun_table(h, 0, 1);
+static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
+{
+ BusTypes_type BusTypes;
- case CCISS_GETLUNINFO:{
- LogvolInfo_struct luninfo;
+ if (!argp)
+ return -EINVAL;
+ BusTypes = readl(&h->cfgtable->BusTypes);
+ if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
+ return -EFAULT;
+ return 0;
+}
- memcpy(&luninfo.LunID, drv->LunID,
- sizeof(luninfo.LunID));
- luninfo.num_opens = drv->usage_count;
- luninfo.num_parts = 0;
- if (copy_to_user(argp, &luninfo,
- sizeof(LogvolInfo_struct)))
- return -EFAULT;
- return 0;
+static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
+{
+ FirmwareVer_type firmware;
+
+ if (!argp)
+ return -EINVAL;
+ memcpy(firmware, h->firm_ver, 4);
+
+ if (copy_to_user
+ (argp, firmware, sizeof(FirmwareVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
+{
+ DriverVer_type DriverVer = DRIVER_VERSION;
+
+ if (!argp)
+ return -EINVAL;
+ if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getluninfo(ctlr_info_t *h,
+ struct gendisk *disk, void __user *argp)
+{
+ LogvolInfo_struct luninfo;
+ drive_info_struct *drv = get_drv(disk);
+
+ if (!argp)
+ return -EINVAL;
+ memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
+ luninfo.num_opens = drv->usage_count;
+ luninfo.num_parts = 0;
+ if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_passthru(ctlr_info_t *h, void __user *argp)
+{
+ IOCTL_Command_struct iocommand;
+ CommandList_struct *c;
+ char *buff = NULL;
+ u64bit temp64;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (!argp)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (copy_from_user
+ (&iocommand, argp, sizeof(IOCTL_Command_struct)))
+ return -EFAULT;
+ if ((iocommand.buf_size < 1) &&
+ (iocommand.Request.Type.Direction != XFER_NONE)) {
+ return -EINVAL;
+ }
+ if (iocommand.buf_size > 0) {
+ buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
+ if (buff == NULL)
+ return -EFAULT;
+ }
+ if (iocommand.Request.Type.Direction == XFER_WRITE) {
+ /* Copy the data into the buffer we created */
+ if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
+ kfree(buff);
+ return -EFAULT;
}
- case CCISS_PASSTHRU:
- {
- IOCTL_Command_struct iocommand;
- CommandList_struct *c;
- char *buff = NULL;
- u64bit temp64;
- DECLARE_COMPLETION_ONSTACK(wait);
-
- if (!arg)
- return -EINVAL;
-
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
-
- if (copy_from_user
- (&iocommand, argp, sizeof(IOCTL_Command_struct)))
- return -EFAULT;
- if ((iocommand.buf_size < 1) &&
- (iocommand.Request.Type.Direction != XFER_NONE)) {
- return -EINVAL;
- }
-#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */
- /* Check kmalloc limits */
- if (iocommand.buf_size > 128000)
- return -EINVAL;
-#endif
- if (iocommand.buf_size > 0) {
- buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
- if (buff == NULL)
- return -EFAULT;
- }
- if (iocommand.Request.Type.Direction == XFER_WRITE) {
- /* Copy the data into the buffer we created */
- if (copy_from_user
- (buff, iocommand.buf, iocommand.buf_size)) {
- kfree(buff);
- return -EFAULT;
- }
- } else {
- memset(buff, 0, iocommand.buf_size);
- }
- c = cmd_special_alloc(h);
- if (!c) {
- kfree(buff);
- return -ENOMEM;
- }
- /* Fill in the command type */
- c->cmd_type = CMD_IOCTL_PEND;
- /* Fill in Command Header */
- c->Header.ReplyQueue = 0; /* unused in simple mode */
- if (iocommand.buf_size > 0) /* buffer to fill */
- {
- c->Header.SGList = 1;
- c->Header.SGTotal = 1;
- } else /* no buffers to fill */
- {
- c->Header.SGList = 0;
- c->Header.SGTotal = 0;
- }
- c->Header.LUN = iocommand.LUN_info;
- /* use the kernel address the cmd block for tag */
- c->Header.Tag.lower = c->busaddr;
-
- /* Fill in Request block */
- c->Request = iocommand.Request;
-
- /* Fill in the scatter gather information */
- if (iocommand.buf_size > 0) {
- temp64.val = pci_map_single(h->pdev, buff,
- iocommand.buf_size,
- PCI_DMA_BIDIRECTIONAL);
- c->SG[0].Addr.lower = temp64.val32.lower;
- c->SG[0].Addr.upper = temp64.val32.upper;
- c->SG[0].Len = iocommand.buf_size;
- c->SG[0].Ext = 0; /* we are not chaining */
- }
- c->waiting = &wait;
+ } else {
+ memset(buff, 0, iocommand.buf_size);
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ kfree(buff);
+ return -ENOMEM;
+ }
+ /* Fill in the command type */
+ c->cmd_type = CMD_IOCTL_PEND;
+ /* Fill in Command Header */
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ if (iocommand.buf_size > 0) { /* buffer to fill */
+ c->Header.SGList = 1;
+ c->Header.SGTotal = 1;
+ } else { /* no buffers to fill */
+ c->Header.SGList = 0;
+ c->Header.SGTotal = 0;
+ }
+ c->Header.LUN = iocommand.LUN_info;
+ /* use the kernel address the cmd block for tag */
+ c->Header.Tag.lower = c->busaddr;
- enqueue_cmd_and_start_io(h, c);
- wait_for_completion(&wait);
+ /* Fill in Request block */
+ c->Request = iocommand.Request;
- /* unlock the buffers from DMA */
- temp64.val32.lower = c->SG[0].Addr.lower;
- temp64.val32.upper = c->SG[0].Addr.upper;
- pci_unmap_single(h->pdev, (dma_addr_t) temp64.val,
- iocommand.buf_size,
- PCI_DMA_BIDIRECTIONAL);
+ /* Fill in the scatter gather information */
+ if (iocommand.buf_size > 0) {
+ temp64.val = pci_map_single(h->pdev, buff,
+ iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
+ c->SG[0].Addr.lower = temp64.val32.lower;
+ c->SG[0].Addr.upper = temp64.val32.upper;
+ c->SG[0].Len = iocommand.buf_size;
+ c->SG[0].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
- check_ioctl_unit_attention(h, c);
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
- /* Copy the error information out */
- iocommand.error_info = *(c->err_info);
- if (copy_to_user
- (argp, &iocommand, sizeof(IOCTL_Command_struct))) {
- kfree(buff);
- cmd_special_free(h, c);
- return -EFAULT;
- }
+ /* unlock the buffers from DMA */
+ temp64.val32.lower = c->SG[0].Addr.lower;
+ temp64.val32.upper = c->SG[0].Addr.upper;
+ pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
+ PCI_DMA_BIDIRECTIONAL);
+ check_ioctl_unit_attention(h, c);
+
+ /* Copy the error information out */
+ iocommand.error_info = *(c->err_info);
+ if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
+ kfree(buff);
+ cmd_special_free(h, c);
+ return -EFAULT;
+ }
- if (iocommand.Request.Type.Direction == XFER_READ) {
- /* Copy the data out of the buffer we created */
- if (copy_to_user
- (iocommand.buf, buff, iocommand.buf_size)) {
- kfree(buff);
- cmd_special_free(h, c);
- return -EFAULT;
- }
- }
+ if (iocommand.Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
kfree(buff);
cmd_special_free(h, c);
- return 0;
+ return -EFAULT;
}
- case CCISS_BIG_PASSTHRU:{
- BIG_IOCTL_Command_struct *ioc;
- CommandList_struct *c;
- unsigned char **buff = NULL;
- int *buff_size = NULL;
- u64bit temp64;
- BYTE sg_used = 0;
- int status = 0;
- int i;
- DECLARE_COMPLETION_ONSTACK(wait);
- __u32 left;
- __u32 sz;
- BYTE __user *data_ptr;
-
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
- ioc = (BIG_IOCTL_Command_struct *)
- kmalloc(sizeof(*ioc), GFP_KERNEL);
- if (!ioc) {
- status = -ENOMEM;
- goto cleanup1;
- }
- if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+ }
+ kfree(buff);
+ cmd_special_free(h, c);
+ return 0;
+}
+
+static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
+{
+ BIG_IOCTL_Command_struct *ioc;
+ CommandList_struct *c;
+ unsigned char **buff = NULL;
+ int *buff_size = NULL;
+ u64bit temp64;
+ BYTE sg_used = 0;
+ int status = 0;
+ int i;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ __u32 left;
+ __u32 sz;
+ BYTE __user *data_ptr;
+
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ ioc = (BIG_IOCTL_Command_struct *)
+ kmalloc(sizeof(*ioc), GFP_KERNEL);
+ if (!ioc) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if ((ioc->buf_size < 1) &&
+ (ioc->Request.Type.Direction != XFER_NONE)) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ /* Check kmalloc limits using all SGs */
+ if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
+ if (!buff) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
+ if (!buff_size) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ left = ioc->buf_size;
+ data_ptr = ioc->buf;
+ while (left) {
+ sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
+ buff_size[sg_used] = sz;
+ buff[sg_used] = kmalloc(sz, GFP_KERNEL);
+ if (buff[sg_used] == NULL) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_WRITE) {
+ if (copy_from_user(buff[sg_used], data_ptr, sz)) {
status = -EFAULT;
goto cleanup1;
}
- if ((ioc->buf_size < 1) &&
- (ioc->Request.Type.Direction != XFER_NONE)) {
- status = -EINVAL;
- goto cleanup1;
- }
- /* Check kmalloc limits using all SGs */
- if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
- status = -EINVAL;
- goto cleanup1;
- }
- if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
- status = -EINVAL;
- goto cleanup1;
- }
- buff =
- kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
- if (!buff) {
- status = -ENOMEM;
- goto cleanup1;
- }
- buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
- GFP_KERNEL);
- if (!buff_size) {
- status = -ENOMEM;
- goto cleanup1;
- }
- left = ioc->buf_size;
- data_ptr = ioc->buf;
- while (left) {
- sz = (left >
- ioc->malloc_size) ? ioc->
- malloc_size : left;
- buff_size[sg_used] = sz;
- buff[sg_used] = kmalloc(sz, GFP_KERNEL);
- if (buff[sg_used] == NULL) {
- status = -ENOMEM;
- goto cleanup1;
- }
- if (ioc->Request.Type.Direction == XFER_WRITE) {
- if (copy_from_user
- (buff[sg_used], data_ptr, sz)) {
- status = -EFAULT;
- goto cleanup1;
- }
- } else {
- memset(buff[sg_used], 0, sz);
- }
- left -= sz;
- data_ptr += sz;
- sg_used++;
- }
- c = cmd_special_alloc(h);
- if (!c) {
- status = -ENOMEM;
- goto cleanup1;
- }
- c->cmd_type = CMD_IOCTL_PEND;
- c->Header.ReplyQueue = 0;
+ } else {
+ memset(buff[sg_used], 0, sz);
+ }
+ left -= sz;
+ data_ptr += sz;
+ sg_used++;
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ c->cmd_type = CMD_IOCTL_PEND;
+ c->Header.ReplyQueue = 0;
+ c->Header.SGList = sg_used;
+ c->Header.SGTotal = sg_used;
+ c->Header.LUN = ioc->LUN_info;
+ c->Header.Tag.lower = c->busaddr;
- if (ioc->buf_size > 0) {
- c->Header.SGList = sg_used;
- c->Header.SGTotal = sg_used;
- } else {
- c->Header.SGList = 0;
- c->Header.SGTotal = 0;
- }
- c->Header.LUN = ioc->LUN_info;
- c->Header.Tag.lower = c->busaddr;
-
- c->Request = ioc->Request;
- if (ioc->buf_size > 0) {
- for (i = 0; i < sg_used; i++) {
- temp64.val =
- pci_map_single(h->pdev, buff[i],
- buff_size[i],
- PCI_DMA_BIDIRECTIONAL);
- c->SG[i].Addr.lower =
- temp64.val32.lower;
- c->SG[i].Addr.upper =
- temp64.val32.upper;
- c->SG[i].Len = buff_size[i];
- c->SG[i].Ext = 0; /* we are not chaining */
- }
- }
- c->waiting = &wait;
- enqueue_cmd_and_start_io(h, c);
- wait_for_completion(&wait);
- /* unlock the buffers from DMA */
- for (i = 0; i < sg_used; i++) {
- temp64.val32.lower = c->SG[i].Addr.lower;
- temp64.val32.upper = c->SG[i].Addr.upper;
- pci_unmap_single(h->pdev,
- (dma_addr_t) temp64.val, buff_size[i],
- PCI_DMA_BIDIRECTIONAL);
- }
- check_ioctl_unit_attention(h, c);
- /* Copy the error information out */
- ioc->error_info = *(c->err_info);
- if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+ c->Request = ioc->Request;
+ for (i = 0; i < sg_used; i++) {
+ temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ c->SG[i].Addr.lower = temp64.val32.lower;
+ c->SG[i].Addr.upper = temp64.val32.upper;
+ c->SG[i].Len = buff_size[i];
+ c->SG[i].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
+ /* unlock the buffers from DMA */
+ for (i = 0; i < sg_used; i++) {
+ temp64.val32.lower = c->SG[i].Addr.lower;
+ temp64.val32.upper = c->SG[i].Addr.upper;
+ pci_unmap_single(h->pdev,
+ (dma_addr_t) temp64.val, buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ }
+ check_ioctl_unit_attention(h, c);
+ /* Copy the error information out */
+ ioc->error_info = *(c->err_info);
+ if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+ cmd_special_free(h, c);
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ BYTE __user *ptr = ioc->buf;
+ for (i = 0; i < sg_used; i++) {
+ if (copy_to_user(ptr, buff[i], buff_size[i])) {
cmd_special_free(h, c);
status = -EFAULT;
goto cleanup1;
}
- if (ioc->Request.Type.Direction == XFER_READ) {
- /* Copy the data out of the buffer we created */
- BYTE __user *ptr = ioc->buf;
- for (i = 0; i < sg_used; i++) {
- if (copy_to_user
- (ptr, buff[i], buff_size[i])) {
- cmd_special_free(h, c);
- status = -EFAULT;
- goto cleanup1;
- }
- ptr += buff_size[i];
- }
- }
- cmd_special_free(h, c);
- status = 0;
- cleanup1:
- if (buff) {
- for (i = 0; i < sg_used; i++)
- kfree(buff[i]);
- kfree(buff);
- }
- kfree(buff_size);
- kfree(ioc);
- return status;
+ ptr += buff_size[i];
}
+ }
+ cmd_special_free(h, c);
+ status = 0;
+cleanup1:
+ if (buff) {
+ for (i = 0; i < sg_used; i++)
+ kfree(buff[i]);
+ kfree(buff);
+ }
+ kfree(buff_size);
+ kfree(ioc);
+ return status;
+}
+
+static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ ctlr_info_t *h = get_host(disk);
+ void __user *argp = (void __user *)arg;
+
+ dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
+ cmd, arg);
+ switch (cmd) {
+ case CCISS_GETPCIINFO:
+ return cciss_getpciinfo(h, argp);
+ case CCISS_GETINTINFO:
+ return cciss_getintinfo(h, argp);
+ case CCISS_SETINTINFO:
+ return cciss_setintinfo(h, argp);
+ case CCISS_GETNODENAME:
+ return cciss_getnodename(h, argp);
+ case CCISS_SETNODENAME:
+ return cciss_setnodename(h, argp);
+ case CCISS_GETHEARTBEAT:
+ return cciss_getheartbeat(h, argp);
+ case CCISS_GETBUSTYPES:
+ return cciss_getbustypes(h, argp);
+ case CCISS_GETFIRMVER:
+ return cciss_getfirmver(h, argp);
+ case CCISS_GETDRIVVER:
+ return cciss_getdrivver(h, argp);
+ case CCISS_DEREGDISK:
+ case CCISS_REGNEWD:
+ case CCISS_REVALIDVOLS:
+ return rebuild_lun_table(h, 0, 1);
+ case CCISS_GETLUNINFO:
+ return cciss_getluninfo(h, disk, argp);
+ case CCISS_PASSTHRU:
+ return cciss_passthru(h, argp);
+ case CCISS_BIG_PASSTHRU:
+ return cciss_bigpassthru(h, argp);
/* scsi_cmd_ioctl handles these, below, though some are not */
/* very meaningful for cciss. SG_IO is the main one people want. */
@@ -2881,6 +2834,8 @@ static int cciss_revalidate(struct gendisk *disk)
InquiryData_struct *inq_buff = NULL;
for (logvol = 0; logvol < CISS_MAX_LUN; logvol++) {
+ if (!h->drv[logvol])
+ continue;
if (memcmp(h->drv[logvol]->LunID, drv->LunID,
sizeof(drv->LunID)) == 0) {
FOUND = 1;
@@ -3800,7 +3755,7 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
for (i = 0; i < MAX_CONFIG_WAIT; i++) {
if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
break;
- msleep(10);
+ usleep_range(10000, 20000);
}
}
@@ -3984,13 +3939,9 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
*board_id = ((subsystem_device_id << 16) & 0xffff0000) |
subsystem_vendor_id;
- for (i = 0; i < ARRAY_SIZE(products); i++) {
- /* Stand aside for hpsa driver on request */
- if (cciss_allow_hpsa && products[i].board_id == HPSA_BOUNDARY)
- return -ENODEV;
+ for (i = 0; i < ARRAY_SIZE(products); i++)
if (*board_id == products[i].board_id)
return i;
- }
dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
*board_id);
return -ENODEV;
@@ -4021,18 +3972,31 @@ static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
return -ENODEV;
}
-static int __devinit cciss_wait_for_board_ready(ctlr_info_t *h)
+static int __devinit cciss_wait_for_board_state(struct pci_dev *pdev,
+ void __iomem *vaddr, int wait_for_ready)
+#define BOARD_READY 1
+#define BOARD_NOT_READY 0
{
- int i;
+ int i, iterations;
u32 scratchpad;
- for (i = 0; i < CCISS_BOARD_READY_ITERATIONS; i++) {
- scratchpad = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
- if (scratchpad == CCISS_FIRMWARE_READY)
- return 0;
+ if (wait_for_ready)
+ iterations = CCISS_BOARD_READY_ITERATIONS;
+ else
+ iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
+
+ for (i = 0; i < iterations; i++) {
+ scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
+ if (wait_for_ready) {
+ if (scratchpad == CCISS_FIRMWARE_READY)
+ return 0;
+ } else {
+ if (scratchpad != CCISS_FIRMWARE_READY)
+ return 0;
+ }
msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
}
- dev_warn(&h->pdev->dev, "board not ready, timed out.\n");
+ dev_warn(&pdev->dev, "board not ready, timed out.\n");
return -ENODEV;
}
@@ -4081,6 +4045,11 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
{
h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
+
+ /* Limit commands in memory limited kdump scenario. */
+ if (reset_devices && h->max_commands > 32)
+ h->max_commands = 32;
+
if (h->max_commands < 16) {
dev_warn(&h->pdev->dev, "Controller reports "
"max supported commands of %d, an obvious lie. "
@@ -4198,7 +4167,7 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
err = -ENOMEM;
goto err_out_free_res;
}
- err = cciss_wait_for_board_ready(h);
+ err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
if (err)
goto err_out_free_res;
err = cciss_find_cfgtables(h);
@@ -4363,36 +4332,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
#define cciss_noop(p) cciss_message(p, 3, 0)
-static __devinit int cciss_reset_msi(struct pci_dev *pdev)
-{
-/* the #defines are stolen from drivers/pci/msi.h. */
-#define msi_control_reg(base) (base + PCI_MSI_FLAGS)
-#define PCI_MSIX_FLAGS_ENABLE (1 << 15)
-
- int pos;
- u16 control = 0;
-
- pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
- if (pos) {
- pci_read_config_word(pdev, msi_control_reg(pos), &control);
- if (control & PCI_MSI_FLAGS_ENABLE) {
- dev_info(&pdev->dev, "resetting MSI\n");
- pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE);
- }
- }
-
- pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
- if (pos) {
- pci_read_config_word(pdev, msi_control_reg(pos), &control);
- if (control & PCI_MSIX_FLAGS_ENABLE) {
- dev_info(&pdev->dev, "resetting MSI-X\n");
- pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE);
- }
- }
-
- return 0;
-}
-
static int cciss_controller_hard_reset(struct pci_dev *pdev,
void * __iomem vaddr, bool use_doorbell)
{
@@ -4447,17 +4386,17 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
* states or using the doorbell register. */
static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
{
- u16 saved_config_space[32];
u64 cfg_offset;
u32 cfg_base_addr;
u64 cfg_base_addr_index;
void __iomem *vaddr;
unsigned long paddr;
u32 misc_fw_support, active_transport;
- int rc, i;
+ int rc;
CfgTable_struct __iomem *cfgtable;
bool use_doorbell;
u32 board_id;
+ u16 command_register;
/* For controllers as old a the p600, this is very nearly
* the same thing as
@@ -4467,14 +4406,6 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
* pci_set_power_state(pci_dev, PCI_D0);
* pci_restore_state(pci_dev);
*
- * but we can't use these nice canned kernel routines on
- * kexec, because they also check the MSI/MSI-X state in PCI
- * configuration space and do the wrong thing when it is
- * set/cleared. Also, the pci_save/restore_state functions
- * violate the ordering requirements for restoring the
- * configuration space from the CCISS document (see the
- * comment below). So we roll our own ....
- *
* For controllers newer than the P600, the pci power state
* method of resetting doesn't work so we have another way
* using the doorbell register.
@@ -4493,8 +4424,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
return -ENODEV;
}
- for (i = 0; i < 32; i++)
- pci_read_config_word(pdev, 2*i, &saved_config_space[i]);
+ /* Save the PCI command register */
+ pci_read_config_word(pdev, 4, &command_register);
+ /* Turn the board off. This is so that later pci_restore_state()
+ * won't turn the board on before the rest of config space is ready.
+ */
+ pci_disable_device(pdev);
+ pci_save_state(pdev);
/* find the first memory BAR, so we can find the cfg table */
rc = cciss_pci_find_memory_BAR(pdev, &paddr);
@@ -4529,26 +4465,32 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
if (rc)
goto unmap_cfgtable;
-
- /* Restore the PCI configuration space. The Open CISS
- * Specification says, "Restore the PCI Configuration
- * Registers, offsets 00h through 60h. It is important to
- * restore the command register, 16-bits at offset 04h,
- * last. Do not restore the configuration status register,
- * 16-bits at offset 06h." Note that the offset is 2*i.
- */
- for (i = 0; i < 32; i++) {
- if (i == 2 || i == 3)
- continue;
- pci_write_config_word(pdev, 2*i, saved_config_space[i]);
+ pci_restore_state(pdev);
+ rc = pci_enable_device(pdev);
+ if (rc) {
+ dev_warn(&pdev->dev, "failed to enable device.\n");
+ goto unmap_cfgtable;
}
- wmb();
- pci_write_config_word(pdev, 4, saved_config_space[2]);
+ pci_write_config_word(pdev, 4, command_register);
/* Some devices (notably the HP Smart Array 5i Controller)
need a little pause here */
msleep(CCISS_POST_RESET_PAUSE_MSECS);
+ /* Wait for board to become not ready, then ready. */
+ dev_info(&pdev->dev, "Waiting for board to become ready.\n");
+ rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
+ if (rc) /* Don't bail, might be E500, etc. which can't be reset */
+ dev_warn(&pdev->dev,
+ "failed waiting for board to become not ready\n");
+ rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
+ if (rc) {
+ dev_warn(&pdev->dev,
+ "failed waiting for board to become ready\n");
+ goto unmap_cfgtable;
+ }
+ dev_info(&pdev->dev, "board ready.\n");
+
/* Controller should be in simple mode at this point. If it's not,
* It means we're on one of those controllers which doesn't support
* the doorbell reset method and on which the PCI power management reset
@@ -4589,8 +4531,6 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
return 0; /* just try to do the kdump anyhow. */
if (rc)
return -ENODEV;
- if (cciss_reset_msi(pdev))
- return -ENODEV;
/* Now try to get the controller to respond to a no-op */
for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
@@ -4986,7 +4926,8 @@ static void __exit cciss_cleanup(void)
}
}
kthread_stop(cciss_scan_thread);
- remove_proc_entry("driver/cciss", NULL);
+ if (proc_cciss)
+ remove_proc_entry("driver/cciss", NULL);
bus_unregister(&cciss_bus_type);
}
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 7915dd4..579f749 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -200,10 +200,14 @@ struct ctlr_info
* the above.
*/
#define CCISS_BOARD_READY_WAIT_SECS (120)
+#define CCISS_BOARD_NOT_READY_WAIT_SECS (10)
#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
#define CCISS_BOARD_READY_ITERATIONS \
((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
+#define CCISS_BOARD_NOT_READY_ITERATIONS \
+ ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
+ CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
#define CCISS_POST_RESET_PAUSE_MSECS (3000)
#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000)
#define CCISS_POST_RESET_NOOP_RETRIES (12)
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 575495f..727d022 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -62,8 +62,8 @@ static int cciss_scsi_proc_info(
int length, /* length of data in buffer */
int func); /* 0 == read, 1 == write */
-static int cciss_scsi_queue_command (struct scsi_cmnd *cmd,
- void (* done)(struct scsi_cmnd *));
+static int cciss_scsi_queue_command (struct Scsi_Host *h,
+ struct scsi_cmnd *cmd);
static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
static int cciss_eh_abort_handler(struct scsi_cmnd *);
@@ -1406,7 +1406,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
static int
-cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *))
+cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
{
ctlr_info_t *h;
int rc;
@@ -1504,6 +1504,8 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
return 0;
}
+static DEF_SCSI_QCMD(cciss_scsi_queue_command)
+
static void cciss_unregister_scsi(ctlr_info_t *h)
{
struct cciss_scsi_adapter_data_t *sa;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index d53b029..946dad4 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -35,7 +35,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/hdreg.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
@@ -68,6 +68,7 @@ MODULE_LICENSE("GPL");
#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */
+static DEFINE_MUTEX(cpqarray_mutex);
static int nr_ctlr;
static ctlr_info_t *hba[MAX_CTLR];
@@ -845,9 +846,9 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
ret = ida_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return ret;
}
@@ -859,10 +860,10 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
{
ctlr_info_t *host;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
host = get_host(disk);
host->usage_count--;
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return 0;
}
@@ -1217,9 +1218,9 @@ static int ida_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
ret = ida_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return ret;
}
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9400845..ba95cba 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -78,11 +78,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
init_completion(&md_io.event);
md_io.error = 0;
- if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
- rw |= REQ_HARDBARRIER;
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
+ rw |= REQ_FUA;
rw |= REQ_UNPLUG | REQ_SYNC;
- retry:
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
@@ -100,17 +99,6 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
wait_for_completion(&md_io.event);
ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
- /* check for unsupported barrier op.
- * would rather check on EOPNOTSUPP, but that is not reliable.
- * don't try again for ANY return value != 0 */
- if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
- /* Try again with no barrier */
- dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
- set_bit(MD_NO_BARRIER, &mdev->flags);
- rw &= ~REQ_HARDBARRIER;
- bio_put(bio);
- goto retry;
- }
out:
bio_put(bio);
return ok;
@@ -284,18 +272,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
u32 xor_sum = 0;
if (!get_ldev(mdev)) {
- dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+ dev_err(DEV,
+ "disk is %s, cannot start al transaction (-%d +%d)\n",
+ drbd_disk_str(mdev->state.disk), evicted, new_enr);
complete(&((struct update_al_work *)w)->event);
return 1;
}
/* do we have to do a bitmap write, first?
* TODO reduce maximum latency:
* submit both bios, then wait for both,
- * instead of doing two synchronous sector writes. */
+ * instead of doing two synchronous sector writes.
+ * For now, we must not write the transaction,
+ * if we cannot write out the bitmap of the evicted extent. */
if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
- mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+ /* The bitmap write may have failed, causing a state change. */
+ if (mdev->state.disk < D_INCONSISTENT) {
+ dev_err(DEV,
+ "disk is %s, cannot write al transaction (-%d +%d)\n",
+ drbd_disk_str(mdev->state.disk), evicted, new_enr);
+ complete(&((struct update_al_work *)w)->event);
+ put_ldev(mdev);
+ return 1;
+ }
+
+ mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
buffer = (struct al_transaction *)page_address(mdev->md_io_page);
buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +741,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
unsigned int enr;
unsigned long add = 0;
char ppb[10];
- int i;
+ int i, tmp;
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
@@ -747,7 +749,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
enr = lc_element_by_index(mdev->act_log, i)->lc_number;
if (enr == LC_FREE)
continue;
- add += drbd_bm_ALe_set_all(mdev, enr);
+ tmp = drbd_bm_ALe_set_all(mdev, enr);
+ dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+ add += tmp;
}
lc_unlock(mdev->act_log);
@@ -965,29 +969,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
- spin_lock_irqsave(&mdev->al_lock, flags);
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
- if (count) {
- /* we need the lock for drbd_try_clear_on_disk_bm */
- if (jiffies - mdev->rs_mark_time > HZ*10) {
- /* should be rolling marks,
- * but we estimate only anyways. */
- if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+ if (count && get_ldev(mdev)) {
+ unsigned long now = jiffies;
+ unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+ int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ unsigned long tw = drbd_bm_total_weight(mdev);
+ if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
mdev->state.conn != C_PAUSED_SYNC_T &&
mdev->state.conn != C_PAUSED_SYNC_S) {
- mdev->rs_mark_time = jiffies;
- mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ mdev->rs_mark_time[next] = now;
+ mdev->rs_mark_left[next] = tw;
+ mdev->rs_last_mark = next;
}
}
- if (get_ldev(mdev)) {
- drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
- put_ldev(mdev);
- }
+ spin_lock_irqsave(&mdev->al_lock, flags);
+ drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
+ put_ldev(mdev);
}
- spin_unlock_irqrestore(&mdev->al_lock, flags);
if (wake_up)
wake_up(&mdev->al_wait);
}
@@ -1118,7 +1123,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
* @mdev: DRBD device.
* @sector: The sector number.
*
- * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
*/
int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
{
@@ -1129,10 +1134,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
sig = wait_event_interruptible(mdev->al_wait,
(bm_ext = _bme_get(mdev, enr)));
if (sig)
- return 0;
+ return -EINTR;
if (test_bit(BME_LOCKED, &bm_ext->flags))
- return 1;
+ return 0;
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
sig = wait_event_interruptible(mdev->al_wait,
@@ -1145,13 +1150,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
wake_up(&mdev->al_wait);
}
spin_unlock_irq(&mdev->al_lock);
- return 0;
+ return -EINTR;
}
}
-
set_bit(BME_LOCKED, &bm_ext->flags);
-
- return 1;
+ return 0;
}
/**
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e3f88d6..fd42832 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
*
* maybe bm_set should be atomic_t ?
*/
-static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long s;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b..3803a03 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -114,11 +114,11 @@ struct drbd_conf;
#define D_ASSERT(exp) if (!(exp)) \
dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
-#define ERR_IF(exp) if (({ \
- int _b = (exp) != 0; \
- if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
- __func__, #exp, __FILE__, __LINE__); \
- _b; \
+#define ERR_IF(exp) if (({ \
+ int _b = (exp) != 0; \
+ if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
+ __func__, #exp, __FILE__, __LINE__); \
+ _b; \
}))
/* Defines to control fault insertion */
@@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
-struct p_header {
+struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
-/* 8 bytes. packet FIXED for the next century! */
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
+ u8 payload[0];
+} __packed;
+
+union p_header {
+ struct p_header80 h80;
+ struct p_header95 h95;
+};
/*
* short commands, packets without payload, plain p_header:
@@ -362,12 +374,16 @@ struct p_header {
*/
/* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER 1
-#define DP_RW_SYNC 2
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
struct p_data {
- struct p_header head;
+ union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
@@ -383,7 +399,7 @@ struct p_data {
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -392,7 +408,7 @@ struct p_block_ack {
struct p_block_req {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -409,7 +425,7 @@ struct p_block_req {
*/
struct p_handshake {
- struct p_header head; /* 8 bytes */
+ struct p_header80 head; /* 8 bytes */
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
@@ -424,19 +440,19 @@ struct p_handshake {
/* 80 bytes, FIXED for the next century */
struct p_barrier {
- struct p_header head;
+ struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
- struct p_header head;
+ struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* Since protocol version 88 and higher. */
@@ -444,20 +460,31 @@ struct p_rs_param {
} __packed;
struct p_rs_param_89 {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
+struct p_rs_param_95 {
+ struct p_header80 head;
+ u32 rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
enum drbd_conn_flags {
CF_WANT_LOSE = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
- struct p_header head;
+ struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
@@ -471,17 +498,17 @@ struct p_protocol {
} __packed;
struct p_uuids {
- struct p_header head;
+ struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
- struct p_header head;
+ struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
- struct p_header head;
+ struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
@@ -491,18 +518,18 @@ struct p_sizes {
} __packed;
struct p_state {
- struct p_header head;
+ struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
- struct p_header head;
+ struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
- struct p_header head;
+ struct p_header80 head;
u32 retcode;
} __packed;
@@ -517,7 +544,7 @@ struct p_drbd06_param {
} __packed;
struct p_discard {
- struct p_header head;
+ struct p_header80 head;
u64 block_id;
u32 seq_num;
u32 pad;
@@ -533,7 +560,7 @@ enum drbd_bitmap_code {
};
struct p_compressed_bm {
- struct p_header head;
+ struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -544,10 +571,10 @@ struct p_compressed_bm {
u8 code[0];
} __packed;
-struct p_delay_probe {
- struct p_header head;
- u32 seq_num; /* sequence number to match the two probe packets */
- u32 offset; /* usecs the probe got sent after the reference time point */
+struct p_delay_probe93 {
+ struct p_header80 head;
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
/* DCBP: Drbd Compressed Bitmap Packet ... */
@@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
* so we need to use the fixed size 4KiB page size
* most architechtures have used for a long time.
*/
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
@@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
#endif
union p_polymorph {
- struct p_header header;
+ union p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
+ struct p_rs_param_95 rs_param_95;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
@@ -617,6 +645,8 @@ union p_polymorph {
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
+ struct p_delay_probe93 delay_probe93;
+ struct p_rs_uuid rs_uuid;
} __packed;
/**********************************************************************/
@@ -697,7 +727,7 @@ struct drbd_tl_epoch {
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
- int n_req; /* number of requests attached before this barrier */
+ int n_writes; /* number of requests attached before this barrier */
};
struct drbd_request;
@@ -719,17 +749,12 @@ struct drbd_epoch {
/* drbd_epoch flag bits */
enum {
- DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
- DE_BARRIER_IN_NEXT_EPOCH_DONE,
- DE_CONTAINS_A_BARRIER,
DE_HAVE_BARRIER_NUMBER,
- DE_IS_FINISHING,
};
enum epoch_event {
EV_PUT,
EV_GOT_BARRIER_NR,
- EV_BARRIER_DONE,
EV_BECAME_LAST,
EV_CLEANUP = 32, /* used as flag */
};
@@ -747,7 +772,7 @@ struct digest_info {
struct drbd_epoch_entry {
struct drbd_work w;
struct hlist_node colision;
- struct drbd_epoch *epoch;
+ struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
@@ -755,7 +780,10 @@ struct drbd_epoch_entry {
/* see comments on ee flag bits below */
unsigned long flags;
sector_t sector;
- u64 block_id;
+ union {
+ u64 block_id;
+ struct digest_info *digest;
+ };
};
/* ee flag bits.
@@ -768,11 +796,6 @@ enum {
__EE_CALL_AL_COMPLETE_IO,
__EE_MAY_SET_IN_SYNC,
- /* This epoch entry closes an epoch using a barrier.
- * On sucessful completion, the epoch is released,
- * and the P_BARRIER_ACK send. */
- __EE_IS_BARRIER,
-
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
@@ -781,12 +804,15 @@ enum {
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
+
+ /* This ee has a pointer to a digest instead of a block id */
+ __EE_HAS_DIGEST,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
-#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
/* global flag bits */
enum {
@@ -794,7 +820,6 @@ enum {
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
- STOP_SYNC_TIMER, /* tell timer to cancel itself */
UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
@@ -807,15 +832,15 @@ enum {
* Gets cleared when the state.conn
* goes into C_CONNECTED state. */
WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
- NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
CONSIDER_RESYNC,
- MD_NO_BARRIER, /* meta data device does not support barriers,
- so don't even try */
+ MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
SUSPEND_IO, /* suspend application io */
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
+ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
+ WAS_IO_ERROR, /* Local disk failed returned IO error */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
@@ -829,6 +854,8 @@ enum {
* the peer, if it changed there as well. */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ AL_SUSPENDED, /* Activity logging is currently suspended. */
};
struct drbd_bitmap; /* opaque for drbd_conf */
@@ -838,10 +865,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */
/* THINK maybe we actually want to use the default "event/%s" worker threads
* or similar in linux 2.6, which uses per cpu data and threads.
- *
- * To be general, this might need a spin_lock member.
- * For now, please use the mdev->req_lock to protect list_head,
- * see drbd_queue_work below.
*/
struct drbd_work_queue {
struct list_head q;
@@ -888,8 +911,6 @@ struct drbd_md {
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
- struct file *lo_file;
- struct file *md_file;
struct drbd_md md;
struct disk_conf dc; /* The user provided config... */
sector_t known_size; /* last known size of that backing device */
@@ -912,7 +933,12 @@ enum write_ordering_e {
WO_none,
WO_drain_io,
WO_bdev_flush,
- WO_bio_barrier
+};
+
+struct fifo_buffer {
+ int *values;
+ unsigned int head_index;
+ unsigned int size;
};
struct drbd_conf {
@@ -936,9 +962,16 @@ struct drbd_conf {
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
+ go_diskless,
md_sync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+ struct {
+ unsigned int line;
+ const char* func;
+ } last_md_mark_dirty;
+#endif
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
@@ -946,6 +979,7 @@ struct drbd_conf {
union drbd_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
+ wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
@@ -974,12 +1008,16 @@ struct drbd_conf {
unsigned long rs_start;
/* cumulated time in PausedSyncX state [unit jiffies] */
unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
- unsigned long rs_mark_left;
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
/* marks's time [unit jiffies] */
- unsigned long rs_mark_time;
- /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
- unsigned long rs_same_csum;
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
@@ -1012,10 +1050,10 @@ struct drbd_conf {
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
- struct list_head active_ee; /* IO in progress */
- struct list_head sync_ee; /* IO in progress */
+ struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress */
+ struct list_head read_ee; /* IO in progress (any read) */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
@@ -1026,7 +1064,8 @@ struct drbd_conf {
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
- atomic_t pp_in_use;
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
@@ -1054,6 +1093,15 @@ struct drbd_conf {
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ int rs_last_sect_ev; /* counter to compare with */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ int rs_planed; /* resync sectors already planed */
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1138,6 +1186,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
+enum drbd_req_event;
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@ -1150,12 +1200,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
@@ -1167,7 +1217,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp);
+ struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
@@ -1201,7 +1251,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+#else
+#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+ unsigned int line, const char *func);
+#endif
extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
void (*done)(struct drbd_conf *, int),
@@ -1209,6 +1265,8 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+extern void drbd_go_diskless(struct drbd_conf *mdev);
+extern void drbd_ldev_destroy(struct drbd_conf *mdev);
/* Meta data layout
@@ -1264,6 +1322,8 @@ struct bm_extent {
* Bit 1 ==> local node thinks this block needs to be synced.
*/
+#define SLEEP_TIME (HZ/10)
+
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
@@ -1335,11 +1395,13 @@ struct bm_extent {
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * With a value of 8 all IO in one 128K block make it to the same slot of the
* hash table. */
-#define HT_SHIFT 6
+#define HT_SHIFT 8
#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
@@ -1369,6 +1431,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_
/* bm_find_next variants for use while you hold drbd_bm_lock() */
extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
extern int drbd_bm_rs_done(struct drbd_conf *mdev);
/* for receive_bitmap */
@@ -1421,7 +1484,8 @@ extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
int force);
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
@@ -1467,10 +1531,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1479,7 +1545,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ int is_net);
+#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
+#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
@@ -1487,6 +1556,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+extern void drbd_free_tl_hash(struct drbd_conf *mdev);
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@ -1600,6 +1670,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@ -1712,17 +1784,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
case EP_PASS_ON:
if (!forcedetach) {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Local IO failed in %s."
- "Passing error on...\n", where);
+ dev_err(DEV, "Local IO failed in %s.\n", where);
break;
}
/* NOTE fall through to detach case if forcedetach set */
case EP_DETACH:
case EP_CALL_HELPER:
+ set_bit(WAS_IO_ERROR, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
- dev_err(DEV, "Local IO failed in %s."
- "Detaching...\n", where);
+ dev_err(DEV,
+ "Local IO failed in %s. Detaching...\n", where);
}
break;
}
@@ -1788,7 +1860,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
static inline sector_t drbd_get_capacity(struct block_device *bdev)
{
/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
- return bdev ? bdev->bd_inode->i_size >> 9 : 0;
+ return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
}
/**
@@ -1856,13 +1928,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
}
static inline void
-_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
-{
- list_add_tail(&w->list, &q->q);
- up(&q->s);
-}
-
-static inline void
drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
{
unsigned long flags;
@@ -1899,19 +1964,19 @@ static inline void request_ping(struct drbd_conf *mdev)
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
@@ -2013,7 +2078,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
static inline void put_net_conf(struct drbd_conf *mdev)
{
if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->misc_wait);
+ wake_up(&mdev->net_cnt_wait);
}
/**
@@ -2044,10 +2109,18 @@ static inline int get_net_conf(struct drbd_conf *mdev)
static inline void put_ldev(struct drbd_conf *mdev)
{
+ int i = atomic_dec_return(&mdev->local_cnt);
__release(local);
- if (atomic_dec_and_test(&mdev->local_cnt))
+ D_ASSERT(i >= 0);
+ if (i == 0) {
+ if (mdev->state.disk == D_DISKLESS)
+ /* even internal references gone, safe to destroy */
+ drbd_ldev_destroy(mdev);
+ if (mdev->state.disk == D_FAILED)
+ /* all application IO references gone. */
+ drbd_go_diskless(mdev);
wake_up(&mdev->misc_wait);
- D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+ }
}
#ifndef __CHECKER__
@@ -2055,6 +2128,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
{
int io_allowed;
+ /* never get a reference while D_DISKLESS */
+ if (mdev->state.disk == D_DISKLESS)
+ return 0;
+
atomic_inc(&mdev->local_cnt);
io_allowed = (mdev->state.disk >= mins);
if (!io_allowed)
@@ -2179,11 +2256,16 @@ static inline int drbd_state_is_stable(union drbd_state s)
return 1;
}
+static inline int is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
- if (mdev->state.susp)
+ if (is_susp(mdev->state))
return 0;
if (test_bit(SUSPEND_IO, &mdev->flags))
return 0;
@@ -2318,13 +2400,12 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
{
int r;
- if (test_bit(MD_NO_BARRIER, &mdev->flags))
+ if (test_bit(MD_NO_FUA, &mdev->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
- BLKDEV_IFL_WAIT);
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
if (r) {
- set_bit(MD_NO_BARRIER, &mdev->flags);
+ set_bit(MD_NO_FUA, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa650dd..29cd0dc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -32,7 +32,7 @@
#include <asm/types.h>
#include <net/sock.h>
#include <linux/ctype.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
@@ -64,6 +64,7 @@ struct after_state_chg_work {
struct completion *done;
};
+static DEFINE_MUTEX(drbd_main_mutex);
int drbdd_init(struct drbd_thread *);
int drbd_worker(struct drbd_thread *);
int drbd_asender(struct drbd_thread *);
@@ -77,6 +78,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
@@ -199,7 +201,7 @@ static int tl_init(struct drbd_conf *mdev)
INIT_LIST_HEAD(&b->w.list);
b->next = NULL;
b->br_number = 4711;
- b->n_req = 0;
+ b->n_writes = 0;
b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
mdev->oldest_tle = b;
@@ -240,7 +242,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
INIT_LIST_HEAD(&new->w.list);
new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
new->next = NULL;
- new->n_req = 0;
+ new->n_writes = 0;
newest_before = mdev->newest_tle;
/* never send a barrier number == 0, because that is special-cased
@@ -284,9 +286,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
barrier_nr, b->br_number);
goto bail;
}
- if (b->n_req != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
- barrier_nr, set_size, b->n_req);
+ if (b->n_writes != set_size) {
+ dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, b->n_writes);
goto bail;
}
@@ -333,6 +335,82 @@ bail:
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
}
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @mdev: DRBD device.
+ * @what: The action/event to perform with all request objects
+ *
+ * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
+ * restart_frozen_disk_io.
+ */
+static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+ struct drbd_tl_epoch *b, *tmp, **pn;
+ struct list_head *le, *tle, carry_reads;
+ struct drbd_request *req;
+ int rv, n_writes, n_reads;
+
+ b = mdev->oldest_tle;
+ pn = &mdev->oldest_tle;
+ while (b) {
+ n_writes = 0;
+ n_reads = 0;
+ INIT_LIST_HEAD(&carry_reads);
+ list_for_each_safe(le, tle, &b->requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ rv = _req_mod(req, what);
+
+ n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
+ n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
+ }
+ tmp = b->next;
+
+ if (n_writes) {
+ if (what == resend) {
+ b->n_writes = n_writes;
+ if (b->w.cb == NULL) {
+ b->w.cb = w_send_barrier;
+ inc_ap_pending(mdev);
+ set_bit(CREATE_BARRIER, &mdev->flags);
+ }
+
+ drbd_queue_work(&mdev->data.work, &b->w);
+ }
+ pn = &b->next;
+ } else {
+ if (n_reads)
+ list_add(&carry_reads, &b->requests);
+ /* there could still be requests on that ring list,
+ * in case local io is still pending */
+ list_del(&b->requests);
+
+ /* dec_ap_pending corresponding to queue_barrier.
+ * the newest barrier may not have been queued yet,
+ * in which case w.cb is still NULL. */
+ if (b->w.cb != NULL)
+ dec_ap_pending(mdev);
+
+ if (b == mdev->newest_tle) {
+ /* recycle, but reinit! */
+ D_ASSERT(tmp == NULL);
+ INIT_LIST_HEAD(&b->requests);
+ list_splice(&carry_reads, &b->requests);
+ INIT_LIST_HEAD(&b->w.list);
+ b->w.cb = NULL;
+ b->br_number = net_random();
+ b->n_writes = 0;
+
+ *pn = b;
+ break;
+ }
+ *pn = tmp;
+ kfree(b);
+ }
+ b = tmp;
+ list_splice(&carry_reads, &b->requests);
+ }
+}
+
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
@@ -344,48 +422,12 @@ bail:
*/
void tl_clear(struct drbd_conf *mdev)
{
- struct drbd_tl_epoch *b, *tmp;
struct list_head *le, *tle;
struct drbd_request *r;
- int new_initial_bnr = net_random();
spin_lock_irq(&mdev->req_lock);
- b = mdev->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
- tmp = b->next;
-
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = new_initial_bnr;
- b->n_req = 0;
-
- mdev->oldest_tle = b;
- break;
- }
- kfree(b);
- b = tmp;
- }
+ _tl_restart(mdev, connection_lost_while_pending);
/* we expect this list to be empty. */
D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
@@ -401,6 +443,15 @@ void tl_clear(struct drbd_conf *mdev)
/* ensure bit indicating barrier is required is clear */
clear_bit(CREATE_BARRIER, &mdev->flags);
+ memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+ spin_lock_irq(&mdev->req_lock);
+ _tl_restart(mdev, what);
spin_unlock_irq(&mdev->req_lock);
}
@@ -455,7 +506,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
static int is_valid_state_transition(struct drbd_conf *,
union drbd_state, union drbd_state);
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, int *warn_sync_abort);
+ union drbd_state ns, const char **warn_sync_abort);
int drbd_send_state_req(struct drbd_conf *,
union drbd_state, union drbd_state);
@@ -605,7 +656,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
drbd_role_str(ns.peer),
drbd_disk_str(ns.disk),
drbd_disk_str(ns.pdsk),
- ns.susp ? 's' : 'r',
+ is_susp(ns) ? 's' : 'r',
ns.aftr_isp ? 'a' : '-',
ns.peer_isp ? 'p' : '-',
ns.user_isp ? 'u' : '-'
@@ -763,7 +814,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
* to D_UNKNOWN. This rule and many more along those lines are in this function.
*/
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, int *warn_sync_abort)
+ union drbd_state ns, const char **warn_sync_abort)
{
enum drbd_fencing_p fp;
@@ -778,11 +829,21 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
os.conn <= C_DISCONNECTING)
ns.conn = os.conn;
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+ /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
+ * If you try to go into some Sync* state, that shall fail (elsewhere). */
if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+ ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
ns.conn = os.conn;
+ /* we cannot fail (again) if we already detached */
+ if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+ ns.disk = D_DISKLESS;
+
+ /* if we are only D_ATTACHING yet,
+ * we can (and should) go directly to D_DISKLESS. */
+ if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
+ ns.disk = D_DISKLESS;
+
/* After C_DISCONNECTING only C_STANDALONE may follow */
if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
ns.conn = os.conn;
@@ -798,14 +859,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
ns.aftr_isp = 0;
- if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
- ns.pdsk = D_UNKNOWN;
-
/* Abort resync if a disk fails/detaches */
if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
(ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
if (warn_sync_abort)
- *warn_sync_abort = 1;
+ *warn_sync_abort =
+ os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+ "Online-verify" : "Resync";
ns.conn = C_CONNECTED;
}
@@ -876,7 +936,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (fp == FP_STONITH &&
(ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
!(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp = 1;
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+ !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
if (ns.conn == C_SYNC_SOURCE)
@@ -912,6 +977,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
}
}
+static void drbd_resume_al(struct drbd_conf *mdev)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+ dev_info(DEV, "Resumed AL updates\n");
+}
+
/**
* __drbd_set_state() - Set a new DRBD state
* @mdev: DRBD device.
@@ -927,7 +998,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
{
union drbd_state os;
int rv = SS_SUCCESS;
- int warn_sync_abort = 0;
+ const char *warn_sync_abort = NULL;
struct after_state_chg_work *ascw;
os = mdev->state;
@@ -946,14 +1017,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
/* If the old state was illegal as well, then let
this happen...*/
- if (is_valid_state(mdev, os) == rv) {
- dev_err(DEV, "Considering state change from bad state. "
- "Error would be: '%s'\n",
- drbd_set_st_err_str(rv));
- print_st(mdev, "old", os);
- print_st(mdev, "new", ns);
+ if (is_valid_state(mdev, os) == rv)
rv = is_valid_state_transition(mdev, ns, os);
- }
} else
rv = is_valid_state_transition(mdev, ns, os);
}
@@ -965,7 +1030,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
}
if (warn_sync_abort)
- dev_warn(DEV, "Resync aborted.\n");
+ dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
{
char *pbp, pb[300];
@@ -976,7 +1041,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
PSC(conn);
PSC(disk);
PSC(pdsk);
- PSC(susp);
+ if (is_susp(ns) != is_susp(os))
+ pbp += sprintf(pbp, "susp( %s -> %s ) ",
+ drbd_susp_str(is_susp(os)),
+ drbd_susp_str(is_susp(ns)));
PSC(aftr_isp);
PSC(peer_isp);
PSC(user_isp);
@@ -997,16 +1065,18 @@ int __drbd_set_state(struct drbd_conf *mdev,
!test_and_set_bit(CONFIG_PENDING, &mdev->flags))
set_bit(DEVICE_DYING, &mdev->flags);
- mdev->state.i = ns.i;
+ /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+ * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+ * drbd_ldev_destroy() won't happen before our corresponding
+ * after_state_ch works run, where we put_ldev again. */
+ if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+ atomic_inc(&mdev->local_cnt);
+
+ mdev->state = ns;
wake_up(&mdev->misc_wait);
wake_up(&mdev->state_wait);
- /* post-state-change actions */
- if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
/* aborted verify run. log the last position */
if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
ns.conn < C_CONNECTED) {
@@ -1019,41 +1089,42 @@ int __drbd_set_state(struct drbd_conf *mdev,
if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
(ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
- if (ns.conn == C_SYNC_TARGET) {
- if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
- mod_timer(&mdev->resync_timer, jiffies);
- /* This if (!test_bit) is only needed for the case
- that a device that has ceased to used its timer,
- i.e. it is already in drbd_resync_finished() gets
- paused and resumed. */
- }
+ mdev->rs_paused += (long)jiffies
+ -(long)mdev->rs_mark_time[mdev->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&mdev->resync_timer, jiffies);
}
if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
(ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time = jiffies;
- if (ns.conn == C_PAUSED_SYNC_T)
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
+ mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
}
if (os.conn == C_CONNECTED &&
(ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
mdev->ov_position = 0;
- mdev->rs_total =
- mdev->rs_mark_left = drbd_bm_bits(mdev);
+ mdev->rs_total = drbd_bm_bits(mdev);
if (mdev->agreed_pro_version >= 90)
set_ov_position(mdev, ns.conn);
else
mdev->ov_start_sector = 0;
mdev->ov_left = mdev->rs_total
- BM_SECT_TO_BIT(mdev->ov_position);
- mdev->rs_start =
- mdev->rs_mark_time = jiffies;
+ mdev->rs_start = now;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
mdev->ov_last_oos_size = 0;
mdev->ov_last_oos_start = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = mdev->rs_total;
+ mdev->rs_mark_time[i] = now;
+ }
+
if (ns.conn == C_VERIFY_S) {
dev_info(DEV, "Starting Online Verify from sector %llu\n",
(unsigned long long)mdev->ov_position);
@@ -1106,6 +1177,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->receiver);
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+ drbd_resume_al(mdev);
+
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
@@ -1164,6 +1239,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags)
{
enum drbd_fencing_p fp;
+ enum drbd_req_event what = nothing;
+ union drbd_state nsm = (union drbd_state){ .i = -1 };
if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1187,17 +1264,48 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Here we have the actions that are performed after a
state change. This function might sleep */
- if (fp == FP_STONITH && ns.susp) {
- /* case1: The outdate peer handler is successful:
- * case2: The connection was established again: */
- if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
- (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+ nsm.i = -1;
+ if (ns.susp_nod) {
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ if (ns.conn == C_CONNECTED)
+ what = resend, nsm.susp_nod = 0;
+ else /* ns.conn > C_CONNECTED */
+ dev_err(DEV, "Unexpected Resynd going on!\n");
+ }
+
+ if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+ what = restart_frozen_disk_io, nsm.susp_nod = 0;
+
+ }
+
+ if (ns.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
tl_clear(mdev);
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ }
spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+ _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
+ /* case2: The connection was established again: */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ what = resend;
+ nsm.susp_fen = 0;
+ }
}
+
+ if (what != nothing) {
+ spin_lock_irq(&mdev->req_lock);
+ _tl_restart(mdev, what);
+ nsm.i &= mdev->state.i;
+ _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
+ spin_unlock_irq(&mdev->req_lock);
+ }
+
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
drbd_send_uuids(mdev);
@@ -1216,16 +1324,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
+ if (is_susp(mdev->state)) {
+ set_bit(NEW_CUR_UUID, &mdev->flags);
+ } else {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
}
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+ if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1267,52 +1381,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
- if (os.disk > D_FAILED && ns.disk == D_FAILED) {
+ /* first half of local IO error, failure to attach,
+ * or administrative detach */
+ if (os.disk != D_FAILED && ns.disk == D_FAILED) {
enum drbd_io_error_p eh;
-
- eh = EP_PASS_ON;
- if (get_ldev_if_state(mdev, D_FAILED)) {
- eh = mdev->ldev->dc.on_io_error;
- put_ldev(mdev);
- }
+ int was_io_error;
+ /* corresponding get_ldev was in __drbd_set_state, to serialize
+ * our cleanup here with the transition to D_DISKLESS,
+ * so it is safe to dreference ldev here. */
+ eh = mdev->ldev->dc.on_io_error;
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (mdev->state.disk != D_FAILED)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(mdev->state.disk));
+
+ if (drbd_send_state(mdev))
+ dev_warn(DEV, "Notified peer that I am detaching my disk\n");
+ else
+ dev_err(DEV, "Sending state for detaching disk failed\n");
drbd_rs_cancel_all(mdev);
- /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
- and it is D_DISKLESS here, local_cnt can only go down, it can
- not increase... It will reach zero */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
- spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(mdev);
+ put_ldev(mdev);
- if (eh == EP_CALL_HELPER)
+ if (was_io_error && eh == EP_CALL_HELPER)
drbd_khelper(mdev, "local-io-error");
}
- if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* second half of local IO error, failure to attach,
+ * or administrative detach,
+ * after local_cnt references have reached zero again */
+ if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (mdev->state.disk != D_DISKLESS)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(mdev->state.disk));
- if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that my disk is broken.\n");
- else
- dev_err(DEV, "Sending state in drbd_io_error() failed\n");
- }
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
- lc_destroy(mdev->resync);
- mdev->resync = NULL;
- lc_destroy(mdev->act_log);
- mdev->act_log = NULL;
- __no_warn(local,
- drbd_free_bc(mdev->ldev);
- mdev->ldev = NULL;);
-
- if (mdev->md_io_tmpp)
- __free_page(mdev->md_io_tmpp);
+ if (drbd_send_state(mdev))
+ dev_warn(DEV, "Notified peer that I'm now diskless.\n");
+ else
+ dev_err(DEV, "Sending state for being diskless failed\n");
+ /* corresponding get_ldev in __drbd_set_state
+ * this may finaly trigger drbd_ldev_destroy. */
+ put_ldev(mdev);
}
/* Disks got bigger while they were detached */
@@ -1328,6 +1454,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(mdev);
+
+ /* free tl_hash if we Got thawed and are C_STANDALONE */
+ if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
+ drbd_free_tl_hash(mdev);
+
/* Upon network connection, we need to start the receiver */
if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
drbd_thread_start(&mdev->receiver);
@@ -1554,7 +1689,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
/* the appropriate socket mutex must be held already */
int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags)
{
int sent, ok;
@@ -1564,7 +1699,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
h->magic = BE_DRBD_MAGIC;
h->command = cpu_to_be16(cmd);
- h->length = cpu_to_be16(size-sizeof(struct p_header));
+ h->length = cpu_to_be16(size-sizeof(struct p_header80));
sent = drbd_send(mdev, sock, h, size, msg_flags);
@@ -1579,7 +1714,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
* when we hold the appropriate socket mutex.
*/
int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header *h, size_t size)
+ enum drbd_packets cmd, struct p_header80 *h, size_t size)
{
int ok = 0;
struct socket *sock;
@@ -1607,7 +1742,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
size_t size)
{
- struct p_header h;
+ struct p_header80 h;
int ok;
h.magic = BE_DRBD_MAGIC;
@@ -1629,7 +1764,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
{
- struct p_rs_param_89 *p;
+ struct p_rs_param_95 *p;
struct socket *sock;
int size, rv;
const int apv = mdev->agreed_pro_version;
@@ -1637,7 +1772,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ strlen(mdev->sync_conf.verify_alg) + 1
- : /* 89 */ sizeof(struct p_rs_param_89);
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
/* used from admin command context and receiver/worker context.
* to avoid kmalloc, grab the socket right here,
@@ -1648,12 +1784,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
if (likely(sock != NULL)) {
enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
- p = &mdev->data.sbuf.rs_param_89;
+ p = &mdev->data.sbuf.rs_param_95;
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
p->rate = cpu_to_be32(sc->rate);
+ p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(sc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(sc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(sc->c_max_rate);
if (apv >= 88)
strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
@@ -1709,7 +1849,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
- (struct p_header *)p, size);
+ (struct p_header80 *)p, size);
kfree(p);
return rv;
}
@@ -1735,7 +1875,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
put_ldev(mdev);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_uuids(struct drbd_conf *mdev)
@@ -1756,7 +1896,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
p.uuid = cpu_to_be64(val);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
@@ -1786,7 +1926,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
p.dds_flags = cpu_to_be16(flags);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -1811,7 +1951,7 @@ int drbd_send_state(struct drbd_conf *mdev)
if (likely(sock != NULL)) {
ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header *)&p, sizeof(p), 0);
+ (struct p_header80 *)&p, sizeof(p), 0);
}
mutex_unlock(&mdev->data.mutex);
@@ -1829,7 +1969,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
p.val = cpu_to_be32(val.i);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
@@ -1839,7 +1979,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
p.retcode = cpu_to_be32(retcode);
return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int fill_bitmap_rle_bits(struct drbd_conf *mdev,
@@ -1938,7 +2078,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
enum { OK, FAILED, DONE }
send_bitmap_rle_or_plain(struct drbd_conf *mdev,
- struct p_header *h, struct bm_xfer_ctx *c)
+ struct p_header80 *h, struct bm_xfer_ctx *c)
{
struct p_compressed_bm *p = (void*)h;
unsigned long num_words;
@@ -1968,12 +2108,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
if (len)
drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
- h, sizeof(struct p_header) + len, 0);
+ h, sizeof(struct p_header80) + len, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
- c->bytes[1] += sizeof(struct p_header) + len;
+ c->bytes[1] += sizeof(struct p_header80) + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
@@ -1989,14 +2129,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
int _drbd_send_bitmap(struct drbd_conf *mdev)
{
struct bm_xfer_ctx c;
- struct p_header *p;
+ struct p_header80 *p;
int ret;
ERR_IF(!mdev->bitmap) return FALSE;
/* maybe we should use some per thread scratch page,
* and allocate that during initial device creation? */
- p = (struct p_header *) __get_free_page(GFP_NOIO);
+ p = (struct p_header80 *) __get_free_page(GFP_NOIO);
if (!p) {
dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
return FALSE;
@@ -2054,7 +2194,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
if (mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2082,17 +2222,18 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp)
+ struct p_data *dp, int data_size)
{
- const int header_size = sizeof(struct p_data)
- - sizeof(struct p_header);
- int data_size = ((struct p_header *)dp)->length - header_size;
-
+ data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
dp->block_id);
}
@@ -2140,7 +2281,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2158,7 +2299,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
p.head.magic = BE_DRBD_MAGIC;
p.head.command = cpu_to_be16(cmd);
- p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+ p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
mutex_lock(&mdev->data.mutex);
@@ -2180,7 +2321,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2332,6 +2473,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
return 1;
}
+static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
+{
+ if (mdev->agreed_pro_version >= 95)
+ return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+ (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
+ (bi_rw & REQ_FUA ? DP_FUA : 0) |
+ (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+ (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+ else
+ return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
+}
+
/* Used to send write requests
* R_PRIMARY -> Peer (P_DATA)
*/
@@ -2349,30 +2502,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(P_DATA);
- p.head.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+ if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
+ p.head.h80.magic = BE_DRBD_MAGIC;
+ p.head.h80.command = cpu_to_be16(P_DATA);
+ p.head.h80.length =
+ cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+ } else {
+ p.head.h95.magic = BE_DRBD_MAGIC_BIG;
+ p.head.h95.command = cpu_to_be16(P_DATA);
+ p.head.h95.length =
+ cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+ }
p.sector = cpu_to_be64(req->sector);
p.block_id = (unsigned long)req;
p.seq_num = cpu_to_be32(req->seq_num =
atomic_add_return(1, &mdev->packet_seq));
- dp_flags = 0;
- /* NOTE: no need to check if barriers supported here as we would
- * not pass the test in make_request_common in that case
- */
- if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
- dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
- /* dp_flags |= DP_HARDBARRIER; */
- }
- if (req->master_bio->bi_rw & REQ_SYNC)
- dp_flags |= DP_RW_SYNC;
- /* for now handle SYNCIO and UNPLUG
- * as if they still were one and the same flag */
- if (req->master_bio->bi_rw & REQ_UNPLUG)
- dp_flags |= DP_RW_SYNC;
+ dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
+
if (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
@@ -2413,10 +2561,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(cmd);
- p.head.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+ if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
+ p.head.h80.magic = BE_DRBD_MAGIC;
+ p.head.h80.command = cpu_to_be16(cmd);
+ p.head.h80.length =
+ cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+ } else {
+ p.head.h95.magic = BE_DRBD_MAGIC_BIG;
+ p.head.h95.command = cpu_to_be16(cmd);
+ p.head.h95.length =
+ cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+ }
p.sector = cpu_to_be64(e->sector);
p.block_id = e->block_id;
@@ -2429,8 +2584,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!drbd_get_data_sock(mdev))
return 0;
- ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
- sizeof(p), dgs ? MSG_MORE : 0);
+ ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
if (ok && dgs) {
dgb = mdev->int_dig_out;
drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
@@ -2536,7 +2690,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
unsigned long flags;
int rv = 0;
- lock_kernel();
+ mutex_lock(&drbd_main_mutex);
spin_lock_irqsave(&mdev->req_lock, flags);
/* to have a stable mdev->state.role
* and no race with updating open_cnt */
@@ -2551,7 +2705,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
if (!rv)
mdev->open_cnt++;
spin_unlock_irqrestore(&mdev->req_lock, flags);
- unlock_kernel();
+ mutex_unlock(&drbd_main_mutex);
return rv;
}
@@ -2559,9 +2713,9 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
static int drbd_release(struct gendisk *gd, fmode_t mode)
{
struct drbd_conf *mdev = gd->private_data;
- lock_kernel();
+ mutex_lock(&drbd_main_mutex);
mdev->open_cnt--;
- unlock_kernel();
+ mutex_unlock(&drbd_main_mutex);
return 0;
}
@@ -2605,7 +2759,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
/* .verify_alg = */ {}, 0,
/* .cpu_mask = */ {}, 0,
/* .csums_alg = */ {}, 0,
- /* .use_rle = */ 0
+ /* .use_rle = */ 0,
+ /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
+ /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
+ /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
+ /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
+ /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
+ /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
};
/* Have to use that way, because the layout differs between
@@ -2616,7 +2776,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
- .susp = 0
+ .susp = 0,
+ .susp_nod = 0,
+ .susp_fen = 0
} };
}
@@ -2627,11 +2789,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
drbd_set_defaults(mdev);
- /* for now, we do NOT yet support it,
- * even though we start some framework
- * to eventually support barriers */
- set_bit(NO_BARRIER_SUPP, &mdev->flags);
-
atomic_set(&mdev->ap_bio_cnt, 0);
atomic_set(&mdev->ap_pending_cnt, 0);
atomic_set(&mdev->rs_pending_cnt, 0);
@@ -2640,6 +2797,9 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->net_cnt, 0);
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
+ atomic_set(&mdev->pp_in_use_by_net, 0);
+ atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
@@ -2666,11 +2826,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
INIT_LIST_HEAD(&mdev->meta.work.q);
INIT_LIST_HEAD(&mdev->resync_work.list);
INIT_LIST_HEAD(&mdev->unplug_work.list);
+ INIT_LIST_HEAD(&mdev->go_diskless.list);
INIT_LIST_HEAD(&mdev->md_sync_work.list);
INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
mdev->resync_work.cb = w_resync_inactive;
mdev->unplug_work.cb = w_send_write_hint;
+ mdev->go_diskless.cb = w_go_diskless;
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
init_timer(&mdev->resync_timer);
@@ -2682,6 +2844,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
+ init_waitqueue_head(&mdev->net_cnt_wait);
init_waitqueue_head(&mdev->ee_wait);
init_waitqueue_head(&mdev->al_wait);
init_waitqueue_head(&mdev->seq_wait);
@@ -2691,12 +2854,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
drbd_thread_init(mdev, &mdev->asender, drbd_asender);
mdev->agreed_pro_version = PRO_VERSION_MAX;
- mdev->write_ordering = WO_bio_barrier;
+ mdev->write_ordering = WO_bdev_flush;
mdev->resync_wenr = LC_FREE;
}
void drbd_mdev_cleanup(struct drbd_conf *mdev)
{
+ int i;
if (mdev->receiver.t_state != None)
dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
mdev->receiver.t_state);
@@ -2713,9 +2877,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
mdev->p_size =
mdev->rs_start =
mdev->rs_total =
- mdev->rs_failed =
- mdev->rs_mark_left =
- mdev->rs_mark_time = 0;
+ mdev->rs_failed = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = 0;
+ mdev->rs_mark_time[i] = 0;
+ }
D_ASSERT(mdev->net_conf == NULL);
drbd_set_my_capacity(mdev, 0);
@@ -2726,6 +2894,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
}
drbd_free_resources(mdev);
+ clear_bit(AL_SUSPENDED, &mdev->flags);
/*
* currently we drbd_init_ee only on module load, so
@@ -2741,7 +2910,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
D_ASSERT(list_empty(&mdev->meta.work.q));
D_ASSERT(list_empty(&mdev->resync_work.list));
D_ASSERT(list_empty(&mdev->unplug_work.list));
-
+ D_ASSERT(list_empty(&mdev->go_diskless.list));
}
@@ -2824,7 +2993,7 @@ static int drbd_create_mempools(void)
drbd_ee_mempool = mempool_create(number,
mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
- if (drbd_request_mempool == NULL)
+ if (drbd_ee_mempool == NULL)
goto Enomem;
/* drbd's page pool */
@@ -3203,11 +3372,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
if (ldev == NULL)
return;
- bd_release(ldev->backing_bdev);
- bd_release(ldev->md_bdev);
-
- fput(ldev->lo_file);
- fput(ldev->md_file);
+ blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(ldev);
}
@@ -3280,9 +3446,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
sector_t sector;
int i;
+ del_timer(&mdev->md_sync_timer);
+ /* timer may be rearmed by drbd_md_mark_dirty() now. */
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
return;
- del_timer(&mdev->md_sync_timer);
/* We use here D_FAILED and not D_ATTACHING because we try to write
* metadata even if we detach due to a disk failure! */
@@ -3310,12 +3477,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
- if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- clear_bit(MD_DIRTY, &mdev->flags);
- } else {
+ if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
-
drbd_chk_io_error(mdev, 1, TRUE);
}
@@ -3402,6 +3566,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
return rv;
}
+static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
+{
+ static char *uuid_str[UI_EXTENDED_SIZE] = {
+ [UI_CURRENT] = "CURRENT",
+ [UI_BITMAP] = "BITMAP",
+ [UI_HISTORY_START] = "HISTORY_START",
+ [UI_HISTORY_END] = "HISTORY_END",
+ [UI_SIZE] = "SIZE",
+ [UI_FLAGS] = "FLAGS",
+ };
+
+ if (index >= UI_EXTENDED_SIZE) {
+ dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
+ return;
+ }
+
+ dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
+ uuid_str[index],
+ (unsigned long long)mdev->ldev->md.uuid[index]);
+}
+
+
/**
* drbd_md_mark_dirty() - Mark meta data super block as dirty
* @mdev: DRBD device.
@@ -3410,19 +3596,31 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
* the meta-data super block. This function sets MD_DIRTY, and starts a
* timer that ensures that within five seconds you have to call drbd_md_sync().
*/
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
+{
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
+ mod_timer(&mdev->md_sync_timer, jiffies + HZ);
+ mdev->last_md_mark_dirty.line = line;
+ mdev->last_md_mark_dirty.func = func;
+ }
+}
+#else
void drbd_md_mark_dirty(struct drbd_conf *mdev)
{
- set_bit(MD_DIRTY, &mdev->flags);
- mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
+ mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
}
-
+#endif
static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
{
int i;
- for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+ debug_drbd_uuid(mdev, i+1);
+ }
}
void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3437,6 +3635,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
}
mdev->ldev->md.uuid[idx] = val;
+ debug_drbd_uuid(mdev, idx);
drbd_md_mark_dirty(mdev);
}
@@ -3446,6 +3645,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
if (mdev->ldev->md.uuid[idx]) {
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+ debug_drbd_uuid(mdev, UI_HISTORY_START);
}
_drbd_uuid_set(mdev, idx, val);
}
@@ -3464,9 +3664,12 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
dev_info(DEV, "Creating new current UUID\n");
D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+ debug_drbd_uuid(mdev, UI_BITMAP);
get_random_bytes(&val, sizeof(u64));
_drbd_uuid_set(mdev, UI_CURRENT, val);
+ /* get it to stable storage _now_ */
+ drbd_md_sync(mdev);
}
void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
@@ -3478,6 +3681,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
mdev->ldev->md.uuid[UI_BITMAP] = 0;
+ debug_drbd_uuid(mdev, UI_HISTORY_START);
+ debug_drbd_uuid(mdev, UI_BITMAP);
} else {
if (mdev->ldev->md.uuid[UI_BITMAP])
dev_warn(DEV, "bm UUID already set");
@@ -3485,6 +3690,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
mdev->ldev->md.uuid[UI_BITMAP] = val;
mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
+ debug_drbd_uuid(mdev, UI_BITMAP);
}
drbd_md_mark_dirty(mdev);
}
@@ -3527,6 +3733,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
{
int rv = -EIO;
+ drbd_resume_al(mdev);
if (get_ldev_if_state(mdev, D_ATTACHING)) {
drbd_bm_clear_all(mdev);
rv = drbd_bm_write(mdev);
@@ -3559,6 +3766,41 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
return 1;
}
+void drbd_ldev_destroy(struct drbd_conf *mdev)
+{
+ lc_destroy(mdev->resync);
+ mdev->resync = NULL;
+ lc_destroy(mdev->act_log);
+ mdev->act_log = NULL;
+ __no_warn(local,
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;);
+
+ if (mdev->md_io_tmpp) {
+ __free_page(mdev->md_io_tmpp);
+ mdev->md_io_tmpp = NULL;
+ }
+ clear_bit(GO_DISKLESS, &mdev->flags);
+}
+
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ D_ASSERT(mdev->state.disk == D_FAILED);
+ /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+ * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+ * the protected members anymore, though, so once put_ldev reaches zero
+ * again, it will be safe to free them. */
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ return 1;
+}
+
+void drbd_go_diskless(struct drbd_conf *mdev)
+{
+ D_ASSERT(mdev->state.disk == D_FAILED);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+}
+
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
* @mdev: DRBD device.
@@ -3655,8 +3897,11 @@ static void md_sync_timer_fn(unsigned long data)
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+#ifdef DEBUG
+ dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
+ mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
+#endif
drbd_md_sync(mdev);
-
return 1;
}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73131c5..8cbfaa6 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -33,10 +33,13 @@
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
+#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
+#include <linux/compiler.h>
+#include <linux/kthread.h>
static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
@@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
put_net_conf(mdev);
}
+ /* The helper may take some time.
+ * write out any unsynced meta data changes now */
+ drbd_md_sync(mdev);
+
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
@@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
put_ldev(mdev);
} else {
dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
- return mdev->state.pdsk;
+ nps = mdev->state.pdsk;
+ goto out;
}
- if (fp == FP_STONITH)
- _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
-
r = drbd_khelper(mdev, "fence-peer");
switch ((r>>8) & 0xff) {
@@ -252,9 +257,36 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
dev_info(DEV, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
+
+out:
+ if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
+ /* The handler was not successful... unfreeze here, the
+ state engine can not unfreeze... */
+ _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
+ }
+
return nps;
}
+static int _try_outdate_peer_async(void *data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *)data;
+ enum drbd_disk_state nps;
+
+ nps = drbd_try_outdate_peer(mdev);
+ drbd_request_state(mdev, NS(pdsk, nps));
+
+ return 0;
+}
+
+void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+{
+ struct task_struct *opa;
+
+ opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
+ if (IS_ERR(opa))
+ dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+}
int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
{
@@ -394,6 +426,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
return r;
}
+static struct drbd_conf *ensure_mdev(int minor, int create)
+{
+ struct drbd_conf *mdev;
+
+ if (minor >= minor_count)
+ return NULL;
+
+ mdev = minor_to_mdev(minor);
+
+ if (!mdev && create) {
+ struct gendisk *disk = NULL;
+ mdev = drbd_new_device(minor);
+
+ spin_lock_irq(&drbd_pp_lock);
+ if (minor_table[minor] == NULL) {
+ minor_table[minor] = mdev;
+ disk = mdev->vdisk;
+ mdev = NULL;
+ } /* else: we lost the race */
+ spin_unlock_irq(&drbd_pp_lock);
+
+ if (disk) /* we won the race above */
+ /* in case we ever add a drbd_delete_device(),
+ * don't forget the del_gendisk! */
+ add_disk(disk);
+ else /* we lost the race above */
+ drbd_free_mdev(mdev);
+
+ mdev = minor_to_mdev(minor);
+ }
+
+ return mdev;
+}
static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
@@ -494,6 +559,8 @@ char *ppsize(char *buf, unsigned long long size)
void drbd_suspend_io(struct drbd_conf *mdev)
{
set_bit(SUSPEND_IO, &mdev->flags);
+ if (is_susp(mdev->state))
+ return;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
@@ -713,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
blk_queue_segment_boundary(q, PAGE_SIZE-1);
blk_stack_limits(&q->limits, &b->limits, 0);
- if (b->merge_bvec_fn)
- dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
- b->merge_bvec_fn);
dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -729,14 +793,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
/* serialize deconfig (worker exiting, doing cleanup)
* and reconfig (drbdsetup disk, drbdsetup net)
*
- * wait for a potentially exiting worker, then restart it,
- * or start a new one.
+ * Wait for a potentially exiting worker, then restart it,
+ * or start a new one. Flush any pending work, there may still be an
+ * after_state_change queued.
*/
static void drbd_reconfig_start(struct drbd_conf *mdev)
{
wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
drbd_thread_start(&mdev->worker);
+ drbd_flush_workqueue(mdev);
}
/* if still unconfigured, stops worker again.
@@ -756,6 +822,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
wake_up(&mdev->state_wait);
}
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_conf *mdev)
+{
+ int s = 0;
+
+ if (lc_try_lock(mdev->act_log)) {
+ drbd_al_shrink(mdev);
+ lc_unlock(mdev->act_log);
+ } else {
+ dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
+ return;
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.conn < C_CONNECTED)
+ s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (s)
+ dev_info(DEV, "Suspended AL updates\n");
+}
+
/* does always return 0;
* interesting return code is in reply->ret_code */
static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -766,9 +855,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
- struct inode *inode, *inode2;
+ struct block_device *bdev;
struct lru_cache *resync_lru = NULL;
union drbd_state ns, os;
+ unsigned int max_seg_s;
int rv;
int cp_discovered = 0;
int logical_block_size;
@@ -780,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
retcode = ERR_DISK_CONFIGURED;
goto fail;
}
+ /* It may just now have detached because of IO error. Make sure
+ * drbd_ldev_destroy is done already, we may end up here very fast,
+ * e.g. if someone calls attach from the on-io-error handler,
+ * to realize a "hot spare" feature (not that I'd recommend that) */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
/* allocation not in the IO path, cqueue thread context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -803,46 +898,49 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
goto fail;
}
- nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
- if (IS_ERR(nbc->lo_file)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
- PTR_ERR(nbc->lo_file));
- nbc->lo_file = NULL;
- retcode = ERR_OPEN_DISK;
- goto fail;
+ if (get_net_conf(mdev)) {
+ int prot = mdev->net_conf->wire_protocol;
+ put_net_conf(mdev);
+ if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
}
- inode = nbc->lo_file->f_dentry->d_inode;
-
- if (!S_ISBLK(inode->i_mode)) {
- retcode = ERR_DISK_NOT_BDEV;
+ bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
+ if (IS_ERR(bdev)) {
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+ PTR_ERR(bdev));
+ retcode = ERR_OPEN_DISK;
goto fail;
}
+ nbc->backing_bdev = bdev;
- nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
- if (IS_ERR(nbc->md_file)) {
+ /*
+ * meta_dev_idx >= 0: external fixed size, possibly multiple
+ * drbd sharing one meta device. TODO in that case, paranoia
+ * check that [md_bdev, meta_dev_idx] is not yet used by some
+ * other drbd minor! (if you use drbd.conf + drbdadm, that
+ * should check it for you already; but if you don't, or
+ * someone fooled it, we need to double check here)
+ */
+ bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ (nbc->dc.meta_dev_idx < 0) ?
+ (void *)mdev : (void *)drbd_m_holder);
+ if (IS_ERR(bdev)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
- PTR_ERR(nbc->md_file));
- nbc->md_file = NULL;
+ PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail;
}
+ nbc->md_bdev = bdev;
- inode2 = nbc->md_file->f_dentry->d_inode;
-
- if (!S_ISBLK(inode2->i_mode)) {
- retcode = ERR_MD_NOT_BDEV;
- goto fail;
- }
-
- nbc->backing_bdev = inode->i_bdev;
- if (bd_claim(nbc->backing_bdev, mdev)) {
- printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
- nbc->backing_bdev, mdev,
- nbc->backing_bdev->bd_holder,
- nbc->backing_bdev->bd_contains->bd_holder,
- nbc->backing_bdev->bd_holders);
- retcode = ERR_BDCLAIM_DISK;
+ if ((nbc->backing_bdev == nbc->md_bdev) !=
+ (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ retcode = ERR_MD_IDX_INVALID;
goto fail;
}
@@ -851,28 +949,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
- goto release_bdev_fail;
- }
-
- /* meta_dev_idx >= 0: external fixed size,
- * possibly multiple drbd sharing one meta device.
- * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
- * not yet used by some other drbd minor!
- * (if you use drbd.conf + drbdadm,
- * that should check it for you already; but if you don't, or someone
- * fooled it, we need to double check here) */
- nbc->md_bdev = inode2->i_bdev;
- if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
- : (void *) drbd_m_holder)) {
- retcode = ERR_BDCLAIM_MD_DISK;
- goto release_bdev_fail;
- }
-
- if ((nbc->backing_bdev == nbc->md_bdev) !=
- (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
- nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
- retcode = ERR_MD_IDX_INVALID;
- goto release_bdev2_fail;
+ goto fail;
}
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
@@ -883,7 +960,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
(unsigned long long) drbd_get_max_capacity(nbc),
(unsigned long long) nbc->dc.disk_size);
retcode = ERR_DISK_TO_SMALL;
- goto release_bdev2_fail;
+ goto fail;
}
if (nbc->dc.meta_dev_idx < 0) {
@@ -900,7 +977,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
dev_warn(DEV, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
- goto release_bdev2_fail;
+ goto fail;
}
/* Make sure the new disk is big enough
@@ -908,7 +985,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
if (drbd_get_max_capacity(nbc) <
drbd_get_capacity(mdev->this_bdev)) {
retcode = ERR_DISK_TO_SMALL;
- goto release_bdev2_fail;
+ goto fail;
}
nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -924,14 +1001,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
drbd_resume_io(mdev);
if (retcode < SS_SUCCESS)
- goto release_bdev2_fail;
+ goto fail;
if (!get_ldev_if_state(mdev, D_ATTACHING))
goto force_diskless;
@@ -999,9 +1076,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
if (nbc->dc.no_md_flush)
- set_bit(MD_NO_BARRIER, &mdev->flags);
+ set_bit(MD_NO_FUA, &mdev->flags);
else
- clear_bit(MD_NO_BARRIER, &mdev->flags);
+ clear_bit(MD_NO_FUA, &mdev->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
@@ -1013,15 +1090,16 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
nbc = NULL;
resync_lru = NULL;
- mdev->write_ordering = WO_bio_barrier;
- drbd_bump_write_ordering(mdev, WO_bio_barrier);
+ mdev->write_ordering = WO_bdev_flush;
+ drbd_bump_write_ordering(mdev, WO_bdev_flush);
if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
set_bit(CRASHED_PRIMARY, &mdev->flags);
else
clear_bit(CRASHED_PRIMARY, &mdev->flags);
- if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+ !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
set_bit(CRASHED_PRIMARY, &mdev->flags);
cp_discovered = 1;
}
@@ -1031,7 +1109,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
mdev->read_cnt = 0;
mdev->writ_cnt = 0;
- drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+ max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+ if (mdev->state.conn == C_CONNECTED) {
+ /* We are Primary, Connected, and now attach a new local
+ * backing store. We must not increase the user visible maximum
+ * bio size on this device to something the peer may not be
+ * able to handle. */
+ if (mdev->agreed_pro_version < 94)
+ max_seg_s = queue_max_segment_size(mdev->rq_queue);
+ else if (mdev->agreed_pro_version == 94)
+ max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+ /* else: drbd 8.3.9 and later, stay with default */
+ }
+
+ drbd_setup_queue_param(mdev, max_seg_s);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
@@ -1079,6 +1170,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_al_to_on_disk_bm(mdev);
}
+ if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
+ drbd_suspend_al(mdev); /* IO is still suspended here... */
+
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
ns.i = os.i;
@@ -1146,20 +1240,16 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
force_diskless_dec:
put_ldev(mdev);
force_diskless:
- drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ drbd_force_state(mdev, NS(disk, D_FAILED));
drbd_md_sync(mdev);
- release_bdev2_fail:
- if (nbc)
- bd_release(nbc->md_bdev);
- release_bdev_fail:
- if (nbc)
- bd_release(nbc->backing_bdev);
fail:
if (nbc) {
- if (nbc->lo_file)
- fput(nbc->lo_file);
- if (nbc->md_file)
- fput(nbc->md_file);
+ if (nbc->backing_bdev)
+ blkdev_put(nbc->backing_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (nbc->md_bdev)
+ blkdev_put(nbc->md_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc);
}
lc_destroy(resync_lru);
@@ -1169,10 +1259,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
return 0;
}
+/* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
+ drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
+ if (mdev->state.disk == D_DISKLESS)
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+ drbd_resume_io(mdev);
return 0;
}
@@ -1235,7 +1334,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
&& (new_conf->wire_protocol != DRBD_PROT_C)) {
retcode = ERR_NOT_PROTO_C;
goto fail;
- };
+ }
+
+ if (get_ldev(mdev)) {
+ enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
+ }
if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
retcode = ERR_DISCARD;
@@ -1350,6 +1458,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
}
}
+ drbd_flush_workqueue(mdev);
spin_lock_irq(&mdev->req_lock);
if (mdev->net_conf != NULL) {
retcode = ERR_NET_CONFIGURED;
@@ -1388,10 +1497,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
mdev->int_dig_out=int_dig_out;
mdev->int_dig_in=int_dig_in;
mdev->int_dig_vv=int_dig_vv;
+ retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
- retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
-
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
@@ -1546,6 +1654,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
struct crypto_hash *csums_tfm = NULL;
struct syncer_conf sc;
cpumask_var_t new_cpu_mask;
+ int *rs_plan_s = NULL;
+ int fifo_size;
if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
retcode = ERR_NOMEM;
@@ -1557,6 +1667,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
sc.rate = DRBD_RATE_DEF;
sc.after = DRBD_AFTER_DEF;
sc.al_extents = DRBD_AL_EXTENTS_DEF;
+ sc.on_no_data = DRBD_ON_NO_DATA_DEF;
+ sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
+ sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
+ sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
+ sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+ sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
@@ -1634,6 +1750,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
}
#undef AL_MAX
+ /* to avoid spurious errors when configuring minors before configuring
+ * the minors they depend on: if necessary, first create the minor we
+ * depend on */
+ if (sc.after >= 0)
+ ensure_mdev(sc.after, 1);
+
/* most sanity checks done, try to assign the new sync-after
* dependency. need to hold the global lock in there,
* to avoid a race in the dependency loop check. */
@@ -1641,6 +1763,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
if (retcode != NO_ERROR)
goto fail;
+ fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ if (!rs_plan_s) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
/* ok, assign the rest of it as well.
* lock against receive_SyncParam() */
spin_lock(&mdev->peer_seq_lock);
@@ -1657,6 +1789,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
mdev->verify_tfm = verify_tfm;
verify_tfm = NULL;
}
+
+ if (fifo_size != mdev->rs_plan_s.size) {
+ kfree(mdev->rs_plan_s.values);
+ mdev->rs_plan_s.values = rs_plan_s;
+ mdev->rs_plan_s.size = fifo_size;
+ mdev->rs_planed = 0;
+ rs_plan_s = NULL;
+ }
+
spin_unlock(&mdev->peer_seq_lock);
if (get_ldev(mdev)) {
@@ -1688,6 +1829,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
+ kfree(rs_plan_s);
free_cpumask_var(new_cpu_mask);
crypto_free_hash(csums_tfm);
crypto_free_hash(verify_tfm);
@@ -1721,12 +1863,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
return 0;
}
+static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
+{
+ int rv;
+
+ rv = drbd_bmio_set_n_write(mdev);
+ drbd_suspend_al(mdev);
+ return rv;
+}
+
static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
+ int retcode;
- reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+ retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
+
+ if (retcode < SS_SUCCESS) {
+ if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ /* open coded drbd_bitmap_io() */
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer"))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+ }
+ reply->ret_code = retcode;
return 0;
}
@@ -1765,7 +1933,20 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
- reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ }
+ drbd_suspend_io(mdev);
+ reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (reply->ret_code == SS_SUCCESS) {
+ if (mdev->state.conn < C_CONNECTED)
+ tl_clear(mdev);
+ if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
+ tl_restart(mdev, fail_frozen_disk_io);
+ }
+ drbd_resume_io(mdev);
+
return 0;
}
@@ -1941,40 +2122,6 @@ out:
return 0;
}
-static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
-{
- struct drbd_conf *mdev;
-
- if (nlp->drbd_minor >= minor_count)
- return NULL;
-
- mdev = minor_to_mdev(nlp->drbd_minor);
-
- if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
- struct gendisk *disk = NULL;
- mdev = drbd_new_device(nlp->drbd_minor);
-
- spin_lock_irq(&drbd_pp_lock);
- if (minor_table[nlp->drbd_minor] == NULL) {
- minor_table[nlp->drbd_minor] = mdev;
- disk = mdev->vdisk;
- mdev = NULL;
- } /* else: we lost the race */
- spin_unlock_irq(&drbd_pp_lock);
-
- if (disk) /* we won the race above */
- /* in case we ever add a drbd_delete_device(),
- * don't forget the del_gendisk! */
- add_disk(disk);
- else /* we lost the race above */
- drbd_free_mdev(mdev);
-
- mdev = minor_to_mdev(nlp->drbd_minor);
- }
-
- return mdev;
-}
-
struct cn_handler_struct {
int (*function)(struct drbd_conf *,
struct drbd_nl_cfg_req *,
@@ -2035,7 +2182,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
goto fail;
}
- mdev = ensure_mdev(nlp);
+ mdev = ensure_mdev(nlp->drbd_minor,
+ (nlp->flags & DRBD_NL_CREATE_DEVICE));
if (!mdev) {
retcode = ERR_MINOR_INVALID;
goto fail;
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b..7e6ac30 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
+ int stalled = 0;
drbd_get_syncer_progress(mdev, &rs_left, &res);
@@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
* db: blocks written from mark until now
* rt: remaining time
*/
- dt = (jiffies - mdev->rs_mark_time) / HZ;
-
- if (dt > 20) {
- /* if we made no update to rs_mark_time for too long,
- * we are stalled. show that. */
- seq_printf(seq, "stalled\n");
- return;
- }
+ /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
+ * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+ * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+ i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+ dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+ if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+ stalled = 1;
if (!dt)
dt++;
- db = mdev->rs_mark_left - rs_left;
+ db = mdev->rs_mark_left[i] - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
@@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
- if (dt <= 0)
+ if (dt == 0)
dt = 1;
db = mdev->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
@@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
else
seq_printf(seq, " (%ld)", dbdt);
- seq_printf(seq, " K/sec\n");
+ if (mdev->state.conn == C_SYNC_TARGET) {
+ if (mdev->c_sync_rate > 1000)
+ seq_printf(seq, " want: %d,%03d",
+ mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+ else
+ seq_printf(seq, " want: %d", mdev->c_sync_rate);
+ }
+ seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -151,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
[WO_none] = 'n',
[WO_drain_io] = 'd',
[WO_bdev_flush] = 'f',
- [WO_bio_barrier] = 'b',
};
seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
@@ -196,7 +202,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
seq_printf(seq,
- "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+ "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
@@ -206,11 +212,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
- mdev->state.susp ? 's' : 'r',
+ is_susp(mdev->state) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
+ test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d..24487d4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,7 +36,6 @@
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/pkt_sched.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
@@ -49,11 +48,6 @@
#include "drbd_vli.h"
-struct flush_work {
- struct drbd_work w;
- struct drbd_epoch *epoch;
-};
-
enum finish_epoch {
FE_STILL_LIVE,
FE_DESTROYED,
@@ -66,16 +60,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
-static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
-{
- struct drbd_epoch *prev;
- spin_lock(&mdev->epoch_lock);
- prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
- if (prev == epoch || prev == mdev->current_epoch)
- prev = NULL;
- spin_unlock(&mdev->epoch_lock);
- return prev;
-}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
@@ -241,7 +225,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
}
/**
@@ -298,9 +282,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
* Is also used from inside an other spin_lock_irq(&mdev->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
{
+ atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
+
if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
@@ -311,10 +297,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
}
- atomic_sub(i, &mdev->pp_in_use);
- i = atomic_read(&mdev->pp_in_use);
+ i = atomic_sub_return(i, a);
if (i < 0)
- dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+ dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+ is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
}
@@ -365,7 +351,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
e->size = data_size;
e->flags = 0;
e->sector = sector;
- e->sector = sector;
e->block_id = id;
return e;
@@ -375,9 +360,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
return NULL;
}
-void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
{
- drbd_pp_free(mdev, e->pages);
+ if (e->flags & EE_HAS_DIGEST)
+ kfree(e->digest);
+ drbd_pp_free(mdev, e->pages, is_net);
D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision));
mempool_free(e, drbd_ee_mempool);
@@ -388,13 +375,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_epoch_entry *e, *t;
int count = 0;
+ int is_net = list == &mdev->net_ee;
spin_lock_irq(&mdev->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_ee(mdev, e);
+ drbd_free_some_ee(mdev, e, is_net);
count++;
}
return count;
@@ -423,7 +411,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -719,14 +707,14 @@ out:
static int drbd_send_fp(struct drbd_conf *mdev,
struct socket *sock, enum drbd_packets cmd)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.sbuf.header.h80;
return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
}
static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
int rr;
rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -776,9 +764,6 @@ static int drbd_connect(struct drbd_conf *mdev)
D_ASSERT(!mdev->data.socket);
- if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
- dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
-
if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
return -2;
@@ -927,6 +912,11 @@ retry:
drbd_thread_start(&mdev->asender);
+ if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+ drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+ put_ldev(mdev);
+ }
+
if (!drbd_send_protocol(mdev))
return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -946,22 +936,28 @@ out_release_sockets:
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
{
+ union p_header *h = &mdev->data.rbuf.header;
int r;
r = drbd_recv(mdev, h, sizeof(*h));
-
if (unlikely(r != sizeof(*h))) {
dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
return FALSE;
- };
- h->command = be16_to_cpu(h->command);
- h->length = be16_to_cpu(h->length);
- if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ }
+
+ if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+ *cmd = be16_to_cpu(h->h80.command);
+ *packet_size = be16_to_cpu(h->h80.length);
+ } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+ *cmd = be16_to_cpu(h->h95.command);
+ *packet_size = be32_to_cpu(h->h95.length);
+ } else {
+ dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->h80.magic),
+ be16_to_cpu(h->h80.command),
+ be16_to_cpu(h->h80.length));
return FALSE;
}
mdev->last_received = jiffies;
@@ -969,13 +965,13 @@ static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+static void drbd_flush(struct drbd_conf *mdev)
{
int rv;
if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
- NULL, BLKDEV_IFL_WAIT);
+ NULL);
if (rv) {
dev_err(DEV, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -985,24 +981,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
}
put_ldev(mdev);
}
-
- return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-}
-
-static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- struct flush_work *fw = (struct flush_work *)w;
- struct drbd_epoch *epoch = fw->epoch;
-
- kfree(w);
-
- if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
- drbd_flush_after_epoch(mdev, epoch);
-
- drbd_may_finish_epoch(mdev, epoch, EV_PUT |
- (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
-
- return 1;
}
/**
@@ -1015,15 +993,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
struct drbd_epoch *epoch,
enum epoch_event ev)
{
- int finish, epoch_size;
+ int epoch_size;
struct drbd_epoch *next_epoch;
- int schedule_flush = 0;
enum finish_epoch rv = FE_STILL_LIVE;
spin_lock(&mdev->epoch_lock);
do {
next_epoch = NULL;
- finish = 0;
epoch_size = atomic_read(&epoch->epoch_size);
@@ -1033,16 +1009,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
break;
case EV_GOT_BARRIER_NR:
set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
-
- /* Special case: If we just switched from WO_bio_barrier to
- WO_bdev_flush we should not finish the current epoch */
- if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
- mdev->write_ordering != WO_bio_barrier &&
- epoch == mdev->current_epoch)
- clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
- break;
- case EV_BARRIER_DONE:
- set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
break;
case EV_BECAME_LAST:
/* nothing to do*/
@@ -1051,23 +1017,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
if (epoch_size != 0 &&
atomic_read(&epoch->active) == 0 &&
- test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
- epoch->list.prev == &mdev->current_epoch->list &&
- !test_bit(DE_IS_FINISHING, &epoch->flags)) {
- /* Nearly all conditions are met to finish that epoch... */
- if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
- mdev->write_ordering == WO_none ||
- (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
- ev & EV_CLEANUP) {
- finish = 1;
- set_bit(DE_IS_FINISHING, &epoch->flags);
- } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
- mdev->write_ordering == WO_bio_barrier) {
- atomic_inc(&epoch->active);
- schedule_flush = 1;
- }
- }
- if (finish) {
+ test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
if (!(ev & EV_CLEANUP)) {
spin_unlock(&mdev->epoch_lock);
drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1090,6 +1040,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
/* atomic_set(&epoch->active, 0); is already zero */
if (rv == FE_STILL_LIVE)
rv = FE_RECYCLED;
+ wake_up(&mdev->ee_wait);
}
}
@@ -1101,22 +1052,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
spin_unlock(&mdev->epoch_lock);
- if (schedule_flush) {
- struct flush_work *fw;
- fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
- if (fw) {
- fw->w.cb = w_flush;
- fw->epoch = epoch;
- drbd_queue_work(&mdev->data.work, &fw->w);
- } else {
- dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
- set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
- /* That is not a recursion, only one level */
- drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
- drbd_may_finish_epoch(mdev, epoch, EV_PUT);
- }
- }
-
return rv;
}
@@ -1132,19 +1067,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
[WO_none] = "none",
[WO_drain_io] = "drain",
[WO_bdev_flush] = "flush",
- [WO_bio_barrier] = "barrier",
};
pwo = mdev->write_ordering;
wo = min(pwo, wo);
- if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
- wo = WO_bdev_flush;
if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
wo = WO_drain_io;
if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
wo = WO_none;
mdev->write_ordering = wo;
- if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+ if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
}
@@ -1180,7 +1112,7 @@ next_bio:
bio->bi_sector = sector;
bio->bi_bdev = mdev->ldev->backing_bdev;
/* we special case some flags in the multi-bio case, see below
- * (REQ_UNPLUG, REQ_HARDBARRIER) */
+ * (REQ_UNPLUG) */
bio->bi_rw = rw;
bio->bi_private = e;
bio->bi_end_io = drbd_endio_sec;
@@ -1214,11 +1146,6 @@ next_bio:
bio->bi_rw &= ~REQ_UNPLUG;
drbd_generic_make_request(mdev, fault_type, bio);
-
- /* strip off REQ_HARDBARRIER,
- * unless it is the first or last bio */
- if (bios && bios->bi_next)
- bios->bi_rw &= ~REQ_HARDBARRIER;
} while (bios);
maybe_kick_lo(mdev);
return 0;
@@ -1232,53 +1159,12 @@ fail:
return -ENOMEM;
}
-/**
- * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
- * @mdev: DRBD device.
- * @w: work object.
- * @cancel: The connection will be closed anyways (unused in this callback)
- */
-int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
-{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
- (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
- so that we can finish that epoch in drbd_may_finish_epoch().
- That is necessary if we already have a long chain of Epochs, before
- we realize that REQ_HARDBARRIER is actually not supported */
-
- /* As long as the -ENOTSUPP on the barrier is reported immediately
- that will never trigger. If it is reported late, we will just
- print that warning and continue correctly for all future requests
- with WO_bdev_flush */
- if (previous_epoch(mdev, e->epoch))
- dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
-
- /* we still have a local reference,
- * get_ldev was done in receive_Data. */
-
- e->w.cb = e_end_block;
- if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
- /* drbd_submit_ee fails for one reason only:
- * if was not able to allocate sufficient bios.
- * requeue, try again later. */
- e->w.cb = w_e_reissue;
- drbd_queue_work(&mdev->data.work, &e->w);
- }
- return 1;
-}
-
-static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- int rv, issue_flush;
- struct p_barrier *p = (struct p_barrier *)h;
+ int rv;
+ struct p_barrier *p = &mdev->data.rbuf.barrier;
struct drbd_epoch *epoch;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-
- rv = drbd_recv(mdev, h->payload, h->length);
- ERR_IF(rv != h->length) return FALSE;
-
inc_unacked(mdev);
if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@ -1293,44 +1179,40 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
* Therefore we must send the barrier_ack after the barrier request was
* completed. */
switch (mdev->write_ordering) {
- case WO_bio_barrier:
case WO_none:
if (rv == FE_RECYCLED)
return TRUE;
- break;
+
+ /* receiver context, in the writeout path of the other node.
+ * avoid potential distributed deadlock */
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
+ else
+ dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+ /* Fall through */
case WO_bdev_flush:
case WO_drain_io:
- if (rv == FE_STILL_LIVE) {
- set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
- drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
- }
- if (rv == FE_RECYCLED)
- return TRUE;
-
- /* The asender will send all the ACKs and barrier ACKs out, since
- all EEs moved from the active_ee to the done_ee. We need to
- provide a new epoch object for the EEs that come in soon */
- break;
- }
-
- /* receiver context, in the writeout path of the other node.
- * avoid potential distributed deadlock */
- epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
- if (!epoch) {
- dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
- issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- if (issue_flush) {
- rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
- if (rv == FE_RECYCLED)
- return TRUE;
+ drbd_flush(mdev);
+
+ if (atomic_read(&mdev->current_epoch->epoch_size)) {
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
}
- drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+ epoch = mdev->current_epoch;
+ wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
+
+ D_ASSERT(atomic_read(&epoch->active) == 0);
+ D_ASSERT(epoch->flags == 0);
return TRUE;
+ default:
+ dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
+ return FALSE;
}
epoch->flags = 0;
@@ -1457,7 +1339,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
data_size -= rr;
}
kunmap(page);
- drbd_pp_free(mdev, page);
+ drbd_pp_free(mdev, page, 0);
return rv;
}
@@ -1562,30 +1444,29 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
+ atomic_add(data_size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+
drbd_free_ee(mdev, e);
fail:
put_ldev(mdev);
return FALSE;
}
-static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct drbd_request *req;
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
@@ -1611,20 +1492,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
@@ -1640,9 +1512,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
ok = drbd_drain_block(mdev, data_size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
}
+ atomic_add(data_size >> 9, &mdev->rs_sect_in);
+
return ok;
}
@@ -1653,15 +1527,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
sector_t sector = e->sector;
- struct drbd_epoch *epoch;
int ok = 1, pcmd;
- if (e->flags & EE_IS_BARRIER) {
- epoch = previous_epoch(mdev, e->epoch);
- if (epoch)
- drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
- }
-
if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1765,24 +1632,27 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
return ret;
}
+static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+ if (mdev->agreed_pro_version >= 95)
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FUA : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+ else
+ return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+}
+
/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
struct drbd_epoch_entry *e;
- struct p_data *p = (struct p_data *)h;
- int header_size, data_size;
+ struct p_data *p = &mdev->data.rbuf.data;
int rw = WRITE;
u32 dp_flags;
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write mirrored data block "
@@ -1792,7 +1662,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
atomic_inc(&mdev->current_epoch->epoch_size);
return drbd_drain_block(mdev, data_size);
}
@@ -1815,36 +1685,11 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
e->epoch = mdev->current_epoch;
atomic_inc(&e->epoch->epoch_size);
atomic_inc(&e->epoch->active);
-
- if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
- struct drbd_epoch *epoch;
- /* Issue a barrier if we start a new epoch, and the previous epoch
- was not a epoch containing a single request which already was
- a Barrier. */
- epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
- if (epoch == e->epoch) {
- set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
- rw |= REQ_HARDBARRIER;
- e->flags |= EE_IS_BARRIER;
- } else {
- if (atomic_read(&epoch->epoch_size) > 1 ||
- !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
- set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
- set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
- rw |= REQ_HARDBARRIER;
- e->flags |= EE_IS_BARRIER;
- }
- }
- }
spin_unlock(&mdev->epoch_lock);
dp_flags = be32_to_cpu(p->dp_flags);
- if (dp_flags & DP_HARDBARRIER) {
- dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
- /* rw |= REQ_HARDBARRIER; */
- }
- if (dp_flags & DP_RW_SYNC)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= write_flags_to_bio(mdev, dp_flags);
+
if (dp_flags & DP_MAY_SET_IN_SYNC)
e->flags |= EE_MAY_SET_IN_SYNC;
@@ -1997,16 +1842,27 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
break;
}
- if (mdev->state.pdsk == D_DISKLESS) {
+ if (mdev->state.pdsk < D_INCONSISTENT) {
/* In case we have the only disk of the cluster, */
drbd_set_out_of_sync(mdev, e->sector, e->size);
e->flags |= EE_CALL_AL_COMPLETE_IO;
+ e->flags &= ~EE_MAY_SET_IN_SYNC;
drbd_al_begin_io(mdev, e->sector);
}
if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ hlist_del_init(&e->colision);
+ spin_unlock_irq(&mdev->req_lock);
+ if (e->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(mdev, e->sector);
+
out_interrupted:
/* yes, the epoch_size now is imbalanced.
* but we drop the connection anyways, so we don't have a chance to
@@ -2016,20 +1872,64 @@ out_interrupted:
return FALSE;
}
-static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+{
+ struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ int curr_events;
+ int throttle = 0;
+
+ /* feature disabled? */
+ if (mdev->sync_conf.c_min_rate == 0)
+ return 0;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&mdev->rs_sect_ev);
+ if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ mdev->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+ rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+ dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = mdev->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > mdev->sync_conf.c_min_rate)
+ throttle = 1;
+ }
+ return throttle;
+}
+
+
+static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
{
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
struct digest_info *di = NULL;
- int size, digest_size;
+ int size, verb;
unsigned int fault_type;
- struct p_block_req *p =
- (struct p_block_req *)h;
- const int brps = sizeof(*p)-sizeof(*h);
-
- if (drbd_recv(mdev, h->payload, brps) != brps)
- return FALSE;
+ struct p_block_req *p = &mdev->data.rbuf.block_req;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
@@ -2046,12 +1946,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
- if (__ratelimit(&drbd_ratelimit_state))
+ verb = 1;
+ switch (cmd) {
+ case P_DATA_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+ break;
+ case P_RS_DATA_REQUEST:
+ case P_CSUM_RS_REQUEST:
+ case P_OV_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+ break;
+ case P_OV_REPLY:
+ verb = 0;
+ dec_rs_pending(mdev);
+ drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ break;
+ default:
+ dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+ cmdname(cmd));
+ }
+ if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
- drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
- P_NEG_RS_DREPLY , p);
- return drbd_drain_block(mdev, h->length - brps);
+
+ /* drain possibly payload */
+ return drbd_drain_block(mdev, digest_size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2063,31 +1982,21 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
- switch (h->command) {
+ switch (cmd) {
case P_DATA_REQUEST:
e->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
- break;
+ /* application IO, don't drbd_rs_begin_io */
+ goto submit;
+
case P_RS_DATA_REQUEST:
e->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronously. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- digest_size = h->length - brps ;
di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
if (!di)
goto out_free_e;
@@ -2095,31 +2004,25 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
di->digest_size = digest_size;
di->digest = (((char *)di)+sizeof(struct digest_info));
+ e->digest = di;
+ e->flags |= EE_HAS_DIGEST;
+
if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
goto out_free_e;
- e->block_id = (u64)(unsigned long)di;
- if (h->command == P_CSUM_RS_REQUEST) {
+ if (cmd == P_CSUM_RS_REQUEST) {
D_ASSERT(mdev->agreed_pro_version >= 89);
e->w.cb = w_e_end_csum_rs_req;
- } else if (h->command == P_OV_REPLY) {
+ } else if (cmd == P_OV_REPLY) {
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
- break;
- }
-
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted, probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
}
break;
case P_OV_REQUEST:
- if (mdev->state.conn >= C_CONNECTED &&
- mdev->state.conn != C_VERIFY_T)
- dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
- drbd_conn_str(mdev->state.conn));
if (mdev->ov_start_sector == ~(sector_t)0 &&
mdev->agreed_pro_version >= 90) {
mdev->ov_start_sector = sector;
@@ -2130,37 +2033,63 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
e->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronous. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
-
default:
dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(h->command));
+ cmdname(cmd));
fault_type = DRBD_FAULT_MAX;
+ goto out_free_e;
}
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+ if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+ msleep(100);
+ if (drbd_rs_begin_io(mdev, e->sector))
+ goto out_free_e;
+submit_for_resync:
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
+
+submit:
inc_unacked(mdev);
+ spin_lock_irq(&mdev->req_lock);
+ list_add_tail(&e->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+ /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
out_free_e:
- kfree(di);
put_ldev(mdev);
drbd_free_ee(mdev, e);
return FALSE;
@@ -2699,20 +2628,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
return 1;
}
-static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_protocol *p = (struct p_protocol *)h;
- int header_size, data_size;
+ struct p_protocol *p = &mdev->data.rbuf.protocol;
int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
int p_want_lose, p_two_primaries, cf;
char p_integrity_alg[SHARED_SECRET_MAX] = "";
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
@@ -2805,39 +2727,46 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
{
int ok = TRUE;
- struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+ struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
const int apv = mdev->agreed_pro_version;
+ int *rs_plan_s = NULL;
+ int fifo_size = 0;
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ SHARED_SECRET_MAX
- : /* 89 */ sizeof(struct p_rs_param_89);
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (h->length > exp_max_sz) {
+ if (packet_size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- h->length, exp_max_sz);
+ packet_size, exp_max_sz);
return FALSE;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(*h);
- data_size = h->length - header_size;
- } else /* apv >= 89 */ {
- header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
- data_size = h->length - header_size;
+ header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ } else if (apv <= 94) {
+ header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ D_ASSERT(data_size == 0);
+ } else {
+ header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
return FALSE;
mdev->sync_conf.rate = be32_to_cpu(p->rate);
@@ -2896,6 +2825,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
}
}
+ if (apv > 94) {
+ mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+ mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+ mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ if (!rs_plan_s) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ goto disconnect;
+ }
+ }
+ }
spin_lock(&mdev->peer_seq_lock);
/* lock against drbd_nl_syncer_conf() */
@@ -2913,6 +2858,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
mdev->csums_tfm = csums_tfm;
dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
}
+ if (fifo_size != mdev->rs_plan_s.size) {
+ kfree(mdev->rs_plan_s.values);
+ mdev->rs_plan_s.values = rs_plan_s;
+ mdev->rs_plan_s.size = fifo_size;
+ mdev->rs_planed = 0;
+ }
spin_unlock(&mdev->peer_seq_lock);
}
@@ -2946,19 +2897,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_sizes *p = (struct p_sizes *)h;
+ struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged;
unsigned int max_seg_s;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
@@ -2972,7 +2919,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
* we still need to figure out whether we accept that. */
mdev->p_size = p_size;
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
if (get_ldev(mdev)) {
warn_if_differ_considerably(mdev, "lower level device sizes",
p_size, drbd_get_max_capacity(mdev->ldev));
@@ -3029,6 +2975,8 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
if (mdev->agreed_pro_version < 94)
max_seg_s = be32_to_cpu(p->max_segment_size);
+ else if (mdev->agreed_pro_version == 94)
+ max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
max_seg_s = DRBD_MAX_SEGMENT_SIZE;
@@ -3062,16 +3010,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_uuids *p = (struct p_uuids *)h;
+ struct p_uuids *p = &mdev->data.rbuf.uuids;
u64 *p_uuid;
int i;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3107,6 +3051,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
drbd_md_sync(mdev);
}
put_ldev(mdev);
+ } else if (mdev->state.disk < D_INCONSISTENT &&
+ mdev->state.role == R_PRIMARY) {
+ /* I am a diskless primary, the peer just created a new current UUID
+ for me. */
+ drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
}
/* Before we test for the disk state, we should wait until an eventually
@@ -3150,16 +3099,12 @@ static union drbd_state convert_state(union drbd_state ps)
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_req_state *p = (struct p_req_state *)h;
+ struct p_req_state *p = &mdev->data.rbuf.req_state;
union drbd_state mask, val;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
@@ -3180,20 +3125,14 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_state *p = (struct p_state *)h;
- enum drbd_conns nconn, oconn;
- union drbd_state ns, peer_state;
+ struct p_state *p = &mdev->data.rbuf.state;
+ union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
+ enum chg_state_flags cs_flags;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
- return FALSE;
-
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
@@ -3204,40 +3143,74 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
spin_lock_irq(&mdev->req_lock);
retry:
- oconn = nconn = mdev->state.conn;
+ os = ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
- if (nconn == C_WF_REPORT_PARAMS)
- nconn = C_CONNECTED;
+ /* peer says his disk is uptodate, while we think it is inconsistent,
+ * and this happens while we think we have a sync going on. */
+ if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ /* If we are (becoming) SyncSource, but peer is still in sync
+ * preparation, ignore its uptodate-ness to avoid flapping, it
+ * will change to inconsistent once the peer reaches active
+ * syncing states.
+ * It may have changed syncer-paused flags, however, so we
+ * cannot ignore this completely. */
+ if (peer_state.conn > C_CONNECTED &&
+ peer_state.conn < C_SYNC_SOURCE)
+ real_peer_disk = D_INCONSISTENT;
+
+ /* if peer_state changes to connected at the same time,
+ * it explicitly notifies us that it finished resync.
+ * Maybe we should finish it up, too? */
+ else if (os.conn >= C_SYNC_SOURCE &&
+ peer_state.conn == C_CONNECTED) {
+ if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+ drbd_resync_finished(mdev);
+ return TRUE;
+ }
+ }
+
+ /* peer says his disk is inconsistent, while we think it is uptodate,
+ * and this happens while the peer still thinks we have a sync going on,
+ * but we think we are already done with the sync.
+ * We ignore this to avoid flapping pdsk.
+ * This should not happen, if the peer is a recent version of drbd. */
+ if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ real_peer_disk = D_UP_TO_DATE;
+
+ if (ns.conn == C_WF_REPORT_PARAMS)
+ ns.conn = C_CONNECTED;
if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
int cr; /* consider resync */
/* if we established a new connection */
- cr = (oconn < C_CONNECTED);
+ cr = (os.conn < C_CONNECTED);
/* if we had an established connection
* and one of the nodes newly attaches a disk */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.disk == D_NEGOTIATING ||
- mdev->state.disk == D_NEGOTIATING));
+ os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.conn >= C_STARTING_SYNC_S &&
peer_state.conn <= C_WF_BITMAP_T));
if (cr)
- nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+ ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
put_ldev(mdev);
- if (nconn == C_MASK) {
- nconn = C_CONNECTED;
+ if (ns.conn == C_MASK) {
+ ns.conn = C_CONNECTED;
if (mdev->state.disk == D_NEGOTIATING) {
- drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ drbd_force_state(mdev, NS(disk, D_FAILED));
} else if (peer_state.disk == D_NEGOTIATING) {
dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
peer_state.disk = D_DISKLESS;
@@ -3245,7 +3218,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
} else {
if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
return FALSE;
- D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+ D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return FALSE;
}
@@ -3253,18 +3226,28 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
}
spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn != oconn)
+ if (mdev->state.i != os.i)
goto retry;
clear_bit(CONSIDER_RESYNC, &mdev->flags);
- ns.i = mdev->state.i;
- ns.conn = nconn;
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
- if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
-
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+ cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ for temporal network outages! */
+ spin_unlock_irq(&mdev->req_lock);
+ dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(mdev);
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+ return FALSE;
+ }
+ rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
@@ -3273,8 +3256,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
- if (oconn > C_WF_REPORT_PARAMS) {
- if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ if (os.conn > C_WF_REPORT_PARAMS) {
+ if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
peer_state.disk != D_NEGOTIATING ) {
/* we want resync, peer has not yet decided to sync... */
/* Nowadays only used when forcing a node into primary role and
@@ -3291,9 +3274,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+ struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3302,10 +3285,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
/* Here the _drbd_uuid_ functions are right, current should
_not_ be rotated into the history */
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -3324,14 +3303,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
enum receive_bitmap_ret { OK, DONE, FAILED };
static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+ unsigned long *buffer, struct bm_xfer_ctx *c)
{
unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
unsigned want = num_words * sizeof(long);
- if (want != h->length) {
- dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+ if (want != data_size) {
+ dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
return FAILED;
}
if (want == 0)
@@ -3360,7 +3339,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+ int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
int toggle = DCBP_get_start(p);
int have;
int bits;
@@ -3429,7 +3408,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header) *
+ unsigned plain = sizeof(struct p_header80) *
((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+ c->bm_words * sizeof(long);
unsigned total = c->bytes[0] + c->bytes[1];
@@ -3467,12 +3446,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct bm_xfer_ctx c;
void *buffer;
enum receive_bitmap_ret ret;
int ok = FALSE;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
@@ -3492,39 +3472,39 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
};
do {
- if (h->command == P_BITMAP) {
- ret = receive_bitmap_plain(mdev, h, buffer, &c);
- } else if (h->command == P_COMPRESSED_BITMAP) {
+ if (cmd == P_BITMAP) {
+ ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+ } else if (cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
struct p_compressed_bm *p;
- if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+ if (data_size > BM_PACKET_PAYLOAD_BYTES) {
dev_err(DEV, "ReportCBitmap packet too large\n");
goto out;
}
/* use the page buff */
p = buffer;
memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+ if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
goto out;
- if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+ if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
return FAILED;
}
ret = decode_bitmap_c(mdev, p, &c);
} else {
- dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+ dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
goto out;
}
- c.packets[h->command == P_BITMAP]++;
- c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+ c.packets[cmd == P_BITMAP]++;
+ c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
if (ret != OK)
break;
- if (!drbd_recv_header(mdev, h))
+ if (!drbd_recv_header(mdev, &cmd, &data_size))
goto out;
} while (ret == OK);
if (ret == FAILED)
@@ -3555,17 +3535,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
-static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
+static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
/* TODO zero copy sink :) */
static char sink[128];
int size, want, r;
- if (!silent)
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- h->command, h->length);
+ dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+ cmd, data_size);
- size = h->length;
+ size = data_size;
while (size > 0) {
want = min_t(int, size, sizeof(sink));
r = drbd_recv(mdev, sink, want);
@@ -3575,17 +3554,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
return size == 0;
}
-static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 0);
-}
-
-static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 1);
-}
-
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
if (mdev->state.disk >= D_INCONSISTENT)
drbd_kick_lo(mdev);
@@ -3597,108 +3566,96 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
-
-static drbd_cmd_handler_f drbd_default_handler[] = {
- [P_DATA] = receive_Data,
- [P_DATA_REPLY] = receive_DataReply,
- [P_RS_DATA_REPLY] = receive_RSDataReply,
- [P_BARRIER] = receive_Barrier,
- [P_BITMAP] = receive_bitmap,
- [P_COMPRESSED_BITMAP] = receive_bitmap,
- [P_UNPLUG_REMOTE] = receive_UnplugRemote,
- [P_DATA_REQUEST] = receive_DataRequest,
- [P_RS_DATA_REQUEST] = receive_DataRequest,
- [P_SYNC_PARAM] = receive_SyncParam,
- [P_SYNC_PARAM89] = receive_SyncParam,
- [P_PROTOCOL] = receive_protocol,
- [P_UUIDS] = receive_uuids,
- [P_SIZES] = receive_sizes,
- [P_STATE] = receive_state,
- [P_STATE_CHG_REQ] = receive_req_state,
- [P_SYNC_UUID] = receive_sync_uuid,
- [P_OV_REQUEST] = receive_DataRequest,
- [P_OV_REPLY] = receive_DataRequest,
- [P_CSUM_RS_REQUEST] = receive_DataRequest,
- [P_DELAY_PROBE] = receive_skip_silent,
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+
+struct data_cmd {
+ int expect_payload;
+ size_t pkt_size;
+ drbd_cmd_handler_f function;
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+ [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
+ [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
+ [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
+ [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
+ [P_STATE] = { 0, sizeof(struct p_state), receive_state },
+ [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+ [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = NULL,
+ [P_MAX_CMD] = { 0, 0, NULL },
};
-static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
-static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+/* All handler functions that expect a sub-header get that sub-heder in
+ mdev->data.rbuf.header.head.payload.
+
+ Usually in mdev->data.rbuf.header.head the callback can find the usual
+ p_header, but they may not rely on that. Since there is also p_header95 !
+ */
static void drbdd(struct drbd_conf *mdev)
{
- drbd_cmd_handler_f handler;
- struct p_header *header = &mdev->data.rbuf.header;
+ union p_header *header = &mdev->data.rbuf.header;
+ unsigned int packet_size;
+ enum drbd_packets cmd;
+ size_t shs; /* sub header size */
+ int rv;
while (get_t_state(&mdev->receiver) == Running) {
drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, header)) {
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
- }
+ if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ goto err_out;
- if (header->command < P_MAX_CMD)
- handler = drbd_cmd_handler[header->command];
- else if (P_MAY_IGNORE < header->command
- && header->command < P_MAX_OPT_CMD)
- handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
- else if (header->command > P_MAX_OPT_CMD)
- handler = receive_skip;
- else
- handler = NULL;
+ if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+ dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ goto err_out;
+ }
- if (unlikely(!handler)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n",
- header->command, header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+ shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+ if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+ dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ goto err_out;
}
- if (unlikely(!handler(mdev, header))) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(header->command), header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+
+ if (shs) {
+ rv = drbd_recv(mdev, &header->h80.payload, shs);
+ if (unlikely(rv != shs)) {
+ dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+ goto err_out;
+ }
}
- }
-}
-static void drbd_fail_pending_reads(struct drbd_conf *mdev)
-{
- struct hlist_head *slot;
- struct hlist_node *pos;
- struct hlist_node *tmp;
- struct drbd_request *req;
- int i;
+ rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
- /*
- * Application READ requests
- */
- spin_lock_irq(&mdev->req_lock);
- for (i = 0; i < APP_R_HSIZE; i++) {
- slot = mdev->app_reads_hash+i;
- hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
- /* it may (but should not any longer!)
- * be on the work queue; if that assert triggers,
- * we need to also grab the
- * spin_lock_irq(&mdev->data.work.q_lock);
- * and list_del_init here. */
- D_ASSERT(list_empty(&req->w.list));
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(req, connection_lost_while_pending);
+ if (unlikely(!rv)) {
+ dev_err(DEV, "error receiving %s, l: %d!\n",
+ cmdname(cmd), packet_size);
+ goto err_out;
}
}
- for (i = 0; i < APP_R_HSIZE; i++)
- if (!hlist_empty(mdev->app_reads_hash+i))
- dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
- "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- spin_unlock_irq(&mdev->req_lock);
+ if (0) {
+ err_out:
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ }
+ /* If we leave here, we probably want to update at least the
+ * "Connected" indicator on stable storage. Do so explicitly here. */
+ drbd_md_sync(mdev);
}
void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3711,6 +3668,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
wait_for_completion(&barr.done);
}
+void drbd_free_tl_hash(struct drbd_conf *mdev)
+{
+ struct hlist_head *h;
+
+ spin_lock_irq(&mdev->req_lock);
+
+ if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+ spin_unlock_irq(&mdev->req_lock);
+ return;
+ }
+ /* paranoia code */
+ for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+ (int)(h - mdev->ee_hash), h->first);
+ kfree(mdev->ee_hash);
+ mdev->ee_hash = NULL;
+ mdev->ee_hash_s = 0;
+
+ /* paranoia code */
+ for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+ (int)(h - mdev->tl_hash), h->first);
+ kfree(mdev->tl_hash);
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+ spin_unlock_irq(&mdev->req_lock);
+}
+
static void drbd_disconnect(struct drbd_conf *mdev)
{
enum drbd_fencing_p fp;
@@ -3728,6 +3715,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
drbd_thread_stop(&mdev->asender);
drbd_free_sock(mdev);
+ /* wait for current activity to cease. */
spin_lock_irq(&mdev->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
@@ -3752,7 +3740,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
/* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
resync_timer_fn((unsigned long)mdev);
/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@ -3767,11 +3754,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
- if (!mdev->state.susp)
+ if (!is_susp(mdev->state))
tl_clear(mdev);
- drbd_fail_pending_reads(mdev);
-
dev_info(DEV, "Connection closed\n");
drbd_md_sync(mdev);
@@ -3782,12 +3767,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
put_ldev(mdev);
}
- if (mdev->state.role == R_PRIMARY) {
- if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
- enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
- drbd_request_state(mdev, NS(pdsk, nps));
- }
- }
+ if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+ drbd_try_outdate_peer_async(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
@@ -3800,32 +3781,14 @@ static void drbd_disconnect(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
if (os.conn == C_DISCONNECTING) {
- struct hlist_head *h;
- wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
-
- /* we must not free the tl_hash
- * while application io is still on the fly */
- wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
+ wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
- spin_lock_irq(&mdev->req_lock);
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* paranoia code */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- (int)(h - mdev->tl_hash), h->first);
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
+ if (!is_susp(mdev->state)) {
+ /* we must not free the tl_hash
+ * while application io is still on the fly */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ drbd_free_tl_hash(mdev);
+ }
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = NULL;
@@ -3845,6 +3808,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
i = drbd_release_ee(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&mdev->pp_in_use_by_net);
+ if (i)
+ dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
i = atomic_read(&mdev->pp_in_use);
if (i)
dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@ -3888,7 +3854,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header *)p, sizeof(*p), 0 );
+ (struct p_header80 *)p, sizeof(*p), 0 );
mutex_unlock(&mdev->data.mutex);
return ok;
}
@@ -3904,27 +3870,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
{
/* ASSERT current == mdev->receiver ... */
struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake)
- -sizeof(struct p_header);
+ const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+ unsigned int length;
+ enum drbd_packets cmd;
int rv;
rv = drbd_send_handshake(mdev);
if (!rv)
return 0;
- rv = drbd_recv_header(mdev, &p->head);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
return 0;
- if (p->head.command != P_HAND_SHAKE) {
+ if (cmd != P_HAND_SHAKE) {
dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(p->head.command), p->head.command);
+ cmdname(cmd), cmd);
return -1;
}
- if (p->head.length != expect) {
+ if (length != expect) {
dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, p->head.length);
+ expect, length);
return -1;
}
@@ -3982,10 +3949,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
- struct p_header p;
unsigned int key_len = strlen(mdev->net_conf->shared_secret);
unsigned int resp_size;
struct hash_desc desc;
+ enum drbd_packets cmd;
+ unsigned int length;
int rv;
desc.tfm = mdev->cram_hmac_tfm;
@@ -4005,33 +3973,33 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_CHALLENGE) {
+ if (cmd != P_AUTH_CHALLENGE) {
dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length > CHALLENGE_LEN*2) {
+ if (length > CHALLENGE_LEN * 2) {
dev_err(DEV, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
- peers_ch = kmalloc(p.length, GFP_NOIO);
+ peers_ch = kmalloc(length, GFP_NOIO);
if (peers_ch == NULL) {
dev_err(DEV, "kmalloc of peers_ch failed\n");
rv = -1;
goto fail;
}
- rv = drbd_recv(mdev, peers_ch, p.length);
+ rv = drbd_recv(mdev, peers_ch, length);
- if (rv != p.length) {
+ if (rv != length) {
dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
rv = 0;
goto fail;
@@ -4046,7 +4014,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
}
sg_init_table(&sg, 1);
- sg_set_buf(&sg, peers_ch, p.length);
+ sg_set_buf(&sg, peers_ch, length);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
@@ -4059,18 +4027,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_RESPONSE) {
+ if (cmd != P_AUTH_RESPONSE) {
dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length != resp_size) {
+ if (length != resp_size) {
dev_err(DEV, "expected AuthResponse payload of wrong size\n");
rv = 0;
goto fail;
@@ -4155,7 +4123,7 @@ int drbdd_init(struct drbd_thread *thi)
/* ********* acknowledge sender ******** */
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_req_state_reply *p = (struct p_req_state_reply *)h;
@@ -4173,13 +4141,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
{
return drbd_send_ping_ack(mdev);
}
-static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
{
/* restore idle timeout */
mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
@@ -4189,7 +4157,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4199,11 +4167,15 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- drbd_rs_complete_io(mdev, sector);
- drbd_set_in_sync(mdev, sector, blksize);
- /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
- mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, sector);
+ drbd_set_in_sync(mdev, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(mdev);
+ }
dec_rs_pending(mdev);
+ atomic_add(blksize >> 9, &mdev->rs_sect_in);
return TRUE;
}
@@ -4259,7 +4231,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
return TRUE;
}
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4299,7 +4271,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , what);
}
-static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4319,7 +4291,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , neg_acked);
}
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4332,7 +4304,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
_ar_id_to_req, __func__ , neg_acked);
}
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
sector_t sector;
int size;
@@ -4354,7 +4326,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_barrier_ack *p = (struct p_barrier_ack *)h;
@@ -4363,7 +4335,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
struct drbd_work *w;
@@ -4380,6 +4352,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
else
ov_oos_print(mdev);
+ if (!get_ldev(mdev))
+ return TRUE;
+
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
@@ -4394,18 +4369,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
drbd_resync_finished(mdev);
}
}
+ put_ldev(mdev);
return TRUE;
}
-static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
+static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
{
- /* IGNORE */
return TRUE;
}
struct asender_cmd {
size_t pkt_size;
- int (*process)(struct drbd_conf *mdev, struct p_header *h);
+ int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
};
static struct asender_cmd *get_asender_cmd(int cmd)
@@ -4414,8 +4389,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
/* anything missing from this table is in
* the drbd_cmd_handler (drbd_default_handler) table,
* see the beginning of drbdd() */
- [P_PING] = { sizeof(struct p_header), got_Ping },
- [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
+ [P_PING] = { sizeof(struct p_header80), got_Ping },
+ [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
@@ -4427,7 +4402,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
- [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
[P_MAX_CMD] = { 0, NULL },
};
if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4438,13 +4413,13 @@ static struct asender_cmd *get_asender_cmd(int cmd)
int drbd_asender(struct drbd_thread *thi)
{
struct drbd_conf *mdev = thi->mdev;
- struct p_header *h = &mdev->meta.rbuf.header;
+ struct p_header80 *h = &mdev->meta.rbuf.header.h80;
struct asender_cmd *cmd = NULL;
int rv, len;
void *buf = h;
int received = 0;
- int expect = sizeof(struct p_header);
+ int expect = sizeof(struct p_header80);
int empty;
sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
@@ -4468,10 +4443,8 @@ int drbd_asender(struct drbd_thread *thi)
while (1) {
clear_bit(SIGNAL_ASENDER, &mdev->flags);
flush_signals(current);
- if (!drbd_process_done_ee(mdev)) {
- dev_err(DEV, "process_done_ee() = NOT_OK\n");
+ if (!drbd_process_done_ee(mdev))
goto reconnect;
- }
/* to avoid race with newly queued ACKs */
set_bit(SIGNAL_ASENDER, &mdev->flags);
spin_lock_irq(&mdev->req_lock);
@@ -4530,21 +4503,23 @@ int drbd_asender(struct drbd_thread *thi)
if (received == expect && cmd == NULL) {
if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto reconnect;
}
cmd = get_asender_cmd(be16_to_cpu(h->command));
len = be16_to_cpu(h->length);
if (unlikely(cmd == NULL)) {
- dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto disconnect;
}
expect = cmd->pkt_size;
- ERR_IF(len != expect-sizeof(struct p_header))
+ ERR_IF(len != expect-sizeof(struct p_header80))
goto reconnect;
}
if (received == expect) {
@@ -4554,7 +4529,7 @@ int drbd_asender(struct drbd_thread *thi)
buf = h;
received = 0;
- expect = sizeof(struct p_header);
+ expect = sizeof(struct p_header80);
cmd = NULL;
}
}
@@ -4562,10 +4537,12 @@ int drbd_asender(struct drbd_thread *thi)
if (0) {
reconnect:
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ drbd_md_sync(mdev);
}
if (0) {
disconnect:
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ drbd_md_sync(mdev);
}
clear_bit(SIGNAL_ASENDER, &mdev->flags);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f761d98..11a75d3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
{
const unsigned long s = req->rq_state;
+
+ /* remove it from the transfer log.
+ * well, only if it had been there in the first
+ * place... if it had not (local only or conflicting
+ * and never sent), it should still be "empty" as
+ * initialized in drbd_req_new(), so we can list_del() it
+ * here unconditionally */
+ list_del(&req->tl_requests);
+
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
- /* remove it from the transfer log.
- * well, only if it had been there in the first
- * place... if it had not (local only or conflicting
- * and never sent), it should still be "empty" as
- * initialized in drbd_req_new(), so we can list_del() it
- * here unconditionally */
- list_del(&req->tl_requests);
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
@@ -92,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
*/
if (s & RQ_LOCAL_MASK) {
if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_al_complete_io(mdev, req->sector);
+ if (s & RQ_IN_ACT_LOG)
+ drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
@@ -255,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
if (!hlist_unhashed(&req->colision))
hlist_del(&req->colision);
else
- D_ASSERT((s & RQ_NET_MASK) == 0);
+ D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
/* for writes we need to do some extra housekeeping */
if (rw == WRITE)
@@ -280,6 +283,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* protocol A or B, barrier ack still pending... */
}
+static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+{
+ struct drbd_conf *mdev = req->mdev;
+
+ if (!is_susp(mdev->state))
+ _req_may_be_done(req, m);
+}
+
/*
* checks whether there was an overlapping request
* or ee already registered.
@@ -380,10 +391,11 @@ out_conflict:
* and it enforces that we have to think in a very structured manner
* about the "events" that may happen to a request during its life time ...
*/
-void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
+ int rv = 0;
m->bio = NULL;
switch (what) {
@@ -420,7 +432,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -429,7 +441,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_LOCAL_PENDING;
__drbd_chk_io_error(mdev, FALSE);
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -437,7 +449,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* it is legal to fail READA */
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -455,7 +467,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* no point in retrying if there is no good remote data,
* or we have no connection. */
if (mdev->state.pdsk != D_UP_TO_DATE) {
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
}
@@ -517,11 +529,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
req->epoch = mdev->newest_tle->br_number;
- list_add_tail(&req->tl_requests,
- &mdev->newest_tle->requests);
/* increment size of current epoch */
- mdev->newest_tle->n_req++;
+ mdev->newest_tle->n_writes++;
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
@@ -530,7 +540,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
drbd_queue_work(&mdev->data.work, &req->w);
/* close the epoch, in case it outgrew the limit */
- if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+ if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
queue_barrier(mdev);
break;
@@ -543,7 +553,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_NET_QUEUED;
/* if we did it right, tl_clear should be scheduled only after
* this, so this should not be necessary! */
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case handed_over_to_network:
@@ -568,7 +578,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
* "completed_ok" events came in, once we return from
* _drbd_send_zc_bio (drbd_send_dblock), we have to check
* whether it is done already, and end it. */
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case read_retry_remote_canceled:
@@ -584,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* if it is still queued, we may not complete it here.
* it will be canceled soon. */
if (!(req->rq_state & RQ_NET_QUEUED))
- _req_may_be_done(req, m);
+ _req_may_be_done(req, m); /* Allowed while state.susp */
break;
case write_acked_by_peer_and_sis:
@@ -619,7 +629,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case neg_acked:
@@ -629,11 +639,50 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
/* else: done by handed_over_to_network */
break;
+ case fail_frozen_disk_io:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ _req_may_be_done(req, m); /* Allowed while state.susp */
+ break;
+
+ case restart_frozen_disk_io:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ req->rq_state &= ~RQ_LOCAL_COMPLETED;
+
+ rv = MR_READ;
+ if (bio_data_dir(req->master_bio) == WRITE)
+ rv = MR_WRITE;
+
+ get_ldev(mdev);
+ req->w.cb = w_restart_disk_io;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ break;
+
+ case resend:
+ /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+ before the connection loss (B&C only); only P_BARRIER_ACK was missing.
+ Trowing them out of the TL here by pretending we got a BARRIER_ACK
+ We ensure that the peer was not rebooted */
+ if (!(req->rq_state & RQ_NET_OK)) {
+ if (req->w.cb) {
+ drbd_queue_work(&mdev->data.work, &req->w);
+ rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+ }
+ break;
+ }
+ /* else, fall through to barrier_acked */
+
case barrier_acked:
+ if (!(req->rq_state & RQ_WRITE))
+ break;
+
if (req->rq_state & RQ_NET_PENDING) {
/* barrier came in before all requests have been acked.
* this is bad, because if the connection is lost now,
@@ -643,7 +692,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
}
D_ASSERT(req->rq_state & RQ_NET_SENT);
req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req, m);
+ _req_may_be_done(req, m); /* Allowed while state.susp */
break;
case data_received:
@@ -651,9 +700,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
};
+
+ return rv;
}
/* we may do a local read if:
@@ -752,15 +803,18 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
* resync extent to finish, and, if necessary, pulls in the target
* extent into the activity log, which involves further disk io because
* of transactional on-disk meta data updates. */
- if (rw == WRITE && local)
+ if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ req->rq_state |= RQ_IN_ACT_LOG;
drbd_al_begin_io(mdev, sector);
+ }
remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
- if (!(local || remote) && !mdev->state.susp) {
- dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+ if (!(local || remote) && !is_susp(mdev->state)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
goto fail_free_complete;
}
@@ -785,7 +839,7 @@ allocate_barrier:
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
- if (mdev->state.susp) {
+ if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
generic_make_request() to restart processing of this
bio. In the next call to drbd_make_request_26
@@ -867,30 +921,10 @@ allocate_barrier:
/* check this request on the collision detection hash tables.
* if we have a conflict, just complete it here.
* THINK do we want to check reads, too? (I don't think so...) */
- if (rw == WRITE && _req_conflicts(req)) {
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- if (local) {
- bio_put(req->private_bio);
- req->private_bio = NULL;
- drbd_al_complete_io(mdev, req->sector);
- put_ldev(mdev);
- local = 0;
- }
- if (remote)
- dec_ap_pending(mdev);
- _drbd_end_io_acct(mdev, req);
- /* THINK: do we want to fail it (-EIO), or pretend success? */
- bio_endio(req->master_bio, 0);
- req->master_bio = NULL;
- dec_ap_bio(mdev);
- drbd_req_free(req);
- remote = 0;
- }
+ if (rw == WRITE && _req_conflicts(req))
+ goto fail_conflicting;
+
+ list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
@@ -909,12 +943,21 @@ allocate_barrier:
if (local) {
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
- if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
- : rw == READ ? DRBD_FAULT_DT_RD
- : DRBD_FAULT_DT_RA))
+ /* State may have changed since we grabbed our reference on the
+ * mdev->ldev member. Double check, and short-circuit to endio.
+ * In case the last activity log transaction failed to get on
+ * stable storage, and this is a WRITE, we may not even submit
+ * this bio. */
+ if (get_ldev(mdev)) {
+ if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(req->private_bio, -EIO);
+ else
+ generic_make_request(req->private_bio);
+ put_ldev(mdev);
+ } else
bio_endio(req->private_bio, -EIO);
- else
- generic_make_request(req->private_bio);
}
/* we need to plug ALWAYS since we possibly need to kick lo_dev.
@@ -923,6 +966,21 @@ allocate_barrier:
return 0;
+fail_conflicting:
+ /* this is a conflicting request.
+ * even though it may have been only _partially_
+ * overlapping with one of the currently pending requests,
+ * without even submitting or sending it, we will
+ * pretend that it was successfully served right now.
+ */
+ _drbd_end_io_acct(mdev, req);
+ spin_unlock_irq(&mdev->req_lock);
+ if (remote)
+ dec_ap_pending(mdev);
+ /* THINK: do we want to fail it (-EIO), or pretend success?
+ * this pretends success. */
+ err = 0;
+
fail_free_complete:
if (rw == WRITE && local)
drbd_al_complete_io(mdev, sector);
@@ -961,21 +1019,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
return 1;
}
- /*
- * Paranoia: we might have been primary, but sync target, or
- * even diskless, then lost the connection.
- * This should have been handled (panic? suspend?) somewhere
- * else. But maybe it was not, so check again here.
- * Caution: as long as we do not have a read/write lock on mdev,
- * to serialize state changes, this is racy, since we may lose
- * the connection *after* we test for the cstate.
- */
- if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
- return 1;
- }
-
return 0;
}
@@ -989,20 +1032,6 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
return 0;
}
- /* Reject barrier requests if we know the underlying device does
- * not support them.
- * XXX: Need to get this info from peer as well some how so we
- * XXX: reject if EITHER side/data/metadata area does not support them.
- *
- * because of those XXX, this is not yet enabled,
- * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
- */
- if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) {
- /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
- }
-
/*
* what we "blindly" assume:
*/
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 02d575d..ab2bd09 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -104,6 +104,9 @@ enum drbd_req_event {
read_ahead_completed_with_error,
write_completed_with_error,
completed_ok,
+ resend,
+ fail_frozen_disk_io,
+ restart_frozen_disk_io,
nothing, /* for tracing only */
};
@@ -183,6 +186,12 @@ enum drbd_req_state_bits {
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
+
+ /* Set when this is a write, clear for a read */
+ __RQ_WRITE,
+
+ /* Should call drbd_al_complete_io() for this request... */
+ __RQ_IN_ACT_LOG,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@@ -201,6 +210,16 @@ enum drbd_req_state_bits {
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+#define RQ_WRITE (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+ should be counted in the epoch object*/
+#define MR_WRITE_SHIFT 0
+#define MR_WRITE (1 << MR_WRITE_SHIFT)
+#define MR_READ_SHIFT 1
+#define MR_READ (1 << MR_READ_SHIFT)
+
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
@@ -244,30 +263,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
return NULL;
}
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+ struct bio *bio;
+ bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+ req->private_bio = bio;
+
+ bio->bi_private = req;
+ bio->bi_end_io = drbd_endio_pri;
+ bio->bi_next = NULL;
+}
+
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
- struct bio *bio;
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
- bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+ drbd_req_make_private_bio(req, bio_src);
- req->rq_state = 0;
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
- req->private_bio = bio;
req->epoch = 0;
- req->sector = bio->bi_sector;
- req->size = bio->bi_size;
+ req->sector = bio_src->bi_sector;
+ req->size = bio_src->bi_size;
req->start_time = jiffies;
INIT_HLIST_NODE(&req->colision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
-
- bio->bi_private = req;
- bio->bi_end_io = drbd_endio_pri;
- bio->bi_next = NULL;
}
return req;
}
@@ -292,36 +317,44 @@ struct bio_and_error {
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
-extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
-static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
+ int rv;
/* __req_mod possibly frees req, do not touch req after that! */
- __req_mod(req, what, &m);
+ rv = __req_mod(req, what, &m);
if (m.bio)
complete_master_bio(mdev, &m);
+
+ return rv;
}
/* completion of master bio is outside of spinlock.
- * If you need it irqsave, do it your self! */
-static inline void req_mod(struct drbd_request *req,
+ * If you need it irqsave, do it your self!
+ * Which means: don't use from bio endio callback. */
+static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
+ int rv;
+
spin_lock_irq(&mdev->req_lock);
- __req_mod(req, what, &m);
+ rv = __req_mod(req, what, &m);
spin_unlock_irq(&mdev->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
+
+ return rv;
}
#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16c..34f224b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -26,7 +26,6 @@
#include <linux/module.h>
#include <linux/drbd.h>
#include <linux/sched.h>
-#include <linux/smp_lock.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
@@ -39,8 +38,6 @@
#include "drbd_int.h"
#include "drbd_req.h"
-#define SLEEP_TIME (HZ/10)
-
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
@@ -104,12 +101,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
put_ldev(mdev);
}
-static int is_failed_barrier(int ee_flags)
-{
- return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
- == (EE_IS_BARRIER|EE_WAS_ERROR);
-}
-
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -121,21 +112,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
int is_syncer_req;
int do_al_complete_io;
- /* if this is a failed barrier request, disable use of barriers,
- * and schedule for resubmission */
- if (is_failed_barrier(e->flags)) {
- drbd_bump_write_ordering(mdev, WO_bdev_flush);
- spin_lock_irqsave(&mdev->req_lock, flags);
- list_del(&e->w.list);
- e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
- e->w.cb = w_e_reissue;
- /* put_ldev actually happens below, once we come here again. */
- __release(local);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
- drbd_queue_work(&mdev->data.work, &e->w);
- return;
- }
-
D_ASSERT(e->block_id != ID_VACANT);
/* after we moved e to done_ee,
@@ -246,6 +222,7 @@ void drbd_endio_pri(struct bio *bio, int error)
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
+ /* not req_mod(), we need irqsave here! */
spin_lock_irqsave(&mdev->req_lock, flags);
__req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
@@ -376,54 +353,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
struct drbd_epoch_entry *e;
if (!get_ldev(mdev))
- return 0;
+ return -EIO;
+
+ if (drbd_rs_should_slow_down(mdev))
+ goto defer;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e)
- goto fail;
+ goto defer;
+ e->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
- e->w.cb = w_e_send_csum;
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
- return 1;
+ return 0;
+
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
drbd_free_ee(mdev, e);
-fail:
+defer:
put_ldev(mdev);
- return 2;
+ return -EAGAIN;
}
void resync_timer_fn(unsigned long data)
{
- unsigned long flags;
struct drbd_conf *mdev = (struct drbd_conf *) data;
int queue;
- spin_lock_irqsave(&mdev->req_lock, flags);
-
- if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
- queue = 1;
- if (mdev->state.conn == C_VERIFY_S)
- mdev->resync_work.cb = w_make_ov_request;
- else
- mdev->resync_work.cb = w_make_resync_request;
- } else {
+ queue = 1;
+ switch (mdev->state.conn) {
+ case C_VERIFY_S:
+ mdev->resync_work.cb = w_make_ov_request;
+ break;
+ case C_SYNC_TARGET:
+ mdev->resync_work.cb = w_make_resync_request;
+ break;
+ default:
queue = 0;
mdev->resync_work.cb = w_resync_inactive;
}
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
/* harmless race: list_empty outside data.work.q_lock */
if (list_empty(&mdev->resync_work.list) && queue)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+ int ov;
+
+ ov = fb->values[fb->head_index];
+ fb->values[fb->head_index++] = value;
+
+ if (fb->head_index >= fb->size)
+ fb->head_index = 0;
+
+ return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] += value;
+}
+
+int drbd_rs_controller(struct drbd_conf *mdev)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ unsigned int want; /* The number of sectors we want in the proxy */
+ int req_sect; /* Number of sectors to request in this turn */
+ int correction; /* Number of sectors more we need in the proxy*/
+ int cps; /* correction per invocation of drbd_rs_controller() */
+ int steps; /* Number of time steps to plan ahead */
+ int curr_corr;
+ int max_sect;
+
+ sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
+ mdev->rs_in_flight -= sect_in;
+
+ spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+
+ steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+ if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ } else { /* normal path */
+ want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
+ sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ }
+
+ correction = want - mdev->rs_in_flight - mdev->rs_planed;
+
+ /* Plan ahead */
+ cps = correction / steps;
+ fifo_add_val(&mdev->rs_plan_s, cps);
+ mdev->rs_planed += cps * steps;
+
+ /* What we do in this step */
+ curr_corr = fifo_push(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
+ mdev->rs_planed -= curr_corr;
+
+ req_sect = sect_in + curr_corr;
+ if (req_sect < 0)
+ req_sect = 0;
+
+ max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ if (req_sect > max_sect)
+ req_sect = max_sect;
+
+ /*
+ dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, mdev->rs_in_flight, want, correction,
+ steps, cps, mdev->rs_planed, curr_corr, req_sect);
+ */
+
+ return req_sect;
+}
+
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
@@ -431,8 +499,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size;
- int number, i, size, pe, mx;
+ int number, rollback_i, size, pe, mx;
int align, queued, sndbuf;
+ int i = 0;
if (unlikely(cancel))
return 1;
@@ -446,6 +515,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
+ if (mdev->rs_total == 0) {
+ /* empty resync? */
+ drbd_resync_finished(mdev);
+ return 1;
+ }
+
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -458,11 +533,25 @@ int w_make_resync_request(struct drbd_conf *mdev,
/* starting with drbd 8.3.8, we can handle multi-bio EEs,
* if it should be necessary */
- max_segment_size = mdev->agreed_pro_version < 94 ?
- queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+ max_segment_size =
+ mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
+ mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
- number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ);
- pe = atomic_read(&mdev->rs_pending_cnt);
+ if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+ number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+ mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ mdev->c_sync_rate = mdev->sync_conf.rate;
+ number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+
+ /* Throttle resync on lower level disk activity, which may also be
+ * caused by application IO on Primary/SyncTarget.
+ * Keep this after the call to drbd_rs_controller, as that assumes
+ * to be called as precisely as possible every SLEEP_TIME,
+ * and would be confused otherwise. */
+ if (drbd_rs_should_slow_down(mdev))
+ goto requeue;
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
@@ -476,6 +565,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+ pe = atomic_read(&mdev->rs_pending_cnt);
if ((pe + number) > mx) {
number = mx - pe;
}
@@ -526,6 +616,7 @@ next_sector:
* be prepared for all stripe sizes of software RAIDs.
*/
align = 1;
+ rollback_i = i;
for (;;) {
if (size + BM_BLOCK_SIZE > max_segment_size)
break;
@@ -561,14 +652,19 @@ next_sector:
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
- case 0: /* Disk failure*/
+ case -EIO: /* Disk failure */
put_ldev(mdev);
return 0;
- case 2: /* Allocation failed */
+ case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ i = rollback_i;
goto requeue;
- /* case 1: everything ok */
+ case 0:
+ /* everything ok */
+ break;
+ default:
+ BUG();
}
} else {
inc_rs_pending(mdev);
@@ -595,6 +691,7 @@ next_sector:
}
requeue:
+ mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
@@ -670,6 +767,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
+static void ping_peer(struct drbd_conf *mdev)
+{
+ clear_bit(GOT_PING_ACK, &mdev->flags);
+ request_ping(mdev);
+ wait_event(mdev->misc_wait,
+ test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
@@ -709,6 +814,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;
+ ping_peer(mdev);
+
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
@@ -801,8 +908,10 @@ out:
mdev->rs_paused = 0;
mdev->ov_start_sector = 0;
+ drbd_md_sync(mdev);
+
if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
- dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
+ dev_info(DEV, "Writing the whole bitmap\n");
drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
}
@@ -817,9 +926,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
{
if (drbd_ee_has_active_page(e)) {
/* This might happen if sendpage() has not finished */
+ int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &mdev->pp_in_use_by_net);
+ atomic_sub(i, &mdev->pp_in_use);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
+ wake_up(&drbd_pp_wait);
} else
drbd_free_ee(mdev, e);
}
@@ -926,9 +1039,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return 1;
}
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
@@ -952,7 +1068,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
} else {
inc_rs_pending(mdev);
- e->block_id = ID_SYNCER;
+ e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ kfree(di);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
}
} else {
@@ -962,9 +1080,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
@@ -1034,9 +1149,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
@@ -1055,9 +1173,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size);
else
@@ -1108,7 +1223,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header *)p, sizeof(*p), 0);
+ (struct p_header80 *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
@@ -1173,6 +1288,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return ok;
}
+int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+ if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+ drbd_al_begin_io(mdev, req->sector);
+ /* Calling drbd_al_begin_io() out of the worker might deadlocks
+ theoretically. Practically it can not deadlock, since this is
+ only used when unfreezing IOs. All the extents of the requests
+ that made it into the TL are already active */
+
+ drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+ generic_make_request(req->private_bio);
+
+ return 1;
+}
+
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
@@ -1298,14 +1431,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}
-static void ping_peer(struct drbd_conf *mdev)
-{
- clear_bit(GOT_PING_ACK, &mdev->flags);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
-}
-
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
@@ -1379,13 +1504,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
- mdev->rs_total =
- mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ unsigned long tw = drbd_bm_total_weight(mdev);
+ unsigned long now = jiffies;
+ int i;
+
mdev->rs_failed = 0;
mdev->rs_paused = 0;
- mdev->rs_start =
- mdev->rs_mark_time = jiffies;
mdev->rs_same_csum = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ mdev->rs_total = tw;
+ mdev->rs_start = now;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = tw;
+ mdev->rs_mark_time[i] = now;
+ }
_drbd_pause_after(mdev);
}
write_unlock_irq(&global_state_lock);
@@ -1397,12 +1530,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
- if (mdev->rs_total == 0) {
- /* Peer still reachable? Beware of failing before-resync-target handlers! */
- ping_peer(mdev);
+ if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ /* This still has a race (about when exactly the peers
+ * detect connection loss) that can lead to a full sync
+ * on next handshake. In 8.3.9 we fixed this with explicit
+ * resync-finished notifications, but the fix
+ * introduces a protocol change. Sleeping for some
+ * time longer than the ping interval + timeout on the
+ * SyncSource, to give the SyncTarget the chance to
+ * detect connection loss, then waiting for a ping
+ * response (implicit in drbd_resync_finished) reduces
+ * the race considerably, but does not solve it. */
+ if (side == C_SYNC_SOURCE)
+ schedule_timeout_interruptible(
+ mdev->net_conf->ping_int * HZ +
+ mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}
+ atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
+ mdev->rs_in_flight = 0;
+ mdev->rs_planed = 0;
+ spin_lock(&mdev->peer_seq_lock);
+ fifo_set(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
/* ns.conn may already be != mdev->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 709e69c..b9ba04f 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -178,7 +178,6 @@ static int print_unex = 1;
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/bio.h>
-#include <linux/smp_lock.h>
#include <linux/string.h>
#include <linux/jiffies.h>
#include <linux/fcntl.h>
@@ -199,6 +198,7 @@ static int print_unex = 1;
* It's been recommended that take about 1/4 of the default speed
* in some more extreme cases.
*/
+static DEFINE_MUTEX(floppy_mutex);
static int slow_floppy;
#include <asm/dma.h>
@@ -258,8 +258,8 @@ static int irqdma_allocated;
#include <linux/completion.h>
static struct request *current_req;
-static struct request_queue *floppy_queue;
static void do_fd_request(struct request_queue *q);
+static int set_next_request(void);
#ifndef fd_get_dma_residue
#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
static struct block_device *opened_bdev[N_DRIVE];
static DEFINE_MUTEX(open_lock);
static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+static int fdc_queue;
/*
* This struct defines the different floppy types.
@@ -895,8 +896,8 @@ static void unlock_fdc(void)
del_timer(&fd_timeout);
cont = NULL;
clear_bit(0, &fdc_busy);
- if (current_req || blk_peek_request(floppy_queue))
- do_fd_request(floppy_queue);
+ if (current_req || set_next_request())
+ do_fd_request(current_req->q);
spin_unlock_irqrestore(&floppy_lock, flags);
wake_up(&fdc_wait);
}
@@ -2248,8 +2249,8 @@ static void floppy_end_request(struct request *req, int error)
* logical buffer */
static void request_done(int uptodate)
{
- struct request_queue *q = floppy_queue;
struct request *req = current_req;
+ struct request_queue *q;
unsigned long flags;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2263,6 +2264,8 @@ static void request_done(int uptodate)
return;
}
+ q = req->q;
+
if (uptodate) {
/* maintain values for invalidation on geometry
* change */
@@ -2816,6 +2819,28 @@ static int make_raw_rw_request(void)
return 2;
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static int set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+
+ do {
+ q = disks[fdc_queue]->queue;
+ if (++fdc_queue == N_DRIVE)
+ fdc_queue = 0;
+ if (q) {
+ current_req = blk_fetch_request(q);
+ if (current_req)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return current_req != NULL;
+}
+
static void redo_fd_request(void)
{
int drive;
@@ -2827,17 +2852,17 @@ static void redo_fd_request(void)
do_request:
if (!current_req) {
- struct request *req;
+ int pending;
+
+ spin_lock_irq(&floppy_lock);
+ pending = set_next_request();
+ spin_unlock_irq(&floppy_lock);
- spin_lock_irq(floppy_queue->queue_lock);
- req = blk_fetch_request(floppy_queue);
- spin_unlock_irq(floppy_queue->queue_lock);
- if (!req) {
+ if (!pending) {
do_floppy = NULL;
unlock_fdc();
return;
}
- current_req = req;
}
drive = (long)current_req->rq_disk->private_data;
set_fdc(drive);
@@ -3558,9 +3583,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return ret;
}
@@ -3621,7 +3646,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
{
int drive = (long)disk->private_data;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
mutex_lock(&open_lock);
if (UDRS->fd_ref < 0)
UDRS->fd_ref = 0;
@@ -3632,7 +3657,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
if (!UDRS->fd_ref)
opened_bdev[drive] = NULL;
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return 0;
}
@@ -3650,7 +3675,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
int res = -EBUSY;
char *tmp;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
mutex_lock(&open_lock);
old_dev = UDRS->fd_device;
if (opened_bdev[drive] && opened_bdev[drive] != bdev)
@@ -3727,7 +3752,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
goto out;
}
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return 0;
out:
if (UDRS->fd_ref < 0)
@@ -3738,7 +3763,7 @@ out:
opened_bdev[drive] = NULL;
out2:
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return res;
}
@@ -4170,6 +4195,13 @@ static int __init floppy_init(void)
goto out_put_disk;
}
+ disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
+ if (!disks[dr]->queue) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ blk_queue_max_hw_sectors(disks[dr]->queue, 64);
disks[dr]->major = FLOPPY_MAJOR;
disks[dr]->first_minor = TOMINOR(dr);
disks[dr]->fops = &floppy_fops;
@@ -4188,13 +4220,6 @@ static int __init floppy_init(void)
if (err)
goto out_unreg_blkdev;
- floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
- if (!floppy_queue) {
- err = -ENOMEM;
- goto out_unreg_driver;
- }
- blk_queue_max_hw_sectors(floppy_queue, 64);
-
blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
floppy_find, NULL, NULL);
@@ -4322,7 +4347,6 @@ static int __init floppy_init(void)
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
- disks[drive]->queue = floppy_queue;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
disks[drive]->driverfs_dev = &floppy_device[drive].dev;
add_disk(disks[drive]);
@@ -4333,19 +4357,19 @@ static int __init floppy_init(void)
out_unreg_platform_dev:
platform_device_unregister(&floppy_device[drive]);
out_flush_work:
- flush_scheduled_work();
+ flush_work_sync(&floppy_work);
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
out_unreg_region:
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
- blk_cleanup_queue(floppy_queue);
-out_unreg_driver:
platform_driver_unregister(&floppy_driver);
out_unreg_blkdev:
unregister_blkdev(FLOPPY_MAJOR, "fd");
out_put_disk:
while (dr--) {
del_timer(&motor_off_timer[dr]);
+ if (disks[dr]->queue)
+ blk_cleanup_queue(disks[dr]->queue);
put_disk(disks[dr]);
}
return err;
@@ -4403,7 +4427,7 @@ static int floppy_grab_irq_and_dma(void)
* We might have scheduled a free_irq(), wait it to
* drain first:
*/
- flush_scheduled_work();
+ flush_work_sync(&floppy_work);
if (fd_request_irq()) {
DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4554,12 +4578,12 @@ static void __exit floppy_module_exit(void)
device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
+ blk_cleanup_queue(disks[drive]->queue);
put_disk(disks[drive]);
}
del_timer_sync(&fd_timeout);
del_timer_sync(&fd_timer);
- blk_cleanup_queue(floppy_queue);
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 91797bb..44e18c0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -67,16 +67,18 @@
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* for invalidate_bdev() */
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
+#include <linux/sysfs.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(loop_mutex);
static LIST_HEAD(loop_devices);
static DEFINE_MUTEX(loop_devices_mutex);
@@ -99,8 +101,8 @@ static int transfer_none(struct loop_device *lo, int cmd,
else
memcpy(raw_buf, loop_buf, size);
- kunmap_atomic(raw_buf, KM_USER0);
kunmap_atomic(loop_buf, KM_USER1);
+ kunmap_atomic(raw_buf, KM_USER0);
cond_resched();
return 0;
}
@@ -128,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
for (i = 0; i < size; i++)
*out++ = *in++ ^ key[(i & 511) % keysize];
- kunmap_atomic(raw_buf, KM_USER0);
kunmap_atomic(loop_buf, KM_USER1);
+ kunmap_atomic(raw_buf, KM_USER0);
cond_resched();
return 0;
}
@@ -393,11 +395,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct loop_device *lo = p->lo;
struct page *page = buf->page;
sector_t IV;
- int size, ret;
-
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
+ int size;
IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
(buf->offset >> 9);
@@ -477,17 +475,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
if (bio_rw(bio) == WRITE) {
- bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
struct file *file = lo->lo_backing_file;
- if (barrier) {
- if (unlikely(!file->f_op->fsync)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
+ if (bio->bi_rw & REQ_FLUSH) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret)) {
+ if (unlikely(ret && ret != -EINVAL)) {
ret = -EIO;
goto out;
}
@@ -495,9 +487,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
ret = lo_send(lo, bio, pos);
- if (barrier && !ret) {
+ if ((bio->bi_rw & REQ_FUA) && !ret) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret))
+ if (unlikely(ret && ret != -EINVAL))
ret = -EIO;
}
} else
@@ -737,6 +729,103 @@ static inline int is_loop_device(struct file *file)
return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
}
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+ ssize_t (*callback)(struct loop_device *, char *))
+{
+ struct loop_device *l, *lo = NULL;
+
+ mutex_lock(&loop_devices_mutex);
+ list_for_each_entry(l, &loop_devices, lo_list)
+ if (disk_to_dev(l->lo_disk) == dev) {
+ lo = l;
+ break;
+ }
+ mutex_unlock(&loop_devices_mutex);
+
+ return lo ? callback(lo, page) : -EIO;
+}
+
+#define LOOP_ATTR_RO(_name) \
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
+static ssize_t loop_attr_do_show_##_name(struct device *d, \
+ struct device_attribute *attr, char *b) \
+{ \
+ return loop_attr_show(d, b, loop_attr_##_name##_show); \
+} \
+static struct device_attribute loop_attr_##_name = \
+ __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+ ssize_t ret;
+ char *p = NULL;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ if (lo->lo_backing_file)
+ p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ mutex_unlock(&lo->lo_ctl_mutex);
+
+ if (IS_ERR_OR_NULL(p))
+ ret = PTR_ERR(p);
+ else {
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+ buf[ret] = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+ int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+ return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+
+static struct attribute *loop_attrs[] = {
+ &loop_attr_backing_file.attr,
+ &loop_attr_offset.attr,
+ &loop_attr_sizelimit.attr,
+ &loop_attr_autoclear.attr,
+ NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+ .name = "loop",
+ .attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+ return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+ sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
@@ -832,10 +921,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->lo_queue->unplug_fn = loop_unplug;
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+ blk_queue_flush(lo->lo_queue, REQ_FLUSH);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
+ loop_sysfs_init(lo);
/* let user-space know about the new size */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
@@ -854,6 +944,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
return 0;
out_clr:
+ loop_sysfs_exit(lo);
lo->lo_thread = NULL;
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
@@ -948,6 +1039,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
if (bdev)
invalidate_bdev(bdev);
set_capacity(lo->lo_disk, 0);
+ loop_sysfs_exit(lo);
if (bdev) {
bd_set_size(bdev, 0);
/* let user-space know about this change */
@@ -1409,11 +1501,11 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
{
struct loop_device *lo = bdev->bd_disk->private_data;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
lo->lo_refcnt++;
mutex_unlock(&lo->lo_ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
@@ -1423,7 +1515,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
struct loop_device *lo = disk->private_data;
int err;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
if (--lo->lo_refcnt)
@@ -1448,7 +1540,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
out:
mutex_unlock(&lo->lo_ctl_mutex);
out_unlocked:
- lock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0daa422..a32fb41 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -24,7 +24,7 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
@@ -53,6 +53,7 @@
#define DBG_BLKDEV 0x0100
#define DBG_RX 0x0200
#define DBG_TX 0x0400
+static DEFINE_MUTEX(nbd_mutex);
static unsigned int debugflags;
#endif /* NDEBUG */
@@ -717,11 +718,11 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
- lock_kernel();
+ mutex_lock(&nbd_mutex);
mutex_lock(&lo->tx_lock);
error = __nbd_ioctl(bdev, lo, cmd, arg);
mutex_unlock(&lo->tx_lock);
- unlock_kernel();
+ mutex_unlock(&nbd_mutex);
return error;
}
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f..87311eb 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
break;
/* filter out block requests we don't understand */
- if (rq->cmd_type != REQ_TYPE_FS &&
- !(rq->cmd_flags & REQ_HARDBARRIER)) {
+ if (rq->cmd_type != REQ_TYPE_FS) {
blk_end_request_all(rq, 0);
continue;
}
@@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_flush(q, REQ_FLUSH);
disk->queue = q;
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 76f8565..62cec6a 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,9 +138,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
#include <linux/cdrom.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(pcd_mutex);
static DEFINE_SPINLOCK(pcd_lock);
module_param(verbose, bool, 0644);
@@ -227,9 +228,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
struct pcd_unit *cd = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
ret = cdrom_open(&cd->info, bdev, mode);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return ret;
}
@@ -237,9 +238,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
static int pcd_block_release(struct gendisk *disk, fmode_t mode)
{
struct pcd_unit *cd = disk->private_data;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
cdrom_release(&cd->info, mode);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return 0;
}
@@ -249,9 +250,9 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
struct pcd_unit *cd = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return ret;
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 985f0d4..c0ee155 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -153,10 +153,11 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <linux/workqueue.h>
+static DEFINE_MUTEX(pd_mutex);
static DEFINE_SPINLOCK(pd_lock);
module_param(verbose, bool, 0);
@@ -736,14 +737,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode)
{
struct pd_unit *disk = bdev->bd_disk->private_data;
- lock_kernel();
+ mutex_lock(&pd_mutex);
disk->access++;
if (disk->removable) {
pd_special_command(disk, pd_media_check);
pd_special_command(disk, pd_door_lock);
}
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
}
@@ -771,10 +772,10 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
switch (cmd) {
case CDROMEJECT:
- lock_kernel();
+ mutex_lock(&pd_mutex);
if (disk->access == 1)
pd_special_command(disk, pd_eject);
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
default:
return -EINVAL;
@@ -785,10 +786,10 @@ static int pd_release(struct gendisk *p, fmode_t mode)
{
struct pd_unit *disk = p->private_data;
- lock_kernel();
+ mutex_lock(&pd_mutex);
if (!--disk->access && disk->removable)
pd_special_command(disk, pd_door_unlock);
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 4457b49..635f25d 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -152,9 +152,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(pf_mutex);
static DEFINE_SPINLOCK(pf_spin_lock);
module_param(verbose, bool, 0644);
@@ -302,7 +303,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
struct pf_unit *pf = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pf_mutex);
pf_identify(pf);
ret = -ENODEV;
@@ -318,7 +319,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
if (pf->removable)
pf_lock(pf, 1);
out:
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return ret;
}
@@ -349,9 +350,9 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
if (pf->access != 1)
return -EBUSY;
- lock_kernel();
+ mutex_lock(&pf_mutex);
pf_eject(pf);
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return 0;
}
@@ -360,9 +361,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
{
struct pf_unit *pf = disk->private_data;
- lock_kernel();
+ mutex_lock(&pf_mutex);
if (pf->access <= 0) {
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return -EINVAL;
}
@@ -371,7 +372,7 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
if (!pf->access && pf->removable)
pf_lock(pf, 0);
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return 0;
}
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3d..6b9a200 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,7 +162,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
#include <linux/pg.h>
#include <linux/device.h>
#include <linux/sched.h> /* current, TASK_* */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/jiffies.h>
#include <asm/uaccess.h>
@@ -193,6 +193,7 @@ module_param_array(drive3, int, NULL, 0);
#define ATAPI_IDENTIFY 0x12
+static DEFINE_MUTEX(pg_mutex);
static int pg_open(struct inode *inode, struct file *file);
static int pg_release(struct inode *inode, struct file *file);
static ssize_t pg_read(struct file *filp, char __user *buf,
@@ -234,6 +235,7 @@ static const struct file_operations pg_fops = {
.write = pg_write,
.open = pg_open,
.release = pg_release,
+ .llseek = noop_llseek,
};
static void pg_init_units(void)
@@ -518,7 +520,7 @@ static int pg_open(struct inode *inode, struct file *file)
struct pg *dev = &devices[unit];
int ret = 0;
- lock_kernel();
+ mutex_lock(&pg_mutex);
if ((unit >= PG_UNITS) || (!dev->present)) {
ret = -ENODEV;
goto out;
@@ -547,7 +549,7 @@ static int pg_open(struct inode *inode, struct file *file)
file->private_data = dev;
out:
- unlock_kernel();
+ mutex_unlock(&pg_mutex);
return ret;
}
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825f..7179f79 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,7 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
#include <linux/mtio.h>
#include <linux/device.h>
#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
@@ -189,6 +189,7 @@ module_param_array(drive3, int, NULL, 0);
#define ATAPI_MODE_SENSE 0x1a
#define ATAPI_LOG_SENSE 0x4d
+static DEFINE_MUTEX(pt_mutex);
static int pt_open(struct inode *inode, struct file *file);
static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
static int pt_release(struct inode *inode, struct file *file);
@@ -239,6 +240,7 @@ static const struct file_operations pt_fops = {
.unlocked_ioctl = pt_ioctl,
.open = pt_open,
.release = pt_release,
+ .llseek = noop_llseek,
};
/* sysfs class support */
@@ -650,9 +652,9 @@ static int pt_open(struct inode *inode, struct file *file)
struct pt_unit *tape = pt + unit;
int err;
- lock_kernel();
+ mutex_lock(&pt_mutex);
if (unit >= PT_UNITS || (!tape->present)) {
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return -ENODEV;
}
@@ -681,12 +683,12 @@ static int pt_open(struct inode *inode, struct file *file)
}
file->private_data = tape;
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
out:
atomic_inc(&tape->available);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return err;
}
@@ -704,15 +706,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (mtop.mt_op) {
case MTREW:
- lock_kernel();
+ mutex_lock(&pt_mutex);
pt_rewind(tape);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
case MTWEOF:
- lock_kernel();
+ mutex_lock(&pt_mutex);
pt_write_fm(tape);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
default:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 37a2bb5..77d70ee 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -57,7 +57,6 @@
#include <linux/seq_file.h>
#include <linux/miscdevice.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <scsi/scsi_cmnd.h>
@@ -86,6 +85,7 @@
#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
+static DEFINE_MUTEX(pktcdvd_mutex);
static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
static struct proc_dir_entry *pkt_proc;
static int pktdev_major;
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;
@@ -2297,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
* so bdget() can't fail.
*/
bdget(pd->bdev->bd_dev);
- if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
+ if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
goto out;
- if ((ret = bd_claim(pd->bdev, pd)))
- goto out_putdev;
-
if ((ret = pkt_get_last_written(pd, &lba))) {
printk(DRIVER_NAME": pkt_get_last_written failed\n");
- goto out_unclaim;
+ goto out_putdev;
}
set_capacity(pd->disk, lba << 2);
@@ -2315,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
q = bdev_get_queue(pd->bdev);
if (write) {
if ((ret = pkt_open_write(pd)))
- goto out_unclaim;
+ goto out_putdev;
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
@@ -2330,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
}
if ((ret = pkt_set_segment_merging(pd, q)))
- goto out_unclaim;
+ goto out_putdev;
if (write) {
if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
printk(DRIVER_NAME": not enough memory for buffers\n");
ret = -ENOMEM;
- goto out_unclaim;
+ goto out_putdev;
}
printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
}
return 0;
-out_unclaim:
- bd_release(pd->bdev);
out_putdev:
- blkdev_put(pd->bdev, FMODE_READ);
+ blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
out:
return ret;
}
@@ -2363,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
pkt_lock_door(pd, 0);
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
- bd_release(pd->bdev);
- blkdev_put(pd->bdev, FMODE_READ);
+ blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
pkt_shrink_pktlist(pd);
}
@@ -2383,7 +2376,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
VPRINTK(DRIVER_NAME": entering open\n");
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
if (!pd) {
@@ -2411,7 +2404,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return 0;
out_dec:
@@ -2419,7 +2412,7 @@ out_dec:
out:
VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -2428,7 +2421,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
struct pktcdvd_device *pd = disk->private_data;
int ret = 0;
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd->refcnt--;
BUG_ON(pd->refcnt < 0);
@@ -2437,7 +2430,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
pkt_release_dev(pd, flush);
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -2734,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
bdev = bdget(dev);
if (!bdev)
return -ENOMEM;
- ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
+ ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
if (ret)
return ret;
@@ -2773,7 +2766,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
switch (cmd) {
case CDROMEJECT:
/*
@@ -2798,7 +2791,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
ret = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -3046,6 +3039,7 @@ static const struct file_operations pkt_ctl_fops = {
.compat_ioctl = pkt_ctl_compat_ioctl,
#endif
.owner = THIS_MODULE,
+ .llseek = no_llseek,
};
static struct miscdevice pkt_misc = {
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 03688c2..8e1ce2e 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);
- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_flush(queue, REQ_FLUSH);
blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 0000000..e1e38b1
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,2046 @@
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ For usage instructions, please refer to:
+
+ Documentation/ABI/testing/sysfs-bus-rbd
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN 64
+#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_MAX_OPT_LEN 1024
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define DEV_NAME_LEN 32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ u64 image_size;
+ char block_name[32];
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ struct rw_semaphore snap_rwsem;
+ struct ceph_snap_context *snapc;
+ size_t snap_names_len;
+ u64 snap_seq;
+ u32 total_snaps;
+
+ char *snap_names;
+ u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client. multiple devices may share a client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+};
+
+struct rbd_snap {
+ struct device dev;
+ const char *name;
+ size_t size;
+ struct list_head node;
+ u64 id;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct ceph_client *client;
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct rbd_image_header header;
+ char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+ int obj_len;
+ char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+ char pool_name[RBD_MAX_POOL_NAME_LEN];
+ int poolid;
+
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u32 cur_snap; /* index+1 of current snapshot within snap context
+ 0 - for the head */
+ int read_only;
+
+ struct list_head node;
+
+ /* list of snapshots */
+ struct list_head snaps;
+
+ /* sysfs related */
+ struct device dev;
+};
+
+static struct bus_type rbd_bus_type = {
+ .name = "rbd",
+};
+
+static spinlock_t node_lock; /* protects client get/put */
+
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list); /* devices */
+static LIST_HEAD(rbd_client_list); /* clients */
+
+static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
+static void rbd_dev_release(struct device *dev);
+static ssize_t rbd_snap_rollback(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t size);
+static ssize_t rbd_snap_add(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count);
+static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
+ struct rbd_snap *snap);;
+
+
+static struct rbd_device *dev_to_rbd(struct device *dev)
+{
+ return container_of(dev, struct rbd_device, dev);
+}
+
+static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
+{
+ return get_device(&rbd_dev->dev);
+}
+
+static void rbd_put_dev(struct rbd_device *rbd_dev)
+{
+ put_device(&rbd_dev->dev);
+}
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ rbd_get_dev(rbd_dev);
+
+ set_device_ro(bdev, rbd_dev->read_only);
+
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
+ return 0;
+}
+
+static int rbd_release(struct gendisk *disk, fmode_t mode)
+{
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ rbd_put_dev(rbd_dev);
+
+ return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+ .release = rbd_release,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("rbd_client_create\n");
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(opt, rbdc);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ opt = NULL; /* Now rbdc->client is responsible for opt */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_err;
+
+ spin_lock(&node_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&node_lock);
+
+ dout("rbd_client_create created %p\n", rbdc);
+ return rbdc;
+
+out_err:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (opt)
+ ceph_destroy_options(opt);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+ struct rbd_client *client_node;
+
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ list_for_each_entry(client_node, &rbd_client_list, node)
+ if (ceph_compare_options(opt, client_node->client) == 0)
+ return client_node;
+ return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+ char *options)
+{
+ struct rbd_client *rbdc;
+ struct ceph_options *opt;
+ int ret;
+
+ ret = ceph_parse_options(&opt, options, mon_addr,
+ mon_addr + strlen(mon_addr), NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&node_lock);
+ rbdc = __rbd_client_find(opt);
+ if (rbdc) {
+ ceph_destroy_options(opt);
+
+ /* using an existing client */
+ kref_get(&rbdc->kref);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ spin_unlock(&node_lock);
+ return 0;
+ }
+ spin_unlock(&node_lock);
+
+ rbdc = rbd_client_create(opt);
+ if (IS_ERR(rbdc))
+ return PTR_ERR(rbdc);
+
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("rbd_release_client %p\n", rbdc);
+ spin_lock(&node_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&node_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+ kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ rbd_dev->rbd_client = NULL;
+ rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ int allocated_snaps,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 snap_count = le32_to_cpu(ondisk->snap_count);
+ int ret = -ENOMEM;
+
+ init_rwsem(&header->snap_rwsem);
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+ snap_count *
+ sizeof(struct rbd_image_snap_ondisk),
+ gfp_flags);
+ if (!header->snapc)
+ return -ENOMEM;
+ if (snap_count) {
+ header->snap_names = kmalloc(header->snap_names_len,
+ GFP_KERNEL);
+ if (!header->snap_names)
+ goto err_snapc;
+ header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+ GFP_KERNEL);
+ if (!header->snap_sizes)
+ goto err_names;
+ } else {
+ header->snap_names = NULL;
+ header->snap_sizes = NULL;
+ }
+ memcpy(header->block_name, ondisk->block_name,
+ sizeof(ondisk->block_name));
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+
+ atomic_set(&header->snapc->nref, 1);
+ header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->num_snaps = snap_count;
+ header->total_snaps = snap_count;
+
+ if (snap_count &&
+ allocated_snaps == snap_count) {
+ for (i = 0; i < snap_count; i++) {
+ header->snapc->snaps[i] =
+ le64_to_cpu(ondisk->snaps[i].id);
+ header->snap_sizes[i] =
+ le64_to_cpu(ondisk->snaps[i].image_size);
+ }
+
+ /* copy snapshot names */
+ memcpy(header->snap_names, &ondisk->snaps[i],
+ header->snap_names_len);
+ }
+
+ return 0;
+
+err_names:
+ kfree(header->snap_names);
+err_snapc:
+ kfree(header->snapc);
+ return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+ return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header *header = &rbd_dev->header;
+
+ if (!rbd_dev->cur_snap)
+ return 0;
+
+ return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+ u64 *seq, u64 *size)
+{
+ int i;
+ char *p = header->snap_names;
+
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ if (strcmp(snap_name, p) == 0)
+ break;
+ }
+ if (i == header->total_snaps)
+ return -ENOENT;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+
+ if (size)
+ *size = header->snap_sizes[i];
+
+ return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+ const char *snap_name,
+ u64 *size)
+{
+ struct rbd_image_header *header = &dev->header;
+ struct ceph_snap_context *snapc = header->snapc;
+ int ret = -ENOENT;
+
+ down_write(&header->snap_rwsem);
+
+ if (!snap_name ||
+ !*snap_name ||
+ strcmp(snap_name, "-") == 0 ||
+ strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+ if (header->total_snaps)
+ snapc->seq = header->snap_seq;
+ else
+ snapc->seq = 0;
+ dev->cur_snap = 0;
+ dev->read_only = 0;
+ if (size)
+ *size = header->image_size;
+ } else {
+ ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ if (ret < 0)
+ goto done;
+
+ dev->cur_snap = header->total_snaps - ret;
+ dev->read_only = 1;
+ }
+
+ ret = 0;
+done:
+ up_write(&header->snap_rwsem);
+ return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+ kfree(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+ const char *block_name,
+ u64 ofs, u64 len,
+ char *seg_name, u64 *segofs)
+{
+ u64 seg = ofs >> header->obj_order;
+
+ if (seg_name)
+ snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+ "%s.%012llx", block_name, seg);
+
+ ofs = ofs & ((1 << header->obj_order) - 1);
+ len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+ if (segofs)
+ *segofs = ofs;
+
+ return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+ bio_put(tmp);
+ }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+ struct bio_vec *bv;
+ unsigned long flags;
+ void *buf;
+ int i;
+ int pos = 0;
+
+ while (chain) {
+ bio_for_each_segment(bv, chain, i) {
+ if (pos + bv->bv_len > start_ofs) {
+ int remainder = max(start_ofs - pos, 0);
+ buf = bvec_kmap_irq(bv, &flags);
+ memset(buf + remainder, 0,
+ bv->bv_len - remainder);
+ bvec_kunmap_irq(buf, &flags);
+ }
+ pos += bv->bv_len;
+ }
+
+ chain = chain->bi_next;
+ }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+ struct bio_pair **bp,
+ int len, gfp_t gfpmask)
+{
+ struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+ int total = 0;
+
+ if (*bp) {
+ bio_pair_release(*bp);
+ *bp = NULL;
+ }
+
+ while (old_chain && (total < len)) {
+ tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ if (!tmp)
+ goto err_out;
+
+ if (total + old_chain->bi_size > len) {
+ struct bio_pair *bp;
+
+ /*
+ * this split can only happen with a single paged bio,
+ * split_bio will BUG_ON if this is not the case
+ */
+ dout("bio_chain_clone split! total=%d remaining=%d"
+ "bi_size=%d\n",
+ (int)total, (int)len-total,
+ (int)old_chain->bi_size);
+
+ /* split the bio. We'll release it either in the next
+ call, or it will have to be released outside */
+ bp = bio_split(old_chain, (len - total) / 512ULL);
+ if (!bp)
+ goto err_out;
+
+ __bio_clone(tmp, &bp->bio1);
+
+ *next = &bp->bio2;
+ } else {
+ __bio_clone(tmp, old_chain);
+ *next = old_chain->bi_next;
+ }
+
+ tmp->bi_bdev = NULL;
+ gfpmask &= ~__GFP_WAIT;
+ tmp->bi_next = NULL;
+
+ if (!new_chain) {
+ new_chain = tail = tmp;
+ } else {
+ tail->bi_next = tmp;
+ tail = tmp;
+ }
+ old_chain = old_chain->bi_next;
+
+ total += tmp->bi_size;
+ }
+
+ BUG_ON(total < len);
+
+ if (tail)
+ tail->bi_next = NULL;
+
+ *old = old_chain;
+
+ return new_chain;
+
+err_out:
+ dout("bio_chain_clone with err\n");
+ bio_chain_put(new_chain);
+ return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+ int num_ops,
+ int opcode,
+ u32 payload_len)
+{
+ *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+ GFP_NOIO);
+ if (!*ops)
+ return -ENOMEM;
+ (*ops)[0].op = opcode;
+ /*
+ * op extent offset and length will be set later on
+ * in calc_raw_layout()
+ */
+ (*ops)[0].payload_len = payload_len;
+ return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+ kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+ struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj, u64 ofs, u64 len,
+ struct bio *bio,
+ struct page **pages,
+ int num_pages,
+ int flags,
+ struct ceph_osd_req_op *ops,
+ int num_reply,
+ void (*rbd_cb)(struct ceph_osd_request *req,
+ struct ceph_msg *msg))
+{
+ struct ceph_osd_request *req;
+ struct ceph_file_layout *layout;
+ int ret;
+ u64 bno;
+ struct timespec mtime = CURRENT_TIME;
+ struct rbd_request *req_data;
+ struct ceph_osd_request_head *reqhead;
+ struct rbd_image_header *header = &dev->header;
+
+ ret = -ENOMEM;
+ req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+ if (!req_data)
+ goto done;
+
+ dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+ down_read(&header->snap_rwsem);
+
+ req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+ snapc,
+ ops,
+ false,
+ GFP_NOIO, pages, bio);
+ if (IS_ERR(req)) {
+ up_read(&header->snap_rwsem);
+ ret = PTR_ERR(req);
+ goto done_pages;
+ }
+
+ req->r_callback = rbd_cb;
+
+ req_data->rq = rq;
+ req_data->bio = bio;
+ req_data->pages = pages;
+ req_data->len = len;
+
+ req->r_priv = req_data;
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+ strncpy(req->r_oid, obj, sizeof(req->r_oid));
+ req->r_oid_len = strlen(req->r_oid);
+
+ layout = &req->r_file_layout;
+ memset(layout, 0, sizeof(*layout));
+ layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_stripe_count = cpu_to_le32(1);
+ layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_pg_preferred = cpu_to_le32(-1);
+ layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+ ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+ ofs, &len, &bno, req, ops);
+
+ ceph_osdc_build_request(req, ofs, &len,
+ ops,
+ snapc,
+ &mtime,
+ req->r_oid, req->r_oid_len);
+ up_read(&header->snap_rwsem);
+
+ ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ if (ret < 0)
+ goto done_err;
+
+ if (!rbd_cb) {
+ ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ceph_osdc_put_request(req);
+ }
+ return ret;
+
+done_err:
+ bio_chain_put(req_data->bio);
+ ceph_osdc_put_request(req);
+done_pages:
+ kfree(req_data);
+done:
+ if (rq)
+ blk_end_request(rq, ret, len);
+ return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ struct rbd_request *req_data = req->r_priv;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ __s32 rc;
+ u64 bytes;
+ int read_op;
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+ read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+ dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+ if (rc == -ENOENT && read_op) {
+ zero_bio_chain(req_data->bio, 0);
+ rc = 0;
+ } else if (rc == 0 && read_op && bytes < req_data->len) {
+ zero_bio_chain(req_data->bio, bytes);
+ bytes = req_data->len;
+ }
+
+ blk_end_request(req_data->rq, rc, bytes);
+
+ if (req_data->bio)
+ bio_chain_put(req_data->bio);
+
+ ceph_osdc_put_request(req);
+ kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode,
+ int flags,
+ struct ceph_osd_req_op *orig_ops,
+ int num_reply,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ int ret;
+ struct page **pages;
+ int num_pages;
+ struct ceph_osd_req_op *ops = orig_ops;
+ u32 payload_len;
+
+ num_pages = calc_pages_for(ofs , len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (!orig_ops) {
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+ ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+ if (ret < 0)
+ goto done_ops;
+ }
+ }
+
+ ret = rbd_do_request(NULL, dev, snapc, snapid,
+ obj, ofs, len, NULL,
+ pages, num_pages,
+ flags,
+ ops,
+ 2,
+ NULL);
+ if (ret < 0)
+ goto done_ops;
+
+ if ((flags & CEPH_OSD_FLAG_READ) && buf)
+ ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+ if (!orig_ops)
+ rbd_destroy_ops(ops);
+done:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+ struct rbd_device *rbd_dev ,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode, int flags, int num_reply,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ char *seg_name;
+ u64 seg_ofs;
+ u64 seg_len;
+ int ret;
+ struct ceph_osd_req_op *ops;
+ u32 payload_len;
+
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return -ENOMEM;
+
+ seg_len = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, len,
+ seg_name, &seg_ofs);
+
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ /* we've taken care of segment sizes earlier when we
+ cloned the bios. We should never have a segment
+ truncated at this point */
+ BUG_ON(seg_len < len);
+
+ ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+ seg_name, seg_ofs, seg_len,
+ bio,
+ NULL, 0,
+ flags,
+ ops,
+ num_reply,
+ rbd_req_cb);
+done:
+ kfree(seg_name);
+ return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+ struct rbd_device *rbd_dev,
+ struct ceph_snap_context *snapc,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+ struct rbd_device *rbd_dev,
+ u64 snapid,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ return rbd_req_sync_op(dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ NULL,
+ 1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+ u64 snapid,
+ const char *obj)
+{
+ struct ceph_osd_req_op *ops;
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+ if (ret < 0)
+ return ret;
+
+ ops[0].snap.snapid = snapid;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+ const char *obj,
+ const char *cls,
+ const char *method,
+ const char *data,
+ int len)
+{
+ struct ceph_osd_req_op *ops;
+ int cls_len = strlen(cls);
+ int method_len = strlen(method);
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+ cls_len + method_len + len);
+ if (ret < 0)
+ return ret;
+
+ ops[0].cls.class_name = cls;
+ ops[0].cls.class_len = (__u8)cls_len;
+ ops[0].cls.method_name = method;
+ ops[0].cls.method_len = (__u8)method_len;
+ ops[0].cls.argc = 0;
+ ops[0].cls.indata = data;
+ ops[0].cls.indata_len = len;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ dout("cls_exec returned %d\n", ret);
+ return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ struct request *rq;
+ struct bio_pair *bp = NULL;
+
+ rq = blk_fetch_request(q);
+
+ while (1) {
+ struct bio *bio;
+ struct bio *rq_bio, *next_bio = NULL;
+ bool do_write;
+ int size, op_size = 0;
+ u64 ofs;
+
+ /* peek at request from block layer */
+ if (!rq)
+ break;
+
+ dout("fetched request\n");
+
+ /* filter out block requests we don't understand */
+ if ((rq->cmd_type != REQ_TYPE_FS)) {
+ __blk_end_request_all(rq, 0);
+ goto next;
+ }
+
+ /* deduce our operation (read, write) */
+ do_write = (rq_data_dir(rq) == WRITE);
+
+ size = blk_rq_bytes(rq);
+ ofs = blk_rq_pos(rq) * 512ULL;
+ rq_bio = rq->bio;
+ if (do_write && rbd_dev->read_only) {
+ __blk_end_request_all(rq, -EROFS);
+ goto next;
+ }
+
+ spin_unlock_irq(q->queue_lock);
+
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * 512ULL);
+
+ do {
+ /* a bio clone to be passed down to OSD req */
+ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+ op_size = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, size,
+ NULL, NULL);
+ bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+ op_size, GFP_ATOMIC);
+ if (!bio) {
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENOMEM);
+ goto next;
+ }
+
+ /* init OSD command: write or read */
+ if (do_write)
+ rbd_req_write(rq, rbd_dev,
+ rbd_dev->header.snapc,
+ ofs,
+ op_size, bio);
+ else
+ rbd_req_read(rq, rbd_dev,
+ cur_snap_id(rbd_dev),
+ ofs,
+ op_size, bio);
+
+ size -= op_size;
+ ofs += op_size;
+
+ rq_bio = next_bio;
+ } while (size > 0);
+
+ if (bp)
+ bio_pair_release(bp);
+
+ spin_lock_irq(q->queue_lock);
+next:
+ rq = blk_fetch_request(q);
+ }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+ sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ unsigned int bio_sectors = bmd->bi_size >> 9;
+ int max;
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
+ if (max <= bvec->bv_len && bio_sectors == 0)
+ return bvec->bv_len;
+ return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk = rbd_dev->disk;
+
+ if (!disk)
+ return;
+
+ rbd_header_free(&rbd_dev->header);
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
+{
+ ssize_t rc;
+ struct rbd_image_header_ondisk *dh;
+ int snap_count = 0;
+ u64 snap_names_len = 0;
+
+ while (1) {
+ int len = sizeof(*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ snap_names_len;
+
+ rc = -ENOMEM;
+ dh = kmalloc(len, GFP_KERNEL);
+ if (!dh)
+ return -ENOMEM;
+
+ rc = rbd_req_sync_read(rbd_dev,
+ NULL, CEPH_NOSNAP,
+ rbd_dev->obj_md_name,
+ 0, len,
+ (char *)dh);
+ if (rc < 0)
+ goto out_dh;
+
+ rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+ if (rc < 0)
+ goto out_dh;
+
+ if (snap_count != header->total_snaps) {
+ snap_count = header->total_snaps;
+ snap_names_len = header->snap_names_len;
+ rbd_header_free(header);
+ kfree(dh);
+ continue;
+ }
+ break;
+ }
+
+out_dh:
+ kfree(dh);
+ return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+ const char *snap_name,
+ gfp_t gfp_flags)
+{
+ int name_len = strlen(snap_name);
+ u64 new_snapid;
+ int ret;
+ void *data, *data_start, *data_end;
+
+ /* we should create a snapshot only if we're pointing at the head */
+ if (dev->cur_snap)
+ return -EINVAL;
+
+ ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+ &new_snapid);
+ dout("created snapid=%lld\n", new_snapid);
+ if (ret < 0)
+ return ret;
+
+ data = kmalloc(name_len + 16, gfp_flags);
+ if (!data)
+ return -ENOMEM;
+
+ data_start = data;
+ data_end = data + name_len + 16;
+
+ ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+ ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+ ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+ data_start, data - data_start);
+
+ kfree(data_start);
+
+ if (ret < 0)
+ return ret;
+
+ dev->header.snapc->seq = new_snapid;
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
+{
+ struct rbd_snap *snap;
+
+ while (!list_empty(&rbd_dev->snaps)) {
+ snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
+ __rbd_remove_snap_dev(rbd_dev, snap);
+ }
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int __rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+ int ret;
+ struct rbd_image_header h;
+ u64 snap_seq;
+
+ ret = rbd_read_header(rbd_dev, &h);
+ if (ret < 0)
+ return ret;
+
+ down_write(&rbd_dev->header.snap_rwsem);
+
+ snap_seq = rbd_dev->header.snapc->seq;
+
+ kfree(rbd_dev->header.snapc);
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
+ rbd_dev->header.snap_names = h.snap_names;
+ rbd_dev->header.snap_names_len = h.snap_names_len;
+ rbd_dev->header.snap_sizes = h.snap_sizes;
+ rbd_dev->header.snapc->seq = snap_seq;
+
+ ret = __rbd_init_snaps_header(rbd_dev);
+
+ up_write(&rbd_dev->header.snap_rwsem);
+
+ return ret;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int rc;
+ u64 total_size = 0;
+
+ /* contact OSD, request size info about the object being mapped */
+ rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+ if (rc)
+ return rc;
+
+ /* no need to lock here, as rbd_dev is not registered yet */
+ rc = __rbd_init_snaps_header(rbd_dev);
+ if (rc)
+ return rc;
+
+ rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ if (rc)
+ return rc;
+
+ /* create gendisk info */
+ rc = -ENOMEM;
+ disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ if (!disk)
+ goto out;
+
+ sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+ disk->major = rbd_dev->major;
+ disk->first_minor = 0;
+ disk->fops = &rbd_bd_ops;
+ disk->private_data = rbd_dev;
+
+ /* init rq */
+ rc = -ENOMEM;
+ q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+ if (!q)
+ goto out_disk;
+ blk_queue_merge_bvec(q, rbd_merge_bvec);
+ disk->queue = q;
+
+ q->queuedata = rbd_dev;
+
+ rbd_dev->disk = disk;
+ rbd_dev->q = q;
+
+ /* finally, announce the disk to the world */
+ set_capacity(disk, total_size / 512ULL);
+ add_disk(disk);
+
+ pr_info("%s: added with size 0x%llx\n",
+ disk->disk_name, (unsigned long long)total_size);
+ return 0;
+
+out_disk:
+ put_disk(disk);
+out:
+ return rc;
+}
+
+/*
+ sysfs
+*/
+
+static ssize_t rbd_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
+}
+
+static ssize_t rbd_major_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "%d\n", rbd_dev->major);
+}
+
+static ssize_t rbd_client_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
+}
+
+static ssize_t rbd_pool_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->pool_name);
+}
+
+static ssize_t rbd_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->obj);
+}
+
+static ssize_t rbd_snap_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->snap_name);
+}
+
+static ssize_t rbd_image_refresh(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t size)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ int rc;
+ int ret = size;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rc = __rbd_update_snaps(rbd_dev);
+ if (rc < 0)
+ ret = rc;
+
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
+static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
+static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
+static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
+static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
+
+static struct attribute *rbd_attrs[] = {
+ &dev_attr_size.attr,
+ &dev_attr_major.attr,
+ &dev_attr_client_id.attr,
+ &dev_attr_pool.attr,
+ &dev_attr_name.attr,
+ &dev_attr_current_snap.attr,
+ &dev_attr_refresh.attr,
+ &dev_attr_create_snap.attr,
+ &dev_attr_rollback_snap.attr,
+ NULL
+};
+
+static struct attribute_group rbd_attr_group = {
+ .attrs = rbd_attrs,
+};
+
+static const struct attribute_group *rbd_attr_groups[] = {
+ &rbd_attr_group,
+ NULL
+};
+
+static void rbd_sysfs_dev_release(struct device *dev)
+{
+}
+
+static struct device_type rbd_device_type = {
+ .name = "rbd",
+ .groups = rbd_attr_groups,
+ .release = rbd_sysfs_dev_release,
+};
+
+
+/*
+ sysfs - snapshots
+*/
+
+static ssize_t rbd_snap_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
+
+ return sprintf(buf, "%lld\n", (long long)snap->size);
+}
+
+static ssize_t rbd_snap_id_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
+
+ return sprintf(buf, "%lld\n", (long long)snap->id);
+}
+
+static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
+static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
+
+static struct attribute *rbd_snap_attrs[] = {
+ &dev_attr_snap_size.attr,
+ &dev_attr_snap_id.attr,
+ NULL,
+};
+
+static struct attribute_group rbd_snap_attr_group = {
+ .attrs = rbd_snap_attrs,
+};
+
+static void rbd_snap_dev_release(struct device *dev)
+{
+ struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
+ kfree(snap->name);
+ kfree(snap);
+}
+
+static const struct attribute_group *rbd_snap_attr_groups[] = {
+ &rbd_snap_attr_group,
+ NULL
+};
+
+static struct device_type rbd_snap_device_type = {
+ .groups = rbd_snap_attr_groups,
+ .release = rbd_snap_dev_release,
+};
+
+static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
+ struct rbd_snap *snap)
+{
+ list_del(&snap->node);
+ device_unregister(&snap->dev);
+}
+
+static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
+ struct rbd_snap *snap,
+ struct device *parent)
+{
+ struct device *dev = &snap->dev;
+ int ret;
+
+ dev->type = &rbd_snap_device_type;
+ dev->parent = parent;
+ dev->release = rbd_snap_dev_release;
+ dev_set_name(dev, "snap_%s", snap->name);
+ ret = device_register(dev);
+
+ return ret;
+}
+
+static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
+ int i, const char *name,
+ struct rbd_snap **snapp)
+{
+ int ret;
+ struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
+ if (!snap)
+ return -ENOMEM;
+ snap->name = kstrdup(name, GFP_KERNEL);
+ snap->size = rbd_dev->header.snap_sizes[i];
+ snap->id = rbd_dev->header.snapc->snaps[i];
+ if (device_is_registered(&rbd_dev->dev)) {
+ ret = rbd_register_snap_dev(rbd_dev, snap,
+ &rbd_dev->dev);
+ if (ret < 0)
+ goto err;
+ }
+ *snapp = snap;
+ return 0;
+err:
+ kfree(snap->name);
+ kfree(snap);
+ return ret;
+}
+
+/*
+ * search for the previous snap in a null delimited string list
+ */
+const char *rbd_prev_snap_name(const char *name, const char *start)
+{
+ if (name < start + 2)
+ return NULL;
+
+ name -= 2;
+ while (*name) {
+ if (name == start)
+ return start;
+ name--;
+ }
+ return name + 1;
+}
+
+/*
+ * compare the old list of snapshots that we have to what's in the header
+ * and update it accordingly. Note that the header holds the snapshots
+ * in a reverse order (from newest to oldest) and we need to go from
+ * older to new so that we don't get a duplicate snap name when
+ * doing the process (e.g., removed snapshot and recreated a new
+ * one with the same name.
+ */
+static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
+{
+ const char *name, *first_name;
+ int i = rbd_dev->header.total_snaps;
+ struct rbd_snap *snap, *old_snap = NULL;
+ int ret;
+ struct list_head *p, *n;
+
+ first_name = rbd_dev->header.snap_names;
+ name = first_name + rbd_dev->header.snap_names_len;
+
+ list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
+ u64 cur_id;
+
+ old_snap = list_entry(p, struct rbd_snap, node);
+
+ if (i)
+ cur_id = rbd_dev->header.snapc->snaps[i - 1];
+
+ if (!i || old_snap->id < cur_id) {
+ /* old_snap->id was skipped, thus was removed */
+ __rbd_remove_snap_dev(rbd_dev, old_snap);
+ continue;
+ }
+ if (old_snap->id == cur_id) {
+ /* we have this snapshot already */
+ i--;
+ name = rbd_prev_snap_name(name, first_name);
+ continue;
+ }
+ for (; i > 0;
+ i--, name = rbd_prev_snap_name(name, first_name)) {
+ if (!name) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ cur_id = rbd_dev->header.snapc->snaps[i];
+ /* snapshot removal? handle it above */
+ if (cur_id >= old_snap->id)
+ break;
+ /* a new snapshot */
+ ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
+ if (ret < 0)
+ return ret;
+
+ /* note that we add it backward so using n and not p */
+ list_add(&snap->node, n);
+ p = &snap->node;
+ }
+ }
+ /* we're done going over the old snap list, just add what's left */
+ for (; i > 0; i--) {
+ name = rbd_prev_snap_name(name, first_name);
+ if (!name) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
+ if (ret < 0)
+ return ret;
+ list_add(&snap->node, &rbd_dev->snaps);
+ }
+
+ return 0;
+}
+
+
+static void rbd_root_dev_release(struct device *dev)
+{
+}
+
+static struct device rbd_root_dev = {
+ .init_name = "rbd",
+ .release = rbd_root_dev_release,
+};
+
+static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
+{
+ int ret = -ENOMEM;
+ struct device *dev;
+ struct rbd_snap *snap;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ dev = &rbd_dev->dev;
+
+ dev->bus = &rbd_bus_type;
+ dev->type = &rbd_device_type;
+ dev->parent = &rbd_root_dev;
+ dev->release = rbd_dev_release;
+ dev_set_name(dev, "%d", rbd_dev->id);
+ ret = device_register(dev);
+ if (ret < 0)
+ goto done_free;
+
+ list_for_each_entry(snap, &rbd_dev->snaps, node) {
+ ret = rbd_register_snap_dev(rbd_dev, snap,
+ &rbd_dev->dev);
+ if (ret < 0)
+ break;
+ }
+
+ mutex_unlock(&ctl_mutex);
+ return 0;
+done_free:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
+{
+ device_unregister(&rbd_dev->dev);
+}
+
+static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count)
+{
+ struct ceph_osd_client *osdc;
+ struct rbd_device *rbd_dev;
+ ssize_t rc = -ENOMEM;
+ int irc, new_id = 0;
+ struct list_head *tmp;
+ char *mon_dev_name;
+ char *options;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!mon_dev_name)
+ goto err_out_mod;
+
+ options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!options)
+ goto err_mon_dev;
+
+ /* new rbd_device object */
+ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+ if (!rbd_dev)
+ goto err_out_opt;
+
+ /* static rbd_device initialization */
+ spin_lock_init(&rbd_dev->lock);
+ INIT_LIST_HEAD(&rbd_dev->node);
+ INIT_LIST_HEAD(&rbd_dev->snaps);
+
+ /* generate unique id: find highest unique id, add one */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id >= new_id)
+ new_id = rbd_dev->id + 1;
+ }
+
+ rbd_dev->id = new_id;
+
+ /* add to global list */
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+ /* parse add command */
+ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+ "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ mon_dev_name, options, rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name) < 4) {
+ rc = -EINVAL;
+ goto err_out_slot;
+ }
+
+ if (rbd_dev->snap_name[0] == 0)
+ rbd_dev->snap_name[0] = '-';
+
+ rbd_dev->obj_len = strlen(rbd_dev->obj);
+ snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+ rbd_dev->obj, RBD_SUFFIX);
+
+ /* initialize rest of new object */
+ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+ rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+ if (rc < 0)
+ goto err_out_slot;
+
+ mutex_unlock(&ctl_mutex);
+
+ /* pick the pool */
+ osdc = &rbd_dev->client->osdc;
+ rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+ if (rc < 0)
+ goto err_out_client;
+ rbd_dev->poolid = rc;
+
+ /* register our block device */
+ irc = register_blkdev(0, rbd_dev->name);
+ if (irc < 0) {
+ rc = irc;
+ goto err_out_client;
+ }
+ rbd_dev->major = irc;
+
+ rc = rbd_bus_add_dev(rbd_dev);
+ if (rc)
+ goto err_out_blkdev;
+
+ /* set up and announce blkdev mapping */
+ rc = rbd_init_disk(rbd_dev);
+ if (rc)
+ goto err_out_bus;
+
+ return count;
+
+err_out_bus:
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ list_del_init(&rbd_dev->node);
+ mutex_unlock(&ctl_mutex);
+
+ /* this will also clean up rest of rbd_dev stuff */
+
+ rbd_bus_del_dev(rbd_dev);
+ kfree(options);
+ kfree(mon_dev_name);
+ return rc;
+
+err_out_blkdev:
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+ rbd_put_client(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+ list_del_init(&rbd_dev->node);
+ mutex_unlock(&ctl_mutex);
+
+ kfree(rbd_dev);
+err_out_opt:
+ kfree(options);
+err_mon_dev:
+ kfree(mon_dev_name);
+err_out_mod:
+ dout("Error adding device %s\n", buf);
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+ struct list_head *tmp;
+ struct rbd_device *rbd_dev;
+
+ list_for_each(tmp, &rbd_dev_list) {
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id == id)
+ return rbd_dev;
+ }
+ return NULL;
+}
+
+static void rbd_dev_release(struct device *dev)
+{
+ struct rbd_device *rbd_dev =
+ container_of(dev, struct rbd_device, dev);
+
+ rbd_put_client(rbd_dev);
+
+ /* clean up and free blkdev */
+ rbd_free_disk(rbd_dev);
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ kfree(rbd_dev);
+
+ /* release module ref */
+ module_put(THIS_MODULE);
+}
+
+static ssize_t rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+ int ret = count;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ list_del_init(&rbd_dev->node);
+
+ __rbd_remove_all_snaps(rbd_dev);
+ rbd_bus_del_dev(rbd_dev);
+
+done:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t rbd_snap_add(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ int ret;
+ char *name = kmalloc(count + 1, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ snprintf(name, count, "%s", buf);
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ ret = rbd_header_add_snap(rbd_dev,
+ name, GFP_KERNEL);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = __rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+ kfree(name);
+ return ret;
+}
+
+static ssize_t rbd_snap_rollback(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ int ret;
+ u64 snapid;
+ u64 cur_ofs;
+ char *seg_name = NULL;
+ char *snap_name = kmalloc(count + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!snap_name)
+ return ret;
+
+ /* parse snaps add command */
+ snprintf(snap_name, count, "%s", buf);
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ goto done;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+ if (ret < 0)
+ goto done_unlock;
+
+ dout("snapid=%lld\n", snapid);
+
+ cur_ofs = 0;
+ while (cur_ofs < rbd_dev->header.image_size) {
+ cur_ofs += rbd_get_segment(&rbd_dev->header,
+ rbd_dev->obj,
+ cur_ofs, (u64)-1,
+ seg_name, NULL);
+ dout("seg_name=%s\n", seg_name);
+
+ ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+ if (ret < 0)
+ pr_warning("could not roll back obj %s err=%d\n",
+ seg_name, ret);
+ }
+
+ ret = __rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+done:
+ kfree(seg_name);
+ kfree(snap_name);
+
+ return ret;
+}
+
+static struct bus_attribute rbd_bus_attrs[] = {
+ __ATTR(add, S_IWUSR, NULL, rbd_add),
+ __ATTR(remove, S_IWUSR, NULL, rbd_remove),
+ __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/bus/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+ int ret;
+
+ rbd_bus_type.bus_attrs = rbd_bus_attrs;
+
+ ret = bus_register(&rbd_bus_type);
+ if (ret < 0)
+ return ret;
+
+ ret = device_register(&rbd_root_dev);
+
+ return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+ device_unregister(&rbd_root_dev);
+ bus_unregister(&rbd_bus_type);
+}
+
+int __init rbd_init(void)
+{
+ int rc;
+
+ rc = rbd_sysfs_init();
+ if (rc)
+ return rc;
+ spin_lock_init(&node_lock);
+ pr_info("loaded " DRV_NAME_LONG "\n");
+ return 0;
+}
+
+void __exit rbd_exit(void)
+{
+ rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 0000000..fc6c678
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * foo.00000000
+ * foo.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
+#define RBD_MIN_OBJ_ORDER 16
+#define RBD_MAX_OBJ_ORDER 30
+
+#define RBD_MAX_OBJ_NAME_LEN 96
+#define RBD_MAX_SEG_NAME_LEN 128
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+struct rbd_info {
+ __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+ __le64 id;
+ __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+ char text[40];
+ char block_name[24];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ __le64 image_size;
+ __le64 snap_seq;
+ __le32 snap_count;
+ __le32 reserved;
+ __le64 snap_names_len;
+ struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 2e46815..75333d0 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -20,7 +20,7 @@
#include <linux/fd.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/hdreg.h>
#include <linux/kernel.h>
#include <linux/delay.h>
@@ -222,6 +222,7 @@ extern int swim_read_sector_header(struct swim __iomem *base,
extern int swim_read_sector_data(struct swim __iomem *base,
unsigned char *data);
+static DEFINE_MUTEX(swim_mutex);
static inline void set_swim_mode(struct swim __iomem *base, int enable)
{
struct iwm __iomem *iwm_base;
@@ -666,9 +667,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&swim_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return ret;
}
@@ -678,7 +679,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
struct floppy_state *fs = disk->private_data;
struct swim __iomem *base = fs->swd->base;
- lock_kernel();
+ mutex_lock(&swim_mutex);
if (fs->ref_count < 0)
fs->ref_count = 0;
else if (fs->ref_count > 0)
@@ -686,7 +687,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
if (fs->ref_count == 0)
swim_motor(base, OFF);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return 0;
}
@@ -704,9 +705,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
case FDEJECT:
if (fs->ref_count != 1)
return -EBUSY;
- lock_kernel();
+ mutex_lock(&swim_mutex);
err = floppy_eject(fs);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return err;
case FDGETPRM:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index cc6a386..bf3a5b8 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -25,7 +25,7 @@
#include <linux/ioctl.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/io.h>
@@ -36,6 +36,7 @@
#include <asm/machdep.h>
#include <asm/pmac_feature.h>
+static DEFINE_MUTEX(swim3_mutex);
static struct request_queue *swim3_queue;
static struct gendisk *disks[2];
static struct request *fd_req;
@@ -873,9 +874,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
ret = floppy_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return ret;
}
@@ -953,9 +954,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return ret;
}
@@ -964,13 +965,13 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
{
struct floppy_state *fs = disk->private_data;
struct swim3 __iomem *sw = fs->swim3;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
if (fs->ref_count > 0 && --fs->ref_count == 0) {
swim3_action(fs, MOTOR_OFF);
out_8(&sw->control_bic, 0xff);
swim3_select(fs, RELAX);
}
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return 0;
}
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index c48e148..9ae3bb7 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -28,7 +28,7 @@
#include <linux/timer.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <scsi/scsi.h>
#define DRV_NAME "ub"
@@ -248,6 +248,7 @@ struct ub_completion {
spinlock_t lock;
};
+static DEFINE_MUTEX(ub_mutex);
static inline void ub_init_completion(struct ub_completion *x)
{
x->done = 0;
@@ -396,7 +397,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
#else
static const struct usb_device_id ub_usb_ids[] = {
- { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
+ { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) },
{ }
};
@@ -1715,9 +1716,9 @@ static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ret = ub_bd_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return ret;
}
@@ -1730,9 +1731,9 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
struct ub_lun *lun = disk->private_data;
struct ub_dev *sc = lun->udev;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ub_put(sc);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return 0;
}
@@ -1747,9 +1748,9 @@ static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
void __user *usermem = (void __user *) arg;
int ret;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return ret;
}
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index f651e51..e2ff697 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -41,7 +41,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/dma-mapping.h>
#include <linux/completion.h>
#include <linux/device.h>
@@ -73,6 +73,7 @@ enum {
MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
};
+static DEFINE_MUTEX(viodasd_mutex);
static DEFINE_SPINLOCK(viodasd_spinlock);
#define VIOMAXREQ 16
@@ -180,9 +181,9 @@ static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&viodasd_mutex);
ret = viodasd_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&viodasd_mutex);
return ret;
}
@@ -196,7 +197,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
struct viodasd_device *d = disk->private_data;
HvLpEvent_Rc hvrc;
- lock_kernel();
+ mutex_lock(&viodasd_mutex);
/* Send the event to OS/400. We DON'T expect a response */
hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
HvLpEvent_Type_VirtualIo,
@@ -210,7 +211,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
if (hvrc != 0)
pr_warning("HV close call failed %d\n", (int)hvrc);
- unlock_kernel();
+ mutex_unlock(&viodasd_mutex);
return 0;
}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e25..6ecf89c 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
#include <linux/hdreg.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
@@ -128,9 +127,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- if (vbr->req->cmd_flags & REQ_HARDBARRIER)
- vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
-
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
/*
@@ -222,8 +218,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
return err;
}
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +234,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
(void __user *)data);
}
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- lock_kernel();
- ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
-
- return ret;
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
@@ -392,31 +376,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
vblk->disk->driverfs_dev = &vdev->dev;
index++;
- if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
- /*
- * If the FLUSH feature is supported we do have support for
- * flushing a volatile write cache on the host. Use that
- * to implement write barrier support.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
- } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
- /*
- * If the BARRIER feature is supported the host expects us
- * to order request by tags. This implies there is not
- * volatile write cache on the host, and that the host
- * never re-orders outstanding I/O. This feature is not
- * useful for real life scenarious and deprecated.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_TAG);
- } else {
- /*
- * If the FLUSH feature is not supported we must assume that
- * the host does not perform any kind of volatile write
- * caching. We still need to drain the queue to provider
- * proper barrier semantics.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
- }
+ /* configure queue flush support */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+ blk_queue_flush(q, REQ_FLUSH);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -535,9 +497,9 @@ static const struct virtio_device_id id_table[] = {
};
static unsigned int features[] = {
- VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
- VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
- VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
};
/*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d5a3cd7..4abd2bc 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -46,7 +46,7 @@
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/blkpg.h>
#include <linux/delay.h>
#include <linux/io.h>
@@ -58,6 +58,7 @@
#include "xd.h"
+static DEFINE_MUTEX(xd_mutex);
static void __init do_xd_setup (int *integers);
#ifdef MODULE
static int xd[5] = { -1,-1,-1,-1, };
@@ -381,9 +382,9 @@ static int xd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&xd_mutex);
ret = xd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&xd_mutex);
return ret;
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ab735a6..d7aa39e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -41,7 +41,7 @@
#include <linux/cdrom.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <xen/xen.h>
@@ -65,13 +65,14 @@ enum blkif_state {
struct blk_shadow {
struct blkif_request req;
- unsigned long request;
+ struct request *request;
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
/*
* We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -95,7 +96,7 @@ struct blkfront_info
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free;
- int feature_barrier;
+ unsigned int feature_flush;
int is_ready;
};
@@ -135,7 +136,7 @@ static void add_id_to_freelist(struct blkfront_info *info,
unsigned long id)
{
info->shadow[id].req.id = info->shadow_free;
- info->shadow[id].request = 0;
+ info->shadow[id].request = NULL;
info->shadow_free = id;
}
@@ -244,14 +245,11 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
}
/*
- * blkif_queue_request
+ * Generate a Xen blkfront IO request from a blk layer request. Reads
+ * and writes are handled as expected. Since we lack a loose flush
+ * request, we map flushes into a full ordered barrier.
*
- * request block io
- *
- * id: for guest use only.
- * operation: BLKIF_OP_{READ,WRITE,PROBE}
- * buffer: buffer to read/write into. this should be a
- * virtual address in the guest os.
+ * @req: a request struct
*/
static int blkif_queue_request(struct request *req)
{
@@ -280,7 +278,7 @@ static int blkif_queue_request(struct request *req)
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
id = get_id_from_freelist(info);
- info->shadow[id].request = (unsigned long)req;
+ info->shadow[id].request = req;
ring_req->id = id;
ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
@@ -288,8 +286,18 @@ static int blkif_queue_request(struct request *req)
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
- if (req->cmd_flags & REQ_HARDBARRIER)
+
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ /*
+ * Ideally we could just do an unordered
+ * flush-to-disk, but all we have is a full write
+ * barrier at the moment. However, a barrier write is
+ * a superset of FUA, so we can implement it the same
+ * way. (It's also a FLUSH+FUA, since it is
+ * guaranteed ordered WRT previous writes.)
+ */
ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+ }
ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -418,26 +426,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
}
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
{
- int err;
- const char *barrier;
-
- switch (info->feature_barrier) {
- case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
- case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
- case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
- default: return -EINVAL;
- }
-
- err = blk_queue_ordered(info->rq, info->feature_barrier);
-
- if (err)
- return err;
-
+ blk_queue_flush(info->rq, info->feature_flush);
printk(KERN_INFO "blkfront: %s: barriers %s\n",
- info->gd->disk_name, barrier);
- return 0;
+ info->gd->disk_name,
+ info->feature_flush ? "enabled" : "disabled");
}
@@ -516,7 +510,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->rq = gd->queue;
info->gd = gd;
- xlvbd_barrier(info);
+ xlvbd_flush(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
@@ -553,7 +547,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
spin_unlock_irqrestore(&blkif_io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */
- flush_scheduled_work();
+ flush_work_sync(&info->work);
del_gendisk(info->gd);
@@ -602,7 +596,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
spin_unlock_irq(&blkif_io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
- flush_scheduled_work();
+ flush_work_sync(&info->work);
/* Free resources associated with old device channel. */
if (info->ring_ref != GRANT_INVALID_REF) {
@@ -649,7 +643,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
bret = RING_GET_RESPONSE(&info->ring, i);
id = bret->id;
- req = (struct request *)info->shadow[id].request;
+ req = info->shadow[id].request;
blkif_completion(&info->shadow[id]);
@@ -662,8 +656,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = QUEUE_ORDERED_NONE;
- xlvbd_barrier(info);
+ }
+ if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+ info->shadow[id].req.nr_segments == 0)) {
+ printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n",
+ info->gd->disk_name);
+ error = -EOPNOTSUPP;
+ }
+ if (unlikely(error)) {
+ if (error == -EOPNOTSUPP)
+ error = 0;
+ info->feature_flush = 0;
+ xlvbd_flush(info);
}
/* fall through */
case BLKIF_OP_READ:
@@ -914,7 +918,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Stage 3: Find pending requests and requeue them. */
for (i = 0; i < BLK_RING_SIZE; i++) {
/* Not in use? */
- if (copy[i].request == 0)
+ if (!copy[i].request)
continue;
/* Grab a request slot and copy shadow state into it. */
@@ -931,9 +935,7 @@ static int blkif_recover(struct blkfront_info *info)
req->seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->id].frame[j]),
- rq_data_dir(
- (struct request *)
- info->shadow[req->id].request));
+ rq_data_dir(info->shadow[req->id].request));
info->shadow[req->id].req = *req;
info->ring.req_prod_pvt++;
@@ -1076,20 +1078,14 @@ static void blkfront_connect(struct blkfront_info *info)
/*
* If there's no "feature-barrier" defined, then it means
* we're dealing with a very old backend which writes
- * synchronously; draining will do what needs to get done.
- *
- * If there are barriers, then we can do full queued writes
- * with tagged barriers.
+ * synchronously; nothing to do.
*
- * If barriers are not supported, then there's no much we can
- * do, so just set ordering to NONE.
+ * If there are barriers, then we use flush.
*/
- if (err)
- info->feature_barrier = QUEUE_ORDERED_DRAIN;
- else if (barrier)
- info->feature_barrier = QUEUE_ORDERED_TAG;
- else
- info->feature_barrier = QUEUE_ORDERED_NONE;
+ info->feature_flush = 0;
+
+ if (!err && barrier)
+ info->feature_flush = REQ_FLUSH | REQ_FUA;
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
@@ -1125,6 +1121,8 @@ static void blkback_changed(struct xenbus_device *dev,
case XenbusStateInitialising:
case XenbusStateInitWait:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
@@ -1201,7 +1199,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
struct blkfront_info *info;
int err = 0;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
info = disk->private_data;
if (!info) {
@@ -1219,7 +1217,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
mutex_unlock(&info->mutex);
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return err;
}
@@ -1229,7 +1227,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
struct block_device *bdev;
struct xenbus_device *xbdev;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
bdput(bdev);
@@ -1263,7 +1261,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
}
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return 0;
}
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 057413b..829161e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,7 +89,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/ata.h>
#include <linux/hdreg.h>
#include <linux/platform_device.h>
@@ -214,6 +214,7 @@ struct ace_device {
u16 cf_id[ATA_ID_WORDS];
};
+static DEFINE_MUTEX(xsysace_mutex);
static int ace_major;
/* ---------------------------------------------------------------------
@@ -903,13 +904,13 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
- lock_kernel();
+ mutex_lock(&xsysace_mutex);
spin_lock_irqsave(&ace->lock, flags);
ace->users++;
spin_unlock_irqrestore(&ace->lock, flags);
check_disk_change(bdev);
- unlock_kernel();
+ mutex_unlock(&xsysace_mutex);
return 0;
}
@@ -922,7 +923,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
- lock_kernel();
+ mutex_lock(&xsysace_mutex);
spin_lock_irqsave(&ace->lock, flags);
ace->users--;
if (ace->users == 0) {
@@ -930,7 +931,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
}
spin_unlock_irqrestore(&ace->lock, flags);
- unlock_kernel();
+ mutex_unlock(&xsysace_mutex);
return 0;
}
@@ -1224,7 +1225,8 @@ ace_of_probe(struct platform_device *op, const struct of_device_id *match)
bus_width = ACE_BUS_WIDTH_8;
/* Call the bus-independant setup code */
- return ace_alloc(&op->dev, id ? *id : 0, physaddr, irq, bus_width);
+ return ace_alloc(&op->dev, id ? be32_to_cpup(id) : 0,
+ physaddr, irq, bus_width);
}
static int __devexit ace_of_remove(struct platform_device *op)
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d75b2bb..a22e3f8 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,7 +33,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/bitops.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <asm/setup.h>
@@ -57,6 +57,7 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 )
+static DEFINE_MUTEX(z2ram_mutex);
static u_long *z2ram_map = NULL;
static u_long z2ram_size = 0;
static int z2_count = 0;
@@ -79,8 +80,10 @@ static void do_z2_request(struct request_queue *q)
int err = 0;
if (start + len > z2ram_size) {
- printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
- blk_rq_pos(req), blk_rq_cur_sectors(req));
+ pr_err(DEVICE_NAME ": bad access: block=%llu, "
+ "count=%u\n",
+ (unsigned long long)blk_rq_pos(req),
+ blk_rq_cur_sectors(req));
err = -EIO;
goto done;
}
@@ -154,7 +157,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
device = MINOR(bdev->bd_dev);
- lock_kernel();
+ mutex_lock(&z2ram_mutex);
if ( current_device != -1 && current_device != device )
{
rc = -EBUSY;
@@ -296,25 +299,25 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
set_capacity(z2ram_gendisk, z2ram_size >> 9);
}
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return 0;
err_out_kfree:
kfree(z2ram_map);
err_out:
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return rc;
}
static int
z2_release(struct gendisk *disk, fmode_t mode)
{
- lock_kernel();
+ mutex_lock(&z2ram_mutex);
if ( current_device == -1 ) {
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return 0;
}
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
/*
* FIXME: unmap memory
*/