202 files changed, 23921 insertions, 2028 deletions
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index f2df6e2..676f08b 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -3,7 +3,7 @@
 #
 
 menuconfig ATA
-	tristate "Serial ATA (prod) and Parallel ATA (experimental) drivers"
+	tristate "Serial ATA and Parallel ATA drivers"
 	depends on HAS_IOMEM
 	depends on BLOCK
 	depends on !(M32R || M68K) || BROKEN
@@ -374,8 +374,8 @@ config PATA_HPT366
 	  If unsure, say N.
 
 config PATA_HPT37X
-	tristate "HPT 370/370A/371/372/374/302 PATA support (Experimental)"
-	depends on PCI && EXPERIMENTAL
+	tristate "HPT 370/370A/371/372/374/302 PATA support"
+	depends on PCI
 	help
 	  This option enables support for the majority of the later HPT
 	  PATA controllers via the new ATA layer.
@@ -383,8 +383,8 @@ config PATA_HPT37X
 	  If unsure, say N.
 
 config PATA_HPT3X2N
-	tristate "HPT 372N/302N PATA support (Experimental)"
-	depends on PCI && EXPERIMENTAL
+	tristate "HPT 372N/302N PATA support"
+	depends on PCI
 	help
 	  This option enables support for the N variant HPT PATA
 	  controllers via the new ATA layer
@@ -401,7 +401,7 @@ config PATA_HPT3X3
 	  If unsure, say N.
 
 config PATA_HPT3X3_DMA
-	bool "HPT 343/363 DMA support (Experimental)"
+	bool "HPT 343/363 DMA support"
 	depends on PATA_HPT3X3
 	help
 	  This option enables DMA support for the HPT343/363
@@ -510,8 +510,8 @@ config PATA_NETCELL
 	  If unsure, say N.
 
 config PATA_NINJA32
-	tristate "Ninja32/Delkin Cardbus ATA support (Experimental)"
-	depends on PCI && EXPERIMENTAL
+	tristate "Ninja32/Delkin Cardbus ATA support"
+	depends on PCI
 	help
 	  This option enables support for the Ninja32, Delkin and
 	  possibly other brands of Cardbus ATA adapter
@@ -573,6 +573,14 @@ config PATA_PCMCIA
 
 	  If unsure, say N.
 
+config PATA_PDC2027X
+	tristate "Promise PATA 2027x support"
+	depends on PCI
+	help
+	  This option enables support for Promise PATA pdc20268 to pdc20277 host adapters.
+
+	  If unsure, say N.
+
 config PATA_PDC_OLD
 	tristate "Older Promise PATA controller support"
 	depends on PCI
@@ -643,14 +651,6 @@ config PATA_SERVERWORKS
 
 	  If unsure, say N.
 
-config PATA_PDC2027X
-	tristate "Promise PATA 2027x support"
-	depends on PCI
-	help
-	  This option enables support for Promise PATA pdc20268 to pdc20277 host adapters.
-
-	  If unsure, say N.
-
 config PATA_SIL680
 	tristate "CMD / Silicon Image 680 PATA support"
 	depends on PCI
@@ -667,6 +667,15 @@ config PATA_SIS
 
 	  If unsure, say N.
 
+config PATA_TOSHIBA
+	tristate "Toshiba Piccolo support (Experimental)"
+	depends on PCI && EXPERIMENTAL
+	help
+	  Support for the Toshiba Piccolo controllers. Currently only the
+	  primary channel is supported by this driver.
+
+	  If unsure, say N.
+
 config PATA_VIA
 	tristate "VIA PATA support"
 	depends on PCI
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 01e126f..d909435 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_PATA_RZ1000)	+= pata_rz1000.o
 obj-$(CONFIG_PATA_SC1200)	+= pata_sc1200.o
 obj-$(CONFIG_PATA_SERVERWORKS)	+= pata_serverworks.o
 obj-$(CONFIG_PATA_SIL680)	+= pata_sil680.o
+obj-$(CONFIG_PATA_TOSHIBA)	+= pata_piccolo.o
 obj-$(CONFIG_PATA_VIA)		+= pata_via.o
 obj-$(CONFIG_PATA_WINBOND)	+= pata_sl82c105.o
 obj-$(CONFIG_PATA_WINBOND_VLB)	+= pata_winbond.o
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index a3241a1..b8bea10 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -113,6 +113,7 @@ enum {
 	board_ahci_mcp65	= 6,
 	board_ahci_nopmp	= 7,
 	board_ahci_yesncq	= 8,
+	board_ahci_nosntf	= 9,
 
 	/* global controller registers */
 	HOST_CAP		= 0x00, /* host capabilities */
@@ -235,6 +236,7 @@ enum {
 	AHCI_HFLAG_NO_SUSPEND		= (1 << 10), /* don't suspend */
 	AHCI_HFLAG_SRST_TOUT_IS_OFFLINE	= (1 << 11), /* treat SRST timeout as
 							link offline */
+	AHCI_HFLAG_NO_SNTF		= (1 << 12), /* no sntf */
 
 	/* ap->flags bits */
 
@@ -508,7 +510,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_yesncq */
+	[board_ahci_yesncq] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_YES_NCQ),
 		.flags		= AHCI_FLAG_COMMON,
@@ -516,6 +518,14 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
+	[board_ahci_nosntf] =
+	{
+		AHCI_HFLAGS	(AHCI_HFLAG_NO_SNTF),
+		.flags		= AHCI_FLAG_COMMON,
+		.pio_mask	= ATA_PIO4,
+		.udma_mask	= ATA_UDMA6,
+		.port_ops	= &ahci_ops,
+	},
 };
 
 static const struct pci_device_id ahci_pci_tbl[] = {
@@ -531,7 +541,7 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, 0x2683), board_ahci }, /* ESB2 */
 	{ PCI_VDEVICE(INTEL, 0x27c6), board_ahci }, /* ICH7-M DH */
 	{ PCI_VDEVICE(INTEL, 0x2821), board_ahci }, /* ICH8 */
-	{ PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* ICH8 */
+	{ PCI_VDEVICE(INTEL, 0x2822), board_ahci_nosntf }, /* ICH8 */
 	{ PCI_VDEVICE(INTEL, 0x2824), board_ahci }, /* ICH8 */
 	{ PCI_VDEVICE(INTEL, 0x2829), board_ahci }, /* ICH8M */
 	{ PCI_VDEVICE(INTEL, 0x282a), board_ahci }, /* ICH8M */
@@ -849,6 +859,12 @@ static void ahci_save_initial_config(struct pci_dev *pdev,
 		cap &= ~HOST_CAP_PMP;
 	}
 
+	if ((cap & HOST_CAP_SNTF) && (hpriv->flags & AHCI_HFLAG_NO_SNTF)) {
+		dev_printk(KERN_INFO, &pdev->dev,
+			   "controller can't do SNTF, turning off CAP_SNTF\n");
+		cap &= ~HOST_CAP_SNTF;
+	}
+
 	if (pdev->vendor == PCI_VENDOR_ID_JMICRON && pdev->device == 0x2361 &&
 	    port_map != 1) {
 		dev_printk(KERN_INFO, &pdev->dev,
@@ -2988,6 +3004,14 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable)
 		return -ENODEV;
 
+	/* Promise's PDC42819 is a SAS/SATA controller that has an AHCI mode.
+	 * At the moment, we can only use the AHCI mode. Let the users know
+	 * that for SAS drives they're out of luck.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_PROMISE)
+		dev_printk(KERN_INFO, &pdev->dev, "PDC42819 "
+			   "can only drive SATA devices with this driver\n");
+
 	/* acquire resources */
 	rc = pcim_enable_device(pdev);
 	if (rc)
diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index ecfd22b..12e26c3 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -168,9 +168,12 @@ static struct pci_device_id ata_generic[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_VIA,    PCI_DEVICE_ID_VIA_82C561), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_OPTI,   PCI_DEVICE_ID_OPTI_82C558), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE), },
-	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO), },
+#if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE)
 	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),  },
+#endif	
 	/* Must come last. If you add entries adjust this table appropriately */
 	{ PCI_ANY_ID,		PCI_ANY_ID,			   PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_IDE << 8, 0xFFFFFF00UL, 1},
 	{ 0, },
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 9ac4e37..0c6155f 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -869,10 +869,10 @@ static void do_pata_set_dmamode(struct ata_port *ap, struct ata_device *adev, in
 				(timings[pio][1] << 8);
 		}
 
-		if (ap->udma_mask) {
+		if (ap->udma_mask)
 			udma_enable &= ~(1 << devid);
-			pci_write_config_word(dev, master_port, master_data);
-		}
+
+		pci_write_config_word(dev, master_port, master_data);
 	}
 	/* Don't scribble on 0x48 if the controller does not support UDMA */
 	if (ap->udma_mask)
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index b0882cd..1245838 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -807,12 +807,11 @@ static int ata_acpi_exec_tfs(struct ata_device *dev, int *nr_executed)
  * EH context.
  *
  * RETURNS:
- * 0 on success, -errno on failure.
+ * 0 on success, -ENOENT if _SDD doesn't exist, -errno on failure.
  */
 static int ata_acpi_push_id(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->link->ap;
-	int err;
 	acpi_status status;
 	struct acpi_object_list input;
 	union acpi_object in_params[1];
@@ -835,12 +834,16 @@ static int ata_acpi_push_id(struct ata_device *dev)
 	status = acpi_evaluate_object(dev->acpi_handle, "_SDD", &input, NULL);
 	swap_buf_le16(dev->id, ATA_ID_WORDS);
 
-	err = ACPI_FAILURE(status) ? -EIO : 0;
-	if (err < 0)
+	if (status == AE_NOT_FOUND)
+		return -ENOENT;
+
+	if (ACPI_FAILURE(status)) {
 		ata_dev_printk(dev, KERN_WARNING,
 			       "ACPI _SDD failed (AE 0x%x)\n", status);
+		return -EIO;
+	}
 
-	return err;
+	return 0;
 }
 
 /**
@@ -971,7 +974,7 @@ int ata_acpi_on_devcfg(struct ata_device *dev)
 	/* do _SDD if SATA */
 	if (acpi_sata) {
 		rc = ata_acpi_push_id(dev);
-		if (rc)
+		if (rc && rc != -ENOENT)
 			goto acpi_err;
 	}
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index dc72690..22ff51b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6616,6 +6616,13 @@ static int __init ata_init(void)
 {
 	ata_parse_force_param();
 
+	/*
+	 * FIXME: In UP case, there is only one workqueue thread and if you
+	 * have more than one PIO device, latency is bloody awful, with
+	 * occasional multi-second "hiccups" as one PIO device waits for
+	 * another.  It's an ugly wart that users DO occasionally complain
+	 * about; luckily most users have at most one PIO polled device.
+	 */
 	ata_wq = create_workqueue("ata");
 	if (!ata_wq)
 		goto free_force_tbl;
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index bba2ae5..0ea97c9 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -110,6 +110,13 @@ static const unsigned long ata_eh_identify_timeouts[] = {
 	ULONG_MAX,
 };
 
+static const unsigned long ata_eh_flush_timeouts[] = {
+	15000,	/* be generous with flush */
+	15000,  /* ditto */
+	30000,	/* and even more generous */
+	ULONG_MAX,
+};
+
 static const unsigned long ata_eh_other_timeouts[] = {
 	 5000,	/* same rationale as identify timeout */
 	10000,	/* ditto */
@@ -147,6 +154,8 @@ ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = {
 	  .timeouts = ata_eh_other_timeouts, },
 	{ .commands = CMDS(ATA_CMD_INIT_DEV_PARAMS),
 	  .timeouts = ata_eh_other_timeouts, },
+	{ .commands = CMDS(ATA_CMD_FLUSH, ATA_CMD_FLUSH_EXT),
+	  .timeouts = ata_eh_flush_timeouts },
 };
 #undef CMDS
 
@@ -3112,6 +3121,82 @@ static int atapi_eh_clear_ua(struct ata_device *dev)
 	return 0;
 }
 
+/**
+ *	ata_eh_maybe_retry_flush - Retry FLUSH if necessary
+ *	@dev: ATA device which may need FLUSH retry
+ *
+ *	If @dev failed FLUSH, it needs to be reported upper layer
+ *	immediately as it means that @dev failed to remap and already
+ *	lost at least a sector and further FLUSH retrials won't make
+ *	any difference to the lost sector.  However, if FLUSH failed
+ *	for other reasons, for example transmission error, FLUSH needs
+ *	to be retried.
+ *
+ *	This function determines whether FLUSH failure retry is
+ *	necessary and performs it if so.
+ *
+ *	RETURNS:
+ *	0 if EH can continue, -errno if EH needs to be repeated.
+ */
+static int ata_eh_maybe_retry_flush(struct ata_device *dev)
+{
+	struct ata_link *link = dev->link;
+	struct ata_port *ap = link->ap;
+	struct ata_queued_cmd *qc;
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+	int rc = 0;
+
+	/* did flush fail for this device? */
+	if (!ata_tag_valid(link->active_tag))
+		return 0;
+
+	qc = __ata_qc_from_tag(ap, link->active_tag);
+	if (qc->dev != dev || (qc->tf.command != ATA_CMD_FLUSH_EXT &&
+			       qc->tf.command != ATA_CMD_FLUSH))
+		return 0;
+
+	/* if the device failed it, it should be reported to upper layers */
+	if (qc->err_mask & AC_ERR_DEV)
+		return 0;
+
+	/* flush failed for some other reason, give it another shot */
+	ata_tf_init(dev, &tf);
+
+	tf.command = qc->tf.command;
+	tf.flags |= ATA_TFLAG_DEVICE;
+	tf.protocol = ATA_PROT_NODATA;
+
+	ata_dev_printk(dev, KERN_WARNING, "retrying FLUSH 0x%x Emask 0x%x\n",
+		       tf.command, qc->err_mask);
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	if (!err_mask) {
+		/*
+		 * FLUSH is complete but there's no way to
+		 * successfully complete a failed command from EH.
+		 * Making sure retry is allowed at least once and
+		 * retrying it should do the trick - whatever was in
+		 * the cache is already on the platter and this won't
+		 * cause infinite loop.
+		 */
+		qc->scsicmd->allowed = max(qc->scsicmd->allowed, 1);
+	} else {
+		ata_dev_printk(dev, KERN_WARNING, "FLUSH failed Emask 0x%x\n",
+			       err_mask);
+		rc = -EIO;
+
+		/* if device failed it, report it to upper layers */
+		if (err_mask & AC_ERR_DEV) {
+			qc->err_mask |= AC_ERR_DEV;
+			qc->result_tf = tf;
+			if (!(ap->pflags & ATA_PFLAG_FROZEN))
+				rc = 0;
+		}
+	}
+	return rc;
+}
+
 static int ata_link_nr_enabled(struct ata_link *link)
 {
 	struct ata_device *dev;
@@ -3455,6 +3540,15 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 			}
 		}
 
+		/* retry flush if necessary */
+		ata_for_each_dev(dev, link, ALL) {
+			if (dev->class != ATA_DEV_ATA)
+				continue;
+			rc = ata_eh_maybe_retry_flush(dev);
+			if (rc)
+				goto dev_fail;
+		}
+
 		/* configure link power saving */
 		if (ehc->i.action & ATA_EH_LPM)
 			ata_for_each_dev(dev, link, ALL)
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b4ee28d..62e6b9e 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -47,6 +47,7 @@
 #include <linux/hdreg.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
+#include <asm/unaligned.h>
 
 #include "libata.h"
 
@@ -154,8 +155,7 @@ static ssize_t ata_scsi_lpm_put(struct device *dev,
 	 */
 	for (i = 1; i < ARRAY_SIZE(link_pm_policy); i++) {
 		const int len = strlen(link_pm_policy[i].name);
-		if (strncmp(link_pm_policy[i].name, buf, len) == 0 &&
-		   buf[len] == '\n') {
+		if (strncmp(link_pm_policy[i].name, buf, len) == 0) {
 			policy = link_pm_policy[i].value;
 			break;
 		}
@@ -1964,6 +1964,7 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 		0x80,	/* page 0x80, unit serial no page */
 		0x83,	/* page 0x83, device ident page */
 		0x89,	/* page 0x89, ata info page */
+		0xb0,	/* page 0xb0, block limits page */
 		0xb1,	/* page 0xb1, block device characteristics page */
 	};
 
@@ -2085,6 +2086,43 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 }
 
+static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
+{
+	u32 min_io_sectors;
+
+	rbuf[1] = 0xb0;
+	rbuf[3] = 0x3c;		/* required VPD size with unmap support */
+
+	/*
+	 * Optimal transfer length granularity.
+	 *
+	 * This is always one physical block, but for disks with a smaller
+	 * logical than physical sector size we need to figure out what the
+	 * latter is.
+	 */
+	if (ata_id_has_large_logical_sectors(args->id))
+		min_io_sectors = ata_id_logical_per_physical_sectors(args->id);
+	else
+		min_io_sectors = 1;
+	put_unaligned_be16(min_io_sectors, &rbuf[6]);
+
+	/*
+	 * Optimal unmap granularity.
+	 *
+	 * The ATA spec doesn't even know about a granularity or alignment
+	 * for the TRIM command.  We can leave away most of the unmap related
+	 * VPD page entries, but we have specifify a granularity to signal
+	 * that we support some form of unmap - in thise case via WRITE SAME
+	 * with the unmap bit set.
+	 */
+	if (ata_id_has_trim(args->id)) {
+		put_unaligned_be32(65535 * 512 / 8, &rbuf[20]);
+		put_unaligned_be32(1, &rbuf[28]);
+	}
+
+	return 0;
+}
+
 static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
 {
 	int form_factor = ata_id_form_factor(args->id);
@@ -2374,6 +2412,13 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[13] = log_per_phys;
 		rbuf[14] = (lowest_aligned >> 8) & 0x3f;
 		rbuf[15] = lowest_aligned;
+
+		if (ata_id_has_trim(args->id)) {
+			rbuf[14] |= 0x80; /* TPE */
+
+			if (ata_id_has_zero_after_trim(args->id))
+				rbuf[14] |= 0x40; /* TPRZ */
+		}
 	}
 
 	return 0;
@@ -2896,6 +2941,58 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	return 1;
 }
 
+static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct ata_device *dev = qc->dev;
+	const u8 *cdb = scmd->cmnd;
+	u64 block;
+	u32 n_block;
+	u32 size;
+	void *buf;
+
+	/* we may not issue DMA commands if no DMA mode is set */
+	if (unlikely(!dev->dma_mode))
+		goto invalid_fld;
+
+	if (unlikely(scmd->cmd_len < 16))
+		goto invalid_fld;
+	scsi_16_lba_len(cdb, &block, &n_block);
+
+	/* for now we only support WRITE SAME with the unmap bit set */
+	if (unlikely(!(cdb[1] & 0x8)))
+		goto invalid_fld;
+
+	/*
+	 * WRITE SAME always has a sector sized buffer as payload, this
+	 * should never be a multiple entry S/G list.
+	 */
+	if (!scsi_sg_count(scmd))
+		goto invalid_fld;
+
+	buf = page_address(sg_page(scsi_sglist(scmd)));
+	size = ata_set_lba_range_entries(buf, 512, block, n_block);
+
+	tf->protocol = ATA_PROT_DMA;
+	tf->hob_feature = 0;
+	tf->feature = ATA_DSM_TRIM;
+	tf->hob_nsect = (size / 512) >> 8;
+	tf->nsect = size / 512;
+	tf->command = ATA_CMD_DSM;
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
+		     ATA_TFLAG_WRITE;
+
+	ata_qc_set_pc_nbytes(qc);
+
+	return 0;
+
+ invalid_fld:
+	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	/* "Invalid field in cdb" */
+	return 1;
+}
+
 /**
  *	ata_get_xlat_func - check if SCSI to ATA translation is possible
  *	@dev: ATA device
@@ -2920,6 +3017,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 	case WRITE_16:
 		return ata_scsi_rw_xlat;
 
+	case 0x93 /*WRITE_SAME_16*/:
+		return ata_scsi_write_same_xlat;
+
 	case SYNCHRONIZE_CACHE:
 		if (ata_try_flush_cache(dev))
 			return ata_scsi_flush_xlat;
@@ -3109,6 +3209,9 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 		case 0x89:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_89);
 			break;
+		case 0xb0:
+			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b0);
+			break;
 		case 0xb1:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b1);
 			break;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bbbb1fa..51eb1e2 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -2384,7 +2384,7 @@ void ata_sff_post_internal_cmd(struct ata_queued_cmd *qc)
 	ap->hsm_task_state = HSM_ST_IDLE;
 
 	if (ap->ioaddr.bmdma_addr)
-		ata_bmdma_stop(qc);
+		ap->ops->bmdma_stop(qc);
 
 	spin_unlock_irqrestore(ap->lock, flags);
 }
diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index 1432dc9..9434114 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -453,7 +453,9 @@ static void ali_init_chipset(struct pci_dev *pdev)
 			/* Clear CD-ROM DMA write bit */
 			tmp &= 0x7F;
 		/* Cable and UDMA */
-		pci_write_config_byte(pdev, 0x4B, tmp | 0x09);
+		if (pdev->revision >= 0xc2)
+			tmp |= 0x01;
+		pci_write_config_byte(pdev, 0x4B, tmp | 0x08);
 		/*
 		 * CD_ROM DMA on (0x53 bit 0). Enable this even if we want
 		 * to use PIO. 0x53 bit 1 (rev 20 only) - enable FIFO control
diff --git a/drivers/ata/pata_cmd64x.c b/drivers/ata/pata_cmd64x.c
index f98dffe..dadfc35 100644
--- a/drivers/ata/pata_cmd64x.c
+++ b/drivers/ata/pata_cmd64x.c
@@ -31,7 +31,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_cmd64x"
-#define DRV_VERSION "0.2.5"
+#define DRV_VERSION "0.3.1"
 
 /*
  * CMD64x specific registers definition.
@@ -254,17 +254,109 @@ static void cmd648_bmdma_stop(struct ata_queued_cmd *qc)
 }
 
 /**
- *	cmd646r1_dma_stop	-	DMA stop callback
+ *	cmd64x_bmdma_stop	-	DMA stop callback
  *	@qc: Command in progress
  *
- *	Stub for now while investigating the r1 quirk in the old driver.
+ *	Track the completion of live DMA commands and clear the
+ *	host->private_data DMA tracking flag as we do.
  */
 
-static void cmd646r1_bmdma_stop(struct ata_queued_cmd *qc)
+static void cmd64x_bmdma_stop(struct ata_queued_cmd *qc)
 {
+	struct ata_port *ap = qc->ap;
 	ata_bmdma_stop(qc);
+	WARN_ON(ap->host->private_data != ap);
+	ap->host->private_data = NULL;
 }
 
+/**
+ *	cmd64x_qc_defer		-	Defer logic for chip limits
+ *	@qc: queued command
+ *
+ *	Decide whether we can issue the command. Called under the host lock.
+ */
+
+static int cmd64x_qc_defer(struct ata_queued_cmd *qc)
+{
+	struct ata_host *host = qc->ap->host;
+	struct ata_port *alt = host->ports[1 ^ qc->ap->port_no];
+	int rc;
+	int dma = 0;
+
+	/* Apply the ATA rules first */
+	rc = ata_std_qc_defer(qc);
+	if (rc)
+		return rc;
+
+	if (qc->tf.protocol == ATAPI_PROT_DMA ||
+			qc->tf.protocol == ATA_PROT_DMA)
+		dma = 1;
+
+	/* If the other port is not live then issue the command */
+	if (alt == NULL || !alt->qc_active) {
+		if (dma)
+			host->private_data = qc->ap;
+		return 0;
+	}
+	/* If there is a live DMA command then wait */
+	if (host->private_data != NULL)
+		return 	ATA_DEFER_PORT;
+	if (dma)
+		/* Cannot overlap our DMA command */
+		return ATA_DEFER_PORT;
+	return 0;
+}
+
+/**
+ *	cmd64x_interrupt - ATA host interrupt handler
+ *	@irq: irq line (unused)
+ *	@dev_instance: pointer to our ata_host information structure
+ *
+ *	Our interrupt handler for PCI IDE devices.  Calls
+ *	ata_sff_host_intr() for each port that is flagging an IRQ. We cannot
+ *	use the defaults as we need to avoid touching status/altstatus during
+ *	a DMA.
+ *
+ *	LOCKING:
+ *	Obtains host lock during operation.
+ *
+ *	RETURNS:
+ *	IRQ_NONE or IRQ_HANDLED.
+ */
+irqreturn_t cmd64x_interrupt(int irq, void *dev_instance)
+{
+	struct ata_host *host = dev_instance;
+	struct pci_dev *pdev = to_pci_dev(host->dev);
+	unsigned int i;
+	unsigned int handled = 0;
+	unsigned long flags;
+	static const u8 irq_reg[2] = { CFR, ARTTIM23 };
+	static const u8 irq_mask[2] = { 1 << 2, 1 << 4 };
+
+	/* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */
+	spin_lock_irqsave(&host->lock, flags);
+
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap;
+		u8 reg;
+
+		pci_read_config_byte(pdev, irq_reg[i], &reg);
+		ap = host->ports[i];
+		if (ap && (reg & irq_mask[i]) &&
+		    !(ap->flags & ATA_FLAG_DISABLED)) {
+			struct ata_queued_cmd *qc;
+
+			qc = ata_qc_from_tag(ap, ap->link.active_tag);
+			if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)) &&
+			    (qc->flags & ATA_QCFLAG_ACTIVE))
+				handled |= ata_sff_host_intr(ap, qc);
+		}
+	}
+
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	return IRQ_RETVAL(handled);
+}
 static struct scsi_host_template cmd64x_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
@@ -273,6 +365,8 @@ static const struct ata_port_operations cmd64x_base_ops = {
 	.inherits	= &ata_bmdma_port_ops,
 	.set_piomode	= cmd64x_set_piomode,
 	.set_dmamode	= cmd64x_set_dmamode,
+	.bmdma_stop	= cmd64x_bmdma_stop,
+	.qc_defer	= cmd64x_qc_defer,
 };
 
 static struct ata_port_operations cmd64x_port_ops = {
@@ -282,7 +376,6 @@ static struct ata_port_operations cmd64x_port_ops = {
 
 static struct ata_port_operations cmd646r1_port_ops = {
 	.inherits	= &cmd64x_base_ops,
-	.bmdma_stop	= cmd646r1_bmdma_stop,
 	.cable_detect	= ata_cable_40wire,
 };
 
@@ -290,12 +383,11 @@ static struct ata_port_operations cmd648_port_ops = {
 	.inherits	= &cmd64x_base_ops,
 	.bmdma_stop	= cmd648_bmdma_stop,
 	.cable_detect	= cmd648_cable_detect,
+	.qc_defer	= ata_std_qc_defer
 };
 
 static int cmd64x_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	u32 class_rev;
-
 	static const struct ata_port_info cmd_info[6] = {
 		{	/* CMD 643 - no UDMA */
 			.flags = ATA_FLAG_SLAVE_POSS,
@@ -340,40 +432,43 @@ static int cmd64x_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	const struct ata_port_info *ppi[] = { &cmd_info[id->driver_data], NULL };
 	u8 mrdmode;
 	int rc;
+	struct ata_host *host;
 
 	rc = pcim_enable_device(pdev);
 	if (rc)
 		return rc;
 
-	pci_read_config_dword(pdev, PCI_CLASS_REVISION, &class_rev);
-	class_rev &= 0xFF;
-
 	if (id->driver_data == 0)	/* 643 */
 		ata_pci_bmdma_clear_simplex(pdev);
 
 	if (pdev->device == PCI_DEVICE_ID_CMD_646) {
 		/* Does UDMA work ? */
-		if (class_rev > 4)
+		if (pdev->revision > 4)
 			ppi[0] = &cmd_info[2];
 		/* Early rev with other problems ? */
-		else if (class_rev == 1)
+		else if (pdev->revision == 1)
 			ppi[0] = &cmd_info[3];
 	}
 
+
 	pci_write_config_byte(pdev, PCI_LATENCY_TIMER, 64);
 	pci_read_config_byte(pdev, MRDMODE, &mrdmode);
 	mrdmode &= ~ 0x30;	/* IRQ set up */
 	mrdmode |= 0x02;	/* Memory read line enable */
 	pci_write_config_byte(pdev, MRDMODE, mrdmode);
 
-	/* Force PIO 0 here.. */
-
 	/* PPC specific fixup copied from old driver */
 #ifdef CONFIG_PPC
 	pci_write_config_byte(pdev, UDIDETCR0, 0xF0);
 #endif
+	rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
+	if (rc)
+		return rc;
+	/* We use this pointer to track the AP which has DMA running */
+	host->private_data = NULL;
 
-	return ata_pci_sff_init_one(pdev, ppi, &cmd64x_sht, NULL);
+	pci_set_master(pdev);
+	return ata_pci_sff_activate_host(host, cmd64x_interrupt, &cmd64x_sht);
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/ata/pata_cs5520.c b/drivers/ata/pata_cs5520.c
index 0df83cf..95ebdac 100644
--- a/drivers/ata/pata_cs5520.c
+++ b/drivers/ata/pata_cs5520.c
@@ -90,48 +90,12 @@ static void cs5520_set_timings(struct ata_port *ap, struct ata_device *adev, int
 }
 
 /**
- *	cs5520_enable_dma	-	turn on DMA bits
- *
- *	Turn on the DMA bits for this disk. Needed because the BIOS probably
- *	has not done the work for us. Belongs in the core SATA code.
- */
-
-static void cs5520_enable_dma(struct ata_port *ap, struct ata_device *adev)
-{
-	/* Set the DMA enable/disable flag */
-	u8 reg = ioread8(ap->ioaddr.bmdma_addr + 0x02);
-	reg |= 1<<(adev->devno + 5);
-	iowrite8(reg, ap->ioaddr.bmdma_addr + 0x02);
-}
-
-/**
- *	cs5520_set_dmamode	-	program DMA timings
- *	@ap: ATA port
- *	@adev: ATA device
- *
- *	Program the DMA mode timings for the controller according to the pio
- *	clocking table. Note that this device sets the DMA timings to PIO
- *	mode values. This may seem bizarre but the 5520 architecture talks
- *	PIO mode to the disk and DMA mode to the controller so the underlying
- *	transfers are PIO timed.
- */
-
-static void cs5520_set_dmamode(struct ata_port *ap, struct ata_device *adev)
-{
-	static const int dma_xlate[3] = { XFER_PIO_0, XFER_PIO_3, XFER_PIO_4 };
-	cs5520_set_timings(ap, adev, dma_xlate[adev->dma_mode]);
-	cs5520_enable_dma(ap, adev);
-}
-
-/**
  *	cs5520_set_piomode	-	program PIO timings
  *	@ap: ATA port
  *	@adev: ATA device
  *
  *	Program the PIO mode timings for the controller according to the pio
- *	clocking table. We know pio_mode will equal dma_mode because of the
- *	CS5520 architecture. At least once we turned DMA on and wrote a
- *	mode setter.
+ *	clocking table.
  */
 
 static void cs5520_set_piomode(struct ata_port *ap, struct ata_device *adev)
@@ -149,7 +113,6 @@ static struct ata_port_operations cs5520_port_ops = {
 	.qc_prep		= ata_sff_dumb_qc_prep,
 	.cable_detect		= ata_cable_40wire,
 	.set_piomode		= cs5520_set_piomode,
-	.set_dmamode		= cs5520_set_dmamode,
 };
 
 static int __devinit cs5520_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c
index 6da4cb4..ffee397 100644
--- a/drivers/ata/pata_cs5536.c
+++ b/drivers/ata/pata_cs5536.c
@@ -224,7 +224,7 @@ static struct scsi_host_template cs5536_sht = {
 };
 
 static struct ata_port_operations cs5536_port_ops = {
-	.inherits		= &ata_bmdma_port_ops,
+	.inherits		= &ata_bmdma32_port_ops,
 	.cable_detect		= cs5536_cable_detect,
 	.set_piomode		= cs5536_set_piomode,
 	.set_dmamode		= cs5536_set_dmamode,
diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c
index 2a6412f..b2e71e6 100644
--- a/drivers/ata/pata_efar.c
+++ b/drivers/ata/pata_efar.c
@@ -2,6 +2,7 @@
  *    pata_efar.c - EFAR PIIX clone controller driver
  *
  *	(C) 2005 Red Hat
+ *	(C) 2009 Bartlomiej Zolnierkiewicz
  *
  *    Some parts based on ata_piix.c by Jeff Garzik and others.
  *
@@ -118,12 +119,12 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev)
 		int shift = 4 * ap->port_no;
 		u8 slave_data;
 
-		idetm_data &= 0xCC0F;
+		idetm_data &= 0xFF0F;
 		idetm_data |= (control << 4);
 
 		/* Slave timing in separate register */
 		pci_read_config_byte(dev, 0x44, &slave_data);
-		slave_data &= 0x0F << shift;
+		slave_data &= ap->port_no ? 0x0F : 0xF0;
 		slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << shift;
 		pci_write_config_byte(dev, 0x44, slave_data);
 	}
@@ -200,7 +201,7 @@ static void efar_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 			master_data &= 0xFF4F;  /* Mask out IORDY|TIME1|DMAONLY */
 			master_data |= control << 4;
 			pci_read_config_byte(dev, 0x44, &slave_data);
-			slave_data &= (0x0F + 0xE1 * ap->port_no);
+			slave_data &= ap->port_no ? 0x0F : 0xF0;
 			/* Load the matching timing */
 			slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0);
 			pci_write_config_byte(dev, 0x44, slave_data);
@@ -251,7 +252,7 @@ static int efar_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	static const struct ata_port_info info = {
 		.flags		= ATA_FLAG_SLAVE_POSS,
 		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
+		.mwdma_mask	= ATA_MWDMA12_ONLY,
 		.udma_mask 	= ATA_UDMA4,
 		.port_ops	= &efar_ops,
 	};
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index d7f2da1..0bd48e8 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -27,7 +27,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME	"pata_hpt366"
-#define DRV_VERSION	"0.6.2"
+#define DRV_VERSION	"0.6.7"
 
 struct hpt_clock {
 	u8	xfer_mode;
@@ -36,24 +36,22 @@ struct hpt_clock {
 
 /* key for bus clock timings
  * bit
- * 0:3    data_high_time. inactive time of DIOW_/DIOR_ for PIO and MW
- *        DMA. cycles = value + 1
- * 4:8    data_low_time. active time of DIOW_/DIOR_ for PIO and MW
- *        DMA. cycles = value + 1
- * 9:12   cmd_high_time. inactive time of DIOW_/DIOR_ during task file
+ * 0:3    data_high_time. Inactive time of DIOW_/DIOR_ for PIO and MW DMA.
+ *        cycles = value + 1
+ * 4:7    data_low_time. Active time of DIOW_/DIOR_ for PIO and MW DMA.
+ *        cycles = value + 1
+ * 8:11   cmd_high_time. Inactive time of DIOW_/DIOR_ during task file
  *        register access.
- * 13:17  cmd_low_time. active time of DIOW_/DIOR_ during task file
+ * 12:15  cmd_low_time. Active time of DIOW_/DIOR_ during task file
  *        register access.
- * 18:21  udma_cycle_time. clock freq and clock cycles for UDMA xfer.
- *        during task file register access.
- * 22:24  pre_high_time. time to initialize 1st cycle for PIO and MW DMA
- *        xfer.
- * 25:27  cmd_pre_high_time. time to initialize 1st PIO cycle for task
+ * 16:18  udma_cycle_time. Clock cycles for UDMA xfer?
+ * 19:21  pre_high_time. Time to initialize 1st cycle for PIO and MW DMA xfer.
+ * 22:24  cmd_pre_high_time. Time to initialize 1st PIO cycle for task file
  *        register access.
- * 28     UDMA enable
- * 29     DMA enable
- * 30     PIO_MST enable. if set, the chip is in bus master mode during
- *        PIO.
+ * 28     UDMA enable.
+ * 29     DMA  enable.
+ * 30     PIO_MST enable. If set, the chip is in bus master mode during
+ *        PIO xfer.
  * 31     FIFO enable.
  */
 
@@ -344,7 +342,6 @@ static int hpt36x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	const struct ata_port_info *ppi[] = { &info_hpt366, NULL };
 
 	void *hpriv = NULL;
-	u32 class_rev;
 	u32 reg1;
 	int rc;
 
@@ -352,13 +349,10 @@ static int hpt36x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
-	class_rev &= 0xFF;
-
 	/* May be a later chip in disguise. Check */
 	/* Newer chips are not in the HPT36x driver. Ignore them */
-	if (class_rev > 2)
-			return -ENODEV;
+	if (dev->revision > 2)
+		return -ENODEV;
 
 	hpt36x_init_chipset(dev);
 
diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c
index d0a7df2..4224cfc 100644
--- a/drivers/ata/pata_hpt37x.c
+++ b/drivers/ata/pata_hpt37x.c
@@ -24,7 +24,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME	"pata_hpt37x"
-#define DRV_VERSION	"0.6.12"
+#define DRV_VERSION	"0.6.14"
 
 struct hpt_clock {
 	u8	xfer_speed;
@@ -303,72 +303,79 @@ static unsigned long hpt370a_filter(struct ata_device *adev, unsigned long mask)
 }
 
 /**
- *	hpt37x_pre_reset	-	reset the hpt37x bus
- *	@link: ATA link to reset
- *	@deadline: deadline jiffies for the operation
+ *	hpt37x_cable_detect	-	Detect the cable type
+ *	@ap: ATA port to detect on
  *
- *	Perform the initial reset handling for the 370/372 and 374 func 0
+ *	Return the cable type attached to this port
  */
 
-static int hpt37x_pre_reset(struct ata_link *link, unsigned long deadline)
+static int hpt37x_cable_detect(struct ata_port *ap)
 {
-	u8 scr2, ata66;
-	struct ata_port *ap = link->ap;
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
-	static const struct pci_bits hpt37x_enable_bits[] = {
-		{ 0x50, 1, 0x04, 0x04 },
-		{ 0x54, 1, 0x04, 0x04 }
-	};
-	if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no]))
-		return -ENOENT;
+	u8 scr2, ata66;
 
 	pci_read_config_byte(pdev, 0x5B, &scr2);
 	pci_write_config_byte(pdev, 0x5B, scr2 & ~0x01);
+
+	udelay(10); /* debounce */
+
 	/* Cable register now active */
 	pci_read_config_byte(pdev, 0x5A, &ata66);
 	/* Restore state */
 	pci_write_config_byte(pdev, 0x5B, scr2);
 
 	if (ata66 & (2 >> ap->port_no))
-		ap->cbl = ATA_CBL_PATA40;
+		return ATA_CBL_PATA40;
 	else
-		ap->cbl = ATA_CBL_PATA80;
-
-	/* Reset the state machine */
-	pci_write_config_byte(pdev, 0x50 + 4 * ap->port_no, 0x37);
-	udelay(100);
-
-	return ata_sff_prereset(link, deadline);
+		return ATA_CBL_PATA80;
 }
 
-static int hpt374_fn1_pre_reset(struct ata_link *link, unsigned long deadline)
+/**
+ *	hpt374_fn1_cable_detect	-	Detect the cable type
+ *	@ap: ATA port to detect on
+ *
+ *	Return the cable type attached to this port
+ */
+
+static int hpt374_fn1_cable_detect(struct ata_port *ap)
 {
-	static const struct pci_bits hpt37x_enable_bits[] = {
-		{ 0x50, 1, 0x04, 0x04 },
-		{ 0x54, 1, 0x04, 0x04 }
-	};
-	u16 mcr3;
-	u8 ata66;
-	struct ata_port *ap = link->ap;
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	unsigned int mcrbase = 0x50 + 4 * ap->port_no;
-
-	if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no]))
-		return -ENOENT;
+	u16 mcr3;
+	u8 ata66;
 
 	/* Do the extra channel work */
 	pci_read_config_word(pdev, mcrbase + 2, &mcr3);
-	/* Set bit 15 of 0x52 to enable TCBLID as input
-	 */
+	/* Set bit 15 of 0x52 to enable TCBLID as input */
 	pci_write_config_word(pdev, mcrbase + 2, mcr3 | 0x8000);
 	pci_read_config_byte(pdev, 0x5A, &ata66);
 	/* Reset TCBLID/FCBLID to output */
 	pci_write_config_word(pdev, mcrbase + 2, mcr3);
 
 	if (ata66 & (2 >> ap->port_no))
-		ap->cbl = ATA_CBL_PATA40;
+		return ATA_CBL_PATA40;
 	else
-		ap->cbl = ATA_CBL_PATA80;
+		return ATA_CBL_PATA80;
+}
+
+/**
+ *	hpt37x_pre_reset	-	reset the hpt37x bus
+ *	@link: ATA link to reset
+ *	@deadline: deadline jiffies for the operation
+ *
+ *	Perform the initial reset handling for the HPT37x.
+ */
+
+static int hpt37x_pre_reset(struct ata_link *link, unsigned long deadline)
+{
+	struct ata_port *ap = link->ap;
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	static const struct pci_bits hpt37x_enable_bits[] = {
+		{ 0x50, 1, 0x04, 0x04 },
+		{ 0x54, 1, 0x04, 0x04 }
+	};
+	if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no]))
+		return -ENOENT;
 
 	/* Reset the state machine */
 	pci_write_config_byte(pdev, 0x50 + 4 * ap->port_no, 0x37);
@@ -404,9 +411,8 @@ static void hpt370_set_piomode(struct ata_port *ap, struct ata_device *adev)
 
 	pci_read_config_dword(pdev, addr1, &reg);
 	mode = hpt37x_find_mode(ap, adev->pio_mode);
-	mode &= ~0x8000000;	/* No FIFO in PIO */
-	mode &= ~0x30070000;	/* Leave config bits alone */
-	reg &= 0x30070000;	/* Strip timing bits */
+	mode &= 0xCFC3FFFF;	/* Leave DMA bits alone */
+	reg &= ~0xCFC3FFFF;	/* Strip timing bits */
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -423,8 +429,7 @@ static void hpt370_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 {
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	u32 addr1, addr2;
-	u32 reg;
-	u32 mode;
+	u32 reg, mode, mask;
 	u8 fast;
 
 	addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no);
@@ -436,11 +441,12 @@ static void hpt370_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	fast |= 0x01;
 	pci_write_config_byte(pdev, addr2, fast);
 
+	mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000;
+
 	pci_read_config_dword(pdev, addr1, &reg);
 	mode = hpt37x_find_mode(ap, adev->dma_mode);
-	mode |= 0x8000000;	/* FIFO in MWDMA or UDMA */
-	mode &= ~0xC0000000;	/* Leave config bits alone */
-	reg &= 0xC0000000;	/* Strip timing bits */
+	mode &= mask;
+	reg &= ~mask;
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -508,9 +514,8 @@ static void hpt372_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	mode = hpt37x_find_mode(ap, adev->pio_mode);
 
 	printk("Find mode for %d reports %X\n", adev->pio_mode, mode);
-	mode &= ~0x80000000;	/* No FIFO in PIO */
-	mode &= ~0x30070000;	/* Leave config bits alone */
-	reg &= 0x30070000;	/* Strip timing bits */
+	mode &= 0xCFC3FFFF;	/* Leave DMA bits alone */
+	reg &= ~0xCFC3FFFF;	/* Strip timing bits */
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -527,8 +532,7 @@ static void hpt372_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 {
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	u32 addr1, addr2;
-	u32 reg;
-	u32 mode;
+	u32 reg, mode, mask;
 	u8 fast;
 
 	addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no);
@@ -539,12 +543,13 @@ static void hpt372_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	fast &= ~0x07;
 	pci_write_config_byte(pdev, addr2, fast);
 
+	mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000;
+
 	pci_read_config_dword(pdev, addr1, &reg);
 	mode = hpt37x_find_mode(ap, adev->dma_mode);
 	printk("Find mode for DMA %d reports %X\n", adev->dma_mode, mode);
-	mode &= ~0xC0000000;	/* Leave config bits alone */
-	mode |= 0x80000000;	/* FIFO in MWDMA or UDMA */
-	reg &= 0xC0000000;	/* Strip timing bits */
+	mode &= mask;
+	reg &= ~mask;
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -584,6 +589,7 @@ static struct ata_port_operations hpt370_port_ops = {
 	.bmdma_stop	= hpt370_bmdma_stop,
 
 	.mode_filter	= hpt370_filter,
+	.cable_detect	= hpt37x_cable_detect,
 	.set_piomode	= hpt370_set_piomode,
 	.set_dmamode	= hpt370_set_dmamode,
 	.prereset	= hpt37x_pre_reset,
@@ -608,6 +614,7 @@ static struct ata_port_operations hpt372_port_ops = {
 
 	.bmdma_stop	= hpt37x_bmdma_stop,
 
+	.cable_detect	= hpt37x_cable_detect,
 	.set_piomode	= hpt372_set_piomode,
 	.set_dmamode	= hpt372_set_dmamode,
 	.prereset	= hpt37x_pre_reset,
@@ -620,7 +627,8 @@ static struct ata_port_operations hpt372_port_ops = {
 
 static struct ata_port_operations hpt374_fn1_port_ops = {
 	.inherits	= &hpt372_port_ops,
-	.prereset	= hpt374_fn1_pre_reset,
+	.cable_detect	= hpt374_fn1_cable_detect,
+	.prereset	= hpt37x_pre_reset,
 };
 
 /**
@@ -791,9 +799,8 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	static const int MHz[4] = { 33, 40, 50, 66 };
 	void *private_data = NULL;
 	const struct ata_port_info *ppi[] = { NULL, NULL };
-
+	u8 rev = dev->revision;
 	u8 irqmask;
-	u32 class_rev;
 	u8 mcr1;
 	u32 freq;
 	int prefer_dpll = 1;
@@ -808,19 +815,16 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
-	class_rev &= 0xFF;
-
 	if (dev->device == PCI_DEVICE_ID_TTI_HPT366) {
 		/* May be a later chip in disguise. Check */
 		/* Older chips are in the HPT366 driver. Ignore them */
-		if (class_rev < 3)
+		if (rev < 3)
 			return -ENODEV;
 		/* N series chips have their own driver. Ignore */
-		if (class_rev == 6)
+		if (rev == 6)
 			return -ENODEV;
 
-		switch(class_rev) {
+		switch(rev) {
 			case 3:
 				ppi[0] = &info_hpt370;
 				chip_table = &hpt370;
@@ -836,28 +840,29 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 				chip_table = &hpt372;
 				break;
 			default:
-				printk(KERN_ERR "pata_hpt37x: Unknown HPT366 subtype please report (%d).\n", class_rev);
+				printk(KERN_ERR "pata_hpt37x: Unknown HPT366 "
+				       "subtype, please report (%d).\n", rev);
 				return -ENODEV;
 		}
 	} else {
 		switch(dev->device) {
 			case PCI_DEVICE_ID_TTI_HPT372:
 				/* 372N if rev >= 2*/
-				if (class_rev >= 2)
+				if (rev >= 2)
 					return -ENODEV;
 				ppi[0] = &info_hpt372;
 				chip_table = &hpt372a;
 				break;
 			case PCI_DEVICE_ID_TTI_HPT302:
 				/* 302N if rev > 1 */
-				if (class_rev > 1)
+				if (rev > 1)
 					return -ENODEV;
 				ppi[0] = &info_hpt372;
 				/* Check this */
 				chip_table = &hpt302;
 				break;
 			case PCI_DEVICE_ID_TTI_HPT371:
-				if (class_rev > 1)
+				if (rev > 1)
 					return -ENODEV;
 				ppi[0] = &info_hpt372;
 				chip_table = &hpt371;
diff --git a/drivers/ata/pata_hpt3x2n.c b/drivers/ata/pata_hpt3x2n.c
index 3d59fe0..9a09a1b 100644
--- a/drivers/ata/pata_hpt3x2n.c
+++ b/drivers/ata/pata_hpt3x2n.c
@@ -25,7 +25,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME	"pata_hpt3x2n"
-#define DRV_VERSION	"0.3.4"
+#define DRV_VERSION	"0.3.7"
 
 enum {
 	HPT_PCI_FAST	=	(1 << 31),
@@ -80,14 +80,13 @@ static struct hpt_clock hpt3x2n_clocks[] = {
 
 	{	XFER_MW_DMA_2,	0x2c829c62	},
 	{	XFER_MW_DMA_1,	0x2c829c66	},
-	{	XFER_MW_DMA_0,	0x2c829d2c	},
+	{	XFER_MW_DMA_0,	0x2c829d2e	},
 
 	{	XFER_PIO_4,	0x0c829c62	},
 	{	XFER_PIO_3,	0x0c829c84	},
 	{	XFER_PIO_2,	0x0c829ca6	},
 	{	XFER_PIO_1,	0x0d029d26	},
 	{	XFER_PIO_0,	0x0d029d5e	},
-	{	0,		0x0d029d5e	}
 };
 
 /**
@@ -128,12 +127,15 @@ static int hpt3x2n_cable_detect(struct ata_port *ap)
 
 	pci_read_config_byte(pdev, 0x5B, &scr2);
 	pci_write_config_byte(pdev, 0x5B, scr2 & ~0x01);
+
+	udelay(10); /* debounce */
+
 	/* Cable register now active */
 	pci_read_config_byte(pdev, 0x5A, &ata66);
 	/* Restore state */
 	pci_write_config_byte(pdev, 0x5B, scr2);
 
-	if (ata66 & (1 << ap->port_no))
+	if (ata66 & (2 >> ap->port_no))
 		return ATA_CBL_PATA40;
 	else
 		return ATA_CBL_PATA80;
@@ -185,9 +187,8 @@ static void hpt3x2n_set_piomode(struct ata_port *ap, struct ata_device *adev)
 
 	pci_read_config_dword(pdev, addr1, &reg);
 	mode = hpt3x2n_find_mode(ap, adev->pio_mode);
-	mode &= ~0x8000000;	/* No FIFO in PIO */
-	mode &= ~0x30070000;	/* Leave config bits alone */
-	reg &= 0x30070000;	/* Strip timing bits */
+	mode &= 0xCFC3FFFF;	/* Leave DMA bits alone */
+	reg &= ~0xCFC3FFFF;	/* Strip timing bits */
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -204,8 +205,7 @@ static void hpt3x2n_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 {
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	u32 addr1, addr2;
-	u32 reg;
-	u32 mode;
+	u32 reg, mode, mask;
 	u8 fast;
 
 	addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no);
@@ -216,11 +216,12 @@ static void hpt3x2n_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 	fast &= ~0x07;
 	pci_write_config_byte(pdev, addr2, fast);
 
+	mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000;
+
 	pci_read_config_dword(pdev, addr1, &reg);
 	mode = hpt3x2n_find_mode(ap, adev->dma_mode);
-	mode |= 0x8000000;	/* FIFO in MWDMA or UDMA */
-	mode &= ~0xC0000000;	/* Leave config bits alone */
-	reg &= 0xC0000000;	/* Strip timing bits */
+	mode &= mask;
+	reg &= ~mask;
 	pci_write_config_dword(pdev, addr1, reg | mode);
 }
 
@@ -447,10 +448,8 @@ static int hpt3x2n_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 		.port_ops = &hpt3x2n_port_ops
 	};
 	const struct ata_port_info *ppi[] = { &info, NULL };
-
+	u8 rev = dev->revision;
 	u8 irqmask;
-	u32 class_rev;
-
 	unsigned int pci_mhz;
 	unsigned int f_low, f_high;
 	int adjust;
@@ -462,26 +461,23 @@ static int hpt3x2n_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
-	class_rev &= 0xFF;
-
 	switch(dev->device) {
 		case PCI_DEVICE_ID_TTI_HPT366:
-			if (class_rev < 6)
+			if (rev < 6)
 				return -ENODEV;
 			break;
 		case PCI_DEVICE_ID_TTI_HPT371:
-			if (class_rev < 2)
+			if (rev < 2)
 				return -ENODEV;
 			/* 371N if rev > 1 */
 			break;
 		case PCI_DEVICE_ID_TTI_HPT372:
 			/* 372N if rev >= 2*/
-			if (class_rev < 2)
+			if (rev < 2)
 				return -ENODEV;
 			break;
 		case PCI_DEVICE_ID_TTI_HPT302:
-			if (class_rev < 2)
+			if (rev < 2)
 				return -ENODEV;
 			break;
 		case PCI_DEVICE_ID_TTI_HPT372N:
diff --git a/drivers/ata/pata_hpt3x3.c b/drivers/ata/pata_hpt3x3.c
index 7e31025..c86c716 100644
--- a/drivers/ata/pata_hpt3x3.c
+++ b/drivers/ata/pata_hpt3x3.c
@@ -255,8 +255,17 @@ static int hpt3x3_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 #ifdef CONFIG_PM
 static int hpt3x3_reinit_one(struct pci_dev *dev)
 {
+	struct ata_host *host = dev_get_drvdata(&dev->dev);
+	int rc;
+
+	rc = ata_pci_device_do_resume(dev);
+	if (rc)
+		return rc;
+
 	hpt3x3_init_chipset(dev);
-	return ata_pci_device_resume(dev);
+
+	ata_host_resume(host);
+	return 0;
 }
 #endif
 
diff --git a/drivers/ata/pata_it8213.c b/drivers/ata/pata_it8213.c
index f156da8..8f3325a 100644
--- a/drivers/ata/pata_it8213.c
+++ b/drivers/ata/pata_it8213.c
@@ -22,7 +22,7 @@
 #define DRV_VERSION	"0.0.3"
 
 /**
- *	it8213_pre_reset	-	check for 40/80 pin
+ *	it8213_pre_reset	-	probe begin
  *	@link: link
  *	@deadline: deadline jiffies for the operation
  *
@@ -92,18 +92,17 @@ static void it8213_set_piomode (struct ata_port *ap, struct ata_device *adev)
 			    { 2, 1 },
 			    { 2, 3 }, };
 
-	if (pio > 2)
-		control |= 1;	/* TIME1 enable */
+	if (pio > 1)
+		control |= 1;	/* TIME */
 	if (ata_pio_need_iordy(adev))	/* PIO 3/4 require IORDY */
-		control |= 2;	/* IORDY enable */
+		control |= 2;	/* IE */
 	/* Bit 2 is set for ATAPI on the IT8213 - reverse of ICH/PIIX */
 	if (adev->class != ATA_DEV_ATA)
-		control |= 4;
+		control |= 4;	/* PPE */
 
 	pci_read_config_word(dev, idetm_port, &idetm_data);
 
-	/* Enable PPE, IE and TIME as appropriate */
-
+	/* Set PPE, IE, and TIME as appropriate */
 	if (adev->devno == 0) {
 		idetm_data &= 0xCCF0;
 		idetm_data |= control;
@@ -112,17 +111,17 @@ static void it8213_set_piomode (struct ata_port *ap, struct ata_device *adev)
 	} else {
 		u8 slave_data;
 
-		idetm_data &= 0xCC0F;
+		idetm_data &= 0xFF0F;
 		idetm_data |= (control << 4);
 
 		/* Slave timing in separate register */
 		pci_read_config_byte(dev, 0x44, &slave_data);
 		slave_data &= 0xF0;
-		slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << 4;
+		slave_data |= (timings[pio][0] << 2) | timings[pio][1];
 		pci_write_config_byte(dev, 0x44, slave_data);
 	}
 
-	idetm_data |= 0x4000;	/* Ensure SITRE is enabled */
+	idetm_data |= 0x4000;	/* Ensure SITRE is set */
 	pci_write_config_word(dev, idetm_port, idetm_data);
 }
 
@@ -173,10 +172,10 @@ static void it8213_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 
 		udma_enable |= (1 << devid);
 
-		/* Load the UDMA mode number */
+		/* Load the UDMA cycle time */
 		pci_read_config_word(dev, 0x4A, &udma_timing);
 		udma_timing &= ~(3 << (4 * devid));
-		udma_timing |= (udma & 3) << (4 * devid);
+		udma_timing |= u_speed << (4 * devid);
 		pci_write_config_word(dev, 0x4A, udma_timing);
 
 		/* Load the clock selection */
@@ -211,7 +210,7 @@ static void it8213_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 			master_data &= 0xFF4F;  /* Mask out IORDY|TIME1|DMAONLY */
 			master_data |= control << 4;
 			pci_read_config_byte(dev, 0x44, &slave_data);
-			slave_data &= (0x0F + 0xE1 * ap->port_no);
+			slave_data &= 0xF0;
 			/* Load the matching timing */
 			slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0);
 			pci_write_config_byte(dev, 0x44, slave_data);
@@ -263,7 +262,7 @@ static int it8213_init_one (struct pci_dev *pdev, const struct pci_device_id *en
 	static const struct ata_port_info info = {
 		.flags		= ATA_FLAG_SLAVE_POSS,
 		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
+		.mwdma_mask	= ATA_MWDMA12_ONLY,
 		.udma_mask 	= ATA_UDMA4, /* FIXME: want UDMA 100? */
 		.port_ops	= &it8213_ops,
 	};
diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c
index 188bc2f..edc5c1f 100644
--- a/drivers/ata/pata_it821x.c
+++ b/drivers/ata/pata_it821x.c
@@ -955,7 +955,7 @@ static int it821x_reinit_one(struct pci_dev *pdev)
 static const struct pci_device_id it821x[] = {
 	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8211), },
 	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8212), },
-	{ PCI_VDEVICE(RDC, 0x1010), },
+	{ PCI_VDEVICE(RDC, PCI_DEVICE_ID_RDC_D1010), },
 
 	{ },
 };
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index 6932e56..9df1ff7 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -25,6 +25,13 @@
  *		http://www.ryston.cz/petr/vlb/pdc20230b.html
  *		http://www.ryston.cz/petr/vlb/pdc20230c.html
  *		http://www.ryston.cz/petr/vlb/pdc20630.html
+ *	QDI65x0:
+ *		http://www.ryston.cz/petr/vlb/qd6500.html
+ *		http://www.ryston.cz/petr/vlb/qd6580.html
+ *
+ *	QDI65x0 probe code based on drivers/ide/legacy/qd65xx.c
+ *	Rewritten from the work of Colten Edwards <pje120@cs.usask.ca> by
+ *	Samuel Thibault <samuel.thibault@ens-lyon.org>
  *
  *  Unsupported but docs exist:
  *	Appian/Adaptec AIC25VL01/Cirrus Logic PD7220
@@ -35,7 +42,7 @@
  *  the MPIIX where the tuning is PCI side but the IDE is "ISA side".
  *
  *  Specific support is included for the ht6560a/ht6560b/opti82c611a/
- *  opti82c465mv/promise 20230c/20630/winbond83759A
+ *  opti82c465mv/promise 20230c/20630/qdi65x0/winbond83759A
  *
  *  Use the autospeed and pio_mask options with:
  *	Appian ADI/2 aka CLPD7220 or AIC25VL01.
@@ -672,7 +679,7 @@ static void qdi6580dp_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	outb(timing, ld_qdi->timing + 2 * ap->port_no);
 	/* Clear the FIFO */
 	if (adev->class != ATA_DEV_ATA)
-		outb(0x5F, ld_qdi->timing + 3);
+		outb(0x5F, (ld_qdi->timing & 0xFFF0) + 3);
 }
 
 /**
@@ -707,7 +714,7 @@ static void qdi6580_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	outb(timing, ld_qdi->timing + 2 * adev->devno);
 	/* Clear the FIFO */
 	if (adev->class != ATA_DEV_ATA)
-		outb(0x5F, ld_qdi->timing + 3);
+		outb(0x5F, (ld_qdi->timing & 0xFFF0) + 3);
 }
 
 /**
@@ -787,6 +794,7 @@ static struct ata_port_operations qdi6580_port_ops = {
 static struct ata_port_operations qdi6580dp_port_ops = {
 	.inherits	= &legacy_base_port_ops,
 	.set_piomode	= qdi6580dp_set_piomode,
+	.qc_issue	= qdi_qc_issue,
 	.sff_data_xfer	= vlb32_data_xfer,
 };
 
diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
index 2096fb7..950da39 100644
--- a/drivers/ata/pata_marvell.c
+++ b/drivers/ata/pata_marvell.c
@@ -58,7 +58,7 @@ static int marvell_pata_active(struct pci_dev *pdev)
 }
 
 /**
- *	marvell_pre_reset	-	check for 40/80 pin
+ *	marvell_pre_reset	-	probe begin
  *	@link: link
  *	@deadline: deadline jiffies for the operation
  *
diff --git a/drivers/ata/pata_ns87415.c b/drivers/ata/pata_ns87415.c
index 773b159..061aa1c 100644
--- a/drivers/ata/pata_ns87415.c
+++ b/drivers/ata/pata_ns87415.c
@@ -325,6 +325,13 @@ static struct scsi_host_template ns87415_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
+static void ns87415_fixup(struct pci_dev *pdev)
+{
+	/* Select 512 byte sectors */
+	pci_write_config_byte(pdev, 0x55, 0xEE);
+	/* Select PIO0 8bit clocking */
+	pci_write_config_byte(pdev, 0x54, 0xB7);
+}
 
 /**
  *	ns87415_init_one - Register 87415 ATA PCI device with kernel services
@@ -371,10 +378,8 @@ static int ns87415_init_one (struct pci_dev *pdev, const struct pci_device_id *e
 	if (rc)
 		return rc;
 
-	/* Select 512 byte sectors */
-	pci_write_config_byte(pdev, 0x55, 0xEE);
-	/* Select PIO0 8bit clocking */
-	pci_write_config_byte(pdev, 0x54, 0xB7);
+	ns87415_fixup(pdev);
+
 	return ata_pci_sff_init_one(pdev, ppi, &ns87415_sht, NULL);
 }
 
@@ -384,6 +389,23 @@ static const struct pci_device_id ns87415_pci_tbl[] = {
 	{ }	/* terminate list */
 };
 
+#ifdef CONFIG_PM
+static int ns87415_reinit_one(struct pci_dev *pdev)
+{
+	struct ata_host *host = dev_get_drvdata(&pdev->dev);
+	int rc;
+
+	rc = ata_pci_device_do_resume(pdev);
+	if (rc)
+		return rc;
+
+	ns87415_fixup(pdev);
+
+	ata_host_resume(host);
+	return 0;
+}
+#endif
+
 static struct pci_driver ns87415_pci_driver = {
 	.name			= DRV_NAME,
 	.id_table		= ns87415_pci_tbl,
@@ -391,7 +413,7 @@ static struct pci_driver ns87415_pci_driver = {
 	.remove			= ata_pci_remove_one,
 #ifdef CONFIG_PM
 	.suspend		= ata_pci_device_suspend,
-	.resume			= ata_pci_device_resume,
+	.resume			= ns87415_reinit_one,
 #endif
 };
 
diff --git a/drivers/ata/pata_oldpiix.c b/drivers/ata/pata_oldpiix.c
index 84ac503..9a8687d 100644
--- a/drivers/ata/pata_oldpiix.c
+++ b/drivers/ata/pata_oldpiix.c
@@ -239,7 +239,7 @@ static int oldpiix_init_one (struct pci_dev *pdev, const struct pci_device_id *e
 	static const struct ata_port_info info = {
 		.flags		= ATA_FLAG_SLAVE_POSS,
 		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
+		.mwdma_mask	= ATA_MWDMA12_ONLY,
 		.port_ops	= &oldpiix_pata_ops,
 	};
 	const struct ata_port_info *ppi[] = { &info, NULL };
diff --git a/drivers/ata/pata_piccolo.c b/drivers/ata/pata_piccolo.c
new file mode 100644
index 0000000..bfe0180
--- /dev/null
+++ b/drivers/ata/pata_piccolo.c
@@ -0,0 +1,140 @@
+/*
+ *  pata_piccolo.c - Toshiba Piccolo PATA/SATA controller driver.
+ *
+ *  This is basically an update to ata_generic.c to add Toshiba Piccolo support
+ *  then split out to keep ata_generic "clean".
+ *
+ *  Copyright 2005 Red Hat Inc, all rights reserved.
+ *
+ *  Elements from ide/pci/generic.c
+ *	    Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
+ *	    Portions (C) Copyright 2002  Red Hat Inc <alan@redhat.com>
+ *
+ *  May be copied or modified under the terms of the GNU General Public License
+ *
+ *  The timing data tables/programming info are courtesy of the NetBSD driver
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <scsi/scsi_host.h>
+#include <linux/libata.h>
+
+#define DRV_NAME "pata_piccolo"
+#define DRV_VERSION "0.0.1"
+
+
+
+static void tosh_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+	static const u16 pio[6] = {	/* For reg 0x50 low word & E088 */
+		0x0566, 0x0433, 0x0311, 0x0201, 0x0200, 0x0100
+	};
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	u16 conf;
+	pci_read_config_word(pdev, 0x50, &conf);
+	conf &= 0xE088;
+	conf |= pio[adev->pio_mode - XFER_PIO_0];
+	pci_write_config_word(pdev, 0x50, conf);
+}
+
+static void tosh_set_dmamode(struct ata_port *ap, struct ata_device *adev)
+{
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	u32 conf;
+	pci_read_config_dword(pdev, 0x5C, &conf);
+	conf &= 0x78FFE088;	/* Keep the other bits */
+	if (adev->dma_mode >= XFER_UDMA_0) {
+		int udma = adev->dma_mode - XFER_UDMA_0;
+		conf |= 0x80000000;
+		conf |= (udma + 2) << 28;
+		conf |= (2 - udma) * 0x111;	/* spread into three nibbles */
+	} else {
+		static const u32 mwdma[4] = {
+			0x0655, 0x0200, 0x0200, 0x0100
+		};
+		conf |= mwdma[adev->dma_mode - XFER_MW_DMA_0];
+	}
+	pci_write_config_dword(pdev, 0x5C, conf);
+}
+
+
+static struct scsi_host_template tosh_sht = {
+	ATA_BMDMA_SHT(DRV_NAME),
+};
+
+static struct ata_port_operations tosh_port_ops = {
+	.inherits	= &ata_bmdma_port_ops,
+	.cable_detect	= ata_cable_unknown,
+	.set_piomode	= tosh_set_piomode,
+	.set_dmamode	= tosh_set_dmamode
+};
+
+/**
+ *	ata_tosh_init		-	attach generic IDE
+ *	@dev: PCI device found
+ *	@id: match entry
+ *
+ *	Called each time a matching IDE interface is found. We check if the
+ *	interface is one we wish to claim and if so we perform any chip
+ *	specific hacks then let the ATA layer do the heavy lifting.
+ */
+
+static int ata_tosh_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	static const struct ata_port_info info = {
+		.flags = ATA_FLAG_SLAVE_POSS,
+		.pio_mask = ATA_PIO5,
+		.mwdma_mask = ATA_MWDMA2,
+		.udma_mask = ATA_UDMA2,
+		.port_ops = &tosh_port_ops
+	};
+	const struct ata_port_info *ppi[] = { &info, &ata_dummy_port_info };
+	/* Just one port for the moment */
+	return ata_pci_sff_init_one(dev, ppi, &tosh_sht, NULL);
+}
+
+static struct pci_device_id ata_tosh[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),  },
+	{ 0, },
+};
+
+static struct pci_driver ata_tosh_pci_driver = {
+	.name 		= DRV_NAME,
+	.id_table	= ata_tosh,
+	.probe 		= ata_tosh_init_one,
+	.remove		= ata_pci_remove_one,
+#ifdef CONFIG_PM
+	.suspend	= ata_pci_device_suspend,
+	.resume		= ata_pci_device_resume,
+#endif
+};
+
+static int __init ata_tosh_init(void)
+{
+	return pci_register_driver(&ata_tosh_pci_driver);
+}
+
+
+static void __exit ata_tosh_exit(void)
+{
+	pci_unregister_driver(&ata_tosh_pci_driver);
+}
+
+
+MODULE_AUTHOR("Alan Cox");
+MODULE_DESCRIPTION("Low level driver for Toshiba Piccolo ATA");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, ata_tosh);
+MODULE_VERSION(DRV_VERSION);
+
+module_init(ata_tosh_init);
+module_exit(ata_tosh_exit);
+
diff --git a/drivers/ata/pata_radisys.c b/drivers/ata/pata_radisys.c
index 4401b33..4fd25e7 100644
--- a/drivers/ata/pata_radisys.c
+++ b/drivers/ata/pata_radisys.c
@@ -139,9 +139,9 @@ static void radisys_set_dmamode (struct ata_port *ap, struct ata_device *adev)
 		pci_read_config_byte(dev, 0x4A, &udma_mode);
 
 		if (adev->xfer_mode == XFER_UDMA_2)
-			udma_mode &= ~ (1 << adev->devno);
+			udma_mode &= ~(2 << (adev->devno * 4));
 		else /* UDMA 4 */
-			udma_mode |= (1 << adev->devno);
+			udma_mode |= (2 << (adev->devno * 4));
 
 		pci_write_config_byte(dev, 0x4A, udma_mode);
 
diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c
index c843a1e..237a24d 100644
--- a/drivers/ata/pata_rdc.c
+++ b/drivers/ata/pata_rdc.c
@@ -284,7 +284,7 @@ static struct ata_port_info rdc_port_info = {
 
 	.flags		= ATA_FLAG_SLAVE_POSS,
 	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
+	.mwdma_mask	= ATA_MWDMA12_ONLY,
 	.udma_mask	= ATA_UDMA5,
 	.port_ops	= &rdc_pata_ops,
 };
diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c
index a5e4dfe..2932998 100644
--- a/drivers/ata/pata_rz1000.c
+++ b/drivers/ata/pata_rz1000.c
@@ -105,11 +105,20 @@ static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *en
 #ifdef CONFIG_PM
 static int rz1000_reinit_one(struct pci_dev *pdev)
 {
+	struct ata_host *host = dev_get_drvdata(&pdev->dev);
+	int rc;
+
+	rc = ata_pci_device_do_resume(pdev);
+	if (rc)
+		return rc;
+
 	/* If this fails on resume (which is a "cant happen" case), we
 	   must stop as any progress risks data loss */
 	if (rz1000_fifo_disable(pdev))
 		panic("rz1000 fifo");
-	return ata_pci_device_resume(pdev);
+
+	ata_host_resume(host);
+	return 0;
 }
 #endif
 
diff --git a/drivers/ata/pata_sil680.c b/drivers/ata/pata_sil680.c
index 4cb649d..a2ace48 100644
--- a/drivers/ata/pata_sil680.c
+++ b/drivers/ata/pata_sil680.c
@@ -212,13 +212,11 @@ static struct ata_port_operations sil680_port_ops = {
 
 static u8 sil680_init_chip(struct pci_dev *pdev, int *try_mmio)
 {
-	u32 class_rev	= 0;
 	u8 tmpbyte	= 0;
 
-        pci_read_config_dword(pdev, PCI_CLASS_REVISION, &class_rev);
-        class_rev &= 0xff;
         /* FIXME: double check */
-	pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, (class_rev) ? 1 : 255);
+	pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE,
+			      pdev->revision ? 1 : 255);
 
 	pci_write_config_byte(pdev, 0x80, 0x00);
 	pci_write_config_byte(pdev, 0x84, 0x00);
diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c
index 488e77b..5c30d56 100644
--- a/drivers/ata/pata_sis.c
+++ b/drivers/ata/pata_sis.c
@@ -2,7 +2,7 @@
  *    pata_sis.c - SiS ATA driver
  *
  *	(C) 2005 Red Hat
- *	(C) 2007 Bartlomiej Zolnierkiewicz
+ *	(C) 2007,2009 Bartlomiej Zolnierkiewicz
  *
  *    Based upon linux/drivers/ide/pci/sis5513.c
  * Copyright (C) 1999-2000	Andre Hedrick <andre@linux-ide.org>
@@ -829,6 +829,23 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	return ata_pci_sff_init_one(pdev, ppi, &sis_sht, chipset);
 }
 
+#ifdef CONFIG_PM
+static int sis_reinit_one(struct pci_dev *pdev)
+{
+	struct ata_host *host = dev_get_drvdata(&pdev->dev);
+	int rc;
+
+	rc = ata_pci_device_do_resume(pdev);
+	if (rc)
+		return rc;
+
+	sis_fixup(pdev, host->private_data);
+
+	ata_host_resume(host);
+	return 0;
+}
+#endif
+
 static const struct pci_device_id sis_pci_tbl[] = {
 	{ PCI_VDEVICE(SI, 0x5513), },	/* SiS 5513 */
 	{ PCI_VDEVICE(SI, 0x5518), },	/* SiS 5518 */
@@ -844,7 +861,7 @@ static struct pci_driver sis_pci_driver = {
 	.remove			= ata_pci_remove_one,
 #ifdef CONFIG_PM
 	.suspend		= ata_pci_device_suspend,
-	.resume			= ata_pci_device_resume,
+	.resume			= sis_reinit_one,
 #endif
 };
 
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 88984b8..0d97890 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -303,14 +303,21 @@ static void via_do_set_mode(struct ata_port *ap, struct ata_device *adev, int mo
 	}
 
 	/* Set UDMA unless device is not UDMA capable */
-	if (udma_type && t.udma) {
-		u8 cable80_status;
+	if (udma_type) {
+		u8 udma_etc;
 
-		/* Get 80-wire cable detection bit */
-		pci_read_config_byte(pdev, 0x50 + offset, &cable80_status);
-		cable80_status &= 0x10;
+		pci_read_config_byte(pdev, 0x50 + offset, &udma_etc);
 
-		pci_write_config_byte(pdev, 0x50 + offset, ut | cable80_status);
+		/* clear transfer mode bit */
+		udma_etc &= ~0x20;
+
+		if (t.udma) {
+			/* preserve 80-wire cable detection bit */
+			udma_etc &= 0x10;
+			udma_etc |= ut;
+		}
+
+		pci_write_config_byte(pdev, 0x50 + offset, udma_etc);
 	}
 }
 
@@ -337,6 +344,32 @@ static void via_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 }
 
 /**
+ *	via_mode_filter		-	filter buggy device/mode pairs
+ *	@dev: ATA device
+ *	@mask: Mode bitmask
+ *
+ *	We need to apply some minimal filtering for old controllers and at least
+ *	one breed of Transcend SSD. Return the updated mask.
+ */
+
+static unsigned long via_mode_filter(struct ata_device *dev, unsigned long mask)
+{
+	struct ata_host *host = dev->link->ap->host;
+	const struct via_isa_bridge *config = host->private_data;
+	unsigned char model_num[ATA_ID_PROD_LEN + 1];
+
+	if (config->id == PCI_DEVICE_ID_VIA_82C586_0) {
+		ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num));
+		if (strcmp(model_num, "TS64GSSD25-M") == 0) {
+			ata_dev_printk(dev, KERN_WARNING,
+	"disabling UDMA mode due to reported lockups with this device.\n");
+			mask &= ~ ATA_MASK_UDMA;
+		}
+	}
+	return ata_bmdma_mode_filter(dev, mask);
+}
+
+/**
  *	via_tf_load - send taskfile registers to host controller
  *	@ap: Port to which output is sent
  *	@tf: ATA taskfile register set
@@ -427,6 +460,7 @@ static struct ata_port_operations via_port_ops = {
 	.prereset	= via_pre_reset,
 	.sff_tf_load	= via_tf_load,
 	.port_start	= via_port_start,
+	.mode_filter	= via_mode_filter,
 };
 
 static struct ata_port_operations via_port_ops_noirq = {
@@ -526,7 +560,7 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		.port_ops = &via_port_ops
 	};
 	const struct ata_port_info *ppi[] = { NULL, NULL };
-	struct pci_dev *isa = NULL;
+	struct pci_dev *isa;
 	const struct via_isa_bridge *config;
 	static int printed_version;
 	u8 enable;
@@ -551,15 +585,13 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		if ((isa = pci_get_device(PCI_VENDOR_ID_VIA +
 			!!(config->flags & VIA_BAD_ID),
 			config->id, NULL))) {
+			u8 rev = isa->revision;
+			pci_dev_put(isa);
 
-			if (isa->revision >= config->rev_min &&
-			    isa->revision <= config->rev_max)
+			if (rev >= config->rev_min && rev <= config->rev_max)
 				break;
-			pci_dev_put(isa);
 		}
 
-	pci_dev_put(isa);
-
 	if (!(config->flags & VIA_NO_ENABLES)) {
 		/* 0x40 low bits indicate enabled channels */
 		pci_read_config_byte(pdev, 0x40 , &enable);
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 172b57e..8a5d35b 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -34,7 +34,7 @@ enum {
 
 	SATA_FSL_HOST_FLAGS	= (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY |
 				ATA_FLAG_MMIO | ATA_FLAG_PIO_DMA |
-				ATA_FLAG_PMP | ATA_FLAG_NCQ),
+				ATA_FLAG_PMP | ATA_FLAG_NCQ | ATA_FLAG_AN),
 
 	SATA_FSL_MAX_CMDS	= SATA_FSL_QUEUE_DEPTH,
 	SATA_FSL_CMD_HDR_SIZE	= 16,	/* 4 DWORDS */
@@ -132,7 +132,7 @@ enum {
 	INT_ON_SINGL_DEVICE_ERR = (1 << 1),
 	INT_ON_CMD_COMPLETE = 1,
 
-	INT_ON_ERROR = INT_ON_FATAL_ERR |
+	INT_ON_ERROR = INT_ON_FATAL_ERR | INT_ON_SNOTIFY_UPDATE |
 	    INT_ON_PHYRDY_CHG | INT_ON_SINGL_DEVICE_ERR,
 
 	/*
@@ -153,7 +153,7 @@ enum {
 	IE_ON_CMD_COMPLETE = 1,
 
 	DEFAULT_PORT_IRQ_ENABLE_MASK = IE_ON_FATAL_ERR | IE_ON_PHYRDY_CHG |
-	    IE_ON_SIGNATURE_UPDATE |
+	    IE_ON_SIGNATURE_UPDATE | IE_ON_SNOTIFY_UPDATE |
 	    IE_ON_SINGL_DEVICE_ERR | IE_ON_CMD_COMPLETE,
 
 	EXT_INDIRECT_SEG_PRD_FLAG = (1 << 31),
@@ -992,9 +992,8 @@ static void sata_fsl_error_intr(struct ata_port *ap)
 	 */
 
 	sata_fsl_scr_read(&ap->link, SCR_ERROR, &SError);
-	if (unlikely(SError & 0xFFFF0000)) {
+	if (unlikely(SError & 0xFFFF0000))
 		sata_fsl_scr_write(&ap->link, SCR_ERROR, SError);
-	}
 
 	DPRINTK("error_intr,hStat=0x%x,CE=0x%x,DE =0x%x,SErr=0x%x\n",
 		hstatus, cereg, ioread32(hcr_base + DE), SError);
@@ -1007,6 +1006,10 @@ static void sata_fsl_error_intr(struct ata_port *ap)
 		freeze = 1;
 	}
 
+	/* Handle SDB FIS receive & notify update */
+	if (hstatus & INT_ON_SNOTIFY_UPDATE)
+		sata_async_notification(ap);
+
 	/* Handle PHYRDY change notification */
 	if (hstatus & INT_ON_PHYRDY_CHG) {
 		DPRINTK("SATA FSL: PHYRDY change indication\n");
@@ -1070,9 +1073,9 @@ static void sata_fsl_error_intr(struct ata_port *ap)
 	}
 
 	/* record error info */
-	if (qc) {
+	if (qc)
 		qc->err_mask |= err_mask;
-	} else
+	else
 		ehi->err_mask |= err_mask;
 
 	ehi->action |= action;
@@ -1103,7 +1106,6 @@ static void sata_fsl_host_intr(struct ata_port *ap)
 	if (unlikely(SError & 0xFFFF0000)) {
 		DPRINTK("serror @host_intr : 0x%x\n", SError);
 		sata_fsl_error_intr(ap);
-
 	}
 
 	if (unlikely(hstatus & INT_ON_ERROR)) {
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 6f5093b..a8a7be0 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -2217,7 +2217,7 @@ static unsigned int mv_qc_issue_fis(struct ata_queued_cmd *qc)
 	int err = 0;
 
 	ata_tf_to_fis(&qc->tf, link->pmp, 1, (void *)fis);
-	err = mv_send_fis(ap, fis, sizeof(fis) / sizeof(fis[0]));
+	err = mv_send_fis(ap, fis, ARRAY_SIZE(fis));
 	if (err)
 		return err;
 
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index e6946fc..1370df6 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -417,6 +417,10 @@ static struct ata_port_operations sil24_ops = {
 #endif
 };
 
+static int sata_sil24_msi;    /* Disable MSI */
+module_param_named(msi, sata_sil24_msi, bool, S_IRUGO);
+MODULE_PARM_DESC(msi, "Enable MSI (Default: false)");
+
 /*
  * Use bits 30-31 of port_flags to encode available port numbers.
  * Current maxium is 4.
@@ -1340,6 +1344,11 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	sil24_init_controller(host);
 
+	if (sata_sil24_msi && !pci_enable_msi(pdev)) {
+		dev_printk(KERN_INFO, &pdev->dev, "Using MSI\n");
+		pci_intx(pdev, 0);
+	}
+
 	pci_set_master(pdev);
 	return ata_host_activate(host, pdev->irq, sil24_interrupt, IRQF_SHARED,
 				 &sil24_sht);
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 846d89e..5a01ece 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -185,6 +185,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 	}
 
 	dev->power.runtime_status = RPM_SUSPENDING;
+	dev->power.deferred_resume = false;
 
 	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) {
 		spin_unlock_irq(&dev->power.lock);
@@ -200,7 +201,6 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 	if (retval) {
 		dev->power.runtime_status = RPM_ACTIVE;
 		pm_runtime_cancel_pending(dev);
-		dev->power.deferred_resume = false;
 
 		if (retval == -EAGAIN || retval == -EBUSY) {
 			notify = true;
@@ -217,7 +217,6 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 	wake_up_all(&dev->power.wait_queue);
 
 	if (dev->power.deferred_resume) {
-		dev->power.deferred_resume = false;
 		__pm_runtime_resume(dev, false);
 		retval = -EAGAIN;
 		goto out;
@@ -626,6 +625,8 @@ int pm_schedule_suspend(struct device *dev, unsigned int delay)
 		goto out;
 
 	dev->power.timer_expires = jiffies + msecs_to_jiffies(delay);
+	if (!dev->power.timer_expires)
+		dev->power.timer_expires = 1;
 	mod_timer(&dev->power.suspend_timer, dev->power.timer_expires);
 
  out:
@@ -659,13 +660,17 @@ static int __pm_request_resume(struct device *dev)
 
 	pm_runtime_deactivate_timer(dev);
 
+	if (dev->power.runtime_status == RPM_SUSPENDING) {
+		dev->power.deferred_resume = true;
+		return retval;
+	}
 	if (dev->power.request_pending) {
 		/* If non-resume request is pending, we can overtake it. */
 		dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME;
 		return retval;
-	} else if (retval) {
-		return retval;
 	}
+	if (retval)
+		return retval;
 
 	dev->power.request = RPM_REQ_RESUME;
 	dev->power.request_pending = true;
@@ -777,7 +782,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
 	}
 
 	if (parent) {
-		spin_lock(&parent->power.lock);
+		spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING);
 
 		/*
 		 * It is invalid to put an active child under a parent that is
@@ -786,12 +791,10 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
 		 */
 		if (!parent->power.disable_depth
 		    && !parent->power.ignore_children
-		    && parent->power.runtime_status != RPM_ACTIVE) {
+		    && parent->power.runtime_status != RPM_ACTIVE)
 			error = -EBUSY;
-		} else {
-			if (dev->power.runtime_status == RPM_SUSPENDED)
-				atomic_inc(&parent->power.child_count);
-		}
+		else if (dev->power.runtime_status == RPM_SUSPENDED)
+			atomic_inc(&parent->power.child_count);
 
 		spin_unlock(&parent->power.lock);
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e0..77bfce5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8..aff5ac9 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 
 swim_mod-objs	:= swim.o swim_asm.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 92b1263..873e594 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
 static int deregister_disk(ctlr_info_t *h, int drv_index,
 			   int clear_all, int via_ioctl);
 
-static void cciss_read_capacity(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity(int ctlr, int logvol,
 			sector_t *total_size, unsigned int *block_size);
-static void cciss_read_capacity_16(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity_16(int ctlr, int logvol,
 			sector_t *total_size, unsigned int *block_size);
 static void cciss_geometry_inquiry(int ctlr, int logvol,
-			int withirq, sector_t total_size,
+			sector_t total_size,
 			unsigned int block_size, InquiryData_struct *inq_buff,
 				   drive_info_struct *drv);
 static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
 					   __u32);
 static void start_io(ctlr_info_t *h);
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-		   __u8 page_code, unsigned char *scsi3addr, int cmd_type);
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 			__u8 page_code, unsigned char scsi3addr[],
 			int cmd_type);
@@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
 	if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
 		struct seq_file *seq = file->private_data;
 		ctlr_info_t *h = seq->private;
-		int rc;
 
-		rc = cciss_engage_scsi(h->ctlr);
-		if (rc != 0)
-			err = -rc;
-		else
+		err = cciss_engage_scsi(h->ctlr);
+		if (err == 0)
 			err = length;
 	} else
 #endif /* CONFIG_CISS_SCSI_TAPE */
@@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq)
 {
 	CommandList_struct *cmd = rq->completion_data;
 	ctlr_info_t *h = hba[cmd->ctlr];
+	SGDescriptor_struct *curr_sg = cmd->SG;
 	unsigned long flags;
 	u64bit temp64;
 	int i, ddir;
+	int sg_index = 0;
 
 	if (cmd->Request.Type.Direction == XFER_READ)
 		ddir = PCI_DMA_FROMDEVICE;
@@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq)
 	/* command did not need to be retried */
 	/* unmap the DMA mapping for all the scatter gather elements */
 	for (i = 0; i < cmd->Header.SGList; i++) {
-		temp64.val32.lower = cmd->SG[i].Addr.lower;
-		temp64.val32.upper = cmd->SG[i].Addr.upper;
-		pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
+		if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
+			temp64.val32.lower = cmd->SG[i].Addr.lower;
+			temp64.val32.upper = cmd->SG[i].Addr.upper;
+			pci_dma_sync_single_for_cpu(h->pdev, temp64.val,
+						cmd->SG[i].Len, ddir);
+			pci_unmap_single(h->pdev, temp64.val,
+						cmd->SG[i].Len, ddir);
+			/* Point to the next block */
+			curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain;
+			sg_index = 0;
+		}
+		temp64.val32.lower = curr_sg[sg_index].Addr.lower;
+		temp64.val32.upper = curr_sg[sg_index].Addr.upper;
+		pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
+				ddir);
+		++sg_index;
 	}
 
 #ifdef CCISS_DEBUG
@@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
  * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
  * they cannot be read.
  */
-static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+static void cciss_get_device_descr(int ctlr, int logvol,
 				   char *vendor, char *model, char *rev)
 {
 	int rc;
@@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 		return;
 
 	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 0,
-				scsi3addr, TYPE_CMD);
-	else
-		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 0,
-				scsi3addr, TYPE_CMD);
+	rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
+			scsi3addr, TYPE_CMD);
 	if (rc == IO_OK) {
 		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
 		vendor[VENDOR_LEN] = '\0';
@@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
  * number cannot be had, for whatever reason, 16 bytes of 0xff
  * are returned instead.
  */
-static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
+static void cciss_get_serial_no(int ctlr, int logvol,
 				unsigned char *serial_no, int buflen)
 {
 #define PAGE_83_INQ_BYTES 64
@@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 		return;
 	memset(serial_no, 0, buflen);
 	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
-	else
-		rc = sendcmd(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
+	rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
+		PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	if (rc == IO_OK)
 		memcpy(serial_no, &buf[8], buflen);
 	kfree(buf);
@@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 	blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
 
 	/* This is a hardware imposed limit. */
-	blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES);
+	blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
 
 	/* This is a limit in the driver and could be eliminated. */
-	blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
+	blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
 
 	blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
 
@@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
 
 	/* testing to see if 16-byte CDBs are already being used */
 	if (h->cciss_read == CCISS_READ_16) {
-		cciss_read_capacity_16(h->ctlr, drv_index, 1,
+		cciss_read_capacity_16(h->ctlr, drv_index,
 			&total_size, &block_size);
 
 	} else {
-		cciss_read_capacity(ctlr, drv_index, 1,
-				    &total_size, &block_size);
-
+		cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
 		/* if read_capacity returns all F's this volume is >2TB */
 		/* in size so we switch to 16-byte CDB's for all */
 		/* read/write ops */
 		if (total_size == 0xFFFFFFFFULL) {
-			cciss_read_capacity_16(ctlr, drv_index, 1,
+			cciss_read_capacity_16(ctlr, drv_index,
 			&total_size, &block_size);
 			h->cciss_read = CCISS_READ_16;
 			h->cciss_write = CCISS_WRITE_16;
@@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
 		}
 	}
 
-	cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size,
+	cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
 			       inq_buff, drvinfo);
 	drvinfo->block_size = block_size;
 	drvinfo->nr_blocks = total_size + 1;
 
-	cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+	cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
 				drvinfo->model, drvinfo->rev);
-	cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
+	cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
 			sizeof(drvinfo->serial_no));
 	/* Save the lunid in case we deregister the disk, below. */
 	memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 		case 0: return IO_OK; /* no sense */
 		case 1: return IO_OK; /* recovered error */
 		default:
+			if (check_for_unit_attention(h, c))
+				return IO_NEEDS_RETRY;
 			printk(KERN_WARNING "cciss%d: cmd 0x%02x "
 				"check condition, sense key = 0x%02x\n",
 				h->ctlr, c->Request.CDB[0],
@@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 }
 
 static void cciss_geometry_inquiry(int ctlr, int logvol,
-				   int withirq, sector_t total_size,
+				   sector_t total_size,
 				   unsigned int block_size,
 				   InquiryData_struct *inq_buff,
 				   drive_info_struct *drv)
@@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 
 	memset(inq_buff, 0, sizeof(InquiryData_struct));
 	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
-					      inq_buff, sizeof(*inq_buff),
-					      0xC1, scsi3addr, TYPE_CMD);
-	else
-		return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
-				      sizeof(*inq_buff), 0xC1, scsi3addr,
-				      TYPE_CMD);
+	return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
+			sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		if (inq_buff->data_byte[8] == 0xFF) {
 			printk(KERN_WARNING
@@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 }
 
 static void
-cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
+cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
 		    unsigned int *block_size)
 {
 	ReadCapdata_struct *buf;
@@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 	}
 
 	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
-				ctlr, buf, sizeof(ReadCapdata_struct),
-					0, scsi3addr, TYPE_CMD);
-	else
-		return_code = sendcmd(CCISS_READ_CAPACITY,
-				ctlr, buf, sizeof(ReadCapdata_struct),
-					0, scsi3addr, TYPE_CMD);
+	return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
+		sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 	kfree(buf);
 }
 
-static void
-cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, 				unsigned int *block_size)
+static void cciss_read_capacity_16(int ctlr, int logvol,
+	sector_t *total_size, unsigned int *block_size)
 {
 	ReadCapdata_struct_16 *buf;
 	int return_code;
@@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 	}
 
 	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq) {
-		return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
-			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				0, scsi3addr, TYPE_CMD);
-	}
-	else {
-		return_code = sendcmd(CCISS_READ_CAPACITY_16,
-			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				0, scsi3addr, TYPE_CMD);
-	}
+	return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
+		ctlr, buf, sizeof(ReadCapdata_struct_16),
+			0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk)
 		return 1;
 	}
 	if (h->cciss_read == CCISS_READ_10) {
-		cciss_read_capacity(h->ctlr, logvol, 1,
+		cciss_read_capacity(h->ctlr, logvol,
 					&total_size, &block_size);
 	} else {
-		cciss_read_capacity_16(h->ctlr, logvol, 1,
+		cciss_read_capacity_16(h->ctlr, logvol,
 					&total_size, &block_size);
 	}
-	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
+	cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
 			       inq_buff, drv);
 
 	blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2837,167 +2818,6 @@ static int cciss_revalidate(struct gendisk *disk)
 }
 
 /*
- *   Wait polling for a command to complete.
- *   The memory mapped FIFO is polled for the completion.
- *   Used only at init time, interrupts from the HBA are disabled.
- */
-static unsigned long pollcomplete(int ctlr)
-{
-	unsigned long done;
-	int i;
-
-	/* Wait (up to 20 seconds) for a command to complete */
-
-	for (i = 20 * HZ; i > 0; i--) {
-		done = hba[ctlr]->access.command_completed(hba[ctlr]);
-		if (done == FIFO_EMPTY)
-			schedule_timeout_uninterruptible(1);
-		else
-			return done;
-	}
-	/* Invalid address to tell caller we ran out of time */
-	return 1;
-}
-
-/* Send command c to controller h and poll for it to complete.
- * Turns interrupts off on the board.  Used at driver init time
- * and during SCSI error recovery.
- */
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
-{
-	int i;
-	unsigned long complete;
-	int status = IO_ERROR;
-	u64bit buff_dma_handle;
-
-resend_cmd1:
-
-	/* Disable interrupt on the board. */
-	h->access.set_intr_mask(h, CCISS_INTR_OFF);
-
-	/* Make sure there is room in the command FIFO */
-	/* Actually it should be completely empty at this time */
-	/* unless we are in here doing error handling for the scsi */
-	/* tape side of the driver. */
-	for (i = 200000; i > 0; i--) {
-		/* if fifo isn't full go */
-		if (!(h->access.fifo_full(h)))
-			break;
-		udelay(10);
-		printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
-		       " waiting!\n", h->ctlr);
-	}
-	h->access.submit_command(h, c); /* Send the cmd */
-	do {
-		complete = pollcomplete(h->ctlr);
-
-#ifdef CCISS_DEBUG
-		printk(KERN_DEBUG "cciss: command completed\n");
-#endif				/* CCISS_DEBUG */
-
-		if (complete == 1) {
-			printk(KERN_WARNING
-			       "cciss cciss%d: SendCmd Timeout out, "
-			       "No command list address returned!\n", h->ctlr);
-			status = IO_ERROR;
-			break;
-		}
-
-		/* Make sure it's the command we're expecting. */
-		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
-			printk(KERN_WARNING "cciss%d: Unexpected command "
-				"completion.\n", h->ctlr);
-			continue;
-		}
-
-		/* It is our command.  If no error, we're done. */
-		if (!(complete & CISS_ERROR_BIT)) {
-			status = IO_OK;
-			break;
-		}
-
-		/* There is an error... */
-
-		/* if data overrun or underun on Report command ignore it */
-		if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
-		     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
-		     (c->Request.CDB[0] == CISS_INQUIRY)) &&
-			((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
-			 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
-			complete = c->busaddr;
-			status = IO_OK;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
-			printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
-				h->ctlr, c);
-			if (c->retry_count < MAX_CMD_RETRIES) {
-				printk(KERN_WARNING "cciss%d: retrying %p\n",
-				   h->ctlr, c);
-				c->retry_count++;
-				/* erase the old error information */
-				memset(c->err_info, 0, sizeof(c->err_info));
-				goto resend_cmd1;
-			}
-			printk(KERN_WARNING "cciss%d: retried %p too many "
-				"times\n", h->ctlr, c);
-			status = IO_ERROR;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
-			printk(KERN_WARNING "cciss%d: command could not be "
-				"aborted.\n", h->ctlr);
-			status = IO_ERROR;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
-			status = check_target_status(h, c);
-			break;
-		}
-
-		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
-		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
-			c->Request.CDB[0], c->err_info->CommandStatus);
-		status = IO_ERROR;
-		break;
-
-	} while (1);
-
-	/* unlock the data buffer from DMA */
-	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
-	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
-			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
-	return status;
-}
-
-/*
- * Send a command to the controller, and wait for it to complete.
- * Used at init time, and during SCSI error recovery.
- */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
-{
-	CommandList_struct *c;
-	int status;
-
-	c = cmd_alloc(hba[ctlr], 1);
-	if (!c) {
-		printk(KERN_WARNING "cciss: unable to get memory");
-		return IO_ERROR;
-	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
-		scsi3addr, cmd_type);
-	if (status == IO_OK)
-		status = sendcmd_core(hba[ctlr], c);
-	cmd_free(hba[ctlr], c, 1);
-	return status;
-}
-
-/*
  * Map (physical) PCI mem into (virtual) kernel space
  */
 static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q)
 	int seg;
 	struct request *creq;
 	u64bit temp64;
-	struct scatterlist tmp_sg[MAXSGENTRIES];
+	struct scatterlist *tmp_sg;
+	SGDescriptor_struct *curr_sg;
 	drive_info_struct *drv;
 	int i, dir;
+	int nseg = 0;
+	int sg_index = 0;
+	int chained = 0;
 
 	/* We call start_io here in case there is a command waiting on the
 	 * queue that has not been sent.
@@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q)
 	if (!creq)
 		goto startio;
 
-	BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);
+	BUG_ON(creq->nr_phys_segments > h->maxsgentries);
 
 	if ((c = cmd_alloc(h, 1)) == NULL)
 		goto full;
 
 	blk_start_request(creq);
 
+	tmp_sg = h->scatter_list[c->cmdindex];
 	spin_unlock_irq(q->queue_lock);
 
 	c->cmd_type = CMD_RWREQ;
@@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q)
 	       (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
 #endif				/* CCISS_DEBUG */
 
-	sg_init_table(tmp_sg, MAXSGENTRIES);
+	sg_init_table(tmp_sg, h->maxsgentries);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
 
 	/* get the DMA records for the setup */
@@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q)
 	else
 		dir = PCI_DMA_TODEVICE;
 
+	curr_sg = c->SG;
+	sg_index = 0;
+	chained = 0;
+
 	for (i = 0; i < seg; i++) {
-		c->SG[i].Len = tmp_sg[i].length;
+		if (((sg_index+1) == (h->max_cmd_sgentries)) &&
+			!chained && ((seg - i) > 1)) {
+			nseg = seg - i;
+			curr_sg[sg_index].Len = (nseg) *
+					sizeof(SGDescriptor_struct);
+			curr_sg[sg_index].Ext = CCISS_SG_CHAIN;
+
+			/* Point to next chain block. */
+			curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain;
+			sg_index = 0;
+			chained = 1;
+		}
+		curr_sg[sg_index].Len = tmp_sg[i].length;
 		temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
-						  tmp_sg[i].offset,
-						  tmp_sg[i].length, dir);
-		c->SG[i].Addr.lower = temp64.val32.lower;
-		c->SG[i].Addr.upper = temp64.val32.upper;
-		c->SG[i].Ext = 0;	// we are not chaining
+						tmp_sg[i].offset,
+						tmp_sg[i].length, dir);
+		curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+		curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+		curr_sg[sg_index].Ext = 0;  /* we are not chaining */
+
+		++sg_index;
+	}
+
+	if (chained) {
+		int len;
+		curr_sg = c->SG;
+		sg_index = h->max_cmd_sgentries - 1;
+		len = curr_sg[sg_index].Len;
+		/* Setup pointer to next chain block.
+		 * Fill out last element in current chain
+		 * block with address of next chain block.
+		 */
+		temp64.val = pci_map_single(h->pdev,
+					h->cmd_sg_list[c->cmdindex]->sgchain,
+					len, dir);
+
+		h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val;
+		curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+		curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+
+		pci_dma_sync_single_for_device(h->pdev,
+				h->cmd_sg_list[c->cmdindex]->sg_chain_dma,
+				len, dir);
 	}
+
 	/* track how many SG entries we are using */
 	if (seg > h->maxSG)
 		h->maxSG = seg;
 
 #ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
-	       blk_rq_sectors(creq), seg);
+	printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
+			"chained[%d]\n",
+			blk_rq_sectors(creq), seg, chained);
 #endif				/* CCISS_DEBUG */
 
-	c->Header.SGList = c->Header.SGTotal = seg;
+	c->Header.SGList = c->Header.SGTotal = seg + chained;
+	if (seg > h->max_cmd_sgentries)
+		c->Header.SGList = h->max_cmd_sgentries;
+
 	if (likely(blk_fs_request(creq))) {
 		if(h->cciss_read == CCISS_READ_10) {
 			c->Request.CDB[1] = 0;
@@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h)
  * @h:			   Pointer to the controller.
  *
  * Removes the controller from the rescan queue if present. Blocks if
- * the controller is currently conducting a rescan.
+ * the controller is currently conducting a rescan.  The controller
+ * can be in one of three states:
+ * 1. Doesn't need a scan
+ * 2. On the scan list, but not scanning yet (we remove it)
+ * 3. Busy scanning (and not on the list). In this case we want to wait for
+ *    the scan to complete to make sure the scanning thread for this
+ *    controller is completely idle.
  **/
 static void remove_from_scan_list(struct ctlr_info *h)
 {
 	struct ctlr_info *test_h, *tmp_h;
-	int scanning = 0;
 
 	mutex_lock(&scan_mutex);
 	list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
-		if (test_h == h) {
+		if (test_h == h) { /* state 2. */
 			list_del(&h->scan_list);
 			complete_all(&h->scan_wait);
 			mutex_unlock(&scan_mutex);
 			return;
 		}
 	}
-	if (&h->busy_scanning)
-		scanning = 0;
-	mutex_unlock(&scan_mutex);
-
-	if (scanning)
+	if (h->busy_scanning) { /* state 3. */
+		mutex_unlock(&scan_mutex);
 		wait_for_completion(&h->scan_wait);
+	} else { /* state 1, nothing to do. */
+		mutex_unlock(&scan_mutex);
+	}
 }
 
 /**
@@ -3573,13 +3448,11 @@ static int scan_thread(void *data)
 			h->busy_scanning = 1;
 			mutex_unlock(&scan_mutex);
 
-			if (h) {
-				rebuild_lun_table(h, 0, 0);
-				complete_all(&h->scan_wait);
-				mutex_lock(&scan_mutex);
-				h->busy_scanning = 0;
-				mutex_unlock(&scan_mutex);
-			}
+			rebuild_lun_table(h, 0, 0);
+			complete_all(&h->scan_wait);
+			mutex_lock(&scan_mutex);
+			h->busy_scanning = 0;
+			mutex_unlock(&scan_mutex);
 		}
 	}
 
@@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
 	case REPORT_LUNS_CHANGED:
 		printk(KERN_WARNING "cciss%d: report LUN data "
 			"changed\n", h->ctlr);
-		add_to_scan_list(h);
-		wake_up_process(cciss_scan_thread);
+	/*
+	 * Here, we could call add_to_scan_list and wake up the scan thread,
+	 * except that it's quite likely that we will get more than one
+	 * REPORT_LUNS_CHANGED condition in quick succession, which means
+	 * that those which occur after the first one will likely happen
+	 * *during* the scan_thread's rescan.  And the rescan code is not
+	 * robust enough to restart in the middle, undoing what it has already
+	 * done, and it's not clear that it's even possible to do this, since
+	 * part of what it does is notify the block layer, which starts
+	 * doing it's own i/o to read partition tables and so on, and the
+	 * driver doesn't have visibility to know what might need undoing.
+	 * In any event, if possible, it is horribly complicated to get right
+	 * so we just don't do it for now.
+	 *
+	 * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
+	 */
 		return 1;
 	break;
 	case POWER_OR_RESET:
@@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	 * leave a little room for ioctl calls.
 	 */
 	c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
+	c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
+
+	/*
+	 * Limit native command to 32 s/g elements to save dma'able memory.
+	 * Howvever spec says if 0, use 31
+	 */
+
+	c->max_cmd_sgentries = 31;
+	if (c->maxsgentries > 512) {
+		c->max_cmd_sgentries = 32;
+		c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
+		c->maxsgentries -= 1;   /* account for chain pointer */
+	} else {
+		c->maxsgentries = 31;   /* Default to traditional value */
+		c->chainsize = 0;       /* traditional */
+	}
+
 	c->product_name = products[prod_index].product_name;
 	c->access = *(products[prod_index].access);
 	c->nr_cmds = c->max_commands - 4;
@@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 {
 	int i;
 	int j = 0;
+	int k = 0;
 	int rc;
 	int dac, return_code;
 	InquiryData_struct *inq_buff;
@@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		printk(KERN_ERR "cciss: out of memory");
 		goto clean4;
 	}
+
+	/* Need space for temp scatter list */
+	hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
+						sizeof(struct scatterlist *),
+						GFP_KERNEL);
+	for (k = 0; k < hba[i]->nr_cmds; k++) {
+		hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
+							hba[i]->maxsgentries,
+							GFP_KERNEL);
+		if (hba[i]->scatter_list[k] == NULL) {
+			printk(KERN_ERR "cciss%d: could not allocate "
+				"s/g lists\n", i);
+			goto clean4;
+		}
+	}
+	hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) *
+						hba[i]->nr_cmds,
+						GFP_KERNEL);
+	if (!hba[i]->cmd_sg_list) {
+		printk(KERN_ERR "cciss%d: Cannot get memory for "
+			"s/g chaining.\n", i);
+		goto clean4;
+	}
+	/* Build up chain blocks for each command */
+	if (hba[i]->chainsize > 0) {
+		for (j = 0; j < hba[i]->nr_cmds; j++) {
+			hba[i]->cmd_sg_list[j] =
+					kmalloc(sizeof(struct Cmd_sg_list),
+							GFP_KERNEL);
+			if (!hba[i]->cmd_sg_list[j]) {
+				printk(KERN_ERR "cciss%d: Cannot get memory "
+					"for chain block.\n", i);
+				goto clean4;
+			}
+			/* Need a block of chainsized s/g elements. */
+			hba[i]->cmd_sg_list[j]->sgchain =
+					kmalloc((hba[i]->chainsize *
+						sizeof(SGDescriptor_struct)),
+						GFP_KERNEL);
+			if (!hba[i]->cmd_sg_list[j]->sgchain) {
+				printk(KERN_ERR "cciss%d: Cannot get memory "
+					"for s/g chains\n", i);
+				goto clean4;
+			}
+		}
+	}
+
 	spin_lock_init(&hba[i]->lock);
 
 	/* Initialize the pdev driver private data.
@@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 	cciss_procinit(i);
 
-	hba[i]->cciss_max_sectors = 2048;
+	hba[i]->cciss_max_sectors = 8192;
 
 	rebuild_lun_table(hba[i], 1, 0);
 	hba[i]->busy_initializing = 0;
@@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 clean4:
 	kfree(hba[i]->cmd_pool_bits);
+	/* Free up sg elements */
+	for (k = 0; k < hba[i]->nr_cmds; k++)
+		kfree(hba[i]->scatter_list[k]);
+	kfree(hba[i]->scatter_list);
+	/* Only free up extra s/g lists if controller supports them */
+	if (hba[i]->chainsize > 0) {
+		for (j = 0; j < hba[i]->nr_cmds; j++) {
+			if (hba[i]->cmd_sg_list[j]) {
+				kfree(hba[i]->cmd_sg_list[j]->sgchain);
+				kfree(hba[i]->cmd_sg_list[j]);
+			}
+		}
+		kfree(hba[i]->cmd_sg_list);
+	}
 	if (hba[i]->cmd_pool)
 		pci_free_consistent(hba[i]->pdev,
 				    hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -4400,30 +4366,28 @@ clean_no_release_regions:
 
 static void cciss_shutdown(struct pci_dev *pdev)
 {
-	ctlr_info_t *tmp_ptr;
-	int i;
-	char flush_buf[4];
+	ctlr_info_t *h;
+	char *flush_buf;
 	int return_code;
 
-	tmp_ptr = pci_get_drvdata(pdev);
-	if (tmp_ptr == NULL)
-		return;
-	i = tmp_ptr->ctlr;
-	if (hba[i] == NULL)
+	h = pci_get_drvdata(pdev);
+	flush_buf = kzalloc(4, GFP_KERNEL);
+	if (!flush_buf) {
+		printk(KERN_WARNING
+			"cciss:%d cache not flushed, out of memory.\n",
+			h->ctlr);
 		return;
-
-	/* Turn board interrupts off  and send the flush cache command */
-	/* sendcmd will turn off interrupt, and send the flush...
-	 * To write all data in the battery backed cache to disks */
-	memset(flush_buf, 0, 4);
-	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
-		CTLR_LUNID, TYPE_CMD);
-	if (return_code == IO_OK) {
-		printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
-	} else {
-		printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
 	}
-	free_irq(hba[i]->intr[2], hba[i]);
+	/* write all data in the battery backed cache to disk */
+	memset(flush_buf, 0, 4);
+	return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
+		4, 0, CTLR_LUNID, TYPE_CMD);
+	kfree(flush_buf);
+	if (return_code != IO_OK)
+		printk(KERN_WARNING "cciss%d: Error flushing cache\n",
+			h->ctlr);
+	h->access.set_intr_mask(h, CCISS_INTR_OFF);
+	free_irq(h->intr[2], h);
 }
 
 static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 			    hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
 	kfree(hba[i]->cmd_pool_bits);
+	/* Free up sg elements */
+	for (j = 0; j < hba[i]->nr_cmds; j++)
+		kfree(hba[i]->scatter_list[j]);
+	kfree(hba[i]->scatter_list);
+	/* Only free up extra s/g lists if controller supports them */
+	if (hba[i]->chainsize > 0) {
+		for (j = 0; j < hba[i]->nr_cmds; j++) {
+			if (hba[i]->cmd_sg_list[j]) {
+				kfree(hba[i]->cmd_sg_list[j]->sgchain);
+				kfree(hba[i]->cmd_sg_list[j]);
+			}
+		}
+		kfree(hba[i]->cmd_sg_list);
+	}
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 31524cf..1d95db2 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -55,7 +55,13 @@ typedef struct _drive_info_struct
 	char device_initialized;     /* indicates whether dev is initialized */
 } drive_info_struct;
 
-struct ctlr_info 
+struct Cmd_sg_list {
+	SGDescriptor_struct	*sgchain;
+	dma_addr_t		sg_chain_dma;
+	int			chain_block_size;
+};
+
+struct ctlr_info
 {
 	int	ctlr;
 	char	devname[8];
@@ -75,6 +81,16 @@ struct ctlr_info
 	int	num_luns;
 	int 	highest_lun;
 	int	usage_count;  /* number of opens all all minor devices */
+	/* Need space for temp sg list
+	 * number of scatter/gathers supported
+	 * number of scatter/gathers in chained block
+	 */
+	struct	scatterlist **scatter_list;
+	int	maxsgentries;
+	int	chainsize;
+	int	max_cmd_sgentries;
+	struct Cmd_sg_list **cmd_sg_list;
+
 #	define DOORBELL_INT	0
 #	define PERF_MODE_INT	1
 #	define SIMPLE_MODE_INT	2
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index dbaed1e..b50a9b2 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -7,7 +7,8 @@
 
 //general boundary defintions
 #define SENSEINFOBYTES          32//note that this value may vary between host implementations
-#define MAXSGENTRIES            31
+#define MAXSGENTRIES            32
+#define CCISS_SG_CHAIN          0x80000000
 #define MAXREPLYQS              256
 
 //Command Status value
@@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
   BYTE             ServerName[16];
   DWORD            HeartBeat;
   DWORD            SCSI_Prefetch;
+  DWORD            MaxSGElements;
+  DWORD            MaxLogicalUnits;
+  DWORD            MaxPhysicalDrives;
+  DWORD            MaxPhysicalDrivesPerLogicalUnit;
 } CfgTable_struct;
 #pragma pack()	 
 #endif // CCISS_CMD_H
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3315268..5d0e46d 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
                         			cp,  
 						ei->ScsiStatus); 
 #endif
-					cmd->result |= (ei->ScsiStatus < 1);
+					cmd->result |= (ei->ScsiStatus << 1);
                 		}
 				else {  /* scsi status is zero??? How??? */
 					
@@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
 	if (sa->registered) {
 		printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
 		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-		return ENXIO;
+		return -ENXIO;
 	}
 	sa->registered = 1;
 	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 0000000..f4acd04
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,71 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+	depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET && CONNECTOR
+	select LRU_CACHE
+	default n
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of EE (epoch_entries)
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 0000000..0d3f337
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,5 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 0000000..17956ff
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1424 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial check sum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while loosing power.
+ */
+struct __packed al_transaction {
+	u32       magic;
+	u32       tr_number;
+	struct __packed {
+		u32 pos;
+		u32 extent; } updates[1 + AL_EXTENTS_PT];
+	u32       xor_sum;
+};
+
+struct update_odbm_work {
+	struct drbd_work w;
+	unsigned int enr;
+};
+
+struct update_al_work {
+	struct drbd_work w;
+	struct lc_element *al_ext;
+	struct completion event;
+	unsigned int enr;
+	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+	unsigned int old_enr;
+};
+
+struct drbd_atodb_wait {
+	atomic_t           count;
+	struct completion  io_done;
+	struct drbd_conf   *mdev;
+	int                error;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+				 struct drbd_backing_dev *bdev,
+				 struct page *page, sector_t sector,
+				 int rw, int size)
+{
+	struct bio *bio;
+	struct drbd_md_io md_io;
+	int ok;
+
+	md_io.mdev = mdev;
+	init_completion(&md_io.event);
+	md_io.error = 0;
+
+	if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+		rw |= (1 << BIO_RW_BARRIER);
+	rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+
+ retry:
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio->bi_bdev = bdev->md_bdev;
+	bio->bi_sector = sector;
+	ok = (bio_add_page(bio, page, size, 0) == size);
+	if (!ok)
+		goto out;
+	bio->bi_private = &md_io;
+	bio->bi_end_io = drbd_md_io_complete;
+	bio->bi_rw = rw;
+
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_endio(bio, -EIO);
+	else
+		submit_bio(rw, bio);
+	wait_for_completion(&md_io.event);
+	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+
+	/* check for unsupported barrier op.
+	 * would rather check on EOPNOTSUPP, but that is not reliable.
+	 * don't try again for ANY return value != 0 */
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+		/* Try again with no barrier */
+		dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		rw &= ~(1 << BIO_RW_BARRIER);
+		bio_put(bio);
+		goto retry;
+	}
+ out:
+	bio_put(bio);
+	return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+			 sector_t sector, int rw)
+{
+	int logical_block_size, mask, ok;
+	int offset = 0;
+	struct page *iop = mdev->md_io_page;
+
+	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+
+	BUG_ON(!bdev->md_bdev);
+
+	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	/* in case logical_block_size != 512 [ s390 only? ] */
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+		D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+		offset = sector & mask;
+		sector = sector & ~mask;
+		iop = mdev->md_io_tmpp;
+
+		if (rw & WRITE) {
+			/* these are GFP_KERNEL pages, pre-allocated
+			 * on device initialization */
+			void *p = page_address(mdev->md_io_page);
+			void *hp = page_address(mdev->md_io_tmpp);
+
+			ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+					READ, logical_block_size);
+
+			if (unlikely(!ok)) {
+				dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+				    "READ [logical_block_size!=512]) failed!\n",
+				    (unsigned long long)sector);
+				return 0;
+			}
+
+			memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+		}
+	}
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector > drbd_md_last_sector(bdev))
+		dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+	if (unlikely(!ok)) {
+		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+		return 0;
+	}
+
+	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+		void *p = page_address(mdev->md_io_page);
+		void *hp = page_address(mdev->md_io_tmpp);
+
+		memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+	}
+
+	return ok;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	struct lc_element *tmp;
+	unsigned long     al_flags = 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			return NULL;
+		}
+	}
+	al_ext   = lc_get(mdev->act_log, enr);
+	al_flags = mdev->act_log->flags;
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (!al_ext) {
+		if (al_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+		if (al_flags & LC_DIRTY)
+			dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+	}
+	*/
+
+	return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *al_ext;
+	struct update_al_work al_work;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+	if (al_ext->lc_number != enr) {
+		/* drbd_al_write_transaction(mdev,al_ext,enr);
+		 * recurses into generic_make_request(), which
+		 * disallows recursion, bios being serialized on the
+		 * current->bio_tail list now.
+		 * we have to delegate updates to the activity log
+		 * to the worker thread. */
+		init_completion(&al_work.event);
+		al_work.al_ext = al_ext;
+		al_work.enr = enr;
+		al_work.old_enr = al_ext->lc_number;
+		al_work.w.cb = w_al_write_transaction;
+		drbd_queue_work_front(&mdev->data.work, &al_work.w);
+		wait_for_completion(&al_work.event);
+
+		mdev->al_writ_cnt++;
+
+		spin_lock_irq(&mdev->al_lock);
+		lc_changed(mdev->act_log, al_ext);
+		spin_unlock_irq(&mdev->al_lock);
+		wake_up(&mdev->al_wait);
+	}
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *extent;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+
+	extent = lc_find(mdev->act_log, enr);
+
+	if (!extent) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+		return;
+	}
+
+	if (lc_put(mdev->act_log, extent) == 0)
+		wake_up(&mdev->al_wait);
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_al_work *aw = container_of(w, struct update_al_work, w);
+	struct lc_element *updated = aw->al_ext;
+	const unsigned int new_enr = aw->enr;
+	const unsigned int evicted = aw->old_enr;
+	struct al_transaction *buffer;
+	sector_t sector;
+	int i, n, mx;
+	unsigned int extent_nr;
+	u32 xor_sum = 0;
+
+	if (!get_ldev(mdev)) {
+		dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+		complete(&((struct update_al_work *)w)->event);
+		return 1;
+	}
+	/* do we have to do a bitmap write, first?
+	 * TODO reduce maximum latency:
+	 * submit both bios, then wait for both,
+	 * instead of doing two synchronous sector writes. */
+	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+
+	mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+
+	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+	n = lc_index_of(mdev->act_log, updated);
+
+	buffer->updates[0].pos = cpu_to_be32(n);
+	buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+	xor_sum ^= new_enr;
+
+	mx = min_t(int, AL_EXTENTS_PT,
+		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = mdev->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+		buffer->updates[i+1].pos = cpu_to_be32(idx);
+		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+		xor_sum ^= extent_nr;
+	}
+	for (; i < AL_EXTENTS_PT; i++) {
+		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+		xor_sum ^= LC_FREE;
+	}
+	mdev->al_tr_cycle += AL_EXTENTS_PT;
+	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+		mdev->al_tr_cycle = 0;
+
+	buffer->xor_sum = cpu_to_be32(xor_sum);
+
+	sector =  mdev->ldev->md.md_offset
+		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+		drbd_chk_io_error(mdev, 1, TRUE);
+
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+	mdev->al_tr_number++;
+
+	mutex_unlock(&mdev->md_io_mutex);
+
+	complete(&((struct update_al_work *)w)->event);
+	put_ldev(mdev);
+
+	return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ * @b:		pointer to an al_transaction.
+ * @index:	On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+static int drbd_al_read_tr(struct drbd_conf *mdev,
+			   struct drbd_backing_dev *bdev,
+			   struct al_transaction *b,
+			   int index)
+{
+	sector_t sector;
+	int rv, i;
+	u32 xor_sum = 0;
+
+	sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+	/* Dont process error normally,
+	 * as this is done before disk is attached! */
+	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+		return -1;
+
+	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+		xor_sum ^= be32_to_cpu(b->updates[i].extent);
+	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+	return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct al_transaction *buffer;
+	int i;
+	int rv;
+	int mx;
+	int active_extents = 0;
+	int transactions = 0;
+	int found_valid = 0;
+	int from = 0;
+	int to = 0;
+	u32 from_tnr = 0;
+	u32 to_tnr = 0;
+	u32 cnr;
+
+	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+	/* lock out all other meta data io for now,
+	 * and make sure the page is mapped.
+	 */
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = page_address(mdev->md_io_page);
+
+	/* Find the valid transaction in the log */
+	for (i = 0; i <= mx; i++) {
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		if (rv == 0)
+			continue;
+		if (rv == -1) {
+			mutex_unlock(&mdev->md_io_mutex);
+			return 0;
+		}
+		cnr = be32_to_cpu(buffer->tr_number);
+
+		if (++found_valid == 1) {
+			from = i;
+			to = i;
+			from_tnr = cnr;
+			to_tnr = cnr;
+			continue;
+		}
+		if ((int)cnr - (int)from_tnr < 0) {
+			D_ASSERT(from_tnr - cnr + i - from == mx+1);
+			from = i;
+			from_tnr = cnr;
+		}
+		if ((int)cnr - (int)to_tnr > 0) {
+			D_ASSERT(cnr - to_tnr == i - to);
+			to = i;
+			to_tnr = cnr;
+		}
+	}
+
+	if (!found_valid) {
+		dev_warn(DEV, "No usable activity log found.\n");
+		mutex_unlock(&mdev->md_io_mutex);
+		return 1;
+	}
+
+	/* Read the valid transactions.
+	 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+	i = from;
+	while (1) {
+		int j, pos;
+		unsigned int extent_nr;
+		unsigned int trn;
+
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		ERR_IF(rv == 0) goto cancel;
+		if (rv == -1) {
+			mutex_unlock(&mdev->md_io_mutex);
+			return 0;
+		}
+
+		trn = be32_to_cpu(buffer->tr_number);
+
+		spin_lock_irq(&mdev->al_lock);
+
+		/* This loop runs backwards because in the cyclic
+		   elements there might be an old version of the
+		   updated element (in slot 0). So the element in slot 0
+		   can overwrite old versions. */
+		for (j = AL_EXTENTS_PT; j >= 0; j--) {
+			pos = be32_to_cpu(buffer->updates[j].pos);
+			extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+			if (extent_nr == LC_FREE)
+				continue;
+
+			lc_set(mdev->act_log, extent_nr, pos);
+			active_extents++;
+		}
+		spin_unlock_irq(&mdev->al_lock);
+
+		transactions++;
+
+cancel:
+		if (i == to)
+			break;
+		i++;
+		if (i > mx)
+			i = 0;
+	}
+
+	mdev->al_tr_number = to_tnr+1;
+	mdev->al_tr_pos = to;
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	/* ok, we are done with it */
+	mutex_unlock(&mdev->md_io_mutex);
+
+	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+	     transactions, active_extents);
+
+	return 1;
+}
+
+static void atodb_endio(struct bio *bio, int error)
+{
+	struct drbd_atodb_wait *wc = bio->bi_private;
+	struct drbd_conf *mdev = wc->mdev;
+	struct page *page;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?! */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	drbd_chk_io_error(mdev, error, TRUE);
+	if (error && wc->error == 0)
+		wc->error = error;
+
+	if (atomic_dec_and_test(&wc->count))
+		complete(&wc->io_done);
+
+	page = bio->bi_io_vec[0].bv_page;
+	put_page(page);
+	bio_put(bio);
+	mdev->bm_writ_cnt++;
+	put_ldev(mdev);
+}
+
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* activity log to on disk bitmap -- prepare bio unless that sector
+ * is already covered by previously prepared bios */
+static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
+					struct bio **bios,
+					unsigned int enr,
+					struct drbd_atodb_wait *wc) __must_hold(local)
+{
+	struct bio *bio;
+	struct page *page;
+	sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+				      + mdev->ldev->md.bm_offset;
+	unsigned int page_offset = PAGE_SIZE;
+	int offset;
+	int i = 0;
+	int err = -ENOMEM;
+
+	/* Check if that enr is already covered by an already created bio.
+	 * Caution, bios[] is not NULL terminated,
+	 * but only initialized to all NULL.
+	 * For completely scattered activity log,
+	 * the last invocation iterates over all bios,
+	 * and finds the last NULL entry.
+	 */
+	while ((bio = bios[i])) {
+		if (bio->bi_sector == on_disk_sector)
+			return 0;
+		i++;
+	}
+	/* bios[i] == NULL, the next not yet used slot */
+
+	/* GFP_KERNEL, we are not in the write-out path */
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio == NULL)
+		return -ENOMEM;
+
+	if (i > 0) {
+		const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
+		page_offset = prev_bv->bv_offset + prev_bv->bv_len;
+		page = prev_bv->bv_page;
+	}
+	if (page_offset == PAGE_SIZE) {
+		page = alloc_page(__GFP_HIGHMEM);
+		if (page == NULL)
+			goto out_bio_put;
+		page_offset = 0;
+	} else {
+		get_page(page);
+	}
+
+	offset = S2W(enr);
+	drbd_bm_get_lel(mdev, offset,
+			min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+			kmap(page) + page_offset);
+	kunmap(page);
+
+	bio->bi_private = wc;
+	bio->bi_end_io = atodb_endio;
+	bio->bi_bdev = mdev->ldev->md_bdev;
+	bio->bi_sector = on_disk_sector;
+
+	if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
+		goto out_put_page;
+
+	atomic_inc(&wc->count);
+	/* we already know that we may do this...
+	 * get_ldev_if_state(mdev,D_ATTACHING);
+	 * just get the extra reference, so that the local_cnt reflects
+	 * the number of pending IO requests DRBD at its backing device.
+	 */
+	atomic_inc(&mdev->local_cnt);
+
+	bios[i] = bio;
+
+	return 0;
+
+out_put_page:
+	err = -EINVAL;
+	put_page(page);
+out_bio_put:
+	bio_put(bio);
+	return err;
+}
+
+/**
+ * drbd_al_to_on_disk_bm() -  * Writes bitmap parts covered by active AL extents
+ * @mdev:	DRBD device.
+ *
+ * Called when we detach (unconfigure) local storage,
+ * or when we go from R_PRIMARY to R_SECONDARY role.
+ */
+void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+{
+	int i, nr_elements;
+	unsigned int enr;
+	struct bio **bios;
+	struct drbd_atodb_wait wc;
+
+	ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
+		return; /* sorry, I don't have any act_log etc... */
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	nr_elements = mdev->act_log->nr_elements;
+
+	/* GFP_KERNEL, we are not in anyone's write-out path */
+	bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
+	if (!bios)
+		goto submit_one_by_one;
+
+	atomic_set(&wc.count, 0);
+	init_completion(&wc.io_done);
+	wc.mdev = mdev;
+	wc.error = 0;
+
+	for (i = 0; i < nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		/* next statement also does atomic_inc wc.count and local_cnt */
+		if (atodb_prepare_unless_covered(mdev, bios,
+						enr/AL_EXT_PER_BM_SECT,
+						&wc))
+			goto free_bios_submit_one_by_one;
+	}
+
+	/* unnecessary optimization? */
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	/* all prepared, submit them */
+	for (i = 0; i < nr_elements; i++) {
+		if (bios[i] == NULL)
+			break;
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
+			bios[i]->bi_rw = WRITE;
+			bio_endio(bios[i], -EIO);
+		} else {
+			submit_bio(WRITE, bios[i]);
+		}
+	}
+
+	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+
+	/* always (try to) flush bitmap to stable storage */
+	drbd_md_flush(mdev);
+
+	/* In case we did not submit a single IO do not wait for
+	 * them to complete. ( Because we would wait forever here. )
+	 *
+	 * In case we had IOs and they are already complete, there
+	 * is not point in waiting anyways.
+	 * Therefore this if () ... */
+	if (atomic_read(&wc.count))
+		wait_for_completion(&wc.io_done);
+
+	put_ldev(mdev);
+
+	kfree(bios);
+	return;
+
+ free_bios_submit_one_by_one:
+	/* free everything by calling the endio callback directly. */
+	for (i = 0; i < nr_elements && bios[i]; i++)
+		bio_endio(bios[i], 0);
+
+	kfree(bios);
+
+ submit_one_by_one:
+	dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		/* Really slow: if we have al-extents 16..19 active,
+		 * sector 4 will be written four times! Synchronous! */
+		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	put_ldev(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev:	DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+	unsigned int enr;
+	unsigned long add = 0;
+	char ppb[10];
+	int i;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		add += drbd_bm_ALe_set_all(mdev, enr);
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+	     ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&mdev->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(mdev->act_log, al_ext);
+	spin_unlock_irq(&mdev->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(mdev->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+	}
+
+	wake_up(&mdev->al_wait);
+}
+
+static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+	if (!get_ldev(mdev)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+		kfree(udw);
+		return 1;
+	}
+
+	drbd_bm_write_sect(mdev, udw->enr);
+	put_ldev(mdev);
+
+	kfree(udw);
+
+	if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+		switch (mdev->state.conn) {
+		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
+		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+			drbd_resync_finished(mdev);
+		default:
+			/* nothing to do */
+			break;
+		}
+	}
+	drbd_bcast_sync_progress(mdev);
+
+	return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+				      int count, int success)
+{
+	struct lc_element *e;
+	struct update_odbm_work *udw;
+
+	unsigned int enr;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt));
+
+	/* I simply assume that a sector/size pair never crosses
+	 * a 16 MB extent border. (Currently this is true...) */
+	enr = BM_SECT_TO_EXT(sector);
+
+	e = lc_get(mdev->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (success)
+				ext->rs_left -= count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d\n",
+				     (unsigned long long)sector,
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count);
+				dump_stack();
+
+				lc_put(mdev->resync, &ext->lce);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return;
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(mdev, enr);
+			if (ext->flags != 0) {
+				dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				dev_warn(DEV, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = success ? 0 : count;
+			lc_changed(mdev->resync, &ext->lce);
+		}
+		lc_put(mdev->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left == ext->rs_failed) {
+			ext->rs_failed = 0;
+
+			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+			if (udw) {
+				udw->enr = ext->lce.lc_number;
+				udw->w.cb = w_update_odbm;
+				drbd_queue_work_front(&mdev->data.work, &udw->w);
+			} else {
+				dev_warn(DEV, "Could not kmalloc an udw\n");
+				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+			}
+		}
+	} else {
+		dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    mdev->resync_locked,
+		    mdev->resync->nr_elements,
+		    mdev->resync->flags);
+	}
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+		       const char *file, const unsigned int line)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+	unsigned long flags;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
+	if (count) {
+		/* we need the lock for drbd_try_clear_on_disk_bm */
+		if (jiffies - mdev->rs_mark_time > HZ*10) {
+			/* should be rolling marks,
+			 * but we estimate only anyways. */
+			if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+			    mdev->state.conn != C_PAUSED_SYNC_T &&
+			    mdev->state.conn != C_PAUSED_SYNC_S) {
+				mdev->rs_mark_time = jiffies;
+				mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+			}
+		}
+		if (get_ldev(mdev)) {
+			drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+			put_ldev(mdev);
+		}
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+			    const char *file, const unsigned int line)
+{
+	unsigned long sbnr, ebnr, lbnr, flags;
+	sector_t esector, nr_sectors;
+	unsigned int enr, count;
+	struct lc_element *e;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "sector: %llus, size: %d\n",
+			(unsigned long long)sector, size);
+		return;
+	}
+
+	if (!get_ldev(mdev))
+		return; /* no disk, no metadata, no bitmap to set bits in */
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors)
+		goto out;
+	ERR_IF(esector >= nr_sectors)
+		esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	/* ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.  */
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+
+	enr = BM_SECT_TO_EXT(sector);
+	e = lc_find(mdev->resync, enr);
+	if (e)
+		lc_entry(e, struct bm_extent, lce)->rs_left += count;
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+out:
+	put_ldev(mdev);
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_locked > mdev->resync->nr_elements/2) {
+		spin_unlock_irq(&mdev->al_lock);
+		return NULL;
+	}
+	e = lc_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			mdev->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = mdev->resync->flags;
+	spin_unlock_irq(&mdev->al_lock);
+	if (wakeup)
+		wake_up(&mdev->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_DIRTY);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	int rv = 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (unlikely(enr == mdev->act_log->new_number))
+		rv = 1;
+	else {
+		al_ext = lc_find(mdev->act_log, enr);
+		if (al_ext) {
+			if (al_ext->refcnt)
+				rv = 1;
+		}
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (unlikely(rv)) {
+		dev_info(DEV, "Delaying sync read until app's write is done\n");
+	}
+	*/
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+
+	sig = wait_event_interruptible(mdev->al_wait,
+			(bm_ext = _bme_get(mdev, enr)));
+	if (sig)
+		return 0;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 1;
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(mdev->al_wait,
+				!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
+		if (sig) {
+			spin_lock_irq(&mdev->al_lock);
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				mdev->resync_locked--;
+				wake_up(&mdev->al_wait);
+			}
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+	}
+
+	set_bit(BME_LOCKED, &bm_ext->flags);
+
+	return 1;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+		e = lc_find(mdev->resync, mdev->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			mdev->resync_wenr = LC_FREE;
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0)
+				mdev->resync_locked--;
+			wake_up(&mdev->al_wait);
+		} else {
+			dev_alert(DEV, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			mdev->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			bm_ext->lce.refcnt--;
+			D_ASSERT(bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (mdev->resync_locked > mdev->resync->nr_elements-3)
+			goto try_again;
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(mdev->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = mdev->resync->flags;
+			if (rs_flags & LC_STARVING)
+				dev_warn(DEV, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_DIRTY);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wake_up(&mdev->al_wait);
+			D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(bm_ext->lce.refcnt == 1);
+		mdev->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (unlikely(al_enr+i == mdev->act_log->new_number))
+			goto try_again;
+		if (lc_is_used(mdev->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	return 0;
+
+try_again:
+	if (bm_ext)
+		mdev->resync_wenr = enr;
+	spin_unlock_irq(&mdev->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	e = lc_find(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+		clear_bit(BME_LOCKED, &bm_ext->flags);
+		clear_bit(BME_NO_WRITES, &bm_ext->flags);
+		mdev->resync_locked--;
+		wake_up(&mdev->al_wait);
+	}
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @mdev:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_conf *mdev)
+{
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(mdev->resync);
+		put_ldev(mdev);
+	}
+	mdev->resync_locked = 0;
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_conf *mdev)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < mdev->resync->nr_elements; i++) {
+			e = lc_element_by_index(mdev->resync, i);
+			bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == mdev->resync_wenr) {
+				dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     mdev->resync_wenr);
+				D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				mdev->resync_wenr = LC_FREE;
+				lc_put(mdev->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(mdev);
+				spin_unlock_irq(&mdev->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(mdev->resync, &bm_ext->lce);
+		}
+		D_ASSERT(mdev->resync->used == 0);
+		put_ldev(mdev);
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ * @size:	Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/*
+	 * round up start sector, round down end sector.  we make sure we only
+	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irq(&mdev->al_lock);
+	count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+	if (count) {
+		mdev->rs_failed += count;
+
+		if (get_ldev(mdev)) {
+			drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+			put_ldev(mdev);
+		}
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 0000000..b61057e
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <asm/kmap_types.h>
+#include "drbd_int.h"
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+
+ * Note that since find_first_bit returns int, at the current granularity of
+ * the bitmap (4KB per byte), this implementation "only" supports up to
+ * 1<<(32+12) == 16 TB...
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+	/* WARNING unsigned long bm_*:
+	 * 32bit number of bit offset is just enough for 512 MB bitmap.
+	 * it will blow up if we make the bitmap bigger...
+	 * not that it makes much sense to have a bitmap that large,
+	 * rather change the granularity to 16k or 64k or something.
+	 * (that implies other problems, however...)
+	 */
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct semaphore bm_change; /* serializes resize operations */
+
+	atomic_t bm_async_io;
+	wait_queue_head_t bm_io_wait;
+
+	unsigned long  bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+/* definition of bits in bm_flags */
+#define BM_LOCKED       0
+#define BM_MD_IO_ERROR  1
+#define BM_P_VMALLOCED  2
+
+static int bm_is_locked(struct drbd_bitmap *b)
+{
+	return test_bit(BM_LOCKED, &b->bm_flags);
+}
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+	dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
+	    current == mdev->receiver.task ? "receiver" :
+	    current == mdev->asender.task  ? "asender"  :
+	    current == mdev->worker.task   ? "worker"   : current->comm,
+	    func, b->bm_why ?: "?",
+	    b->bm_task == mdev->receiver.task ? "receiver" :
+	    b->bm_task == mdev->asender.task  ? "asender"  :
+	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+}
+
+void drbd_bm_lock(struct drbd_conf *mdev, char *why)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = down_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
+		    current == mdev->receiver.task ? "receiver" :
+		    current == mdev->asender.task  ? "asender"  :
+		    current == mdev->worker.task   ? "worker"   : current->comm,
+		    why, b->bm_why ?: "?",
+		    b->bm_task == mdev->receiver.task ? "receiver" :
+		    b->bm_task == mdev->asender.task  ? "asender"  :
+		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+		down(&b->bm_change);
+	}
+	if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
+		dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
+		dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	up(&b->bm_change);
+}
+
+/* word offset to long pointer */
+static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
+{
+	struct page *page;
+	unsigned long page_nr;
+
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	page = b->bm_pages[page_nr];
+
+	return (unsigned long *) kmap_atomic(page, km);
+}
+
+static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+{
+	return __bm_map_paddr(b, offset, KM_IRQ1);
+}
+
+static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+{
+	kunmap_atomic(p_addr, km);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr, KM_IRQ1);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_conf*, but for the debug macros I like to have the mdev around
+ * to be able to report device specific.
+ */
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+					  "a NULL pointer; i=%lu n=%lu\n",
+					  i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+	if (v)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes, vmalloced = 0;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_KERNEL is ok, as this is done when a lower level disk is
+	 * "attached" to the drbd.  Context is receiver thread or cqueue
+	 * thread.  As we have no disk yet, we are not in the IO path,
+	 * not even the IO path of the peer. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kmalloc(bytes, GFP_KERNEL);
+	if (!new_pages) {
+		new_pages = vmalloc(bytes);
+		if (!new_pages)
+			return NULL;
+		vmalloced = 1;
+	}
+
+	memset(new_pages, 0, bytes);
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages, vmalloced);
+				return NULL;
+			}
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	if (vmalloced)
+		set_bit(BM_P_VMALLOCED, &b->bm_flags);
+	else
+		clear_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+	return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ */
+int drbd_bm_init(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	init_MUTEX(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	mdev->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_conf *mdev)
+{
+	ERR_IF(!mdev->bitmap) return 0;
+	return mdev->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_conf *mdev)
+{
+	ERR_IF (!mdev->bitmap) return;
+	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
+	bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
+	kfree(mdev->bitmap);
+	mdev->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+	size_t w = b->bm_bits >> LN2_BPL;
+	int cleared = 0;
+	unsigned long *p_addr, *bm;
+
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
+	if (w < b->bm_words) {
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		w++; bm++;
+	}
+
+	if (w < b->bm_words) {
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+	size_t w = b->bm_bits >> LN2_BPL;
+	unsigned long *p_addr, *bm;
+
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
+	if (w < b->bm_words) {
+		*bm |= ~mask;
+		bm++; w++;
+	}
+
+	if (w < b->bm_words) {
+		*bm = ~(0UL);
+	}
+	bm_unmap(p_addr);
+}
+
+static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
+{
+	unsigned long *p_addr, *bm, offset = 0;
+	unsigned long bits = 0;
+	unsigned long i, do_now;
+
+	while (offset < b->bm_words) {
+		i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
+		p_addr = __bm_map_paddr(b, offset, KM_USER0);
+		bm = p_addr + MLPP(offset);
+		while (i--) {
+#ifndef __LITTLE_ENDIAN
+			if (swap_endian)
+				*bm = lel_to_cpu(*bm);
+#endif
+			bits += hweight_long(*bm++);
+		}
+		__bm_unmap(p_addr, KM_USER0);
+		offset += do_now;
+		cond_resched();
+	}
+
+	return bits;
+}
+
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	return __bm_count_bits(b, 0);
+}
+
+static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
+{
+	return __bm_count_bits(b, 1);
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	size_t do_now, end;
+
+#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+			break; /* breaks to after catch_oob_access_end() only! */
+		}
+		memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		offset += do_now;
+	}
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long bits, words, owords, obits, *p_addr, *bm;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0, growing;
+	int opages_vmalloced;
+
+	ERR_IF(!b) return -ENOMEM;
+
+	drbd_bm_lock(mdev, "resize");
+
+	dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages, opages_vmalloced);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(mdev)) {
+		D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
+		put_ldev(mdev);
+	}
+
+	/* one extra long to catch off by one errors */
+	want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		bm_memset(b, owords, 0xff, words-owords);
+		b->bm_set += bits - obits;
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	p_addr = bm_map_paddr(b, words);
+	bm = p_addr + MLPP(words);
+	*bm = DRBD_MAGIC;
+	bm_unmap(p_addr);
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages, opages_vmalloced);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
+
+ out:
+	drbd_bm_unlock(mdev);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(mdev);
+	put_ldev(mdev);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | lel_to_cpu(*buffer++);
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_paddr(b, offset);
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = cpu_to_lel(*bm++);
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+static void bm_async_io_complete(struct bio *bio, int error)
+{
+	struct drbd_bitmap *b = bio->bi_private;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?!
+	 * do we want to WARN() on this? */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	if (error) {
+		/* doh. what now?
+		 * for now, set all bits, and flag MD_IO_ERROR */
+		__set_bit(BM_MD_IO_ERROR, &b->bm_flags);
+	}
+	if (atomic_dec_and_test(&b->bm_async_io))
+		wake_up(&b->bm_io_wait);
+
+	bio_put(bio);
+}
+
+static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
+{
+	/* we are process context. we always get a bio */
+	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	unsigned int len;
+	sector_t on_disk_sector =
+		mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
+	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+	/* this might happen with very small
+	 * flexible external meta data device */
+	len = min_t(unsigned int, PAGE_SIZE,
+		(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
+
+	bio->bi_bdev = mdev->ldev->md_bdev;
+	bio->bi_sector = on_disk_sector;
+	bio_add_page(bio, b->bm_pages[page_nr], len, 0);
+	bio->bi_private = b;
+	bio->bi_end_io = bm_async_io_complete;
+
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio, -EIO);
+	} else {
+		submit_bio(rw, bio);
+	}
+}
+
+# if defined(__LITTLE_ENDIAN)
+	/* nothing to do, on disk == in memory */
+# define bm_cpu_to_lel(x) ((void)0)
+# else
+void bm_cpu_to_lel(struct drbd_bitmap *b)
+{
+	/* need to cpu_to_lel all the pages ...
+	 * this may be optimized by using
+	 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
+	 * the following is still not optimal, but better than nothing */
+	unsigned int i;
+	unsigned long *p_addr, *bm;
+	if (b->bm_set == 0) {
+		/* no page at all; avoid swap if all is 0 */
+		i = b->bm_number_of_pages;
+	} else if (b->bm_set == b->bm_bits) {
+		/* only the last page */
+		i = b->bm_number_of_pages - 1;
+	} else {
+		/* all pages */
+		i = 0;
+	}
+	for (; i < b->bm_number_of_pages; i++) {
+		p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
+		for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
+			*bm = cpu_to_lel(*bm);
+		kunmap_atomic(p_addr, KM_USER0);
+	}
+}
+# endif
+/* lel_to_cpu == cpu_to_lel */
+# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	/* sector_t sector; */
+	int bm_words, num_pages, i;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	WARN_ON(!bm_is_locked(b));
+
+	/* no spinlock here, the drbd_bm_lock should be enough! */
+
+	bm_words  = drbd_bm_words(mdev);
+	num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	/* on disk bitmap is little endian */
+	if (rw == WRITE)
+		bm_cpu_to_lel(b);
+
+	now = jiffies;
+	atomic_set(&b->bm_async_io, num_pages);
+	__clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
+
+	/* let the layers below us try to merge these bios... */
+	for (i = 0; i < num_pages; i++)
+		bm_page_io_async(mdev, b, i, rw);
+
+	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+	wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
+
+	if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
+		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(mdev, 1, TRUE);
+		err = -EIO;
+	}
+
+	now = jiffies;
+	if (rw == WRITE) {
+		/* swap back endianness */
+		bm_lel_to_cpu(b);
+		/* flush bitmap to stable storage */
+		drbd_md_flush(mdev);
+	} else /* rw == READ */ {
+		/* just read, if necessary adjust endianness */
+		b->bm_set = bm_count_bits_swap_endian(b);
+		dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, READ);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE);
+}
+
+/**
+ * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
+ * @mdev:	DRBD device.
+ * @enr:	Extent number in the resync lru (happens to be sector offset)
+ *
+ * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
+ * by a single sector write. Therefore enr == sector offset from the
+ * start of the bitmap.
+ */
+int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
+{
+	sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+				      + mdev->ldev->md.bm_offset;
+	int bm_words, num_words, offset;
+	int err = 0;
+
+	mutex_lock(&mdev->md_io_mutex);
+	bm_words  = drbd_bm_words(mdev);
+	offset    = S2W(enr);	/* word offset into bitmap */
+	num_words = min(S2W(1), bm_words - offset);
+	if (num_words < S2W(1))
+		memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
+	drbd_bm_get_lel(mdev, offset, num_words,
+			page_address(mdev->md_io_page));
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
+		int i;
+		err = -EIO;
+		dev_err(DEV, "IO ERROR writing bitmap sector %lu "
+		    "(meta-disk sector %llus)\n",
+		    enr, (unsigned long long)on_disk_sector);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
+			drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
+	}
+	mdev->bm_writ_cnt++;
+	mutex_unlock(&mdev->md_io_mutex);
+	return err;
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * should not make much difference anyways, but ...
+ *
+ * this returns a bit number, NOT a sector!
+ */
+#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
+static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
+	const int find_zero_bit, const enum km_type km)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = -1UL;
+	unsigned long *p_addr;
+	unsigned long bit_offset; /* bit offset of the mapped page. */
+
+	if (bm_fo > b->bm_bits) {
+		dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+	} else {
+		while (bm_fo < b->bm_bits) {
+			unsigned long offset;
+			bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
+			offset = bit_offset >> LN2_BPL;    /* word offset of the page */
+			p_addr = __bm_map_paddr(b, offset, km);
+
+			if (find_zero_bit)
+				i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+			else
+				i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+
+			__bm_unmap(p_addr, km);
+			if (i < PAGE_SIZE*8) {
+				i = bit_offset + i;
+				if (i >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		i = -1UL;
+	}
+ found:
+	return i;
+}
+
+static unsigned long bm_find_next(struct drbd_conf *mdev,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = -1UL;
+
+	ERR_IF(!b) return i;
+	ERR_IF(!b->bm_pages) return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!bm_is_locked(mdev)); */
+	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!bm_is_locked(mdev)); */
+	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	unsigned long e, int val, const enum km_type km)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned long last_page_nr = -1UL;
+	int c = 0;
+
+	if (e >= b->bm_bits) {
+		dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned long offset = bitnr>>LN2_BPL;
+		unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr, km);
+			p_addr = __bm_map_paddr(b, offset, km);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr, km);
+	b->bm_set += c;
+	return c;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	int c = 0;
+
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(mdev, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(mdev, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		b->bm_set += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr, KM_USER0);
+}
+
+/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		__bm_change_bits_to(mdev, s, e, 1, KM_USER0);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
+		cond_resched();
+		first_word = 0;
+	}
+
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+	bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(mdev, el, e, 1, KM_USER0);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	if (bitnr < b->bm_bits) {
+		unsigned long offset = bitnr>>LN2_BPL;
+		p_addr = bm_map_paddr(b, offset);
+		i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL, page_nr = -1;
+	unsigned long bitnr;
+	int c = 0;
+	size_t w;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		w = bitnr >> LN2_BPL;
+		if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
+			page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_paddr(b, w);
+		}
+		ERR_IF (bitnr >= b->bm_bits) {
+			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+		} else {
+			c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		}
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (n--)
+			count += hweight_long(*bm++);
+		bm_unmap(p_addr);
+	} else {
+		dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return count;
+}
+
+/* set all bits covered by the AL-extent al_enr */
+unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long weight;
+	int count, s, e, i, do_now;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	weight = b->bm_set;
+
+	s = al_enr * BM_WORDS_PER_AL_EXT;
+	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+	/* assert that s and e are on the same page */
+	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
+	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
+	count = 0;
+	if (s < b->bm_words) {
+		i = do_now = e-s;
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (i--) {
+			count += hweight_long(*bm);
+			*bm = -1UL;
+			bm++;
+		}
+		bm_unmap(p_addr);
+		b->bm_set += do_now*BITS_PER_LONG - count;
+		if (e == b->bm_words)
+			b->bm_set -= bm_clear_surplus(b);
+	} else {
+		dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
+	}
+	weight = b->bm_set - weight;
+	spin_unlock_irq(&b->bm_lock);
+	return weight;
+}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 0000000..2312d78
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2252 @@
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern int disable_sendpage;
+extern int allow_oos;
+extern unsigned int cn_idx;
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+/* All EEs on the free list should have ID_VACANT (== 0)
+ * freshly allocated EEs get !ID_VACANT (== 1)
+ * so if it says "cannot dereference null pointer at adress 0x00000001",
+ * it is most likely one of these :( */
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+
+#define ID_SYNCER (-1ULL)
+#define ID_VACANT 0
+#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
+struct drbd_conf;
+
+
+/* to shorten dev_warn(DEV, "msg"); and relatives statements */
+#define DEV (disk_to_dev(mdev->vdisk))
+
+#define D_ASSERT(exp)	if (!(exp)) \
+	 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
+
+#define ERR_IF(exp) if (({				\
+	int _b = (exp) != 0;				\
+	if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n",	\
+		__func__, #exp, __FILE__, __LINE__);	\
+	 _b;						\
+	}))
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+
+	DRBD_FAULT_MAX,
+};
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+static inline int
+drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+	return fault_rate &&
+		(enable_faults & (1<<type)) &&
+		_drbd_insert_fault(mdev, type);
+}
+#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
+
+#else
+#define FAULT_ACTIVE(_m, _t) (0)
+#endif
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+/* 4th incarnation of the disk layout. */
+#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
+
+extern struct drbd_conf **minor_table;
+extern struct ratelimit_state drbd_ratelimit_state;
+
+/* on the wire */
+enum drbd_packets {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* asender (meta socket */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_DISCARD_ACK	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+
+	P_MAX_CMD	      = 0x25,
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_HAND_SHAKE_M	      = 0xfff1, /* First Packet on the MetaSock */
+	P_HAND_SHAKE_S	      = 0xfff2, /* First Packet on the Socket */
+
+	P_HAND_SHAKE	      = 0xfffe	/* FIXED for the next century! */
+};
+
+static inline const char *cmdname(enum drbd_packets cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_DISCARD_ACK]	        = "DiscardAck",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_MAX_CMD]	        = NULL,
+	};
+
+	if (cmd == P_HAND_SHAKE_M)
+		return "HandShakeM";
+	if (cmd == P_HAND_SHAKE_S)
+		return "HandShakeS";
+	if (cmd == P_HAND_SHAKE)
+		return "HandShake";
+	if (cmd >= P_MAX_CMD)
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+	u8	  payload[0];
+} __packed;
+/* 8 bytes. packet FIXED for the next century! */
+
+/*
+ * short commands, packets without payload, plain p_header:
+ *   P_PING
+ *   P_PING_ACK
+ *   P_BECOME_SYNC_TARGET
+ *   P_BECOME_SYNC_SOURCE
+ *   P_UNPLUG_REMOTE
+ */
+
+/*
+ * commands with out-of-struct payload:
+ *   P_BITMAP    (no additional fields)
+ *   P_DATA, P_DATA_REPLY (see p_data)
+ *   P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
+ */
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER	      1
+#define DP_RW_SYNC	      2
+#define DP_MAY_SET_IN_SYNC    4
+
+struct p_data {
+	struct p_header head;
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	struct p_header head;
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+
+struct p_block_req {
+	struct p_header head;
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_HAND_SHAKE
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+struct p_handshake {
+	struct p_header head;	/* 8 bytes */
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserverd array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserverd[7];
+} __packed;
+/* 80 bytes, FIXED for the next century */
+
+struct p_barrier {
+	struct p_header head;
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	struct p_header head;
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	struct p_header head;
+	u32 rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+	struct p_header head;
+	u32 rate;
+        /* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_protocol {
+	struct p_header head;
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 want_lose;
+	u32 two_primaries;
+
+              /* Since protocol version 87 and higher. */
+	char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+	struct p_header head;
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	struct p_header head;
+	u64	    uuid;
+} __packed;
+
+struct p_sizes {
+	struct p_header head;
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_segment_size;  /* Maximal size of a BIO */
+	u32	    queue_order_type;
+} __packed;
+
+struct p_state {
+	struct p_header head;
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	struct p_header head;
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	struct p_header head;
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_discard {
+	struct p_header head;
+	u64	    block_id;
+	u32	    seq_num;
+	u32	    pad;
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	struct p_header head;
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[0];
+} __packed;
+
+/* DCBP: Drbd Compressed Bitmap Packet ... */
+static inline enum drbd_bitmap_code
+DCBP_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static inline void
+DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static inline int
+DCBP_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static inline void
+DCBP_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static inline int
+DCBP_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+static inline void
+DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+/* one bitmap packet, including the p_header,
+ * should fit within one _architecture independend_ page.
+ * so we need to use the fixed size 4KiB page size
+ * most architechtures have used for a long time.
+ */
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
+#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
+#if (PAGE_SIZE < 4096)
+/* drbd_send_bitmap / receive_bitmap would break horribly */
+#error "PAGE_SIZE too small"
+#endif
+
+union p_polymorph {
+        struct p_header          header;
+        struct p_handshake       handshake;
+        struct p_data            data;
+        struct p_block_ack       block_ack;
+        struct p_barrier         barrier;
+        struct p_barrier_ack     barrier_ack;
+        struct p_rs_param_89     rs_param_89;
+        struct p_protocol        protocol;
+        struct p_sizes           sizes;
+        struct p_uuids           uuids;
+        struct p_state           state;
+        struct p_req_state       req_state;
+        struct p_req_state_reply req_state_reply;
+        struct p_block_req       block_req;
+} __packed;
+
+/**********************************************************************/
+enum drbd_thread_state {
+	None,
+	Running,
+	Exiting,
+	Restarting
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion stop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_conf *mdev;
+	int reset_cpu_mask;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+
+/*
+ * Having this as the first member of a struct provides sort of "inheritance".
+ * "derived" structs can be "drbd_queue_work()"ed.
+ * The callback should know and cast back to the descendant struct.
+ * drbd_request and drbd_epoch_entry are descendants of drbd_work.
+ */
+struct drbd_work;
+typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
+struct drbd_work {
+	struct list_head list;
+	drbd_work_cb cb;
+};
+
+struct drbd_tl_epoch;
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_conf *mdev;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_endio_pri(). */
+	struct bio *private_bio;
+
+	struct hlist_node colision;
+	sector_t sector;
+	unsigned int size;
+	unsigned int epoch; /* barrier_nr */
+
+	/* barrier_nr: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * starting a new epoch...
+	 */
+
+	/* up to here, the struct layout is identical to drbd_epoch_entry;
+	 * we might be able to use that to our advantage...  */
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+	unsigned long rq_state; /* see comments above _req_mod() */
+	int seq_num;
+	unsigned long start_time;
+};
+
+struct drbd_tl_epoch {
+	struct drbd_work w;
+	struct list_head requests; /* requests before */
+	struct drbd_tl_epoch *next; /* pointer to the next barrier */
+	unsigned int br_number;  /* the barriers identifier. */
+	int n_req;	/* number of requests attached before this barrier */
+};
+
+struct drbd_request;
+
+/* These Tl_epoch_entries may be in one of 6 lists:
+   active_ee .. data packet being written
+   sync_ee   .. syncer block being written
+   done_ee   .. block written, need to send P_WRITE_ACK
+   read_ee   .. [RS]P_DATA_REQUEST being read
+*/
+
+struct drbd_epoch {
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* drbd_epoch flag bits */
+enum {
+	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+	DE_BARRIER_IN_NEXT_EPOCH_DONE,
+	DE_CONTAINS_A_BARRIER,
+	DE_HAVE_BARRIER_NUMBER,
+	DE_IS_FINISHING,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BARRIER_DONE,
+	EV_BECAME_LAST,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct drbd_epoch_entry {
+	struct drbd_work    w;
+	struct drbd_conf *mdev;
+	struct bio *private_bio;
+	struct hlist_node colision;
+	sector_t sector;
+	unsigned int size;
+	struct drbd_epoch *epoch;
+
+	/* up to here, the struct layout is identical to drbd_request;
+	 * we might be able to use that to our advantage...  */
+
+	unsigned int flags;
+	u64    block_id;
+};
+
+struct drbd_wq_barrier {
+	struct drbd_work w;
+	struct completion done;
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+/* ee flag bits */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_CONFLICT_PENDING,
+	__EE_MAY_SET_IN_SYNC,
+	__EE_IS_BARRIER,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
+
+/* global flag bits */
+enum {
+	CREATE_BARRIER,		/* next P_DATA is preceeded by a P_BARRIER */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	SEND_PING,		/* whether asender should send a ping asap */
+
+	STOP_SYNC_TIMER,	/* tell timer to cancel itself */
+	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	DISCARD_CONCURRENT,	/* Set on one node, cleared on the peer! */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CLUSTER_ST_CHANGE,	/* Cluster wide state change going on... */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	WRITE_BM_AFTER_RESYNC,	/* A kmalloc() during resync failed */
+	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
+	CONSIDER_RESYNC,
+
+	MD_NO_BARRIER,		/* meta data device does not support barriers,
+				   so don't even try */
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	NET_CONGESTED,		/* The data socket is congested */
+
+	CONFIG_PENDING,		/* serialization of (re)configuration requests.
+				 * if set, also prevents the device from dying */
+	DEVICE_DYING,		/* device became unconfigured,
+				 * but worker thread is still handling the cleanup.
+				 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
+				 * while this is set. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+};
+
+struct drbd_bitmap; /* opaque for drbd_conf */
+
+/* TODO sort members for performance
+ * MAYBE group them further */
+
+/* THINK maybe we actually want to use the default "event/%s" worker threads
+ * or similar in linux 2.6, which uses per cpu data and threads.
+ *
+ * To be general, this might need a spin_lock member.
+ * For now, please use the mdev->req_lock to protect list_head,
+ * see drbd_queue_work below.
+ */
+struct drbd_work_queue {
+	struct list_head q;
+	struct semaphore s; /* producers up it, worker down()s it */
+	spinlock_t q_lock;  /* to protect the list. */
+};
+
+struct drbd_socket {
+	struct drbd_work_queue work;
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	union p_polymorph sbuf;
+	union p_polymorph rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to al area */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* u32 al_nr_extents;	   important for restoring the AL
+	 * is stored into  sync_conf.al_extents, which in turn
+	 * gets applied to act_log->nr_elements
+	 */
+};
+
+/* for sync_conf and other types... */
+#define NL_PACKET(name, number, fields) struct name { fields };
+#define NL_INTEGER(pn,pr,member) int member;
+#define NL_INT64(pn,pr,member) __u64 member;
+#define NL_BIT(pn,pr,member)   unsigned member:1;
+#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
+#include "linux/drbd_nl.h"
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct file *lo_file;
+	struct file *md_file;
+	struct drbd_md md;
+	struct disk_conf dc; /* The user provided config... */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	struct drbd_conf *mdev;
+	struct completion event;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	int (*io_fn)(struct drbd_conf *mdev);
+	void (*done)(struct drbd_conf *mdev, int rv);
+};
+
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+	WO_bio_barrier
+};
+
+struct drbd_conf {
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
+	struct syncer_conf sync_conf;
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct block_device *this_bdev;
+	struct gendisk	    *vdisk;
+
+	struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta; /* ping/ack (metadata) packets */
+	int agreed_pro_version;  /* actually used protocol version */
+	unsigned long last_received; /* in jiffies, either socket */
+	unsigned int ko_count;
+	struct drbd_work  resync_work,
+			  unplug_work,
+			  md_sync_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replys for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t net_cnt;	 /* Users of net_conf */
+	spinlock_t req_lock;
+	struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
+	struct drbd_tl_epoch *newest_tle;
+	struct drbd_tl_epoch *oldest_tle;
+	struct list_head out_of_sequence_requests;
+	struct hlist_head *tl_hash;
+	unsigned int tl_hash_s;
+
+	/* blocks to sync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of sync IOs that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left;
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time;
+	/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
+	struct list_head active_ee; /* IO in progress */
+	struct list_head sync_ee;   /* IO in progress */
+	struct list_head done_ee;   /* send ack */
+	struct list_head read_ee;   /* IO in progress */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+	struct hlist_head *ee_hash; /* is proteced by req_lock! */
+	unsigned int ee_hash_s;
+
+	/* this one is protected by ee_lock, single thread */
+	struct drbd_epoch_entry *last_write_w_barrier;
+
+	int next_barrier_nr;
+	struct hlist_head *app_reads_hash; /* is proteced by req_lock */
+	struct list_head resync_reads;
+	atomic_t pp_in_use;
+	wait_queue_head_t ee_wait;
+	struct page *md_io_page;	/* one page buffer for md_io */
+	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
+	struct mutex md_io_mutex;	/* protects the md_io_buffer */
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	int al_tr_pos;   /* position of the next transaction in the journal */
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
+	struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
+	void *int_dig_out;
+	void *int_dig_in;
+	void *int_dig_vv;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned int minor;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+	cpumask_var_t cpu_mask;
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex state_mutex;
+	char congestion_reason;  /* Why we where congested... */
+};
+
+static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+
+	mdev = minor < minor_count ? minor_table[minor] : NULL;
+
+	return mdev;
+}
+
+static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
+{
+	return mdev->minor;
+}
+
+/* returns 1 if it was successfull,
+ * returns 0 if there was no data socket.
+ * so wherever you are going to use the data.socket, e.g. do
+ * if (!drbd_get_data_sock(mdev))
+ *	return 0;
+ *	CODE();
+ * drbd_put_data_sock(mdev);
+ */
+static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+{
+	mutex_lock(&mdev->data.mutex);
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (unlikely(mdev->data.socket == NULL)) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+{
+	mutex_unlock(&mdev->data.mutex);
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum chg_state_flags {
+	CS_HARD	= 1,
+	CS_VERBOSE = 2,
+	CS_WAIT_COMPLETE = 4,
+	CS_SERIALIZE    = 8,
+	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
+};
+
+extern void drbd_init_set_defaults(struct drbd_conf *mdev);
+extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+			union drbd_state mask, union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+			union drbd_state);
+extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
+			union drbd_state, enum chg_state_flags);
+extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
+			    enum chg_state_flags, struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+			union drbd_state, int);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
+extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+extern void drbd_free_resources(struct drbd_conf *mdev);
+extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_conf *mdev);
+extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
+extern void drbd_free_sock(struct drbd_conf *mdev);
+extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+			void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern int drbd_send_uuids(struct drbd_conf *mdev);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
+extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int _drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev);
+extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			enum drbd_packets cmd, struct p_header *h,
+			size_t size, unsigned msg_flags);
+#define USE_DATA_SOCKET 1
+#define USE_META_SOCKET 0
+extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+			enum drbd_packets cmd, struct p_header *h,
+			size_t size);
+extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
+			char *data, size_t size);
+extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
+extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
+			u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct drbd_epoch_entry *e);
+extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_block_req *rp);
+extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_data *dp);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+			   struct drbd_epoch_entry *e);
+extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
+extern int _drbd_send_barrier(struct drbd_conf *mdev,
+			struct drbd_tl_epoch *barrier);
+extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
+				   sector_t sector,int size,
+				   void *digest, int digest_size,
+				   enum drbd_packets cmd);
+extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
+
+extern int drbd_send_bitmap(struct drbd_conf *mdev);
+extern int _drbd_send_bitmap(struct drbd_conf *mdev);
+extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
+extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+extern void drbd_md_sync(struct drbd_conf *mdev);
+extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
+/* maybe define them below as inline? */
+extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+				 int (*io_fn)(struct drbd_conf *),
+				 void (*done)(struct drbd_conf *, int),
+				 char *why);
+extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
+extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
+extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+
+
+/* Meta data layout
+   We reserve a 128MB Block (4k aligned)
+   * either at the end of the backing device
+   * or on a seperate meta data device. */
+
+#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
+/* The following numbers are sectors */
+#define MD_AL_OFFSET 8	    /* 8 Sectors after start of meta area */
+#define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
+/* Allows up to about 3.8TB */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
+
+/* Since the smalles IO unit is usually 512 byte */
+#define MD_SECTOR_SHIFT	 9
+#define MD_SECTOR_SIZE	 (1<<MD_SECTOR_SHIFT)
+
+/* activity log */
+#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
+#define AL_EXTENT_SHIFT 22		 /* One extent represents 4M Storage */
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define BM_BLOCK_SHIFT  12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SHIFT	 (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3)  /* = 24 */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
+#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+#define DRBD_MAX_SECTORS_BM \
+	  ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
+#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
+#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
+#endif
+#endif
+
+/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
+ * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * hash table. */
+#define HT_SHIFT 6
+#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+
+/* Number of elements in the app_reads_hash */
+#define APP_R_HSIZE 15
+
+extern int  drbd_bm_init(struct drbd_conf *mdev);
+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern void drbd_bm_cleanup(struct drbd_conf *mdev);
+extern void drbd_bm_set_all(struct drbd_conf *mdev);
+extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+extern int  drbd_bm_set_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock */
+extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
+extern int  drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
+extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
+		unsigned long al_enr);
+extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
+extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
+extern int drbd_bm_rs_done(struct drbd_conf *mdev);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap and drbd_bm_write_sect */
+extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
+extern void drbd_bm_unlock(struct drbd_conf *mdev);
+
+extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+extern struct page *drbd_pp_pool; /* drbd's page pool */
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+extern rwlock_t global_state_lock;
+
+extern struct drbd_conf *drbd_new_device(unsigned int minor);
+extern void drbd_free_mdev(struct drbd_conf *mdev);
+
+extern int proc_details;
+
+/* drbd_req */
+extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_conf *mdev);
+extern void drbd_resume_io(struct drbd_conf *mdev);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_conf *,
+		struct drbd_backing_dev *);
+enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_conf *);
+extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
+extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
+		int force);
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
+
+/* drbd_worker.c */
+extern int drbd_worker(struct drbd_thread *thi);
+extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_conf *mdev);
+extern void suspend_other_sg(struct drbd_conf *mdev);
+extern int drbd_resync_finished(struct drbd_conf *mdev);
+/* maybe rather drbd_main.c ? */
+extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
+		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+
+static inline void ov_oos_print(struct drbd_conf *mdev)
+{
+	if (mdev->ov_last_oos_size) {
+		dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)mdev->ov_last_oos_start,
+		     (unsigned long)mdev->ov_last_oos_size);
+	}
+	mdev->ov_last_oos_size=0;
+}
+
+
+extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+/* worker callbacks */
+extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
+extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
+extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+
+/* drbd_receiver.c */
+extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
+extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+					    u64 id,
+					    sector_t sector,
+					    unsigned int data_size,
+					    gfp_t gfp_mask) __must_hold(local);
+extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
+extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+
+/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
+ * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+			char __user *optval, int optlen)
+{
+	int err;
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	return err;
+}
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+	int __user val = 0;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char __user *)&val, sizeof(val));
+}
+
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
+extern int drbd_rs_del_all(struct drbd_conf *mdev);
+extern void drbd_rs_failed_io(struct drbd_conf *mdev,
+		sector_t sector, int size);
+extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
+extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_in_sync(mdev, sector, size) \
+	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_out_of_sync(mdev, sector, size) \
+	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
+extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
+extern void drbd_al_shrink(struct drbd_conf *mdev);
+
+
+/* drbd_nl.c */
+
+void drbd_nl_cleanup(void);
+int __init drbd_nl_init(void);
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
+void drbd_bcast_sync_progress(struct drbd_conf *mdev);
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e);
+
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+/*
+ * inline helper functions
+ *************************/
+
+static inline void drbd_state_lock(struct drbd_conf *mdev)
+{
+	wait_event(mdev->misc_wait,
+		   !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+}
+
+static inline void drbd_state_unlock(struct drbd_conf *mdev)
+{
+	clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+static inline int _drbd_set_state(struct drbd_conf *mdev,
+				   union drbd_state ns, enum chg_state_flags flags,
+				   struct completion *done)
+{
+	int rv;
+
+	read_lock(&global_state_lock);
+	rv = __drbd_set_state(mdev, ns, flags, done);
+	read_unlock(&global_state_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+{
+	switch (mdev->ldev->dc.on_io_error) {
+	case EP_PASS_ON:
+		if (!forcedetach) {
+			if (printk_ratelimit())
+				dev_err(DEV, "Local IO failed in %s."
+					     "Passing error on...\n", where);
+			break;
+		}
+		/* NOTE fall through to detach case if forcedetach set */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		if (mdev->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
+			dev_err(DEV, "Local IO failed in %s."
+				     "Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @mdev:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
+	int error, int forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		__drbd_chk_io_error_(mdev, forcedetach, where);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_AL_OFFSET - 1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect;
+	}
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+	return bdev ? bdev->bd_inode->i_size >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+					drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss__() - Return the sector number of our meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
+				    struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	default: /* external, some index */
+		return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+	case DRBD_MD_INDEX_INTERNAL:
+		/* with drbd08, internal meta data is always "flexible" */
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* sizeof(struct md_on_disk_07) == 4k
+		 * position: last 4k aligned block of 4k size */
+		if (!bdev->backing_bdev) {
+			if (__ratelimit(&drbd_ratelimit_state)) {
+				dev_err(DEV, "bdev->backing_bdev==NULL\n");
+				dump_stack();
+			}
+			return 0;
+		}
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
+			- MD_AL_OFFSET;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		return 0;
+	}
+}
+
+static inline void
+_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	list_add_tail(&w->list, &q->q);
+	up(&q->s);
+}
+
+static inline void
+drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void wake_asender(struct drbd_conf *mdev)
+{
+	if (test_bit(SIGNAL_ASENDER, &mdev->flags))
+		force_sig(DRBD_SIG, mdev->asender.task);
+}
+
+static inline void request_ping(struct drbd_conf *mdev)
+{
+	set_bit(SEND_PING, &mdev->flags);
+	wake_asender(mdev);
+}
+
+static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
+	enum drbd_packets cmd)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping(struct drbd_conf *mdev)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
+}
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, FALSE, TRUE);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, FALSE, FALSE);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, TRUE, FALSE);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, queue_for_net_write or queue_for_net_read);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, data_received)
+ *     [from receive_DataReply]
+ *  _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, send_failed or send_canceled)
+ *  _req_mod(req, connection_lost_while_pending)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which)				\
+	if (atomic_read(&mdev->which) < 0)			\
+		dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n",	\
+		    __func__ , __LINE__ ,			\
+		    atomic_read(&mdev->which))
+
+#define dec_ap_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	if (atomic_dec_and_test(&mdev->ap_pending_cnt))		\
+		wake_up(&mdev->misc_wait);			\
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK whith ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->rs_pending_cnt);
+}
+
+#define dec_rs_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->rs_pending_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->unacked_cnt);
+}
+
+#define dec_unacked(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->unacked_cnt);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+#define sub_unacked(mdev, n)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_sub(n, &mdev->unacked_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+
+static inline void put_net_conf(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->net_cnt))
+		wake_up(&mdev->misc_wait);
+}
+
+/**
+ * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
+ * @mdev:	DRBD device.
+ *
+ * You have to call put_net_conf() when finished working with mdev->net_conf.
+ */
+static inline int get_net_conf(struct drbd_conf *mdev)
+{
+	int have_net_conf;
+
+	atomic_inc(&mdev->net_cnt);
+	have_net_conf = mdev->state.conn >= C_UNCONNECTED;
+	if (!have_net_conf)
+		put_net_conf(mdev);
+	return have_net_conf;
+}
+
+/**
+ * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
+ * @M:		DRBD device.
+ *
+ * You have to call put_ldev() when finished working with mdev->ldev.
+ */
+#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
+#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
+
+static inline void put_ldev(struct drbd_conf *mdev)
+{
+	__release(local);
+	if (atomic_dec_and_test(&mdev->local_cnt))
+		wake_up(&mdev->misc_wait);
+	D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(mdev);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
+#endif
+
+/* you must have an "get_ldev" reference */
+static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/*
+	 * this is to break it at compile time when we change that
+	 * (we may feel 4TB maximum storage per drbd is not enough)
+	 */
+	typecheck(unsigned long, mdev->rs_total);
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > mdev->rs_total) {
+		/* doh. maybe a logic bug somewhere.
+		 * may also be just a race condition
+		 * between this and a disconnect during sync.
+		 * for now, just prevent in-kernel buffer overflow.
+		 */
+		smp_rmb();
+		dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+				drbd_conn_str(mdev->state.conn),
+				*bits_left, mdev->rs_total, mdev->rs_failed);
+		*per_mil_done = 0;
+	} else {
+		/* make sure the calculation happens in long context */
+		unsigned long tmp = 1000UL -
+				(*bits_left >> 10)*1000UL
+				/ ((mdev->rs_total >> 10) + 1UL);
+		*per_mil_done = tmp;
+	}
+}
+
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
+{
+	int mxb = 1000000; /* arbitrary limit on open requests */
+	if (get_net_conf(mdev)) {
+		mxb = mdev->net_conf->max_buffers;
+		put_net_conf(mdev);
+	}
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(union drbd_state s)
+{
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+		/* maybe stable, look at the disk state */
+		break;
+
+	/* no new io accepted during tansitional states
+	 * like handshake or teardown */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+	case C_WF_BITMAP_S:
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during tansitional states */
+	case D_ATTACHING:
+	case D_FAILED:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+
+	if (mdev->state.susp)
+		return 0;
+	if (test_bit(SUSPEND_IO, &mdev->flags))
+		return 0;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(mdev->state))
+		return 0;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&mdev->ap_bio_cnt) > mxb)
+		return 0;
+	if (test_bit(BITMAP_IO, &mdev->flags))
+		return 0;
+	return 1;
+}
+
+/* I'd like to use wait_event_lock_irq,
+ * but I'm not sure when it got introduced,
+ * and not sure when it has 3 or 4 arguments */
+static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+{
+	/* compare with after_state_ch,
+	 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
+	DEFINE_WAIT(wait);
+
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exeed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	spin_lock_irq(&mdev->req_lock);
+	while (!__inc_ap_bio_cond(mdev)) {
+		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mdev->req_lock);
+		schedule();
+		finish_wait(&mdev->misc_wait, &wait);
+		spin_lock_irq(&mdev->req_lock);
+	}
+	atomic_add(one_or_two, &mdev->ap_bio_cnt);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+static inline void dec_ap_bio(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+	int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
+
+	D_ASSERT(ap_bio >= 0);
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&mdev->misc_wait);
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+	}
+}
+
+static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
+{
+	mdev->ed_uuid = val;
+}
+
+static inline int seq_cmp(u32 a, u32 b)
+{
+	/* we assume wrap around at 32bit.
+	 * for wrap around at 24bit (old atomic_t),
+	 * we'd have to
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)(a) - (s32)(b);
+}
+#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
+#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
+#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
+#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
+/* CAUTION: please no side effects in arguments! */
+#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
+
+static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
+{
+	unsigned int m;
+	spin_lock(&mdev->peer_seq_lock);
+	m = seq_max(mdev->peer_seq, new_seq);
+	mdev->peer_seq = m;
+	spin_unlock(&mdev->peer_seq_lock);
+	if (m == new_seq)
+		wake_up(&mdev->seq_wait);
+}
+
+static inline void drbd_update_congested(struct drbd_conf *mdev)
+{
+	struct sock *sk = mdev->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &mdev->flags);
+}
+
+static inline int drbd_queue_order_type(struct drbd_conf *mdev)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+static inline void drbd_blk_run_queue(struct request_queue *q)
+{
+	if (q && q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+static inline void drbd_kick_lo(struct drbd_conf *mdev)
+{
+	if (get_ldev(mdev)) {
+		drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
+		put_ldev(mdev);
+	}
+}
+
+static inline void drbd_md_flush(struct drbd_conf *mdev)
+{
+	int r;
+
+	if (test_bit(MD_NO_BARRIER, &mdev->flags))
+		return;
+
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+	if (r) {
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
+	}
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 0000000..157d1e4
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3699 @@
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+
+#include "drbd_vli.h"
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+int drbdd_init(struct drbd_thread *);
+int drbd_worker(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+int drbd_init(void);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static int drbd_release(struct gendisk *gd, fmode_t mode);
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(cn_idx, uint, 0444);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = 32;
+int disable_sendpage;
+int allow_oos;
+unsigned int cn_idx = CN_IDX_DRBD;
+int proc_details;       /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct drbd_conf **minor_table;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t   drbd_pp_lock;
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static struct block_device_operations drbd_ops = {
+	.owner =   THIS_MODULE,
+	.open =    drbd_open,
+	.release = drbd_release,
+};
+
+#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&mdev->local_cnt))
+			wake_up(&mdev->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * DOC: The transfer log
+ *
+ * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
+ * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
+ * of the list. There is always at least one &struct drbd_tl_epoch object.
+ *
+ * Each &struct drbd_tl_epoch has a circular double linked list of requests
+ * attached.
+ */
+static int tl_init(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* during device minor initialization, we may well use GFP_KERNEL */
+	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
+	if (!b)
+		return 0;
+	INIT_LIST_HEAD(&b->requests);
+	INIT_LIST_HEAD(&b->w.list);
+	b->next = NULL;
+	b->br_number = 4711;
+	b->n_req = 0;
+	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+
+	mdev->oldest_tle = b;
+	mdev->newest_tle = b;
+	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+
+	return 1;
+}
+
+static void tl_cleanup(struct drbd_conf *mdev)
+{
+	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+	kfree(mdev->oldest_tle);
+	mdev->oldest_tle = NULL;
+	kfree(mdev->unused_spare_tle);
+	mdev->unused_spare_tle = NULL;
+	kfree(mdev->tl_hash);
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+}
+
+/**
+ * _tl_add_barrier() - Adds a barrier to the transfer log
+ * @mdev:	DRBD device.
+ * @new:	Barrier to be added before the current head of the TL.
+ *
+ * The caller must hold the req_lock.
+ */
+void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
+{
+	struct drbd_tl_epoch *newest_before;
+
+	INIT_LIST_HEAD(&new->requests);
+	INIT_LIST_HEAD(&new->w.list);
+	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+	new->next = NULL;
+	new->n_req = 0;
+
+	newest_before = mdev->newest_tle;
+	/* never send a barrier number == 0, because that is special-cased
+	 * when using TCQ for our write ordering code */
+	new->br_number = (newest_before->br_number+1) ?: 1;
+	if (mdev->newest_tle != new) {
+		mdev->newest_tle->next = new;
+		mdev->newest_tle = new;
+	}
+}
+
+/**
+ * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
+ * @mdev:	DRBD device.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * &struct drbd_tl_epoch objects this function will cause a termination
+ * of the connection.
+ */
+void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size)
+{
+	struct drbd_tl_epoch *b, *nob; /* next old barrier */
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	b = mdev->oldest_tle;
+
+	/* first some paranoia code */
+	if (b == NULL) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			barrier_nr);
+		goto bail;
+	}
+	if (b->br_number != barrier_nr) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
+			barrier_nr, b->br_number);
+		goto bail;
+	}
+	if (b->n_req != set_size) {
+		dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
+			barrier_nr, set_size, b->n_req);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch */
+	list_for_each_safe(le, tle, &b->requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(r, barrier_acked);
+	}
+	/* There could be requests on the list waiting for completion
+	   of the write to the local disk. To avoid corruptions of
+	   slab's data structures we have to remove the lists head.
+
+	   Also there could have been a barrier ack out of sequence, overtaking
+	   the write acks - which would be a bug and violating write ordering.
+	   To not deadlock in case we lose connection while such requests are
+	   still pending, we need some way to find them for the
+	   _req_mode(connection_lost_while_pending).
+
+	   These have been list_move'd to the out_of_sequence_requests list in
+	   _req_mod(, barrier_acked) above.
+	   */
+	list_del_init(&b->requests);
+
+	nob = b->next;
+	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, b);
+		if (nob)
+			mdev->oldest_tle = nob;
+		/* if nob == NULL b was the only barrier, and becomes the new
+		   barrier. Therefore mdev->oldest_tle points already to b */
+	} else {
+		D_ASSERT(nob != NULL);
+		mdev->oldest_tle = nob;
+		kfree(b);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+	dec_ap_pending(mdev);
+
+	return;
+
+bail:
+	spin_unlock_irq(&mdev->req_lock);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+}
+
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b, *tmp;
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+	int new_initial_bnr = net_random();
+
+	spin_lock_irq(&mdev->req_lock);
+
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			r = list_entry(le, struct drbd_request, tl_requests);
+			/* It would be nice to complete outside of spinlock.
+			 * But this is easier for now. */
+			_req_mod(r, connection_lost_while_pending);
+		}
+		tmp = b->next;
+
+		/* there could still be requests on that ring list,
+		 * in case local io is still pending */
+		list_del(&b->requests);
+
+		/* dec_ap_pending corresponding to queue_barrier.
+		 * the newest barrier may not have been queued yet,
+		 * in which case w.cb is still NULL. */
+		if (b->w.cb != NULL)
+			dec_ap_pending(mdev);
+
+		if (b == mdev->newest_tle) {
+			/* recycle, but reinit! */
+			D_ASSERT(tmp == NULL);
+			INIT_LIST_HEAD(&b->requests);
+			INIT_LIST_HEAD(&b->w.list);
+			b->w.cb = NULL;
+			b->br_number = new_initial_bnr;
+			b->n_req = 0;
+
+			mdev->oldest_tle = b;
+			break;
+		}
+		kfree(b);
+		b = tmp;
+	}
+
+	/* we expect this list to be empty. */
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+
+	/* but just in case, clean it up anyways! */
+	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		/* It would be nice to complete outside of spinlock.
+		 * But this is easier for now. */
+		_req_mod(r, connection_lost_while_pending);
+	}
+
+	/* ensure bit indicating barrier is required is clear */
+	clear_bit(CREATE_BARRIER, &mdev->flags);
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
+ * @mdev:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_conf *mdev,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
+}
+
+int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+		      union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state os, ns;
+	int rv;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	rv = _drbd_set_state(mdev, ns, f, NULL);
+	ns = mdev->state;
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
+static int is_valid_state_transition(struct drbd_conf *,
+				     union drbd_state, union drbd_state);
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, int *warn_sync_abort);
+int drbd_send_state_req(struct drbd_conf *,
+			union drbd_state, union drbd_state);
+
+static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
+				    union drbd_state mask, union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	int rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	rv = 0;
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+	if (!cl_wide_st_chg(mdev, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (!rv) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS) {
+			rv = is_valid_state_transition(mdev, ns, os);
+			if (rv == SS_SUCCESS)
+				rv = 0; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static int drbd_req_state(struct drbd_conf *mdev,
+			  union drbd_state mask, union drbd_state val,
+			  enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	int rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(&mdev->state_mutex);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+	if (cl_wide_st_chg(mdev, os, ns)) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_state_transition(mdev, ns, os);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		drbd_state_lock(mdev);
+		if (!drbd_send_state_req(mdev, mask, val)) {
+			drbd_state_unlock(mdev);
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(mdev->state_wait,
+			(rv = _req_st_cond(mdev, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			drbd_state_unlock(mdev);
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		os = mdev->state;
+		ns.i = (os.i & ~mask.i) | val.i;
+		rv = _drbd_set_state(mdev, ns, f, &done);
+		drbd_state_unlock(mdev);
+	} else {
+		rv = _drbd_set_state(mdev, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(current != mdev->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (f & CS_SERIALIZE)
+		mutex_unlock(&mdev->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+int _drbd_request_state(struct drbd_conf *mdev,	union drbd_state mask,
+			union drbd_state val,	enum chg_state_flags f)
+{
+	int rv;
+
+	wait_event(mdev->state_wait,
+		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
+{
+	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    ns.susp ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_conf *mdev,
+	union drbd_state os, union drbd_state ns, int err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(mdev, " state", os);
+	print_st(mdev, "wanted", ns);
+}
+
+
+#define drbd_peer_str drbd_role_str
+#define drbd_pdsk_str drbd_disk_str
+
+#define drbd_susp_str(A)     ((A) ? "1" : "0")
+#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
+#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
+#define drbd_user_isp_str(A) ((A) ? "1" : "0")
+
+#define PSC(A) \
+	({ if (ns.A != os.A) { \
+		pbp += sprintf(pbp, #A "( %s -> %s ) ", \
+			      drbd_##A##_str(os.A), \
+			      drbd_##A##_str(ns.A)); \
+	} })
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev:	DRBD device.
+ * @ns:		State to consider.
+ */
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	int rv = SS_SUCCESS;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		if (!mdev->net_conf->two_primaries &&
+		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
+			rv = SS_TWO_PRIMARIES;
+		put_net_conf(mdev);
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && mdev->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (mdev->sync_conf.verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  mdev->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	return rv;
+}
+
+/**
+ * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static int is_valid_state_transition(struct drbd_conf *mdev,
+				     union drbd_state ns, union drbd_state os)
+{
+	int rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	return rv;
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, int *warn_sync_abort)
+{
+	enum drbd_fencing_p fp;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Disallow Network errors to configure a device's network part */
+	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
+	    os.conn <= C_DISCONNECTING)
+		ns.conn = os.conn;
+
+	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+		ns.conn = os.conn;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
+		ns.conn = os.conn;
+
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
+		ns.pdsk = D_UNKNOWN;
+
+	/* Abort resync if a disk fails/detaches */
+	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
+	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn_sync_abort)
+			*warn_sync_abort = 1;
+		ns.conn = C_CONNECTED;
+	}
+
+	if (ns.conn >= C_CONNECTED &&
+	    ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
+	     (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
+		switch (ns.conn) {
+		case C_WF_BITMAP_T:
+		case C_PAUSED_SYNC_T:
+			ns.disk = D_OUTDATED;
+			break;
+		case C_CONNECTED:
+		case C_WF_BITMAP_S:
+		case C_SYNC_SOURCE:
+		case C_PAUSED_SYNC_S:
+			ns.disk = D_UP_TO_DATE;
+			break;
+		case C_SYNC_TARGET:
+			ns.disk = D_INCONSISTENT;
+			dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
+			break;
+		}
+		if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
+			dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
+	}
+
+	if (ns.conn >= C_CONNECTED &&
+	    (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
+		switch (ns.conn) {
+		case C_CONNECTED:
+		case C_WF_BITMAP_T:
+		case C_PAUSED_SYNC_T:
+		case C_SYNC_TARGET:
+			ns.pdsk = D_UP_TO_DATE;
+			break;
+		case C_WF_BITMAP_S:
+		case C_PAUSED_SYNC_S:
+			ns.pdsk = D_OUTDATED;
+			break;
+		case C_SYNC_SOURCE:
+			ns.pdsk = D_INCONSISTENT;
+			dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
+			break;
+		}
+		if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
+			dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = mdev->new_state_tmp.disk;
+			ns.pdsk = mdev->new_state_tmp.pdsk;
+		} else {
+			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(mdev);
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+		ns.susp = 1;
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		mdev->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+		if (bit >= mdev->rs_total)
+			mdev->ov_start_sector =
+				BM_BIT_TO_SECT(mdev->rs_total - 1);
+		mdev->ov_position = mdev->ov_start_sector;
+	}
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+int __drbd_set_state(struct drbd_conf *mdev,
+		    union drbd_state ns, enum chg_state_flags flags,
+		    struct completion *done)
+{
+	union drbd_state os;
+	int rv = SS_SUCCESS;
+	int warn_sync_abort = 0;
+	struct after_state_chg_work *ascw;
+
+	os = mdev->state;
+
+	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(mdev, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(mdev, os) == rv) {
+				dev_err(DEV, "Considering state change from bad state. "
+				    "Error would be: '%s'\n",
+				    drbd_set_st_err_str(rv));
+				print_st(mdev, "old", os);
+				print_st(mdev, "new", ns);
+				rv = is_valid_state_transition(mdev, ns, os);
+			}
+		} else
+			rv = is_valid_state_transition(mdev, ns, os);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(mdev, os, ns, rv);
+		return rv;
+	}
+
+	if (warn_sync_abort)
+		dev_warn(DEV, "Resync aborted.\n");
+
+	{
+		char *pbp, pb[300];
+		pbp = pb;
+		*pbp = 0;
+		PSC(role);
+		PSC(peer);
+		PSC(conn);
+		PSC(disk);
+		PSC(pdsk);
+		PSC(susp);
+		PSC(aftr_isp);
+		PSC(peer_isp);
+		PSC(user_isp);
+		dev_info(DEV, "%s\n", pb);
+	}
+
+	/* solve the race between becoming unconfigured,
+	 * worker doing the cleanup, and
+	 * admin reconfiguring us:
+	 * on (re)configure, first set CONFIG_PENDING,
+	 * then wait for a potentially exiting worker,
+	 * start the worker, and schedule one no_op.
+	 * then proceed with configuration.
+	 */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY &&
+	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
+		set_bit(DEVICE_DYING, &mdev->flags);
+
+	mdev->state.i = ns.i;
+	wake_up(&mdev->misc_wait);
+	wake_up(&mdev->state_wait);
+
+	/*   post-state-change actions   */
+	if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
+		set_bit(STOP_SYNC_TIMER, &mdev->flags);
+		mod_timer(&mdev->resync_timer, jiffies);
+	}
+
+	/* aborted verify run. log the last position */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn < C_CONNECTED) {
+		mdev->ov_start_sector =
+			BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
+		dev_info(DEV, "Online Verify reached sector %llu\n",
+			(unsigned long long)mdev->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		dev_info(DEV, "Syncer continues.\n");
+		mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
+		if (ns.conn == C_SYNC_TARGET) {
+			if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
+				mod_timer(&mdev->resync_timer, jiffies);
+			/* This if (!test_bit) is only needed for the case
+			   that a device that has ceased to used its timer,
+			   i.e. it is already in drbd_resync_finished() gets
+			   paused and resumed. */
+		}
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		dev_info(DEV, "Resync suspended\n");
+		mdev->rs_mark_time = jiffies;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			set_bit(STOP_SYNC_TIMER, &mdev->flags);
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		mdev->ov_position = 0;
+		mdev->rs_total =
+		mdev->rs_mark_left = drbd_bm_bits(mdev);
+		if (mdev->agreed_pro_version >= 90)
+			set_ov_position(mdev, ns.conn);
+		else
+			mdev->ov_start_sector = 0;
+		mdev->ov_left = mdev->rs_total
+			      - BM_SECT_TO_BIT(mdev->ov_position);
+		mdev->rs_start     =
+		mdev->rs_mark_time = jiffies;
+		mdev->ov_last_oos_size = 0;
+		mdev->ov_last_oos_start = 0;
+
+		if (ns.conn == C_VERIFY_S) {
+			dev_info(DEV, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)mdev->ov_position);
+			mod_timer(&mdev->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (mdev->state.role == R_PRIMARY ||
+		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (mdev->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (mdev->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (mdev->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != mdev->ldev->md.flags) {
+			mdev->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(mdev);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(mdev);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_TEAR_DOWN &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&mdev->receiver);
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->done = done;
+		drbd_queue_work(&mdev->data.work, &ascw->w);
+	} else {
+		dev_warn(DEV, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE) {
+		D_ASSERT(ascw->done != NULL);
+		complete(ascw->done);
+	}
+	kfree(ascw);
+
+	return 1;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+	if (rv) {
+		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (mdev->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	enum drbd_fencing_p fp;
+
+	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+		if (mdev->p_uuid)
+			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_state(mdev, ns);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(mdev, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (fp == FP_STONITH && ns.susp) {
+		/* case1: The outdate peer handler is successful:
+		 * case2: The connection was established again: */
+		if ((os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) ||
+		    (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+			tl_clear(mdev);
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		kfree(mdev->p_uuid);
+		mdev->p_uuid = NULL;
+		if (get_ldev(mdev)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				drbd_uuid_new_current(mdev);
+				drbd_send_uuids(mdev);
+			}
+			put_ldev(mdev);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+			drbd_uuid_new_current(mdev);
+
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			drbd_al_to_on_disk_bm(mdev);
+		put_ldev(mdev);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
+		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
+		drbd_send_sizes(mdev, 0);  /* to start sync... */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(mdev);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(mdev);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
+
+	/* We are invalidating our self... */
+	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
+
+	if (os.disk > D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh;
+
+		eh = EP_PASS_ON;
+		if (get_ldev_if_state(mdev, D_FAILED)) {
+			eh = mdev->ldev->dc.on_io_error;
+			put_ldev(mdev);
+		}
+
+		drbd_rs_cancel_all(mdev);
+		/* since get_ldev() only works as long as disk>=D_INCONSISTENT,
+		   and it is D_DISKLESS here, local_cnt can only go down, it can
+		   not increase... It will reach zero */
+		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+
+		spin_lock_irq(&mdev->req_lock);
+		_drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if (eh == EP_CALL_HELPER)
+			drbd_khelper(mdev, "local-io-error");
+	}
+
+	if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+
+		if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
+			if (drbd_send_state(mdev))
+				dev_warn(DEV, "Notified peer that my disk is broken.\n");
+			else
+				dev_err(DEV, "Sending state in drbd_io_error() failed\n");
+		}
+
+		lc_destroy(mdev->resync);
+		mdev->resync = NULL;
+		lc_destroy(mdev->act_log);
+		mdev->act_log = NULL;
+		__no_warn(local,
+			drbd_free_bc(mdev->ldev);
+			mdev->ldev = NULL;);
+
+		if (mdev->md_io_tmpp)
+			__free_page(mdev->md_io_tmpp);
+	}
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(mdev);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(mdev);
+
+	/* Upon network connection, we need to start the receiver */
+	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
+		drbd_thread_start(&mdev->receiver);
+
+	/* Terminate worker thread if we are unconfigured - it will be
+	   restarted as needed... */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(mdev);
+		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
+		if (test_bit(DEVICE_DYING, &mdev->flags))
+			drbd_thread_stop_nowait(&mdev->worker);
+	}
+
+	drbd_md_sync(mdev);
+}
+
+
+static int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned long flags;
+	int retval;
+
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "Exiting", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "Restarting" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees Exiting, and can remap to Restarting,
+	 * or thread_start see None, and can proceed as normal.
+	 */
+
+	if (thi->t_state == Restarting) {
+		dev_info(DEV, "Restarting %s\n", current->comm);
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = None;
+	smp_mb();
+	complete(&thi->stop);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	dev_info(DEV, "Terminating %s\n", current->comm);
+
+	/* Release mod reference taken when thread was started */
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
+		      int (*func) (struct drbd_thread *))
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = None;
+	thi->function = func;
+	thi->mdev = mdev;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct task_struct *nt;
+	unsigned long flags;
+
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case None:
+		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return FALSE;
+		}
+
+		init_completion(&thi->stop);
+		D_ASSERT(thi->task == NULL);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		nt = kthread_create(drbd_thread_setup, (void *) thi,
+				    "drbd%d_%s", mdev_to_minor(mdev), me);
+
+		if (IS_ERR(nt)) {
+			dev_err(DEV, "Couldn't start thread\n");
+
+			module_put(THIS_MODULE);
+			return FALSE;
+		}
+		spin_lock_irqsave(&thi->t_lock, flags);
+		thi->task = nt;
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		wake_up_process(nt);
+		break;
+	case Exiting:
+		thi->t_state = Restarting;
+		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+		/* fall through */
+	case Running:
+	case Restarting:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return TRUE;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	unsigned long flags;
+
+	enum drbd_thread_state ns = restart ? Restarting : Exiting;
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	if (thi->t_state == None) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->stop);
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL, thi->task);
+
+	}
+
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait)
+		wait_for_completion(&thi->stop);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ * @mdev:	DRBD device.
+ *
+ * Forces all threads of a device onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+{
+	int ord, cpu;
+
+	/* user override. */
+	if (cpumask_weight(mdev->cpu_mask))
+		return;
+
+	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+	for_each_online_cpu(cpu) {
+		if (ord-- == 0) {
+			cpumask_set_cpu(cpu, mdev->cpu_mask);
+			return;
+		}
+	}
+	/* should not be reached */
+	cpumask_setall(mdev->cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @mdev:	DRBD device.
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+{
+	struct task_struct *p = current;
+	struct drbd_thread *thi =
+		p == mdev->asender.task  ? &mdev->asender  :
+		p == mdev->receiver.task ? &mdev->receiver :
+		p == mdev->worker.task   ? &mdev->worker   :
+		NULL;
+	ERR_IF(thi == NULL)
+		return;
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, mdev->cpu_mask);
+}
+#endif
+
+/* the appropriate socket mutex must be held already */
+int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			  enum drbd_packets cmd, struct p_header *h,
+			  size_t size, unsigned msg_flags)
+{
+	int sent, ok;
+
+	ERR_IF(!h) return FALSE;
+	ERR_IF(!size) return FALSE;
+
+	h->magic   = BE_DRBD_MAGIC;
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size-sizeof(struct p_header));
+
+	sent = drbd_send(mdev, sock, h, size, msg_flags);
+
+	ok = (sent == size);
+	if (!ok)
+		dev_err(DEV, "short sent %s size=%d sent=%d\n",
+		    cmdname(cmd), (int)size, sent);
+	return ok;
+}
+
+/* don't pass the socket. we may only look at it
+ * when we hold the appropriate socket mutex.
+ */
+int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+		  enum drbd_packets cmd, struct p_header *h, size_t size)
+{
+	int ok = 0;
+	struct socket *sock;
+
+	if (use_data_socket) {
+		mutex_lock(&mdev->data.mutex);
+		sock = mdev->data.socket;
+	} else {
+		mutex_lock(&mdev->meta.mutex);
+		sock = mdev->meta.socket;
+	}
+
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (likely(sock != NULL))
+		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+
+	if (use_data_socket)
+		mutex_unlock(&mdev->data.mutex);
+	else
+		mutex_unlock(&mdev->meta.mutex);
+	return ok;
+}
+
+int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
+		   size_t size)
+{
+	struct p_header h;
+	int ok;
+
+	h.magic   = BE_DRBD_MAGIC;
+	h.command = cpu_to_be16(cmd);
+	h.length  = cpu_to_be16(size);
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	ok = (sizeof(h) ==
+		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
+	ok = ok && (size ==
+		drbd_send(mdev, mdev->data.socket, data, size, 0));
+
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+{
+	struct p_rs_param_89 *p;
+	struct socket *sock;
+	int size, rv;
+	const int apv = mdev->agreed_pro_version;
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(mdev->sync_conf.verify_alg) + 1
+		: /* 89 */    sizeof(struct p_rs_param_89);
+
+	/* used from admin command context and receiver/worker context.
+	 * to avoid kmalloc, grab the socket right here,
+	 * then use the pre-allocated sbuf there */
+	mutex_lock(&mdev->data.mutex);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+		p = &mdev->data.sbuf.rs_param_89;
+
+		/* initialize verify_alg and csums_alg */
+		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+		p->rate = cpu_to_be32(sc->rate);
+
+		if (apv >= 88)
+			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
+		if (apv >= 89)
+			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+
+		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
+	} else
+		rv = 0; /* not ok */
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return rv;
+}
+
+int drbd_send_protocol(struct drbd_conf *mdev)
+{
+	struct p_protocol *p;
+	int size, rv;
+
+	size = sizeof(struct p_protocol);
+
+	if (mdev->agreed_pro_version >= 87)
+		size += strlen(mdev->net_conf->integrity_alg) + 1;
+
+	/* we must not recurse into our own queue,
+	 * as that is blocked during handshake */
+	p = kmalloc(size, GFP_NOIO);
+	if (p == NULL)
+		return 0;
+
+	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
+	p->want_lose     = cpu_to_be32(mdev->net_conf->want_lose);
+	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+
+	if (mdev->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+
+	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
+			   (struct p_header *)p, size);
+	kfree(p);
+	return rv;
+}
+
+int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
+{
+	struct p_uuids p;
+	int i;
+
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 1;
+
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+
+	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
+	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
+	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(mdev);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_uuids(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 8);
+}
+
+
+int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
+{
+	struct p_rs_uuid p;
+
+	p.uuid = cpu_to_be64(val);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+{
+	struct p_sizes p;
+	sector_t d_size, u_size;
+	int q_order_type;
+	int ok;
+
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		D_ASSERT(mdev->ldev->backing_bdev);
+		d_size = drbd_get_max_capacity(mdev->ldev);
+		u_size = mdev->ldev->dc.disk_size;
+		q_order_type = drbd_queue_order_type(mdev);
+		p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
+		put_ldev(mdev);
+	} else {
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+	}
+
+	p.d_size = cpu_to_be64(d_size);
+	p.u_size = cpu_to_be64(u_size);
+	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+	p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
+	p.queue_order_type = cpu_to_be32(q_order_type);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
+			   (struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * drbd_send_state() - Sends the drbd state to the peer
+ * @mdev:	DRBD device.
+ */
+int drbd_send_state(struct drbd_conf *mdev)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	/* Grab state lock so we wont send state if we're in the middle
+	 * of a cluster wide state change on another thread */
+	drbd_state_lock(mdev);
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	drbd_state_unlock(mdev);
+	return ok;
+}
+
+int drbd_send_state_req(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	struct p_req_state p;
+
+	p.mask    = cpu_to_be32(mask.i);
+	p.val     = cpu_to_be32(val.i);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
+{
+	struct p_req_state_reply p;
+
+	p.retcode    = cpu_to_be32(retcode);
+
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int fill_bitmap_rle_bits(struct drbd_conf *mdev,
+	struct p_compressed_bm *p,
+	struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits;
+
+	/* may we use this feature? */
+	if ((mdev->sync_conf.use_rle == 0) ||
+		(mdev->agreed_pro_version < 90))
+			return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
+	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
+				    : _drbd_bm_find_next(mdev, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				DCBP_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			DCBP_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+enum { OK, FAILED, DONE }
+send_bitmap_rle_or_plain(struct drbd_conf *mdev,
+	struct p_header *h, struct bm_xfer_ctx *c)
+{
+	struct p_compressed_bm *p = (void*)h;
+	unsigned long num_words;
+	int len;
+	int ok;
+
+	len = fill_bitmap_rle_bits(mdev, p, c);
+
+	if (len < 0)
+		return FAILED;
+
+	if (len) {
+		DCBP_set_code(p, RLE_VLI_Bits);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
+			sizeof(*p) + len, 0);
+
+		c->packets[0]++;
+		c->bytes[0] += sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+		len = num_words * sizeof(long);
+		if (len)
+			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
+				   h, sizeof(struct p_header) + len, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += sizeof(struct p_header) + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
+
+	if (ok == DONE)
+		INFO_bm_xfer_stats(mdev, "send", c);
+	return ok;
+}
+
+/* See the comment at receive_bitmap() */
+int _drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	struct bm_xfer_ctx c;
+	struct p_header *p;
+	int ret;
+
+	ERR_IF(!mdev->bitmap) return FALSE;
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	p = (struct p_header *) __get_free_page(GFP_NOIO);
+	if (!p) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		return FALSE;
+	}
+
+	if (get_ldev(mdev)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(mdev);
+			if (drbd_bm_write(mdev)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				dev_err(DEV, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+				drbd_md_sync(mdev);
+			}
+		}
+		put_ldev(mdev);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	do {
+		ret = send_bitmap_rle_or_plain(mdev, p, &c);
+	} while (ret == OK);
+
+	free_page((unsigned long) p);
+	return (ret == DONE);
+}
+
+int drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	int err;
+
+	if (!drbd_get_data_sock(mdev))
+		return -1;
+	err = !_drbd_send_bitmap(mdev);
+	drbd_put_data_sock(mdev);
+	return err;
+}
+
+int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+{
+	int ok;
+	struct p_barrier_ack p;
+
+	p.barrier  = barrier_nr;
+	p.set_size = cpu_to_be32(set_size);
+
+	if (mdev->state.conn < C_CONNECTED)
+		return FALSE;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
+			(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			  u64 sector,
+			  u32 blksize,
+			  u64 block_id)
+{
+	int ok;
+	struct p_block_ack p;
+
+	p.sector   = sector;
+	p.block_id = block_id;
+	p.blksize  = blksize;
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+
+	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
+		return FALSE;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
+				(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_data *dp)
+{
+	const int header_size = sizeof(struct p_data)
+			      - sizeof(struct p_header);
+	int data_size  = ((struct p_header *)dp)->length - header_size;
+
+	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+			      dp->block_id);
+}
+
+int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_block_req *rp)
+{
+	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @e:		Epoch entry.
+ */
+int drbd_send_ack(struct drbd_conf *mdev,
+	enum drbd_packets cmd, struct drbd_epoch_entry *e)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(e->sector),
+			      cpu_to_be32(e->size),
+			      e->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = block_id;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
+				(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+int drbd_send_drequest_csum(struct drbd_conf *mdev,
+			    sector_t sector, int size,
+			    void *digest, int digest_size,
+			    enum drbd_packets cmd)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbeef;
+	p.blksize  = cpu_to_be32(size);
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+
+	mutex_lock(&mdev->data.mutex);
+
+	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
+	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
+int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbabe;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
+			   (struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/* called on sndtimeo
+ * returns FALSE if we should retry,
+ * TRUE if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - mdev->last_received); */
+
+	drop_it =   mdev->meta.socket == sock
+		|| !mdev->asender.task
+		|| get_t_state(&mdev->asender) != Running
+		|| mdev->state.conn < C_CONNECTED;
+
+	if (drop_it)
+		return TRUE;
+
+	drop_it = !--mdev->ko_count;
+	if (!drop_it) {
+		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+		       current->comm, current->pid, mdev->ko_count);
+		request_ping(mdev);
+	}
+
+	return drop_it; /* && (mdev->state == R_PRIMARY) */;
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
+		   int offset, size_t size)
+{
+	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+	kunmap(page);
+	if (sent == size)
+		mdev->send_cnt += size>>9;
+	return sent == size;
+}
+
+static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
+		    int offset, size_t size)
+{
+	mm_segment_t oldfs = get_fs();
+	int sent, ok;
+	int len = size;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+		return _drbd_no_send_page(mdev, page, offset, size);
+
+	drbd_update_congested(mdev);
+	set_fs(KERNEL_DS);
+	do {
+		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+							offset, len,
+							MSG_NOSIGNAL);
+		if (sent == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev,
+							  mdev->data.socket))
+				break;
+			else
+				continue;
+		}
+		if (sent <= 0) {
+			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
+	set_fs(oldfs);
+	clear_bit(NET_CONGESTED, &mdev->flags);
+
+	ok = (len == 0);
+	if (likely(ok))
+		mdev->send_cnt += size>>9;
+	return ok;
+}
+
+static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (!_drbd_no_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len))
+			return 0;
+	}
+	return 1;
+}
+
+static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (!_drbd_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* Used to send write requests
+ * R_PRIMARY -> Peer	(P_DATA)
+ */
+int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int ok = 1;
+	struct p_data p;
+	unsigned int dp_flags = 0;
+	void *dgb;
+	int dgs;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(P_DATA);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+
+	p.sector   = cpu_to_be64(req->sector);
+	p.block_id = (unsigned long)req;
+	p.seq_num  = cpu_to_be32(req->seq_num =
+				 atomic_add_return(1, &mdev->packet_seq));
+	dp_flags = 0;
+
+	/* NOTE: no need to check if barriers supported here as we would
+	 *       not pass the test in make_request_common in that case
+	 */
+	if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
+		dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
+		/* dp_flags |= DP_HARDBARRIER; */
+	}
+	if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
+		dp_flags |= DP_RW_SYNC;
+	/* for now handle SYNCIO and UNPLUG
+	 * as if they still were one and the same flag */
+	if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
+		dp_flags |= DP_RW_SYNC;
+	if (mdev->state.conn >= C_SYNC_SOURCE &&
+	    mdev->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+
+	p.dp_flags = cpu_to_be32(dp_flags);
+	set_bit(UNPLUG_REMOTE, &mdev->flags);
+	ok = (sizeof(p) ==
+		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+	}
+	if (ok) {
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+			ok = _drbd_send_bio(mdev, req->master_bio);
+		else
+			ok = _drbd_send_zc_bio(mdev, req->master_bio);
+	}
+
+	drbd_put_data_sock(mdev);
+	return ok;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+		    struct drbd_epoch_entry *e)
+{
+	int ok;
+	struct p_data p;
+	void *dgb;
+	int dgs;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+
+	p.sector   = cpu_to_be64(e->sector);
+	p.block_id = e->block_id;
+	/* p.seq_num  = 0;    No sequence numbers here.. */
+
+	/* Only called by our kernel thread.
+	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
+	 * in response to admin command or module unload.
+	 */
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
+					sizeof(p), MSG_MORE);
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+	}
+	if (ok)
+		ok = _drbd_send_zc_bio(mdev, e->private_bio);
+
+	drbd_put_data_sock(mdev);
+	return ok;
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int rv, sent = 0;
+
+	if (!sock)
+		return -1000;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = NULL;
+	msg.msg_namelen    = 0;
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+	if (sock == mdev->data.socket) {
+		mdev->ko_count = mdev->net_conf->ko_count;
+		drbd_update_congested(mdev);
+	}
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev, sock))
+				break;
+			else
+				continue;
+		}
+		D_ASSERT(rv != 0);
+		if (rv == -EINTR) {
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while (sent < size);
+
+	if (sock == mdev->data.socket)
+		clear_bit(NET_CONGESTED, &mdev->flags);
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			dev_err(DEV, "%s_sendmsg returned %d\n",
+			    sock == mdev->meta.socket ? "msock" : "sock",
+			    rv);
+			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+		} else
+			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+	}
+
+	return sent;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct drbd_conf *mdev = bdev->bd_disk->private_data;
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	/* to have a stable mdev->state.role
+	 * and no race with updating open_cnt */
+
+	if (mdev->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		mdev->open_cnt++;
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+static int drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_conf *mdev = gd->private_data;
+	mdev->open_cnt--;
+	return 0;
+}
+
+static void drbd_unplug_fn(struct request_queue *q)
+{
+	struct drbd_conf *mdev = q->queuedata;
+
+	/* unplug FIRST */
+	spin_lock_irq(q->queue_lock);
+	blk_remove_plug(q);
+	spin_unlock_irq(q->queue_lock);
+
+	/* only if connected */
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
+		D_ASSERT(mdev->state.role == R_PRIMARY);
+		if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
+			/* add to the data.work queue,
+			 * unless already queued.
+			 * XXX this might be a good addition to drbd_queue_work
+			 * anyways, to detect "double queuing" ... */
+			if (list_empty(&mdev->unplug_work.list))
+				drbd_queue_work(&mdev->data.work,
+						&mdev->unplug_work);
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+}
+
+static void drbd_set_defaults(struct drbd_conf *mdev)
+{
+	mdev->sync_conf.after      = DRBD_AFTER_DEF;
+	mdev->sync_conf.rate       = DRBD_RATE_DEF;
+	mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
+	mdev->state = (union drbd_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		  .susp = 0
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_conf *mdev)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+	drbd_set_defaults(mdev);
+
+	/* for now, we do NOT yet support it,
+	 * even though we start some framework
+	 * to eventually support barriers */
+	set_bit(NO_BARRIER_SUPP, &mdev->flags);
+
+	atomic_set(&mdev->ap_bio_cnt, 0);
+	atomic_set(&mdev->ap_pending_cnt, 0);
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	atomic_set(&mdev->unacked_cnt, 0);
+	atomic_set(&mdev->local_cnt, 0);
+	atomic_set(&mdev->net_cnt, 0);
+	atomic_set(&mdev->packet_seq, 0);
+	atomic_set(&mdev->pp_in_use, 0);
+
+	mutex_init(&mdev->md_io_mutex);
+	mutex_init(&mdev->data.mutex);
+	mutex_init(&mdev->meta.mutex);
+	sema_init(&mdev->data.work.s, 0);
+	sema_init(&mdev->meta.work.s, 0);
+	mutex_init(&mdev->state_mutex);
+
+	spin_lock_init(&mdev->data.work.q_lock);
+	spin_lock_init(&mdev->meta.work.q_lock);
+
+	spin_lock_init(&mdev->al_lock);
+	spin_lock_init(&mdev->req_lock);
+	spin_lock_init(&mdev->peer_seq_lock);
+	spin_lock_init(&mdev->epoch_lock);
+
+	INIT_LIST_HEAD(&mdev->active_ee);
+	INIT_LIST_HEAD(&mdev->sync_ee);
+	INIT_LIST_HEAD(&mdev->done_ee);
+	INIT_LIST_HEAD(&mdev->read_ee);
+	INIT_LIST_HEAD(&mdev->net_ee);
+	INIT_LIST_HEAD(&mdev->resync_reads);
+	INIT_LIST_HEAD(&mdev->data.work.q);
+	INIT_LIST_HEAD(&mdev->meta.work.q);
+	INIT_LIST_HEAD(&mdev->resync_work.list);
+	INIT_LIST_HEAD(&mdev->unplug_work.list);
+	INIT_LIST_HEAD(&mdev->md_sync_work.list);
+	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+	mdev->resync_work.cb  = w_resync_inactive;
+	mdev->unplug_work.cb  = w_send_write_hint;
+	mdev->md_sync_work.cb = w_md_sync;
+	mdev->bm_io_work.w.cb = w_bitmap_io;
+	init_timer(&mdev->resync_timer);
+	init_timer(&mdev->md_sync_timer);
+	mdev->resync_timer.function = resync_timer_fn;
+	mdev->resync_timer.data = (unsigned long) mdev;
+	mdev->md_sync_timer.function = md_sync_timer_fn;
+	mdev->md_sync_timer.data = (unsigned long) mdev;
+
+	init_waitqueue_head(&mdev->misc_wait);
+	init_waitqueue_head(&mdev->state_wait);
+	init_waitqueue_head(&mdev->ee_wait);
+	init_waitqueue_head(&mdev->al_wait);
+	init_waitqueue_head(&mdev->seq_wait);
+
+	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
+	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
+	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
+
+	mdev->agreed_pro_version = PRO_VERSION_MAX;
+	mdev->write_ordering = WO_bio_barrier;
+	mdev->resync_wenr = LC_FREE;
+}
+
+void drbd_mdev_cleanup(struct drbd_conf *mdev)
+{
+	if (mdev->receiver.t_state != None)
+		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				mdev->receiver.t_state);
+
+	/* no need to lock it, I'm the only thread alive */
+	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
+		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
+	mdev->al_writ_cnt  =
+	mdev->bm_writ_cnt  =
+	mdev->read_cnt     =
+	mdev->recv_cnt     =
+	mdev->send_cnt     =
+	mdev->writ_cnt     =
+	mdev->p_size       =
+	mdev->rs_start     =
+	mdev->rs_total     =
+	mdev->rs_failed    =
+	mdev->rs_mark_left =
+	mdev->rs_mark_time = 0;
+	D_ASSERT(mdev->net_conf == NULL);
+
+	drbd_set_my_capacity(mdev, 0);
+	if (mdev->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(mdev, 0);
+		drbd_bm_cleanup(mdev);
+	}
+
+	drbd_free_resources(mdev);
+
+	/*
+	 * currently we drbd_init_ee only on module load, so
+	 * we may do drbd_release_ee only on module unload!
+	 */
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->net_ee));
+	D_ASSERT(list_empty(&mdev->resync_reads));
+	D_ASSERT(list_empty(&mdev->data.work.q));
+	D_ASSERT(list_empty(&mdev->meta.work.q));
+	D_ASSERT(list_empty(&mdev->resync_work.list));
+	D_ASSERT(list_empty(&mdev->unplug_work.list));
+
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+
+	if (drbd_ee_mempool)
+		mempool_destroy(drbd_ee_mempool);
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache)
+		kmem_cache_destroy(drbd_ee_cache);
+	if (drbd_request_cache)
+		kmem_cache_destroy(drbd_request_cache);
+	if (drbd_bm_ext_cache)
+		kmem_cache_destroy(drbd_bm_ext_cache);
+	if (drbd_al_ext_cache)
+		kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_ee_mempool      = NULL;
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+static int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
+	int i;
+
+	/* prepare our caches and mempools */
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+	drbd_pp_pool         = NULL;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+	drbd_request_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	drbd_ee_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	/* drbd's page pool */
+	spin_lock_init(&drbd_pp_lock);
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
+	void *unused)
+{
+	/* just so we have it.  you never know what interesting things we
+	 * might want to do here some day...
+	 */
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block drbd_notifier = {
+	.notifier_call = drbd_notify_sys,
+};
+
+static void drbd_release_ee_lists(struct drbd_conf *mdev)
+{
+	int rr;
+
+	rr = drbd_release_ee(mdev, &mdev->active_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->sync_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->read_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->done_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->net_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking.
+ * currently only used from module cleanup code. */
+static void drbd_delete_device(unsigned int minor)
+{
+	struct drbd_conf *mdev = minor_to_mdev(minor);
+
+	if (!mdev)
+		return;
+
+	/* paranoia asserts */
+	if (mdev->open_cnt != 0)
+		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
+				__FILE__ , __LINE__);
+
+	ERR_IF (!list_empty(&mdev->data.work.q)) {
+		struct list_head *lp;
+		list_for_each(lp, &mdev->data.work.q) {
+			dev_err(DEV, "lp = %p\n", lp);
+		}
+	};
+	/* end paranoia asserts */
+
+	del_gendisk(mdev->vdisk);
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	if (mdev->this_bdev)
+		bdput(mdev->this_bdev);
+
+	drbd_free_resources(mdev);
+
+	drbd_release_ee_lists(mdev);
+
+	/* should be free'd on disconnect? */
+	kfree(mdev->ee_hash);
+	/*
+	mdev->ee_hash_s = 0;
+	mdev->ee_hash = NULL;
+	*/
+
+	lc_destroy(mdev->act_log);
+	lc_destroy(mdev->resync);
+
+	kfree(mdev->p_uuid);
+	/* mdev->p_uuid = NULL; */
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+
+	/* cleanup the rest that has been
+	 * allocated from drbd_new_device
+	 * and actually free the mdev itself */
+	drbd_free_mdev(mdev);
+}
+
+static void drbd_cleanup(void)
+{
+	unsigned int i;
+
+	unregister_reboot_notifier(&drbd_notifier);
+
+	drbd_nl_cleanup();
+
+	if (minor_table) {
+		if (drbd_proc)
+			remove_proc_entry("drbd", NULL);
+		i = minor_count;
+		while (i--)
+			drbd_delete_device(i);
+		drbd_destroy_mempools();
+	}
+
+	kfree(minor_table);
+
+	unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	printk(KERN_INFO "drbd: module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for pdflush
+ * @congested_data:	User data
+ * @bdi_bits:		Bits pdflush is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+	struct drbd_conf *mdev = congested_data;
+	struct request_queue *q;
+	char reason = '-';
+	int r = 0;
+
+	if (!__inc_ap_bio_cond(mdev)) {
+		/* DRBD has frozen IO */
+		r = bdi_bits;
+		reason = 'd';
+		goto out;
+	}
+
+	if (get_ldev(mdev)) {
+		q = bdev_get_queue(mdev->ldev->backing_bdev);
+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		put_ldev(mdev);
+		if (r)
+			reason = 'b';
+	}
+
+	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+		r |= (1 << BDI_async_congested);
+		reason = reason == 'b' ? 'a' : 'n';
+	}
+
+out:
+	mdev->congestion_reason = reason;
+	return r;
+}
+
+struct drbd_conf *drbd_new_device(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+	struct gendisk *disk;
+	struct request_queue *q;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
+	if (!mdev)
+		return NULL;
+	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
+		goto out_no_cpumask;
+
+	mdev->minor = minor;
+
+	drbd_init_set_defaults(mdev);
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		goto out_no_q;
+	mdev->rq_queue = q;
+	q->queuedata   = mdev;
+	blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_no_disk;
+	mdev->vdisk = disk;
+
+	set_disk_ro(disk, TRUE);
+
+	disk->queue = q;
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->fops = &drbd_ops;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = mdev;
+
+	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+	/* we have no partitions. we contain only ourselves. */
+	mdev->this_bdev->bd_contains = mdev->this_bdev;
+
+	q->backing_dev_info.congested_fn = drbd_congested;
+	q->backing_dev_info.congested_data = mdev;
+
+	blk_queue_make_request(q, drbd_make_request_26);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_merge_bvec(q, drbd_merge_bvec);
+	q->queue_lock = &mdev->req_lock; /* needed since we use */
+		/* plugging on a queue, that actually has no requests! */
+	q->unplug_fn = drbd_unplug_fn;
+
+	mdev->md_io_page = alloc_page(GFP_KERNEL);
+	if (!mdev->md_io_page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(mdev))
+		goto out_no_bitmap;
+	/* no need to lock access, we are still initializing this minor device. */
+	if (!tl_init(mdev))
+		goto out_no_tl;
+
+	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
+	if (!mdev->app_reads_hash)
+		goto out_no_app_reads;
+
+	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!mdev->current_epoch)
+		goto out_no_epoch;
+
+	INIT_LIST_HEAD(&mdev->current_epoch->list);
+	mdev->epochs = 1;
+
+	return mdev;
+
+/* out_whatever_else:
+	kfree(mdev->current_epoch); */
+out_no_epoch:
+	kfree(mdev->app_reads_hash);
+out_no_app_reads:
+	tl_cleanup(mdev);
+out_no_tl:
+	drbd_bm_cleanup(mdev);
+out_no_bitmap:
+	__free_page(mdev->md_io_page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	blk_cleanup_queue(q);
+out_no_q:
+	free_cpumask_var(mdev->cpu_mask);
+out_no_cpumask:
+	kfree(mdev);
+	return NULL;
+}
+
+/* counterpart of drbd_new_device.
+ * last part of drbd_delete_device. */
+void drbd_free_mdev(struct drbd_conf *mdev)
+{
+	kfree(mdev->current_epoch);
+	kfree(mdev->app_reads_hash);
+	tl_cleanup(mdev);
+	if (mdev->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(mdev);
+	__free_page(mdev->md_io_page);
+	put_disk(mdev->vdisk);
+	blk_cleanup_queue(mdev->rq_queue);
+	free_cpumask_var(mdev->cpu_mask);
+	kfree(mdev);
+}
+
+
+int __init drbd_init(void)
+{
+	int err;
+
+	if (sizeof(struct p_handshake) != 80) {
+		printk(KERN_ERR
+		       "drbd: never change the size or layout "
+		       "of the HandShake packet.\n");
+		return -EINVAL;
+	}
+
+	if (1 > minor_count || minor_count > 255) {
+		printk(KERN_ERR
+			"drbd: invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = 8;
+#endif
+	}
+
+	err = drbd_nl_init();
+	if (err)
+		return err;
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		printk(KERN_ERR
+		       "drbd: unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	register_reboot_notifier(&drbd_notifier);
+
+	/*
+	 * allocate all necessary structs
+	 */
+	err = -ENOMEM;
+
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
+				GFP_KERNEL);
+	if (!minor_table)
+		goto Enomem;
+
+	err = drbd_create_mempools();
+	if (err)
+		goto Enomem;
+
+	drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+	if (!drbd_proc)	{
+		printk(KERN_ERR "drbd: unable to register proc file\n");
+		goto Enomem;
+	}
+
+	rwlock_init(&global_state_lock);
+
+	printk(KERN_INFO "drbd: initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
+	printk(KERN_INFO "drbd: registered as block device major %d\n",
+		DRBD_MAJOR);
+	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
+
+	return 0; /* Success! */
+
+Enomem:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		/* currently always the case */
+		printk(KERN_ERR "drbd: ran out of memory\n");
+	else
+		printk(KERN_ERR "drbd: initialization failure\n");
+	return err;
+}
+
+void drbd_free_bc(struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	bd_release(ldev->backing_bdev);
+	bd_release(ldev->md_bdev);
+
+	fput(ldev->lo_file);
+	fput(ldev->md_file);
+
+	kfree(ldev);
+}
+
+void drbd_free_sock(struct drbd_conf *mdev)
+{
+	if (mdev->data.socket) {
+		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
+		sock_release(mdev->data.socket);
+		mdev->data.socket = NULL;
+	}
+	if (mdev->meta.socket) {
+		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
+		sock_release(mdev->meta.socket);
+		mdev->meta.socket = NULL;
+	}
+}
+
+
+void drbd_free_resources(struct drbd_conf *mdev)
+{
+	crypto_free_hash(mdev->csums_tfm);
+	mdev->csums_tfm = NULL;
+	crypto_free_hash(mdev->verify_tfm);
+	mdev->verify_tfm = NULL;
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = NULL;
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = NULL;
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = NULL;
+
+	drbd_free_sock(mdev);
+
+	__no_warn(local,
+		  drbd_free_bc(mdev->ldev);
+		  mdev->ldev = NULL;);
+}
+
+/* meta data management */
+
+struct meta_data_on_disk {
+	u64 la_size;           /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL */
+	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 reserved_u32[4];
+
+} __packed;
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+	struct meta_data_on_disk *buffer;
+	sector_t sector;
+	int i;
+
+	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+		return;
+	del_timer(&mdev->md_sync_timer);
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(mdev, D_FAILED))
+		return;
+
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	memset(buffer, 0, 512);
+
+	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+
+	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+
+	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+	sector = mdev->ldev->md.md_offset;
+
+	if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+		clear_bit(MD_DIRTY, &mdev->flags);
+	} else {
+		/* this was a try anyways ... */
+		dev_err(DEV, "meta data update failed!\n");
+
+		drbd_chk_io_error(mdev, 1, TRUE);
+	}
+
+	/* Update mdev->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
+
+	mutex_unlock(&mdev->md_io_mutex);
+	put_ldev(mdev);
+}
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
+ * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ */
+int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	int i, rv = NO_ERROR;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		return ERR_IO_MD_DISK;
+
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+
+	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+		/* NOTE: cant do normal error processing here as this is
+		   called BEFORE disk is attached */
+		dev_err(DEV, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
+		dev_err(DEV, "Error while reading metadata, magic not found.\n");
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
+		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	if (mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+ err:
+	mutex_unlock(&mdev->md_io_mutex);
+	put_ldev(mdev);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @mdev:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+void drbd_md_mark_dirty(struct drbd_conf *mdev)
+{
+	set_bit(MD_DIRTY, &mdev->flags);
+	mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+}
+
+
+static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+}
+
+void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (mdev->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(mdev, val);
+	}
+
+	mdev->ldev->md.uuid[idx] = val;
+	drbd_md_mark_dirty(mdev);
+}
+
+
+void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+	}
+	_drbd_uuid_set(mdev, idx, val);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @mdev:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
+{
+	u64 val;
+
+	dev_info(DEV, "Creating new current UUID\n");
+	D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
+	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+
+	get_random_bytes(&val, sizeof(u64));
+	_drbd_uuid_set(mdev, UI_CURRENT, val);
+}
+
+void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	if (val == 0) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
+		mdev->ldev->md.uuid[UI_BITMAP] = 0;
+	} else {
+		if (mdev->ldev->md.uuid[UI_BITMAP])
+			dev_warn(DEV, "bm UUID already set");
+
+		mdev->ldev->md.uuid[UI_BITMAP] = val;
+		mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
+
+	}
+	drbd_md_mark_dirty(mdev);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
+		drbd_md_sync(mdev);
+		drbd_bm_set_all(mdev);
+
+		rv = drbd_bm_write(mdev);
+
+		if (!rv) {
+			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+			drbd_md_sync(mdev);
+		}
+
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_bm_clear_all(mdev);
+		rv = drbd_bm_write(mdev);
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+	int rv;
+
+	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
+
+	drbd_bm_lock(mdev, work->why);
+	rv = work->io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	clear_bit(BITMAP_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+
+	if (work->done)
+		work->done(mdev, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+	work->why = NULL;
+
+	return 1;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ */
+void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+			  int (*io_fn)(struct drbd_conf *),
+			  void (*done)(struct drbd_conf *, int),
+			  char *why)
+{
+	D_ASSERT(current == mdev->worker.task);
+
+	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
+	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
+	if (mdev->bm_io_work.why)
+		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, mdev->bm_io_work.why);
+
+	mdev->bm_io_work.io_fn = io_fn;
+	mdev->bm_io_work.done = done;
+	mdev->bm_io_work.why = why;
+
+	set_bit(BITMAP_IO, &mdev->flags);
+	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
+		if (list_empty(&mdev->bm_io_work.w.list)) {
+			set_bit(BITMAP_IO_QUEUED, &mdev->flags);
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+		} else
+			dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
+	}
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
+{
+	int rv;
+
+	D_ASSERT(current != mdev->worker.task);
+
+	drbd_suspend_io(mdev);
+
+	drbd_bm_lock(mdev, why);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+}
+
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(mdev);
+
+	return 1;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (--rsp->count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation"
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(fault_devs == 0 ||
+			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+	if (ret) {
+		fault_count++;
+
+		if (printk_ratelimit())
+			dev_warn(DEV, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+	/* DRBD built from external sources has here a reference to the
+	   git hash of the source code. */
+
+	static char buildtag[38] = "\0uilt-in";
+
+	if (buildtag[0] == 0) {
+#ifdef CONFIG_MODULES
+		if (THIS_MODULE != NULL)
+			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+		else
+#endif
+			buildtag[0] = 'b';
+	}
+
+	return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 0000000..436a090
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2364 @@
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/connector.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_tag_magic.h>
+#include <linux/drbd_limits.h>
+
+static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
+static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
+static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
+
+/* see get_sb_bdev and bd_claim */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+/* Generate the tag_list to struct functions */
+#define NL_PACKET(name, number, fields) \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) \
+{ \
+	int tag; \
+	int dlen; \
+	\
+	while ((tag = get_unaligned(tags++)) != TT_END) {	\
+		dlen = get_unaligned(tags++);			\
+		switch (tag_number(tag)) { \
+		fields \
+		default: \
+			if (tag & T_MANDATORY) { \
+				dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
+				return 0; \
+			} \
+		} \
+		tags = (unsigned short *)((char *)tags + dlen); \
+	} \
+	return 1; \
+}
+#define NL_INTEGER(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
+		arg->member = get_unaligned((int *)(tags));	\
+		break;
+#define NL_INT64(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
+		arg->member = get_unaligned((u64 *)(tags));	\
+		break;
+#define NL_BIT(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
+		arg->member = *(char *)(tags) ? 1 : 0; \
+		break;
+#define NL_STRING(pn, pr, member, len) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
+		if (dlen > len) { \
+			dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
+				#member, dlen, (unsigned int)len); \
+			return 0; \
+		} \
+		 arg->member ## _len = dlen; \
+		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
+		 break;
+#include "linux/drbd_nl.h"
+
+/* Generate the struct to tag_list functions */
+#define NL_PACKET(name, number, fields) \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) \
+{ \
+	fields \
+	return tags; \
+}
+
+#define NL_INTEGER(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INTEGER, tags++);	\
+	put_unaligned(sizeof(int), tags++);		\
+	put_unaligned(arg->member, (int *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(int));
+#define NL_INT64(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INT64, tags++);	\
+	put_unaligned(sizeof(u64), tags++);		\
+	put_unaligned(arg->member, (u64 *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(u64));
+#define NL_BIT(pn, pr, member) \
+	put_unaligned(pn | pr | TT_BIT, tags++);	\
+	put_unaligned(sizeof(char), tags++);		\
+	*(char *)tags = arg->member; \
+	tags = (unsigned short *)((char *)tags+sizeof(char));
+#define NL_STRING(pn, pr, member, len) \
+	put_unaligned(pn | pr | TT_STRING, tags++);	\
+	put_unaligned(arg->member ## _len, tags++);	\
+	memcpy(tags, arg->member, arg->member ## _len); \
+	tags = (unsigned short *)((char *)tags + arg->member ## _len);
+#include "linux/drbd_nl.h"
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
+void drbd_nl_send_reply(struct cn_msg *, int);
+
+int drbd_khelper(struct drbd_conf *mdev, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			NULL, /* Will be set to address family */
+			NULL, /* Will be set to address */
+			NULL };
+
+	char mb[12], af[20], ad[60], *afs;
+	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	int ret;
+
+	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
+
+	if (get_net_conf(mdev)) {
+		switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
+		case AF_INET6:
+			afs = "ipv6";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
+				 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
+			break;
+		case AF_INET:
+			afs = "ipv4";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+			break;
+		default:
+			afs = "ssocks";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+		}
+		snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
+		envp[3]=af;
+		envp[4]=ad;
+		put_net_conf(mdev);
+	}
+
+	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+
+	drbd_bcast_ev_helper(mdev, cmd);
+	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	if (ret)
+		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+{
+	char *ex_to_string;
+	int r;
+	enum drbd_disk_state nps;
+	enum drbd_fencing_p fp;
+
+	D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+
+	if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	} else {
+		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
+		return mdev->state.pdsk;
+	}
+
+	if (fp == FP_STONITH)
+		_drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
+
+	r = drbd_khelper(mdev, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case 3: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		nps = D_INCONSISTENT;
+		break;
+	case 4: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		nps = D_OUTDATED;
+		break;
+	case 5: /* peer was down */
+		if (mdev->state.disk == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			nps = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+			nps = mdev->state.pdsk;
+		}
+		break;
+	case 6: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		dev_warn(DEV, "Peer is primary, outdating myself.\n");
+		nps = D_UNKNOWN;
+		_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+		break;
+	case 7:
+		if (fp != FP_STONITH)
+			dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		nps = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		nps = D_UNKNOWN;
+		dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return nps;
+	}
+
+	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
+			(r>>8) & 0xff, ex_to_string);
+	return nps;
+}
+
+
+int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+{
+	const int max_tries = 4;
+	int r = 0;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+	enum drbd_disk_state nps;
+
+	if (new_role == R_PRIMARY)
+		request_ping(mdev); /* Detect a dead peer ASAP */
+
+	mutex_lock(&mdev->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (r == SS_NO_UP_TO_DATE_DISK && force &&
+		    (mdev->state.disk == D_INCONSISTENT ||
+		     mdev->state.disk == D_OUTDATED)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (r == SS_NO_UP_TO_DATE_DISK &&
+		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+
+			val.pdsk = nps;
+			mask.pdsk = D_MASK;
+
+			continue;
+		}
+
+		if (r == SS_NOTHING_TO_DO)
+			goto fail;
+		if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (force && nps > D_OUTDATED) {
+				dev_warn(DEV, "Forced into split brain situation!\n");
+				nps = D_OUTDATED;
+			}
+
+			mask.pdsk = D_MASK;
+			val.pdsk  = nps;
+
+			continue;
+		}
+		if (r == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
+			if (try < max_tries)
+				try = max_tries - 1;
+			continue;
+		}
+		if (r < SS_SUCCESS) {
+			r = _drbd_request_state(mdev, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (r < SS_SUCCESS)
+				goto fail;
+		}
+		break;
+	}
+
+	if (r < SS_SUCCESS)
+		goto fail;
+
+	if (forced)
+		dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+
+	if (new_role == R_SECONDARY) {
+		set_disk_ro(mdev->vdisk, TRUE);
+		if (get_ldev(mdev)) {
+			mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(mdev);
+		}
+	} else {
+		if (get_net_conf(mdev)) {
+			mdev->net_conf->want_lose = 0;
+			put_net_conf(mdev);
+		}
+		set_disk_ro(mdev->vdisk, FALSE);
+		if (get_ldev(mdev)) {
+			if (((mdev->state.conn < C_CONNECTED ||
+			       mdev->state.pdsk <= D_FAILED)
+			      && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(mdev);
+
+			mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(mdev);
+		}
+	}
+
+	if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
+		drbd_al_to_on_disk_bm(mdev);
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+
+	drbd_md_sync(mdev);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+ fail:
+	mutex_unlock(&mdev->state_mutex);
+	return r;
+}
+
+
+static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	struct primary primary_args;
+
+	memset(&primary_args, 0, sizeof(struct primary));
+	if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	reply->ret_code =
+		drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
+
+	return 0;
+}
+
+static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+
+	return 0;
+}
+
+/* initializes the md.*_offset members, so we are able to find
+ * the on disk meta data */
+static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	switch (bdev->dc.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_RESERVED_SECT;
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.md_offset = 0;
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		/* al size is still fixed */
+		bdev->md.al_offset = -MD_AL_MAX_SIZE;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_BM_OFFSET;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
+		break;
+	}
+}
+
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max. */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%lu %cB", (long)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+void drbd_suspend_io(struct drbd_conf *mdev)
+{
+	set_bit(SUSPEND_IO, &mdev->flags);
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_conf *mdev)
+{
+	clear_bit(SUSPEND_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
+{
+	sector_t prev_first_sect, prev_size; /* previous meta location */
+	sector_t la_size;
+	sector_t size;
+	char ppb[10];
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = unchanged;
+
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(mdev);
+
+	/* no wait necessary anymore, actually we could assert that */
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	prev_first_sect = drbd_md_first_sector(mdev->ldev);
+	prev_size = mdev->ldev->md.md_size_sect;
+	la_size = mdev->ldev->md.la_size_sect;
+
+	/* TODO: should only be some assert here, not (re)init... */
+	drbd_md_set_sector_offsets(mdev, mdev->ldev);
+
+	size = drbd_new_dev_size(mdev, mdev->ldev);
+
+	if (drbd_get_capacity(mdev->this_bdev) != size ||
+	    drbd_bm_capacity(mdev) != size) {
+		int err;
+		err = drbd_bm_resize(mdev, size);
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(mdev)>>1;
+			if (size == 0) {
+				dev_err(DEV, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				dev_err(DEV, "BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n",
+				    (unsigned long)size);
+			}
+			rv = dev_size_error;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(mdev, size);
+		mdev->ldev->md.la_size_sect = size;
+		dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+		     (unsigned long long)size>>1);
+	}
+	if (rv == dev_size_error)
+		goto out;
+
+	la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+
+	md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
+		|| prev_size	   != mdev->ldev->md.md_size_sect;
+
+	if (la_size_changed || md_moved) {
+		drbd_al_shrink(mdev); /* All extents inactive. */
+		dev_info(DEV, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
+		drbd_md_mark_dirty(mdev);
+	}
+
+	if (size > la_size)
+		rv = grew;
+	if (size < la_size)
+		rv = shrunk;
+out:
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	sector_t p_size = mdev->p_size;   /* partner's disk size. */
+	sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size) {
+			size = la_size;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		dev_err(DEV, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @mdev:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_conf *mdev)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	ERR_IF(mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+	if (mdev->act_log &&
+	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+		return 0;
+
+	in_use = 0;
+	t = mdev->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache,
+		mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		dev_err(DEV, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&mdev->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				dev_err(DEV, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		mdev->act_log = n;
+	spin_unlock_irq(&mdev->al_lock);
+	if (in_use) {
+		dev_err(DEV, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		if (t)
+			lc_destroy(t);
+	}
+	drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
+	return 0;
+}
+
+void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
+{
+	struct request_queue * const q = mdev->rq_queue;
+	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+	int max_segments = mdev->ldev->dc.max_bio_bvecs;
+
+	if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
+		max_seg_s = PAGE_SIZE;
+
+	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
+
+	blk_queue_max_sectors(q, max_seg_s >> 9);
+	blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
+	blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
+	blk_queue_max_segment_size(q, max_seg_s);
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_segment_boundary(q, PAGE_SIZE-1);
+	blk_stack_limits(&q->limits, &b->limits, 0);
+
+	if (b->merge_bvec_fn)
+		dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
+		     b->merge_bvec_fn);
+	dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
+
+	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+		dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+		     q->backing_dev_info.ra_pages,
+		     b->backing_dev_info.ra_pages);
+		q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+	}
+}
+
+/* serialize deconfig (worker exiting, doing cleanup)
+ * and reconfig (drbdsetup disk, drbdsetup net)
+ *
+ * wait for a potentially exiting worker, then restart it,
+ * or start a new one.
+ */
+static void drbd_reconfig_start(struct drbd_conf *mdev)
+{
+	wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
+	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
+	drbd_thread_start(&mdev->worker);
+}
+
+/* if still unconfigured, stops worker again.
+ * if configured now, clears CONFIG_PENDING.
+ * wakes potential waiters */
+static void drbd_reconfig_done(struct drbd_conf *mdev)
+{
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.disk == D_DISKLESS &&
+	    mdev->state.conn == C_STANDALONE &&
+	    mdev->state.role == R_SECONDARY) {
+		set_bit(DEVICE_DYING, &mdev->flags);
+		drbd_thread_stop_nowait(&mdev->worker);
+	} else
+		clear_bit(CONFIG_PENDING, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+	wake_up(&mdev->state_wait);
+}
+
+/* does always return 0;
+ * interesting return code is in reply->ret_code */
+static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	enum drbd_ret_codes retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct inode *inode, *inode2;
+	struct lru_cache *resync_lru = NULL;
+	union drbd_state ns, os;
+	int rv;
+	int cp_discovered = 0;
+	int logical_block_size;
+
+	drbd_reconfig_start(mdev);
+
+	/* if you want to reconfigure, please tear down first */
+	if (mdev->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, cqueue thread context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
+	nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
+	nbc->dc.fencing       = DRBD_FENCING_DEF;
+	nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+
+	if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
+	if (IS_ERR(nbc->lo_file)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+		    PTR_ERR(nbc->lo_file));
+		nbc->lo_file = NULL;
+		retcode = ERR_OPEN_DISK;
+		goto fail;
+	}
+
+	inode = nbc->lo_file->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode->i_mode)) {
+		retcode = ERR_DISK_NOT_BDEV;
+		goto fail;
+	}
+
+	nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
+	if (IS_ERR(nbc->md_file)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+		    PTR_ERR(nbc->md_file));
+		nbc->md_file = NULL;
+		retcode = ERR_OPEN_MD_DISK;
+		goto fail;
+	}
+
+	inode2 = nbc->md_file->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode2->i_mode)) {
+		retcode = ERR_MD_NOT_BDEV;
+		goto fail;
+	}
+
+	nbc->backing_bdev = inode->i_bdev;
+	if (bd_claim(nbc->backing_bdev, mdev)) {
+		printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
+		       nbc->backing_bdev, mdev,
+		       nbc->backing_bdev->bd_holder,
+		       nbc->backing_bdev->bd_contains->bd_holder,
+		       nbc->backing_bdev->bd_holders);
+		retcode = ERR_BDCLAIM_DISK;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto release_bdev_fail;
+	}
+
+	/* meta_dev_idx >= 0: external fixed size,
+	 * possibly multiple drbd sharing one meta device.
+	 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
+	 * not yet used by some other drbd minor!
+	 * (if you use drbd.conf + drbdadm,
+	 * that should check it for you already; but if you don't, or someone
+	 * fooled it, we need to double check here) */
+	nbc->md_bdev = inode2->i_bdev;
+	if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
+				: (void *) drbd_m_holder)) {
+		retcode = ERR_BDCLAIM_MD_DISK;
+		goto release_bdev_fail;
+	}
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto release_bdev2_fail;
+	}
+
+	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) nbc->dc.disk_size);
+		retcode = ERR_DISK_TO_SMALL;
+		goto release_bdev2_fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TO_SMALL;
+		dev_warn(DEV, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto release_bdev2_fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(mdev->this_bdev)) {
+		retcode = ERR_DISK_TO_SMALL;
+		goto release_bdev2_fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	if (nbc->known_size > max_possible_sectors) {
+		dev_warn(DEV, "==> truncating very big lower level device "
+			"to currently maximum possible %llu sectors <==\n",
+			(unsigned long long) max_possible_sectors);
+		if (nbc->dc.meta_dev_idx >= 0)
+			dev_warn(DEV, "==>> using internal or flexible "
+				      "meta data may help <<==\n");
+	}
+
+	drbd_suspend_io(mdev);
+	/* also wait for the last barrier ack. */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(mdev);
+
+	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+	drbd_resume_io(mdev);
+	if (retcode < SS_SUCCESS)
+		goto release_bdev2_fail;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		goto force_diskless;
+
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (!mdev->bitmap) {
+		if (drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	retcode = drbd_md_read(mdev, nbc);
+	if (retcode != NO_ERROR)
+		goto force_diskless_dec;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(mdev)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+	   drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
+		dev_warn(DEV, "refusing to truncate a consistent device\n");
+		retcode = ERR_DISK_TO_SMALL;
+		goto force_diskless_dec;
+	}
+
+	if (!drbd_al_read_log(mdev, nbc)) {
+		retcode = ERR_IO_MD_DISK;
+		goto force_diskless_dec;
+	}
+
+	/* allocate a second IO page if logical_block_size != 512 */
+	logical_block_size = bdev_logical_block_size(nbc->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		if (!mdev->md_io_tmpp) {
+			struct page *page = alloc_page(GFP_NOIO);
+			if (!page)
+				goto force_diskless_dec;
+
+			dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
+			     logical_block_size, MD_SECTOR_SIZE);
+			dev_warn(DEV, "Workaround engaged (has performance impact).\n");
+
+			mdev->md_io_tmpp = page;
+		}
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (nbc->dc.no_md_flush)
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+	else
+		clear_bit(MD_NO_BARRIER, &mdev->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now mdev takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(mdev->ldev == NULL);
+	mdev->ldev = nbc;
+	mdev->resync = resync_lru;
+	nbc = NULL;
+	resync_lru = NULL;
+
+	mdev->write_ordering = WO_bio_barrier;
+	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+		cp_discovered = 1;
+	}
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+	mdev->read_cnt = 0;
+	mdev->writ_cnt = 0;
+
+	drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	if (mdev->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &mdev->flags);
+
+	dd = drbd_determin_dev_size(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == grew)
+		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+		dev_info(DEV, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (cp_discovered) {
+		drbd_al_apply_to_bm(mdev);
+		drbd_al_to_on_disk_bm(mdev);
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	ns.i = os.i;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	if ( ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (mdev->state.conn == C_CONNECTED) {
+		mdev->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+	}
+
+	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns = mdev->state;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	if (mdev->state.role == R_PRIMARY)
+		mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(mdev);
+	drbd_md_sync(mdev);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+	put_ldev(mdev);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(mdev);
+ force_diskless:
+	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+	drbd_md_sync(mdev);
+ release_bdev2_fail:
+	if (nbc)
+		bd_release(nbc->md_bdev);
+ release_bdev_fail:
+	if (nbc)
+		bd_release(nbc->backing_bdev);
+ fail:
+	if (nbc) {
+		if (nbc->lo_file)
+			fput(nbc->lo_file);
+		if (nbc->md_file)
+			fput(nbc->md_file);
+		kfree(nbc);
+	}
+	lc_destroy(resync_lru);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
+	return 0;
+}
+
+static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			    struct drbd_nl_cfg_reply *reply)
+{
+	int i, ns;
+	enum drbd_ret_codes retcode;
+	struct net_conf *new_conf = NULL;
+	struct crypto_hash *tfm = NULL;
+	struct crypto_hash *integrity_w_tfm = NULL;
+	struct crypto_hash *integrity_r_tfm = NULL;
+	struct hlist_head *new_tl_hash = NULL;
+	struct hlist_head *new_ee_hash = NULL;
+	struct drbd_conf *odev;
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	void *int_dig_out = NULL;
+	void *int_dig_in = NULL;
+	void *int_dig_vv = NULL;
+	struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+
+	drbd_reconfig_start(mdev);
+
+	if (mdev->state.conn > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, cqueue thread context */
+	new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	memset(new_conf, 0, sizeof(struct net_conf));
+	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
+	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
+	new_conf->ping_int	   = DRBD_PING_INT_DEF;
+	new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
+	new_conf->max_buffers	   = DRBD_MAX_BUFFERS_DEF;
+	new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
+	new_conf->sndbuf_size	   = DRBD_SNDBUF_SIZE_DEF;
+	new_conf->rcvbuf_size	   = DRBD_RCVBUF_SIZE_DEF;
+	new_conf->ko_count	   = DRBD_KO_COUNT_DEF;
+	new_conf->after_sb_0p	   = DRBD_AFTER_SB_0P_DEF;
+	new_conf->after_sb_1p	   = DRBD_AFTER_SB_1P_DEF;
+	new_conf->after_sb_2p	   = DRBD_AFTER_SB_2P_DEF;
+	new_conf->want_lose	   = 0;
+	new_conf->two_primaries    = 0;
+	new_conf->wire_protocol    = DRBD_PROT_C;
+	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
+	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
+
+	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (new_conf->two_primaries
+	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
+		retcode = ERR_NOT_PROTO_C;
+		goto fail;
+	};
+
+	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
+		retcode = ERR_DISCARD;
+		goto fail;
+	}
+
+	retcode = NO_ERROR;
+
+	new_my_addr = (struct sockaddr *)&new_conf->my_addr;
+	new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev || odev == mdev)
+			continue;
+		if (get_net_conf(odev)) {
+			taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
+			if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
+			    !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
+				retcode = ERR_LOCAL_ADDR;
+
+			taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
+			if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
+			    !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
+				retcode = ERR_PEER_ADDR;
+
+			put_net_conf(odev);
+			if (retcode != NO_ERROR)
+				goto fail;
+		}
+	}
+
+	if (new_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			new_conf->cram_hmac_alg);
+		tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm)) {
+			tfm = NULL;
+			retcode = ERR_AUTH_ALG;
+			goto fail;
+		}
+
+		if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
+						!= CRYPTO_ALG_TYPE_HASH) {
+			retcode = ERR_AUTH_ALG_ND;
+			goto fail;
+		}
+	}
+
+	if (new_conf->integrity_alg[0]) {
+		integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_w_tfm)) {
+			integrity_w_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
+			retcode=ERR_INTEGRITY_ALG_ND;
+			goto fail;
+		}
+
+		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_r_tfm)) {
+			integrity_r_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_epoch_size/8;
+	if (mdev->tl_hash_s != ns) {
+		new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_tl_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_buffers/8;
+	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
+		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_ee_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+	if (integrity_w_tfm) {
+		i = crypto_hash_digestsize(integrity_w_tfm);
+		int_dig_out = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_out) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_in = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_in) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_vv = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_vv) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (!mdev->bitmap) {
+		if(drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->net_conf != NULL) {
+		retcode = ERR_NET_CONFIGURED;
+		spin_unlock_irq(&mdev->req_lock);
+		goto fail;
+	}
+	mdev->net_conf = new_conf;
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+
+	if (new_tl_hash) {
+		kfree(mdev->tl_hash);
+		mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
+		mdev->tl_hash = new_tl_hash;
+	}
+
+	if (new_ee_hash) {
+		kfree(mdev->ee_hash);
+		mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
+		mdev->ee_hash = new_ee_hash;
+	}
+
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = tfm;
+
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = integrity_w_tfm;
+
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = integrity_r_tfm;
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+	mdev->int_dig_out=int_dig_out;
+	mdev->int_dig_in=int_dig_in;
+	mdev->int_dig_vv=int_dig_vv;
+	spin_unlock_irq(&mdev->req_lock);
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+fail:
+	kfree(int_dig_out);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	crypto_free_hash(tfm);
+	crypto_free_hash(integrity_w_tfm);
+	crypto_free_hash(integrity_r_tfm);
+	kfree(new_tl_hash);
+	kfree(new_ee_hash);
+	kfree(new_conf);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+
+	if (retcode == SS_NOTHING_TO_DO)
+		goto done;
+	else if (retcode == SS_ALREADY_STANDALONE)
+		goto done;
+	else if (retcode == SS_PRIMARY_NOP) {
+		/* Our statche checking code wants to see the peer outdated. */
+		retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+						      pdsk, D_OUTDATED));
+	} else if (retcode == SS_CW_FAILED_BY_PEER) {
+		/* The peer probably wants to see us outdated. */
+		retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED),
+					      CS_ORDERED);
+		if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			retcode = SS_SUCCESS;
+		}
+	}
+
+	if (retcode < SS_SUCCESS)
+		goto fail;
+
+	if (wait_event_interruptible(mdev->state_wait,
+				     mdev->state.conn != C_DISCONNECTING)) {
+		/* Do not test for mdev->state.conn == C_STANDALONE, since
+		   someone else might connect us in the mean time! */
+		retcode = ERR_INTR;
+		goto fail;
+	}
+
+ done:
+	retcode = NO_ERROR;
+ fail:
+	drbd_md_sync(mdev);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_conf *mdev)
+{
+	int iass; /* I am sync source */
+
+	dev_info(DEV, "Resync of new storage after online grow\n");
+	if (mdev->state.role != mdev->state.peer)
+		iass = (mdev->state.role == R_PRIMARY);
+	else
+		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	if (iass)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	struct resize rs;
+	int retcode = NO_ERROR;
+	int ldsc = 0; /* local disk size changed */
+	enum determine_dev_size dd;
+
+	memset(&rs, 0, sizeof(struct resize));
+	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (mdev->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail;
+	}
+
+	if (mdev->state.role == R_SECONDARY &&
+	    mdev->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail;
+	}
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+		ldsc = 1;
+	}
+
+	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+	dd = drbd_determin_dev_size(mdev);
+	drbd_md_sync(mdev);
+	put_ldev(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	}
+
+	if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+		if (dd == grew)
+			set_bit(RESIZE_PENDING, &mdev->flags);
+
+		drbd_send_uuids(mdev);
+		drbd_send_sizes(mdev, 1);
+	}
+
+ fail:
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct syncer_conf sc;
+	cpumask_var_t new_cpu_mask;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
+		memset(&sc, 0, sizeof(struct syncer_conf));
+		sc.rate       = DRBD_RATE_DEF;
+		sc.after      = DRBD_AFTER_DEF;
+		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+	} else
+		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+
+	if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	/* re-sync running */
+	rsr = (	mdev->state.conn == C_SYNC_SOURCE ||
+		mdev->state.conn == C_SYNC_TARGET ||
+		mdev->state.conn == C_PAUSED_SYNC_S ||
+		mdev->state.conn == C_PAUSED_SYNC_T );
+
+	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	if (!rsr && sc.csums_alg[0]) {
+		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(csums_tfm)) {
+			csums_tfm = NULL;
+			retcode = ERR_CSUMS_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
+			retcode = ERR_CSUMS_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* online verify running */
+	ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+
+	if (ovr) {
+		if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
+			retcode = ERR_VERIFY_RUNNING;
+			goto fail;
+		}
+	}
+
+	if (!ovr && sc.verify_alg[0]) {
+		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(verify_tfm)) {
+			verify_tfm = NULL;
+			retcode = ERR_VERIFY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
+			retcode = ERR_VERIFY_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
+		err = __bitmap_parse(sc.cpu_mask, 32, 0,
+				cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err) {
+			dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+			retcode = ERR_CPU_MASK_PARSE;
+			goto fail;
+		}
+	}
+
+	ERR_IF (sc.rate < 1) sc.rate = 1;
+	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
+#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
+	if (sc.al_extents > AL_MAX) {
+		dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
+		sc.al_extents = AL_MAX;
+	}
+#undef AL_MAX
+
+	/* most sanity checks done, try to assign the new sync-after
+	 * dependency.  need to hold the global lock in there,
+	 * to avoid a race in the dependency loop check. */
+	retcode = drbd_alter_sa(mdev, sc.after);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	/* ok, assign the rest of it as well.
+	 * lock against receive_SyncParam() */
+	spin_lock(&mdev->peer_seq_lock);
+	mdev->sync_conf = sc;
+
+	if (!rsr) {
+		crypto_free_hash(mdev->csums_tfm);
+		mdev->csums_tfm = csums_tfm;
+		csums_tfm = NULL;
+	}
+
+	if (!ovr) {
+		crypto_free_hash(mdev->verify_tfm);
+		mdev->verify_tfm = verify_tfm;
+		verify_tfm = NULL;
+	}
+	spin_unlock(&mdev->peer_seq_lock);
+
+	if (get_ldev(mdev)) {
+		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+		drbd_al_shrink(mdev);
+		err = drbd_check_al_size(mdev);
+		lc_unlock(mdev->act_log);
+		wake_up(&mdev->al_wait);
+
+		put_ldev(mdev);
+		drbd_md_sync(mdev);
+
+		if (err) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (mdev->state.conn >= C_CONNECTED)
+		drbd_send_sync_param(mdev, &sc);
+
+	if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(mdev->cpu_mask, new_cpu_mask);
+		drbd_calc_cpu_mask(mdev);
+		mdev->receiver.reset_cpu_mask = 1;
+		mdev->asender.reset_cpu_mask = 1;
+		mdev->worker.reset_cpu_mask = 1;
+	}
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+fail:
+	free_cpumask_var(new_cpu_mask);
+	crypto_free_hash(csums_tfm);
+	crypto_free_hash(verify_tfm);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
+
+	if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+
+	while (retcode == SS_NEED_CONNECTION) {
+		spin_lock_irq(&mdev->req_lock);
+		if (mdev->state.conn < C_CONNECTED)
+			retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if (retcode != SS_NEED_CONNECTION)
+			break;
+
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+	}
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				   struct drbd_nl_cfg_reply *reply)
+{
+
+	reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+
+	return 0;
+}
+
+static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+
+	if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+
+	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_CLEAR;
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
+
+	return 0;
+}
+
+static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+	return 0;
+}
+
+static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+	return 0;
+}
+
+static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
+		put_net_conf(mdev);
+	}
+	tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl = reply->tag_list;
+	union drbd_state s = mdev->state;
+	unsigned long rs_left;
+	unsigned int res;
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+
+	/* no local ref, no bitmap, no syncer progress. */
+	if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
+		if (get_ldev(mdev)) {
+			drbd_get_syncer_progress(mdev, &rs_left, &res);
+			tl = tl_add_int(tl, T_sync_progress, &res);
+			put_ldev(mdev);
+		}
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
+		tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
+		put_ldev(mdev);
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+/**
+ * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
+ * @mdev:	DRBD device.
+ * @nlp:	Netlink/connector packet from drbdsetup
+ * @reply:	Reply packet for drbdsetup
+ */
+static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+	char rv;
+
+	tl = reply->tag_list;
+
+	rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
+	  test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+
+	tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	/* default to resume from last known position, if possible */
+	struct start_ov args =
+		{ .start_sector = mdev->ov_start_sector };
+
+	if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+	/* w_make_ov_request expects position to be aligned */
+	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
+	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	return 0;
+}
+
+
+static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int skip_initial_sync = 0;
+	int err;
+
+	struct new_c_uuid args;
+
+	memset(&args, 0, sizeof(struct new_c_uuid));
+	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+	    mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		dev_info(DEV, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (mdev->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+		if (err) {
+			dev_err(DEV, "Writing bitmap failed with %d\n",err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(mdev);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+
+	drbd_md_sync(mdev);
+out_dec:
+	put_ldev(mdev);
+out:
+	mutex_unlock(&mdev->state_mutex);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
+{
+	struct drbd_conf *mdev;
+
+	if (nlp->drbd_minor >= minor_count)
+		return NULL;
+
+	mdev = minor_to_mdev(nlp->drbd_minor);
+
+	if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
+		struct gendisk *disk = NULL;
+		mdev = drbd_new_device(nlp->drbd_minor);
+
+		spin_lock_irq(&drbd_pp_lock);
+		if (minor_table[nlp->drbd_minor] == NULL) {
+			minor_table[nlp->drbd_minor] = mdev;
+			disk = mdev->vdisk;
+			mdev = NULL;
+		} /* else: we lost the race */
+		spin_unlock_irq(&drbd_pp_lock);
+
+		if (disk) /* we won the race above */
+			/* in case we ever add a drbd_delete_device(),
+			 * don't forget the del_gendisk! */
+			add_disk(disk);
+		else /* we lost the race above */
+			drbd_free_mdev(mdev);
+
+		mdev = minor_to_mdev(nlp->drbd_minor);
+	}
+
+	return mdev;
+}
+
+struct cn_handler_struct {
+	int (*function)(struct drbd_conf *,
+			 struct drbd_nl_cfg_req *,
+			 struct drbd_nl_cfg_reply *);
+	int reply_body_size;
+};
+
+static struct cn_handler_struct cnd_table[] = {
+	[ P_primary ]		= { &drbd_nl_primary,		0 },
+	[ P_secondary ]		= { &drbd_nl_secondary,		0 },
+	[ P_disk_conf ]		= { &drbd_nl_disk_conf,		0 },
+	[ P_detach ]		= { &drbd_nl_detach,		0 },
+	[ P_net_conf ]		= { &drbd_nl_net_conf,		0 },
+	[ P_disconnect ]	= { &drbd_nl_disconnect,	0 },
+	[ P_resize ]		= { &drbd_nl_resize,		0 },
+	[ P_syncer_conf ]	= { &drbd_nl_syncer_conf,	0 },
+	[ P_invalidate ]	= { &drbd_nl_invalidate,	0 },
+	[ P_invalidate_peer ]	= { &drbd_nl_invalidate_peer,	0 },
+	[ P_pause_sync ]	= { &drbd_nl_pause_sync,	0 },
+	[ P_resume_sync ]	= { &drbd_nl_resume_sync,	0 },
+	[ P_suspend_io ]	= { &drbd_nl_suspend_io,	0 },
+	[ P_resume_io ]		= { &drbd_nl_resume_io,		0 },
+	[ P_outdate ]		= { &drbd_nl_outdate,		0 },
+	[ P_get_config ]	= { &drbd_nl_get_config,
+				    sizeof(struct syncer_conf_tag_len_struct) +
+				    sizeof(struct disk_conf_tag_len_struct) +
+				    sizeof(struct net_conf_tag_len_struct) },
+	[ P_get_state ]		= { &drbd_nl_get_state,
+				    sizeof(struct get_state_tag_len_struct) +
+				    sizeof(struct sync_progress_tag_len_struct)	},
+	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
+				    sizeof(struct get_uuids_tag_len_struct) },
+	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
+				    sizeof(struct get_timeout_flag_tag_len_struct)},
+	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
+	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
+};
+
+static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+{
+	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
+	struct cn_handler_struct *cm;
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	struct drbd_conf *mdev;
+	int retcode, rr;
+	int reply_size = sizeof(struct cn_msg)
+		+ sizeof(struct drbd_nl_cfg_reply)
+		+ sizeof(short int);
+
+	if (!try_module_get(THIS_MODULE)) {
+		printk(KERN_ERR "drbd: try_module_get() failed!\n");
+		return;
+	}
+
+	if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
+		retcode = ERR_PERM;
+		goto fail;
+	}
+
+	mdev = ensure_mdev(nlp);
+	if (!mdev) {
+		retcode = ERR_MINOR_INVALID;
+		goto fail;
+	}
+
+	if (nlp->packet_type >= P_nl_after_last_packet) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	cm = cnd_table + nlp->packet_type;
+
+	/* This may happen if packet number is 0: */
+	if (cm->function == NULL) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	reply_size += cm->reply_body_size;
+
+	/* allocation not in the IO path, cqueue thread context */
+	cn_reply = kmalloc(reply_size, GFP_KERNEL);
+	if (!cn_reply) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
+
+	reply->packet_type =
+		cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
+	reply->minor = nlp->drbd_minor;
+	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
+	/* reply->tag_list; might be modified by cm->function. */
+
+	rr = cm->function(mdev, nlp, reply);
+
+	cn_reply->id = req->id;
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
+	cn_reply->flags = 0;
+
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+
+	kfree(cn_reply);
+	module_put(THIS_MODULE);
+	return;
+ fail:
+	drbd_nl_send_reply(req, retcode);
+	module_put(THIS_MODULE);
+}
+
+static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
+
+static unsigned short *
+__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
+	unsigned short len, int nul_terminated)
+{
+	unsigned short l = tag_descriptions[tag_number(tag)].max_len;
+	len = (len < l) ? len :  l;
+	put_unaligned(tag, tl++);
+	put_unaligned(len, tl++);
+	memcpy(tl, data, len);
+	tl = (unsigned short*)((char*)tl + len);
+	if (nul_terminated)
+		*((char*)tl - 1) = 0;
+	return tl;
+}
+
+static unsigned short *
+tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
+{
+	return __tl_add_blob(tl, tag, data, len, 0);
+}
+
+static unsigned short *
+tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
+{
+	return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
+}
+
+static unsigned short *
+tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
+{
+	put_unaligned(tag, tl++);
+	switch(tag_type(tag)) {
+	case TT_INTEGER:
+		put_unaligned(sizeof(int), tl++);
+		put_unaligned(*(int *)val, (int *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(int));
+		break;
+	case TT_INT64:
+		put_unaligned(sizeof(u64), tl++);
+		put_unaligned(*(u64 *)val, (u64 *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(u64));
+		break;
+	default:
+		/* someone did something stupid. */
+		;
+	}
+	return tl;
+}
+
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct get_state_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_get_state;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct call_helper_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = tl_add_str(tl, T_helper, helper_name);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_call_helper;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e)
+{
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	struct bio_vec *bvec;
+	unsigned short *tl;
+	int i;
+
+	if (!e)
+		return;
+	if (!reason || !reason[0])
+		return;
+
+	/* apparently we have to memcpy twice, first to prepare the data for the
+	 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
+	 * netlink skb. */
+	/* receiver thread context, which is not in the writeout path (of this node),
+	 * but may be in the writeout path of the _other_ node.
+	 * GFP_NOIO to avoid potential "distributed deadlock". */
+	cn_reply = kmalloc(
+		sizeof(struct cn_msg)+
+		sizeof(struct drbd_nl_cfg_reply)+
+		sizeof(struct dump_ee_tag_len_struct)+
+		sizeof(short int),
+		GFP_NOIO);
+
+	if (!cn_reply) {
+		dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
+				(unsigned long long)e->sector, e->size);
+		return;
+	}
+
+	reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
+	tl = reply->tag_list;
+
+	tl = tl_add_str(tl, T_dump_ee_reason, reason);
+	tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
+	tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
+	tl = tl_add_int(tl, T_ee_sector, &e->sector);
+	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
+
+	put_unaligned(T_ee_data, tl++);
+	put_unaligned(e->size, tl++);
+
+	__bio_for_each_segment(bvec, e->private_bio, i, 0) {
+		void *d = kmap(bvec->bv_page);
+		memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
+		kunmap(bvec->bv_page);
+		tl=(unsigned short*)((char*)tl + bvec->bv_len);
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
+	cn_reply->ack = 0; // not used here.
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char*)tl - (char*)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_dump_ee;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	kfree(cn_reply);
+}
+
+void drbd_bcast_sync_progress(struct drbd_conf *mdev)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct sync_progress_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+	unsigned long rs_left;
+	unsigned int res;
+
+	/* no local ref, no bitmap, no syncer progress, no broadcast. */
+	if (!get_ldev(mdev))
+		return;
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+	put_ldev(mdev);
+
+	tl = tl_add_int(tl, T_sync_progress, &res);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_sync_progress;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+int __init drbd_nl_init(void)
+{
+	static struct cb_id cn_id_drbd;
+	int err, try=10;
+
+	cn_id_drbd.val = CN_VAL_DRBD;
+	do {
+		cn_id_drbd.idx = cn_idx;
+		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
+		if (!err)
+			break;
+		cn_idx = (cn_idx + CN_IDX_STEP);
+	} while (try--);
+
+	if (err) {
+		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
+		return err;
+	}
+
+	return 0;
+}
+
+void drbd_nl_cleanup(void)
+{
+	static struct cb_id cn_id_drbd;
+
+	cn_id_drbd.idx = cn_idx;
+	cn_id_drbd.val = CN_VAL_DRBD;
+
+	cn_del_callback(&cn_id_drbd);
+}
+
+void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+{
+	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	int rr;
+
+	cn_reply->id = req->id;
+
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
+	cn_reply->flags = 0;
+
+	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
+	reply->ret_code = ret_code;
+
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+}
+
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 0000000..bdd0b49
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,265 @@
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
+{
+	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned int res;
+	int i, x, y;
+
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_printf(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_printf(seq, "=");
+	seq_printf(seq, ">");
+	for (i = 0; i < y; i++)
+		seq_printf(seq, ".");
+	seq_printf(seq, "] ");
+
+	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
+	/* if more than 1 GB display in MB */
+	if (mdev->rs_total > 0x100000L)
+		seq_printf(seq, "(%lu/%lu)M\n\t",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K\n\t",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(mdev->rs_total));
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	dt = (jiffies - mdev->rs_mark_time) / HZ;
+
+	if (dt > 20) {
+		/* if we made no update to rs_mark_time for too long,
+		 * we are stalled. show that. */
+		seq_printf(seq, "stalled\n");
+		return;
+	}
+
+	if (!dt)
+		dt++;
+	db = mdev->rs_mark_left - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " speed: %ld,%03ld",
+			dbdt/1000, dbdt % 1000);
+	else
+		seq_printf(seq, " speed: %ld", dbdt);
+
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+	db = mdev->rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " (%ld,%03ld)",
+			dbdt/1000, dbdt % 1000);
+	else
+		seq_printf(seq, " (%ld)", dbdt);
+
+	seq_printf(seq, " K/sec\n");
+}
+
+static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
+{
+	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
+		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
+		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
+		   );
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, hole = 0;
+	const char *sn;
+	struct drbd_conf *mdev;
+
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+		[WO_bio_barrier] = 'b',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	for (i = 0; i < minor_count; i++) {
+		mdev = minor_to_mdev(i);
+		if (!mdev) {
+			hole = 1;
+			continue;
+		}
+		if (hole) {
+			hole = 0;
+			seq_printf(seq, "\n");
+		}
+
+		sn = drbd_conn_str(mdev->state.conn);
+
+		if (mdev->state.conn == C_STANDALONE &&
+		    mdev->state.disk == D_DISKLESS &&
+		    mdev->state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(mdev->state.role),
+			   drbd_role_str(mdev->state.peer),
+			   drbd_disk_str(mdev->state.disk),
+			   drbd_disk_str(mdev->state.pdsk),
+			   (mdev->net_conf == NULL ? ' ' :
+			    (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
+			   mdev->state.susp ? 's' : 'r',
+			   mdev->state.aftr_isp ? 'a' : '-',
+			   mdev->state.peer_isp ? 'p' : '-',
+			   mdev->state.user_isp ? 'u' : '-',
+			   mdev->congestion_reason ?: '-',
+			   mdev->send_cnt/2,
+			   mdev->recv_cnt/2,
+			   mdev->writ_cnt/2,
+			   mdev->read_cnt/2,
+			   mdev->al_writ_cnt,
+			   mdev->bm_writ_cnt,
+			   atomic_read(&mdev->local_cnt),
+			   atomic_read(&mdev->ap_pending_cnt) +
+			   atomic_read(&mdev->rs_pending_cnt),
+			   atomic_read(&mdev->unacked_cnt),
+			   atomic_read(&mdev->ap_bio_cnt),
+			   mdev->epochs,
+			   write_ordering_chars[mdev->write_ordering]
+			);
+			seq_printf(seq, " oos:%lu\n",
+				   Bit2KB(drbd_bm_total_weight(mdev)));
+		}
+		if (mdev->state.conn == C_SYNC_SOURCE ||
+		    mdev->state.conn == C_SYNC_TARGET)
+			drbd_syncer_progress(mdev, seq);
+
+		if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+			seq_printf(seq, "\t%3d%%      %lu/%lu\n",
+				   (int)((mdev->rs_total-mdev->ov_left) /
+					 (mdev->rs_total/100+1)),
+				   mdev->rs_total - mdev->ov_left,
+				   mdev->rs_total);
+
+		if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
+			lc_seq_printf_stats(seq, mdev->resync);
+			lc_seq_printf_stats(seq, mdev->act_log);
+			put_ldev(mdev);
+		}
+
+		if (proc_details >= 2) {
+			if (mdev->resync) {
+				lc_seq_dump_details(seq, mdev->resync, "rs_left",
+					resync_dump_detail);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_seq_show, PDE(inode)->data);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 0000000..c548f24
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4426 @@
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+#include "drbd_vli.h"
+
+struct flush_work {
+	struct drbd_work w;
+	struct drbd_epoch *epoch;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+static int drbd_do_handshake(struct drbd_conf *mdev);
+static int drbd_do_auth(struct drbd_conf *mdev);
+
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+
+static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	struct drbd_epoch *prev;
+	spin_lock(&mdev->epoch_lock);
+	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+	if (prev == epoch || prev == mdev->current_epoch)
+		prev = NULL;
+	spin_unlock(&mdev->epoch_lock);
+	return prev;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+{
+	struct page *page = NULL;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant > 0) {
+		spin_lock(&drbd_pp_lock);
+		page = drbd_pp_pool;
+		if (page) {
+			drbd_pp_pool = (struct page *)page_private(page);
+			set_page_private(page, 0); /* just to be polite */
+			drbd_pp_vacant--;
+		}
+		spin_unlock(&drbd_pp_lock);
+	}
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	if (!page)
+		page = alloc_page(GFP_TRY);
+	if (page)
+		atomic_inc(&mdev->pp_in_use);
+	return page;
+}
+
+/* kick lower level device, if we have more than (arbitrary number)
+ * reference counts on it, which typically are locally submitted io
+ * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
+static void maybe_kick_lo(struct drbd_conf *mdev)
+{
+	if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
+		drbd_kick_lo(mdev);
+}
+
+static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+{
+	struct drbd_epoch_entry *e;
+	struct list_head *le, *tle;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_safe(le, tle, &mdev->net_ee) {
+		e = list_entry(le, struct drbd_epoch_entry, w.list);
+		if (drbd_bio_has_active_page(e->private_bio))
+			break;
+		list_move(le, to_be_freed);
+	}
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+
+	maybe_kick_lo(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_ee(mdev, e);
+}
+
+/**
+ * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * @mdev:	DRBD device.
+ * @retry:	whether or not to retry allocation forever (or until signalled)
+ *
+ * Tries to allocate a page, first from our own page pool, then from the
+ * kernel, unless this allocation would exceed the max_buffers setting.
+ * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ */
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+{
+	struct page *page = NULL;
+	DEFINE_WAIT(wait);
+
+	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+		page = drbd_pp_first_page_or_try_alloc(mdev);
+		if (page)
+			return page;
+	}
+
+	for (;;) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_kick_lo_and_reclaim_net(mdev);
+
+		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+			page = drbd_pp_first_page_or_try_alloc(mdev);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+			break;
+		}
+
+		schedule();
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+{
+	int free_it;
+
+	spin_lock(&drbd_pp_lock);
+	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+		free_it = 1;
+	} else {
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+		drbd_pp_vacant++;
+		free_it = 0;
+	}
+	spin_unlock(&drbd_pp_lock);
+
+	atomic_dec(&mdev->pp_in_use);
+
+	if (free_it)
+		__free_page(page);
+
+	wake_up(&drbd_pp_wait);
+}
+
+static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct page *p_to_be_freed = NULL;
+	struct page *page;
+	struct bio_vec *bvec;
+	int i;
+
+	spin_lock(&drbd_pp_lock);
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+			set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
+			p_to_be_freed = bvec->bv_page;
+		} else {
+			set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
+			drbd_pp_pool = bvec->bv_page;
+			drbd_pp_vacant++;
+		}
+	}
+	spin_unlock(&drbd_pp_lock);
+	atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
+
+	while (p_to_be_freed) {
+		page = p_to_be_freed;
+		p_to_be_freed = (struct page *)page_private(page);
+		set_page_private(page, 0); /* just to be polite */
+		put_page(page);
+	}
+
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_ee()
+ drbd_alloc_ee()
+ drbd_init_ee()
+ drbd_release_ee()
+ drbd_ee_fix_bhs()
+ drbd_process_done_ee()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+				     u64 id,
+				     sector_t sector,
+				     unsigned int data_size,
+				     gfp_t gfp_mask) __must_hold(local)
+{
+	struct request_queue *q;
+	struct drbd_epoch_entry *e;
+	struct page *page;
+	struct bio *bio;
+	unsigned int ds;
+
+	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!e) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+		return NULL;
+	}
+
+	bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
+	if (!bio) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
+		goto fail1;
+	}
+
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	bio->bi_sector = sector;
+
+	ds = data_size;
+	while (ds) {
+		page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
+		if (!page) {
+			if (!(gfp_mask & __GFP_NOWARN))
+				dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
+			goto fail2;
+		}
+		if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
+			drbd_pp_free(mdev, page);
+			dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
+			    "data_size=%u,ds=%u) failed\n",
+			    (unsigned long long)sector, data_size, ds);
+
+			q = bdev_get_queue(bio->bi_bdev);
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size,
+					.bi_rw = bio->bi_rw,
+				};
+				int l = q->merge_bvec_fn(q, &bvm,
+						&bio->bi_io_vec[bio->bi_vcnt]);
+				dev_err(DEV, "merge_bvec_fn() = %d\n", l);
+			}
+
+			/* dump more of the bio. */
+			dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
+			dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
+			dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
+			dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
+
+			goto fail2;
+			break;
+		}
+		ds -= min_t(int, ds, PAGE_SIZE);
+	}
+
+	D_ASSERT(data_size == bio->bi_size);
+
+	bio->bi_private = e;
+	e->mdev = mdev;
+	e->sector = sector;
+	e->size = bio->bi_size;
+
+	e->private_bio = bio;
+	e->block_id = id;
+	INIT_HLIST_NODE(&e->colision);
+	e->epoch = NULL;
+	e->flags = 0;
+
+	return e;
+
+ fail2:
+	drbd_pp_free_bio_pages(mdev, bio);
+	bio_put(bio);
+ fail1:
+	mempool_free(e, drbd_ee_mempool);
+
+	return NULL;
+}
+
+void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	struct bio *bio = e->private_bio;
+	drbd_pp_free_bio_pages(mdev, bio);
+	bio_put(bio);
+	D_ASSERT(hlist_unhashed(&e->colision));
+	mempool_free(e, drbd_ee_mempool);
+}
+
+int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_epoch_entry *e, *t;
+	int count = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		drbd_free_ee(mdev, e);
+		count++;
+	}
+	return count;
+}
+
+
+/*
+ * This function is called from _asender only_
+ * but see also comments in _req_mod(,barrier_acked)
+ * and receive_Barrier.
+ *
+ * Move entries from net_ee to done_ee, if ready.
+ * Grab done_ee, call all callbacks, free the entries.
+ * The callbacks typically send out ACKs.
+ */
+static int drbd_process_done_ee(struct drbd_conf *mdev)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+	int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	list_splice_init(&mdev->done_ee, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_ee(mdev, e);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_discard_ack.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		/* list_del not necessary, next/prev members not touched */
+		ok = e->w.cb(mdev, &e->w, !ok) && ok;
+		drbd_free_ee(mdev, e);
+	}
+	wake_up(&mdev->ee_wait);
+
+	return ok;
+}
+
+void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mdev->req_lock);
+		drbd_kick_lo(mdev);
+		schedule();
+		finish_wait(&mdev->ee_wait, &wait);
+		spin_lock_irq(&mdev->req_lock);
+	}
+}
+
+void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, head);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/* see also kernel_accept; which is only present since 2.6.18.
+ * also we want to log which part of it failed, exactly */
+static int drbd_accept(struct drbd_conf *mdev, const char **what,
+		struct socket *sock, struct socket **newsock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	*what = "listen";
+	err = sock->ops->listen(sock, 5);
+	if (err < 0)
+		goto out;
+
+	*what = "sock_create_lite";
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto out;
+
+	*what = "accept";
+	err = sock->ops->accept(sock, *newsock, 0);
+	if (err < 0) {
+		sock_release(*newsock);
+		*newsock = NULL;
+		goto out;
+	}
+	(*newsock)->ops  = sock->ops;
+
+out:
+	return err;
+}
+
+static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
+		    void *buf, size_t size, int flags)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
+	set_fs(oldfs);
+
+	return rv;
+}
+
+static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = MSG_WAITALL | MSG_NOSIGNAL
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	for (;;) {
+		rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
+		if (rv == size)
+			break;
+
+		/* Note:
+		 * ECONNRESET	other side closed the connection
+		 * ERESTARTSYS	(on  sock) we got a signal
+		 */
+
+		if (rv < 0) {
+			if (rv == -ECONNRESET)
+				dev_info(DEV, "sock was reset by peer\n");
+			else if (rv != -ERESTARTSYS)
+				dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			break;
+		} else if (rv == 0) {
+			dev_info(DEV, "sock was shut down by peer\n");
+			break;
+		} else	{
+			/* signal came in, or peer/link went down,
+			 * after we read a partial message
+			 */
+			/* D_ASSERT(signal_pending(current)); */
+			break;
+		}
+	};
+
+	set_fs(oldfs);
+
+	if (rv != size)
+		drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+
+	return rv;
+}
+
+static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	int err;
+	int disconnect_on_error = 1;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	memcpy(&src_in6, mdev->net_conf->my_addr,
+	       min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
+	if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	what = "bind before connect";
+	err = sock->ops->bind(sock,
+			      (struct sockaddr *) &src_in6,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock,
+				 (struct sockaddr *)mdev->net_conf->peer_addr,
+				 mdev->net_conf->peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	}
+	put_net_conf(mdev);
+	return sock;
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+{
+	int timeo, err;
+	struct socket *s_estab = NULL, *s_listen;
+	const char *what;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	timeo = mdev->net_conf->try_connect_int * HZ;
+	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_rcvtimeo = timeo;
+	s_listen->sk->sk_sndtimeo = timeo;
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen,
+			      (struct sockaddr *) mdev->net_conf->my_addr,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	err = drbd_accept(mdev, &what, s_listen, &s_estab);
+
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	}
+	put_net_conf(mdev);
+
+	return s_estab;
+}
+
+static int drbd_send_fp(struct drbd_conf *mdev,
+	struct socket *sock, enum drbd_packets cmd)
+{
+	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+
+	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+}
+
+static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+{
+	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+	int rr;
+
+	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+
+	if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
+		return be16_to_cpu(h->command);
+
+	return 0xffff;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @mdev:	DRBD device.
+ * @sock:	pointer to the pointer to the socket.
+ */
+static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return FALSE;
+
+	rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return TRUE;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return FALSE;
+	}
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+static int drbd_connect(struct drbd_conf *mdev)
+{
+	struct socket *s, *sock, *msock;
+	int try, h, ok;
+
+	D_ASSERT(!mdev->data.socket);
+
+	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
+		dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
+
+	if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+		return -2;
+
+	clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	sock  = NULL;
+	msock = NULL;
+
+	do {
+		for (try = 0;;) {
+			/* 3 tries, this should take less than a second! */
+			s = drbd_try_connect(mdev);
+			if (s || ++try >= 3)
+				break;
+			/* give the other side time to call bind() & listen() */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ / 10);
+		}
+
+		if (s) {
+			if (!sock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
+				sock = s;
+				s = NULL;
+			} else if (!msock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
+				msock = s;
+				s = NULL;
+			} else {
+				dev_err(DEV, "Logic error in drbd_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (sock && msock) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ / 10);
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+
+retry:
+		s = drbd_wait_for_connect(mdev);
+		if (s) {
+			try = drbd_recv_fp(mdev, s);
+			drbd_socket_okay(mdev, &sock);
+			drbd_socket_okay(mdev, &msock);
+			switch (try) {
+			case P_HAND_SHAKE_S:
+				if (sock) {
+					dev_warn(DEV, "initial packet S crossed\n");
+					sock_release(sock);
+				}
+				sock = s;
+				break;
+			case P_HAND_SHAKE_M:
+				if (msock) {
+					dev_warn(DEV, "initial packet M crossed\n");
+					sock_release(msock);
+				}
+				msock = s;
+				set_bit(DISCARD_CONCURRENT, &mdev->flags);
+				break;
+			default:
+				dev_warn(DEV, "Error receiving initial packet\n");
+				sock_release(s);
+				if (random32() & 1)
+					goto retry;
+			}
+		}
+
+		if (mdev->state.conn <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&mdev->receiver) == Exiting)
+				goto out_release_sockets;
+		}
+
+		if (sock && msock) {
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+	} while (1);
+
+	msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+	sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+
+	sock->sk->sk_allocation = GFP_NOIO;
+	msock->sk->sk_allocation = GFP_NOIO;
+
+	sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	if (mdev->net_conf->sndbuf_size) {
+		sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+
+	if (mdev->net_conf->rcvbuf_size) {
+		sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+
+	/* NOT YET ...
+	 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_HAND_SHAKE timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	sock->sk->sk_sndtimeo =
+	sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+
+	msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where apropriate, though */
+	drbd_tcp_nodelay(sock);
+	drbd_tcp_nodelay(msock);
+
+	mdev->data.socket = sock;
+	mdev->meta.socket = msock;
+	mdev->last_received = jiffies;
+
+	D_ASSERT(mdev->asender.task == NULL);
+
+	h = drbd_do_handshake(mdev);
+	if (h <= 0)
+		return h;
+
+	if (mdev->cram_hmac_tfm) {
+		/* drbd_request_state(mdev, NS(conn, WFAuth)); */
+		if (!drbd_do_auth(mdev)) {
+			dev_err(DEV, "Authentication of peer failed\n");
+			return -1;
+		}
+	}
+
+	if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
+		return 0;
+
+	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	atomic_set(&mdev->packet_seq, 0);
+	mdev->peer_seq = 0;
+
+	drbd_thread_start(&mdev->asender);
+
+	drbd_send_protocol(mdev);
+	drbd_send_sync_param(mdev, &mdev->sync_conf);
+	drbd_send_sizes(mdev, 0);
+	drbd_send_uuids(mdev);
+	drbd_send_state(mdev);
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	return 1;
+
+out_release_sockets:
+	if (sock)
+		sock_release(sock);
+	if (msock)
+		sock_release(msock);
+	return -1;
+}
+
+static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+{
+	int r;
+
+	r = drbd_recv(mdev, h, sizeof(*h));
+
+	if (unlikely(r != sizeof(*h))) {
+		dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
+		return FALSE;
+	};
+	h->command = be16_to_cpu(h->command);
+	h->length  = be16_to_cpu(h->length);
+	if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+		dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
+		    (long)be32_to_cpu(h->magic),
+		    h->command, h->length);
+		return FALSE;
+	}
+	mdev->last_received = jiffies;
+
+	return TRUE;
+}
+
+static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	int rv;
+
+	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+		if (rv) {
+			dev_err(DEV, "local disk flush failed with status %d\n", rv);
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			drbd_bump_write_ordering(mdev, WO_drain_io);
+		}
+		put_ldev(mdev);
+	}
+
+	return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+}
+
+static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct flush_work *fw = (struct flush_work *)w;
+	struct drbd_epoch *epoch = fw->epoch;
+
+	kfree(w);
+
+	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+		drbd_flush_after_epoch(mdev, epoch);
+
+	drbd_may_finish_epoch(mdev, epoch, EV_PUT |
+			      (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
+
+	return 1;
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @mdev:	DRBD device.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int finish, epoch_size;
+	struct drbd_epoch *next_epoch;
+	int schedule_flush = 0;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&mdev->epoch_lock);
+	do {
+		next_epoch = NULL;
+		finish = 0;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+			/* Special case: If we just switched from WO_bio_barrier to
+			   WO_bdev_flush we should not finish the current epoch */
+			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+			    mdev->write_ordering != WO_bio_barrier &&
+			    epoch == mdev->current_epoch)
+				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+			break;
+		case EV_BARRIER_DONE:
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
+		    epoch->list.prev == &mdev->current_epoch->list &&
+		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+			/* Nearly all conditions are met to finish that epoch... */
+			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+			    mdev->write_ordering == WO_none ||
+			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+			    ev & EV_CLEANUP) {
+				finish = 1;
+				set_bit(DE_IS_FINISHING, &epoch->flags);
+			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+				 mdev->write_ordering == WO_bio_barrier) {
+				atomic_inc(&epoch->active);
+				schedule_flush = 1;
+			}
+		}
+		if (finish) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&mdev->epoch_lock);
+				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
+				spin_lock(&mdev->epoch_lock);
+			}
+			dec_unacked(mdev);
+
+			if (mdev->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				mdev->epochs--;
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is alrady zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&mdev->epoch_lock);
+
+	if (schedule_flush) {
+		struct flush_work *fw;
+		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
+		if (fw) {
+			fw->w.cb = w_flush;
+			fw->epoch = epoch;
+			drbd_queue_work(&mdev->data.work, &fw->w);
+		} else {
+			dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+			/* That is not a recursion, only one level */
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+			drbd_may_finish_epoch(mdev, epoch, EV_PUT);
+		}
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @mdev:	DRBD device.
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+{
+	enum write_ordering_e pwo;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+		[WO_bio_barrier] = "barrier",
+	};
+
+	pwo = mdev->write_ordering;
+	wo = min(pwo, wo);
+	if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
+		wo = WO_bdev_flush;
+	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
+		wo = WO_none;
+	mdev->write_ordering = wo;
+	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+}
+
+/**
+ * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways (unused in this callback)
+ */
+int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	struct bio *bio = e->private_bio;
+
+	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+	   so that we can finish that epoch in drbd_may_finish_epoch().
+	   That is necessary if we already have a long chain of Epochs, before
+	   we realize that BIO_RW_BARRIER is actually not supported */
+
+	/* As long as the -ENOTSUPP on the barrier is reported immediately
+	   that will never trigger. If it is reported late, we will just
+	   print that warning and continue correctly for all future requests
+	   with WO_bdev_flush */
+	if (previous_epoch(mdev, e->epoch))
+		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
+
+	/* prepare bio for re-submit,
+	 * re-init volatile members */
+	/* we still have a local reference,
+	 * get_ldev was done in receive_Data. */
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	bio->bi_sector = e->sector;
+	bio->bi_size = e->size;
+	bio->bi_idx = 0;
+
+	bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+	bio->bi_flags |= 1 << BIO_UPTODATE;
+
+	/* don't know whether this is necessary: */
+	bio->bi_phys_segments = 0;
+	bio->bi_next = NULL;
+
+	/* these should be unchanged: */
+	/* bio->bi_end_io = drbd_endio_write_sec; */
+	/* bio->bi_vcnt = whatever; */
+
+	e->w.cb = e_end_block;
+
+	/* This is no longer a barrier request. */
+	bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
+
+	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
+
+	return 1;
+}
+
+static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+{
+	int rv, issue_flush;
+	struct p_barrier *p = (struct p_barrier *)h;
+	struct drbd_epoch *epoch;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+	rv = drbd_recv(mdev, h->payload, h->length);
+	ERR_IF(rv != h->length) return FALSE;
+
+	inc_unacked(mdev);
+
+	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
+		drbd_kick_lo(mdev);
+
+	mdev->current_epoch->barrier_nr = p->barrier;
+	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (mdev->write_ordering) {
+	case WO_bio_barrier:
+	case WO_none:
+		if (rv == FE_RECYCLED)
+			return TRUE;
+		break;
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		D_ASSERT(rv == FE_STILL_LIVE);
+		set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+		if (rv == FE_RECYCLED)
+			return TRUE;
+
+		/* The asender will send all the ACKs and barrier ACKs out, since
+		   all EEs moved from the active_ee to the done_ee. We need to
+		   provide a new epoch object for the EEs that come in soon */
+		break;
+	}
+
+	/* receiver context, in the writeout path of the other node.
+	 * avoid potential distributed deadlock */
+	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+	if (!epoch) {
+		dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		if (issue_flush) {
+			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+			if (rv == FE_RECYCLED)
+				return TRUE;
+		}
+
+		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+
+		return TRUE;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&mdev->epoch_lock);
+	if (atomic_read(&mdev->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &mdev->current_epoch->list);
+		mdev->current_epoch = epoch;
+		mdev->epochs++;
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	return TRUE;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_epoch_entry *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+{
+	struct drbd_epoch_entry *e;
+	struct bio_vec *bvec;
+	struct page *page;
+	struct bio *bio;
+	int dgs, ds, i, rr;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
+			     rr, dgs);
+			return NULL;
+		}
+	}
+
+	data_size -= dgs;
+
+	ERR_IF(data_size &  0x1ff) return NULL;
+	ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
+	if (!e)
+		return NULL;
+	bio = e->private_bio;
+	ds = data_size;
+	bio_for_each_segment(bvec, bio, i) {
+		page = bvec->bv_page;
+		rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+		kunmap(page);
+		if (rr != min_t(int, ds, PAGE_SIZE)) {
+			drbd_free_ee(mdev, e);
+			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+			     rr, min_t(int, ds, PAGE_SIZE));
+			return NULL;
+		}
+		ds -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED.\n");
+			drbd_bcast_ee(mdev, "digest failed",
+					dgs, dig_in, dig_vv, e);
+			drbd_free_ee(mdev, e);
+			return NULL;
+		}
+	}
+	mdev->recv_cnt += data_size>>9;
+	return e;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
+{
+	struct page *page;
+	int rr, rv = 1;
+	void *data;
+
+	page = drbd_pp_alloc(mdev, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
+		if (rr != min_t(int, data_size, PAGE_SIZE)) {
+			rv = 0;
+			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+			     rr, min_t(int, data_size, PAGE_SIZE));
+			break;
+		}
+		data_size -= rr;
+	}
+	kunmap(page);
+	drbd_pp_free(mdev, page);
+	return rv;
+}
+
+static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec *bvec;
+	struct bio *bio;
+	int dgs, rr, i, expect;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
+			     rr, dgs);
+			return 0;
+		}
+	}
+
+	data_size -= dgs;
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	mdev->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(sector == bio->bi_sector);
+
+	bio_for_each_segment(bvec, bio, i) {
+		expect = min_t(int, data_size, bvec->bv_len);
+		rr = drbd_recv(mdev,
+			     kmap(bvec->bv_page)+bvec->bv_offset,
+			     expect);
+		kunmap(bvec->bv_page);
+		if (rr != expect) {
+			dev_warn(DEV, "short read receiving data reply: "
+			     "read %d expected %d\n",
+			     rr, expect);
+			return 0;
+		}
+		data_size -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
+			return 0;
+		}
+	}
+
+	D_ASSERT(data_size == 0);
+	return 1;
+}
+
+/* e_end_resync_block() is called via
+ * drbd_process_done_ee() by asender only */
+static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	int ok;
+
+	D_ASSERT(hlist_unhashed(&e->colision));
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		drbd_set_in_sync(mdev, sector, e->size);
+		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(mdev, sector, e->size);
+
+		ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+	}
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
+{
+	struct drbd_epoch_entry *e;
+
+	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	dec_rs_pending(mdev);
+
+	e->private_bio->bi_end_io = drbd_endio_write_sec;
+	e->private_bio->bi_rw = WRITE;
+	e->w.cb = e_end_resync_block;
+
+	inc_unacked(mdev);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->sync_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
+	/* accounting done in endio */
+
+	maybe_kick_lo(mdev);
+	return TRUE;
+}
+
+static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct drbd_request *req;
+	sector_t sector;
+	unsigned int header_size, data_size;
+	int ok;
+	struct p_data *p = (struct p_data *)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&mdev->req_lock);
+	req = _ar_id_to_req(mdev, p->block_id, sector);
+	spin_unlock_irq(&mdev->req_lock);
+	if (unlikely(!req)) {
+		dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
+		return FALSE;
+	}
+
+	/* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
+	 * special casing it there for the various failure cases.
+	 * still no race with drbd_fail_pending_reads */
+	ok = recv_dless_read(mdev, req, sector, data_size);
+
+	if (ok)
+		req_mod(req, data_received);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return ok;
+}
+
+static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	unsigned int header_size, data_size;
+	int ok;
+	struct p_data *p = (struct p_data *)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	if (get_ldev(mdev)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_endio_write_sec. */
+		ok = recv_resync_read(mdev, sector, data_size);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not write resync data to local disk.\n");
+
+		ok = drbd_drain_block(mdev, data_size);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+	}
+
+	return ok;
+}
+
+/* e_end_block() is called via drbd_process_done_ee().
+ * this means this function only runs in the asender thread
+ */
+static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	struct drbd_epoch *epoch;
+	int ok = 1, pcmd;
+
+	if (e->flags & EE_IS_BARRIER) {
+		epoch = previous_epoch(mdev, e->epoch);
+		if (epoch)
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
+	}
+
+	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
+		if (likely(drbd_bio_uptodate(e->private_bio))) {
+			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
+				mdev->state.conn <= C_PAUSED_SYNC_T &&
+				e->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			ok &= drbd_send_ack(mdev, pcmd, e);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(mdev, sector, e->size);
+		} else {
+			ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(mdev);
+	}
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+		D_ASSERT(!hlist_unhashed(&e->colision));
+		hlist_del_init(&e->colision);
+		spin_unlock_irq(&mdev->req_lock);
+	} else {
+		D_ASSERT(hlist_unhashed(&e->colision));
+	}
+
+	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return ok;
+}
+
+static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	int ok = 1;
+
+	D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+	ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+
+	spin_lock_irq(&mdev->req_lock);
+	D_ASSERT(!hlist_unhashed(&e->colision));
+	hlist_del_init(&e->colision);
+	spin_unlock_irq(&mdev->req_lock);
+
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than mdev->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update mdev->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+{
+	DEFINE_WAIT(wait);
+	unsigned int p_seq;
+	long timeout;
+	int ret = 0;
+	spin_lock(&mdev->peer_seq_lock);
+	for (;;) {
+		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		if (seq_le(packet_seq, mdev->peer_seq+1))
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		p_seq = mdev->peer_seq;
+		spin_unlock(&mdev->peer_seq_lock);
+		timeout = schedule_timeout(30*HZ);
+		spin_lock(&mdev->peer_seq_lock);
+		if (timeout == 0 && p_seq == mdev->peer_seq) {
+			ret = -ETIMEDOUT;
+			dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+			break;
+		}
+	}
+	finish_wait(&mdev->seq_wait, &wait);
+	if (mdev->peer_seq+1 == packet_seq)
+		mdev->peer_seq++;
+	spin_unlock(&mdev->peer_seq_lock);
+	return ret;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	struct drbd_epoch_entry *e;
+	struct p_data *p = (struct p_data *)h;
+	int header_size, data_size;
+	int rw = WRITE;
+	u32 dp_flags;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	if (!get_ldev(mdev)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not write mirrored data block "
+			    "to local disk.\n");
+		spin_lock(&mdev->peer_seq_lock);
+		if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
+			mdev->peer_seq++;
+		spin_unlock(&mdev->peer_seq_lock);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+		atomic_inc(&mdev->current_epoch->epoch_size);
+		return drbd_drain_block(mdev, data_size);
+	}
+
+	/* get_ldev(mdev) successful.
+	 * Corresponding put_ldev done either below (on various errors),
+	 * or in drbd_endio_write_sec, if we successfully submit the data at
+	 * the end of this function. */
+
+	sector = be64_to_cpu(p->sector);
+	e = read_in_block(mdev, p->block_id, sector, data_size);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	e->private_bio->bi_end_io = drbd_endio_write_sec;
+	e->w.cb = e_end_block;
+
+	spin_lock(&mdev->epoch_lock);
+	e->epoch = mdev->current_epoch;
+	atomic_inc(&e->epoch->epoch_size);
+	atomic_inc(&e->epoch->active);
+
+	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
+		struct drbd_epoch *epoch;
+		/* Issue a barrier if we start a new epoch, and the previous epoch
+		   was not a epoch containing a single request which already was
+		   a Barrier. */
+		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
+		if (epoch == e->epoch) {
+			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+			rw |= (1<<BIO_RW_BARRIER);
+			e->flags |= EE_IS_BARRIER;
+		} else {
+			if (atomic_read(&epoch->epoch_size) > 1 ||
+			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+				rw |= (1<<BIO_RW_BARRIER);
+				e->flags |= EE_IS_BARRIER;
+			}
+		}
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	if (dp_flags & DP_HARDBARRIER) {
+		dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
+		/* rw |= (1<<BIO_RW_BARRIER); */
+	}
+	if (dp_flags & DP_RW_SYNC)
+		rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		e->flags |= EE_MAY_SET_IN_SYNC;
+
+	/* I'm the receiver, I do hold a net_cnt reference. */
+	if (!mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+	} else {
+		/* don't get the req_lock yet,
+		 * we may sleep in drbd_wait_peer_seq */
+		const int size = e->size;
+		const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+		DEFINE_WAIT(wait);
+		struct drbd_request *i;
+		struct hlist_node *n;
+		struct hlist_head *slot;
+		int first;
+
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		BUG_ON(mdev->ee_hash == NULL);
+		BUG_ON(mdev->tl_hash == NULL);
+
+		/* conflict detection and handling:
+		 * 1. wait on the sequence number,
+		 *    in case this data packet overtook ACK packets.
+		 * 2. check our hash tables for conflicting requests.
+		 *    we only need to walk the tl_hash, since an ee can not
+		 *    have a conflict with an other ee: on the submitting
+		 *    node, the corresponding req had already been conflicting,
+		 *    and a conflicting req is never sent.
+		 *
+		 * Note: for two_primaries, we are protocol C,
+		 * so there cannot be any request that is DONE
+		 * but still on the transfer log.
+		 *
+		 * unconditionally add to the ee_hash.
+		 *
+		 * if no conflicting request is found:
+		 *    submit.
+		 *
+		 * if any conflicting request is found
+		 * that has not yet been acked,
+		 * AND I have the "discard concurrent writes" flag:
+		 *	 queue (via done_ee) the P_DISCARD_ACK; OUT.
+		 *
+		 * if any conflicting request is found:
+		 *	 block the receiver, waiting on misc_wait
+		 *	 until no more conflicting requests are there,
+		 *	 or we get interrupted (disconnect).
+		 *
+		 *	 we do not just write after local io completion of those
+		 *	 requests, but only after req is done completely, i.e.
+		 *	 we wait for the P_DISCARD_ACK to arrive!
+		 *
+		 *	 then proceed normally, i.e. submit.
+		 */
+		if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+			goto out_interrupted;
+
+		spin_lock_irq(&mdev->req_lock);
+
+		hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+		slot = tl_hash_slot(mdev, sector);
+		first = 1;
+		for (;;) {
+			int have_unacked = 0;
+			int have_conflict = 0;
+			prepare_to_wait(&mdev->misc_wait, &wait,
+				TASK_INTERRUPTIBLE);
+			hlist_for_each_entry(i, n, slot, colision) {
+				if (OVERLAPS) {
+					/* only ALERT on first iteration,
+					 * we may be woken up early... */
+					if (first)
+						dev_alert(DEV, "%s[%u] Concurrent local write detected!"
+						      "	new: %llus +%u; pending: %llus +%u\n",
+						      current->comm, current->pid,
+						      (unsigned long long)sector, size,
+						      (unsigned long long)i->sector, i->size);
+					if (i->rq_state & RQ_NET_PENDING)
+						++have_unacked;
+					++have_conflict;
+				}
+			}
+#undef OVERLAPS
+			if (!have_conflict)
+				break;
+
+			/* Discard Ack only for the _first_ iteration */
+			if (first && discard && have_unacked) {
+				dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
+				     (unsigned long long)sector);
+				inc_unacked(mdev);
+				e->w.cb = e_send_discard_ack;
+				list_add_tail(&e->w.list, &mdev->done_ee);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				/* we could probably send that P_DISCARD_ACK ourselves,
+				 * but I don't like the receiver using the msock */
+
+				put_ldev(mdev);
+				wake_asender(mdev);
+				finish_wait(&mdev->misc_wait, &wait);
+				return TRUE;
+			}
+
+			if (signal_pending(current)) {
+				hlist_del_init(&e->colision);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				finish_wait(&mdev->misc_wait, &wait);
+				goto out_interrupted;
+			}
+
+			spin_unlock_irq(&mdev->req_lock);
+			if (first) {
+				first = 0;
+				dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
+				     "sec=%llus\n", (unsigned long long)sector);
+			} else if (discard) {
+				/* we had none on the first iteration.
+				 * there must be none now. */
+				D_ASSERT(have_unacked == 0);
+			}
+			schedule();
+			spin_lock_irq(&mdev->req_lock);
+		}
+		finish_wait(&mdev->misc_wait, &wait);
+	}
+
+	list_add(&e->w.list, &mdev->active_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	switch (mdev->net_conf->wire_protocol) {
+	case DRBD_PROT_C:
+		inc_unacked(mdev);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+		break;
+	case DRBD_PROT_B:
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(mdev, P_RECV_ACK, e);
+		break;
+	case DRBD_PROT_A:
+		/* nothing to do */
+		break;
+	}
+
+	if (mdev->state.pdsk == D_DISKLESS) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(mdev, e->sector, e->size);
+		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		drbd_al_begin_io(mdev, e->sector);
+	}
+
+	e->private_bio->bi_rw = rw;
+	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
+	/* accounting done in endio */
+
+	maybe_kick_lo(mdev);
+	return TRUE;
+
+out_interrupted:
+	/* yes, the epoch_size now is imbalanced.
+	 * but we drop the connection anyways, so we don't have a chance to
+	 * receive a barrier... atomic_inc(&mdev->epoch_size); */
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return FALSE;
+}
+
+static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct drbd_epoch_entry *e;
+	struct digest_info *di = NULL;
+	int size, digest_size;
+	unsigned int fault_type;
+	struct p_block_req *p =
+		(struct p_block_req *)h;
+	const int brps = sizeof(*p)-sizeof(*h);
+
+	if (drbd_recv(mdev, h->payload, brps) != brps)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return FALSE;
+	}
+	if (sector + (size>>9) > capacity) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return FALSE;
+	}
+
+	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+		drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
+				 P_NEG_RS_DREPLY , p);
+		return TRUE;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	e->private_bio->bi_rw = READ;
+	e->private_bio->bi_end_io = drbd_endio_read_sec;
+
+	switch (h->command) {
+	case P_DATA_REQUEST:
+		e->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		break;
+	case P_RS_DATA_REQUEST:
+		e->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* Eventually this should become asynchronously. Currently it
+		 * blocks the whole receiver just to delay the reading of a
+		 * resync data block.
+		 * the drbd_work_queue mechanism is made for this...
+		 */
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted,
+			 * probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		digest_size = h->length - brps ;
+		di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = digest_size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+			goto out_free_e;
+
+		e->block_id = (u64)(unsigned long)di;
+		if (h->command == P_CSUM_RS_REQUEST) {
+			D_ASSERT(mdev->agreed_pro_version >= 89);
+			e->w.cb = w_e_end_csum_rs_req;
+		} else if (h->command == P_OV_REPLY) {
+			e->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(mdev);
+			break;
+		}
+
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted, probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (mdev->state.conn >= C_CONNECTED &&
+		    mdev->state.conn != C_VERIFY_T)
+			dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
+				drbd_conn_str(mdev->state.conn));
+		if (mdev->ov_start_sector == ~(sector_t)0 &&
+		    mdev->agreed_pro_version >= 90) {
+			mdev->ov_start_sector = sector;
+			mdev->ov_position = sector;
+			mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
+			dev_info(DEV, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		e->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* Eventually this should become asynchronous. Currently it
+		 * blocks the whole receiver just to delay the reading of a
+		 * resync data block.
+		 * the drbd_work_queue mechanism is made for this...
+		 */
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted,
+			 * probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+
+	default:
+		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+		    cmdname(h->command));
+		fault_type = DRBD_FAULT_MAX;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	inc_unacked(mdev);
+
+	drbd_generic_make_request(mdev, fault_type, e->private_bio);
+	maybe_kick_lo(mdev);
+
+	return TRUE;
+
+out_free_e:
+	kfree(di);
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return FALSE;
+}
+
+static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = mdev->p_uuid[UI_SIZE];
+	ch_self = mdev->comm_bm_set;
+
+	switch (mdev->net_conf->after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
+		     "Using discard-least-changes instead\n");
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, hg, rv = -100;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	switch (mdev->net_conf->after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && mdev->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return mdev->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_PRIMARY) {
+			self = drbd_set_role(mdev, R_SECONDARY, 0);
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (self != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, hg, rv = -100;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	switch (mdev->net_conf->after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1) {
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (self != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+ */
+static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
+{
+	u64 self, peer;
+	int i, j;
+
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_set_bm(mdev, 0UL);
+
+				drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+					       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
+				mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
+				mdev->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
+			(mdev->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+		peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
+		if (self == peer) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
+			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = mdev->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
+		peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+		if (self == peer) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			_drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
+			_drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			dev_info(DEV, "Undid last start of resync:\n");
+
+			drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+				       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = mdev->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	int hg, rule_nr;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+
+	mydisk = mdev->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = mdev->new_state_tmp.disk;
+
+	dev_info(DEV, "drbd_sync_handshake:\n");
+	drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
+	drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
+		       mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(mdev, &rule_nr);
+
+	dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		dev_alert(DEV, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg == -1001) {
+		dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		dev_info(DEV, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+		int pcount = (mdev->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(mdev);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(mdev);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(mdev);
+			break;
+		}
+		if (abs(hg) < 100) {
+			dev_warn(DEV, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				dev_warn(DEV, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			dev_warn(DEV, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+
+	if (hg == -100) {
+		dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+		drbd_khelper(mdev, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
+		switch (mdev->net_conf->rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(mdev, "pri-lost");
+			/* fall through */
+		case ASB_DISCONNECT:
+			dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (abs(hg) >= 2) {
+		dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(mdev)) {
+			dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(mdev));
+		}
+	}
+
+	return rv;
+}
+
+/* returns 1 if invalid */
+static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
+	    (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
+		return 0;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
+	    self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
+		return 1;
+
+	/* everything else is valid if they are equal on both sides. */
+	if (peer == self)
+		return 0;
+
+	/* everything es is invalid. */
+	return 1;
+}
+
+static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_protocol *p = (struct p_protocol *)h;
+	int header_size, data_size;
+	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_want_lose, p_two_primaries;
+	char p_integrity_alg[SHARED_SECRET_MAX] = "";
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_want_lose	= be32_to_cpu(p->want_lose);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+
+	if (p_proto != mdev->net_conf->wire_protocol) {
+		dev_err(DEV, "incompatible communication protocols\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
+		dev_err(DEV, "incompatible after-sb-0pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
+		dev_err(DEV, "incompatible after-sb-1pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
+		dev_err(DEV, "incompatible after-sb-2pri settings\n");
+		goto disconnect;
+	}
+
+	if (p_want_lose && mdev->net_conf->want_lose) {
+		dev_err(DEV, "both sides have the 'want_lose' flag set\n");
+		goto disconnect;
+	}
+
+	if (p_two_primaries != mdev->net_conf->two_primaries) {
+		dev_err(DEV, "incompatible setting of the two-primaries options\n");
+		goto disconnect;
+	}
+
+	if (mdev->agreed_pro_version >= 87) {
+		unsigned char *my_alg = mdev->net_conf->integrity_alg;
+
+		if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
+			return FALSE;
+
+		p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
+		if (strcmp(p_integrity_alg, my_alg)) {
+			dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+			goto disconnect;
+		}
+		dev_info(DEV, "data-integrity-alg: %s\n",
+		     my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
+	}
+
+	return TRUE;
+
+disconnect:
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return FALSE;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
+		crypto_free_hash(tfm);
+		dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
+		return ERR_PTR(-EINVAL);
+	}
+	return tfm;
+}
+
+static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+{
+	int ok = TRUE;
+	struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	const int apv = mdev->agreed_pro_version;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : /* 89 */    sizeof(struct p_rs_param_89);
+
+	if (h->length > exp_max_sz) {
+		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    h->length, exp_max_sz);
+		return FALSE;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param) - sizeof(*h);
+		data_size   = h->length  - header_size;
+	} else /* apv >= 89 */ {
+		header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
+		data_size   = h->length  - header_size;
+		D_ASSERT(data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX) {
+				dev_err(DEV, "verify-alg too long, "
+				    "peer wants %u, accepting only %u byte\n",
+						data_size, SHARED_SECRET_MAX);
+				return FALSE;
+			}
+
+			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
+				return FALSE;
+
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+
+		spin_lock(&mdev->peer_seq_lock);
+		/* lock against drbd_nl_syncer_conf() */
+		if (verify_tfm) {
+			strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
+			mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
+			crypto_free_hash(mdev->verify_tfm);
+			mdev->verify_tfm = verify_tfm;
+			dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+		}
+		if (csums_tfm) {
+			strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
+			mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
+			crypto_free_hash(mdev->csums_tfm);
+			mdev->csums_tfm = csums_tfm;
+			dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+		}
+		spin_unlock(&mdev->peer_seq_lock);
+	}
+
+	return ok;
+disconnect:
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_hash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_hash(verify_tfm);
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return FALSE;
+}
+
+static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ */
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_conf *mdev,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_sizes *p = (struct p_sizes *)h;
+	enum determine_dev_size dd = unchanged;
+	unsigned int max_seg_s;
+	sector_t p_size, p_usize, my_usize;
+	int ldsc = 0; /* local disk size changed */
+	enum drbd_conns nconn;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+
+	if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
+		dev_err(DEV, "some backing storage is needed\n");
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	mdev->p_size = p_size;
+
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+	if (get_ldev(mdev)) {
+		warn_if_differ_considerably(mdev, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(mdev->ldev));
+		warn_if_differ_considerably(mdev, "user requested size",
+					    p_usize, mdev->ldev->dc.disk_size);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (mdev->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
+					     p_usize);
+
+		my_usize = mdev->ldev->dc.disk_size;
+
+		if (mdev->ldev->dc.disk_size != p_usize) {
+			mdev->ldev->dc.disk_size = p_usize;
+			dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+			     (unsigned long)mdev->ldev->dc.disk_size);
+		}
+
+		/* Never shrink a device with usable data during connect.
+		   But allow online shrinking if we are connected. */
+		if (drbd_new_dev_size(mdev, mdev->ldev) <
+		   drbd_get_capacity(mdev->this_bdev) &&
+		   mdev->state.disk >= D_OUTDATED &&
+		   mdev->state.conn < C_CONNECTED) {
+			dev_err(DEV, "The peer's disk size is too small!\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			mdev->ldev->dc.disk_size = my_usize;
+			put_ldev(mdev);
+			return FALSE;
+		}
+		put_ldev(mdev);
+	}
+#undef min_not_zero
+
+	if (get_ldev(mdev)) {
+		dd = drbd_determin_dev_size(mdev);
+		put_ldev(mdev);
+		if (dd == dev_size_error)
+			return FALSE;
+		drbd_md_sync(mdev);
+	} else {
+		/* I am diskless, need to accept the peer's size. */
+		drbd_set_my_capacity(mdev, p_size);
+	}
+
+	if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+		nconn = drbd_sync_handshake(mdev,
+				mdev->state.peer, mdev->state.pdsk);
+		put_ldev(mdev);
+
+		if (nconn == C_MASK) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return FALSE;
+		}
+
+		if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return FALSE;
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		max_seg_s = be32_to_cpu(p->max_segment_size);
+		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
+			drbd_setup_queue_param(mdev, max_seg_s);
+
+		drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(mdev, 0);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
+		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
+			if (mdev->state.pdsk >= D_INCONSISTENT &&
+			    mdev->state.disk >= D_INCONSISTENT)
+				resync_after_online_grow(mdev);
+			else
+				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+		}
+	}
+
+	return TRUE;
+}
+
+static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_uuids *p = (struct p_uuids *)h;
+	u64 *p_uuid;
+	int i;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = p_uuid;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.disk < D_INCONSISTENT &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	if (get_ldev(mdev)) {
+		int skip_initial_sync =
+			mdev->state.conn == C_CONNECTED &&
+			mdev->agreed_pro_version >= 90 &&
+			mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids");
+			_drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(mdev);
+		}
+		put_ldev(mdev);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+	if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
+		drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+
+	return TRUE;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_req_state *p = (struct p_req_state *)h;
+	union drbd_state mask, val;
+	int rv;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
+	    test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
+		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+		return TRUE;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+
+	drbd_send_sr_reply(mdev, rv);
+	drbd_md_sync(mdev);
+
+	return TRUE;
+}
+
+static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_state *p = (struct p_state *)h;
+	enum drbd_conns nconn, oconn;
+	union drbd_state ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	int rv;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
+		return FALSE;
+
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+ retry:
+	oconn = nconn = mdev->state.conn;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (nconn == C_WF_REPORT_PARAMS)
+		nconn = C_CONNECTED;
+
+	if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (oconn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (oconn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			mdev->state.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --overwrite-data */
+		cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (oconn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+
+		put_ldev(mdev);
+		if (nconn == C_MASK) {
+			if (mdev->state.disk == D_NEGOTIATING) {
+				drbd_force_state(mdev, NS(disk, D_DISKLESS));
+				nconn = C_CONNECTED;
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+			} else {
+				D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return FALSE;
+			}
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn != oconn)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &mdev->flags);
+	ns.i = mdev->state.i;
+	ns.conn = nconn;
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = mdev->new_state_tmp.disk;
+
+	rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+	ns = mdev->state;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	if (oconn > C_WF_REPORT_PARAMS) {
+		if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(mdev);
+			drbd_send_state(mdev);
+		}
+	}
+
+	mdev->net_conf->want_lose = 0;
+
+	drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+
+	return TRUE;
+}
+
+static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+
+	wait_event(mdev->misc_wait,
+		   mdev->state.conn == C_WF_SYNC_UUID ||
+		   mdev->state.conn < C_CONNECTED ||
+		   mdev->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		_drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(mdev, UI_BITMAP, 0UL);
+
+		drbd_start_resync(mdev, C_SYNC_TARGET);
+
+		put_ldev(mdev);
+	} else
+		dev_err(DEV, "Ignoring SyncUUID packet!\n");
+
+	return TRUE;
+}
+
+enum receive_bitmap_ret { OK, DONE, FAILED };
+
+static enum receive_bitmap_ret
+receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
+	unsigned long *buffer, struct bm_xfer_ctx *c)
+{
+	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+	unsigned want = num_words * sizeof(long);
+
+	if (want != h->length) {
+		dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+		return FAILED;
+	}
+	if (want == 0)
+		return DONE;
+	if (drbd_recv(mdev, buffer, want) != want)
+		return FAILED;
+
+	drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return OK;
+}
+
+static enum receive_bitmap_ret
+recv_bm_rle_bits(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+	int toggle = DCBP_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return FAILED;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return FAILED;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return FAILED;
+			}
+			_drbd_bm_set_bits(mdev, s, e);
+		}
+
+		if (have < bits) {
+			dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return FAILED;
+		}
+		look_ahead >>= bits;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return FAILED;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s == c->bm_bits) ? DONE : OK;
+}
+
+static enum receive_bitmap_ret
+decode_bitmap_c(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	if (DCBP_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(mdev, p, c);
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	return FAILED;
+}
+
+void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned plain = sizeof(struct p_header) *
+		((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+		+ c->bm_words * sizeof(long);
+	unsigned total = c->bytes[0] + c->bytes[1];
+	unsigned r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct bm_xfer_ctx c;
+	void *buffer;
+	enum receive_bitmap_ret ret;
+	int ok = FALSE;
+
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+
+	drbd_bm_lock(mdev, "receive bitmap");
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	buffer	 = (unsigned long *) __get_free_page(GFP_NOIO);
+	if (!buffer) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		goto out;
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	do {
+		if (h->command == P_BITMAP) {
+			ret = receive_bitmap_plain(mdev, h, buffer, &c);
+		} else if (h->command == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p;
+
+			if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+				dev_err(DEV, "ReportCBitmap packet too large\n");
+				goto out;
+			}
+			/* use the page buff */
+			p = buffer;
+			memcpy(p, h, sizeof(*h));
+			if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+				goto out;
+			if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
+				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+				return FAILED;
+			}
+			ret = decode_bitmap_c(mdev, p, &c);
+		} else {
+			dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+			goto out;
+		}
+
+		c.packets[h->command == P_BITMAP]++;
+		c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+
+		if (ret != OK)
+			break;
+
+		if (!drbd_recv_header(mdev, h))
+			goto out;
+	} while (ret == OK);
+	if (ret == FAILED)
+		goto out;
+
+	INFO_bm_xfer_stats(mdev, "receive", &c);
+
+	if (mdev->state.conn == C_WF_BITMAP_T) {
+		ok = !drbd_send_bitmap(mdev);
+		if (!ok)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(ok == SS_SUCCESS);
+	} else if (mdev->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(mdev->state.conn));
+	}
+
+	ok = TRUE;
+ out:
+	drbd_bm_unlock(mdev);
+	if (ok && mdev->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	free_page((unsigned long) buffer);
+	return ok;
+}
+
+static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
+{
+	/* TODO zero copy sink :) */
+	static char sink[128];
+	int size, want, r;
+
+	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+	     h->command, h->length);
+
+	size = h->length;
+	while (size > 0) {
+		want = min_t(int, size, sizeof(sink));
+		r = drbd_recv(mdev, sink, want);
+		ERR_IF(r <= 0) break;
+		size -= r;
+	}
+	return size == 0;
+}
+
+static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+{
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(mdev->data.socket);
+
+	return TRUE;
+}
+
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
+
+static drbd_cmd_handler_f drbd_default_handler[] = {
+	[P_DATA]	    = receive_Data,
+	[P_DATA_REPLY]	    = receive_DataReply,
+	[P_RS_DATA_REPLY]   = receive_RSDataReply,
+	[P_BARRIER]	    = receive_Barrier,
+	[P_BITMAP]	    = receive_bitmap,
+	[P_COMPRESSED_BITMAP]    = receive_bitmap,
+	[P_UNPLUG_REMOTE]   = receive_UnplugRemote,
+	[P_DATA_REQUEST]    = receive_DataRequest,
+	[P_RS_DATA_REQUEST] = receive_DataRequest,
+	[P_SYNC_PARAM]	    = receive_SyncParam,
+	[P_SYNC_PARAM89]	   = receive_SyncParam,
+	[P_PROTOCOL]        = receive_protocol,
+	[P_UUIDS]	    = receive_uuids,
+	[P_SIZES]	    = receive_sizes,
+	[P_STATE]	    = receive_state,
+	[P_STATE_CHG_REQ]   = receive_req_state,
+	[P_SYNC_UUID]       = receive_sync_uuid,
+	[P_OV_REQUEST]      = receive_DataRequest,
+	[P_OV_REPLY]        = receive_DataRequest,
+	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
+	/* anything missing from this table is in
+	 * the asender_tbl, see get_asender_cmd */
+	[P_MAX_CMD]	    = NULL,
+};
+
+static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
+static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+
+static void drbdd(struct drbd_conf *mdev)
+{
+	drbd_cmd_handler_f handler;
+	struct p_header *header = &mdev->data.rbuf.header;
+
+	while (get_t_state(&mdev->receiver) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (!drbd_recv_header(mdev, header)) {
+			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+			break;
+		}
+
+		if (header->command < P_MAX_CMD)
+			handler = drbd_cmd_handler[header->command];
+		else if (P_MAY_IGNORE < header->command
+		     && header->command < P_MAX_OPT_CMD)
+			handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
+		else if (header->command > P_MAX_OPT_CMD)
+			handler = receive_skip;
+		else
+			handler = NULL;
+
+		if (unlikely(!handler)) {
+			dev_err(DEV, "unknown packet type %d, l: %d!\n",
+			    header->command, header->length);
+			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+			break;
+		}
+		if (unlikely(!handler(mdev, header))) {
+			dev_err(DEV, "error receiving %s, l: %d!\n",
+			    cmdname(header->command), header->length);
+			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+			break;
+		}
+	}
+}
+
+static void drbd_fail_pending_reads(struct drbd_conf *mdev)
+{
+	struct hlist_head *slot;
+	struct hlist_node *pos;
+	struct hlist_node *tmp;
+	struct drbd_request *req;
+	int i;
+
+	/*
+	 * Application READ requests
+	 */
+	spin_lock_irq(&mdev->req_lock);
+	for (i = 0; i < APP_R_HSIZE; i++) {
+		slot = mdev->app_reads_hash+i;
+		hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
+			/* it may (but should not any longer!)
+			 * be on the work queue; if that assert triggers,
+			 * we need to also grab the
+			 * spin_lock_irq(&mdev->data.work.q_lock);
+			 * and list_del_init here. */
+			D_ASSERT(list_empty(&req->w.list));
+			/* It would be nice to complete outside of spinlock.
+			 * But this is easier for now. */
+			_req_mod(req, connection_lost_while_pending);
+		}
+	}
+	for (i = 0; i < APP_R_HSIZE; i++)
+		if (!hlist_empty(mdev->app_reads_hash+i))
+			dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
+				"%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
+
+	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+	struct drbd_wq_barrier barr;
+
+	barr.w.cb = w_prev_work_done;
+	init_completion(&barr.done);
+	drbd_queue_work(&mdev->data.work, &barr.w);
+	wait_for_completion(&barr.done);
+}
+
+static void drbd_disconnect(struct drbd_conf *mdev)
+{
+	enum drbd_fencing_p fp;
+	union drbd_state os, ns;
+	int rv = SS_UNKNOWN_ERROR;
+	unsigned int i;
+
+	if (mdev->state.conn == C_STANDALONE)
+		return;
+	if (mdev->state.conn >= C_WF_CONNECTION)
+		dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
+				drbd_conn_str(mdev->state.conn));
+
+	/* asender does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&mdev->asender);
+
+	mutex_lock(&mdev->data.mutex);
+	drbd_free_sock(mdev);
+	mutex_unlock(&mdev->data.mutex);
+
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(mdev);
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	wake_up(&mdev->misc_wait);
+
+	/* make sure syncer is stopped and w_resume_next_sg queued */
+	del_timer_sync(&mdev->resync_timer);
+	set_bit(STOP_SYNC_TIMER, &mdev->flags);
+	resync_timer_fn((unsigned long)mdev);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(mdev);
+
+	/* This also does reclaim_net_ee().  If we do this too early, we might
+	 * miss some resync ee and pages.*/
+	drbd_process_done_ee(mdev);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = NULL;
+
+	if (!mdev->state.susp)
+		tl_clear(mdev);
+
+	drbd_fail_pending_reads(mdev);
+
+	dev_info(DEV, "Connection closed\n");
+
+	drbd_md_sync(mdev);
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.role == R_PRIMARY) {
+		if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
+			enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
+			drbd_request_state(mdev, NS(pdsk, nps));
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	if (os.conn >= C_UNCONNECTED) {
+		/* Do not restart in case we are C_DISCONNECTING */
+		ns = os;
+		ns.conn = C_UNCONNECTED;
+		rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (os.conn == C_DISCONNECTING) {
+		struct hlist_head *h;
+		wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+
+		/* we must not free the tl_hash
+		 * while application io is still on the fly */
+		wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
+
+		spin_lock_irq(&mdev->req_lock);
+		/* paranoia code */
+		for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+			if (h->first)
+				dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+						(int)(h - mdev->ee_hash), h->first);
+		kfree(mdev->ee_hash);
+		mdev->ee_hash = NULL;
+		mdev->ee_hash_s = 0;
+
+		/* paranoia code */
+		for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+			if (h->first)
+				dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+						(int)(h - mdev->tl_hash), h->first);
+		kfree(mdev->tl_hash);
+		mdev->tl_hash = NULL;
+		mdev->tl_hash_s = 0;
+		spin_unlock_irq(&mdev->req_lock);
+
+		crypto_free_hash(mdev->cram_hmac_tfm);
+		mdev->cram_hmac_tfm = NULL;
+
+		kfree(mdev->net_conf);
+		mdev->net_conf = NULL;
+		drbd_request_state(mdev, NS(conn, C_STANDALONE));
+	}
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_release_ee(mdev, &mdev->net_ee);
+	if (i)
+		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&mdev->pp_in_use);
+	if (i)
+		dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&mdev->current_epoch->epoch_size, 0);
+	D_ASSERT(list_empty(&mdev->current_epoch->list));
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.sbuf.handshake;
+	int ok;
+
+	if (mutex_lock_interruptible(&mdev->data.mutex)) {
+		dev_err(DEV, "interrupted during initial handshake\n");
+		return 0; /* interrupted. not ok. */
+	}
+
+	if (mdev->data.socket == NULL) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
+			     (struct p_header *)p, sizeof(*p), 0 );
+	mutex_unlock(&mdev->data.mutex);
+	return ok;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+static int drbd_do_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.rbuf.handshake;
+	const int expect = sizeof(struct p_handshake)
+			  -sizeof(struct p_header);
+	int rv;
+
+	rv = drbd_send_handshake(mdev);
+	if (!rv)
+		return 0;
+
+	rv = drbd_recv_header(mdev, &p->head);
+	if (!rv)
+		return 0;
+
+	if (p->head.command != P_HAND_SHAKE) {
+		dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
+		     cmdname(p->head.command), p->head.command);
+		return -1;
+	}
+
+	if (p->head.length != expect) {
+		dev_err(DEV, "expected HandShake length: %u, received: %u\n",
+		     expect, p->head.length);
+		return -1;
+	}
+
+	rv = drbd_recv(mdev, &p->head.payload, expect);
+
+	if (rv != expect) {
+		dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
+		return 0;
+	}
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+
+	dev_info(DEV, "Handshake successful: "
+	     "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+
+	return 1;
+
+ incompat:
+	dev_err(DEV, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+	dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return 0;
+}
+#else
+#define CHALLENGE_LEN 64
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	struct scatterlist sg;
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	struct p_header p;
+	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+	unsigned int resp_size;
+	struct hash_desc desc;
+	int rv;
+
+	desc.tfm = mdev->cram_hmac_tfm;
+	desc.flags = 0;
+
+	rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
+				(u8 *)mdev->net_conf->shared_secret, key_len);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &p);
+	if (!rv)
+		goto fail;
+
+	if (p.command != P_AUTH_CHALLENGE) {
+		dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
+		rv = 0;
+		goto fail;
+	}
+
+	if (p.length > CHALLENGE_LEN*2) {
+		dev_err(DEV, "expected AuthChallenge payload too big.\n");
+		rv = 0;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(p.length, GFP_NOIO);
+	if (peers_ch == NULL) {
+		dev_err(DEV, "kmalloc of peers_ch failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, peers_ch, p.length);
+
+	if (rv != p.length) {
+		dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		dev_err(DEV, "kmalloc of response failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, peers_ch, p.length);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &p);
+	if (!rv)
+		goto fail;
+
+	if (p.command != P_AUTH_RESPONSE) {
+		dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
+		rv = 0;
+		goto fail;
+	}
+
+	if (p.length != resp_size) {
+		dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, response , resp_size);
+
+	if (rv != resp_size) {
+		dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		dev_err(DEV, "kmalloc of right_response failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
+		     resp_size, mdev->net_conf->cram_hmac_alg);
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+
+	return rv;
+}
+#endif
+
+int drbdd_init(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned int minor = mdev_to_minor(mdev);
+	int h;
+
+	sprintf(current->comm, "drbd%d_receiver", minor);
+
+	dev_info(DEV, "receiver (re)started\n");
+
+	do {
+		h = drbd_connect(mdev);
+		if (h == 0) {
+			drbd_disconnect(mdev);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ);
+		}
+		if (h == -1) {
+			dev_warn(DEV, "Discarding network configuration.\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	} while (h == 0);
+
+	if (h > 0) {
+		if (get_net_conf(mdev)) {
+			drbdd(mdev);
+			put_net_conf(mdev);
+		}
+	}
+
+	drbd_disconnect(mdev);
+
+	dev_info(DEV, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &mdev->flags);
+		dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
+		    drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&mdev->state_wait);
+
+	return TRUE;
+}
+
+static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+{
+	return drbd_send_ping_ack(mdev);
+
+}
+
+static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	/* restore idle timeout */
+	mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+	return TRUE;
+}
+
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	D_ASSERT(mdev->agreed_pro_version >= 89);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	drbd_rs_complete_io(mdev, sector);
+	drbd_set_in_sync(mdev, sector, blksize);
+	/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+	mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+	dec_rs_pending(mdev);
+
+	return TRUE;
+}
+
+/* when we receive the ACK for a write request,
+ * verify that we actually know about it */
+static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = tl_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, colision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			if (req->sector != sector) {
+				dev_err(DEV, "_ack_id_to_req: found req %p but it has "
+				    "wrong sector (%llus versus %llus)\n", req,
+				    (unsigned long long)req->sector,
+				    (unsigned long long)sector);
+				break;
+			}
+			return req;
+		}
+	}
+	dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
+		(void *)(unsigned long)id, (unsigned long long)sector);
+	return NULL;
+}
+
+typedef struct drbd_request *(req_validator_fn)
+	(struct drbd_conf *mdev, u64 id, sector_t sector);
+
+static int validate_req_change_req_state(struct drbd_conf *mdev,
+	u64 id, sector_t sector, req_validator_fn validator,
+	const char *func, enum drbd_req_event what)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&mdev->req_lock);
+	req = validator(mdev, id, sector);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&mdev->req_lock);
+		dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
+		return FALSE;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return TRUE;
+}
+
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		drbd_set_in_sync(mdev, sector, blksize);
+		dec_rs_pending(mdev);
+		return TRUE;
+	}
+	switch (be16_to_cpu(h->command)) {
+	case P_RS_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer_and_sis;
+		break;
+	case P_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer;
+		break;
+	case P_RECV_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
+		what = recv_acked_by_peer;
+		break;
+	case P_DISCARD_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = conflict_discarded_by_peer;
+		break;
+	default:
+		D_ASSERT(0);
+		return FALSE;
+	}
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ack_id_to_req, __func__ , what);
+}
+
+static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	if (__ratelimit(&drbd_ratelimit_state))
+		dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		int size = be32_to_cpu(p->blksize);
+		dec_rs_pending(mdev);
+		drbd_rs_failed_io(mdev, sector, size);
+		return TRUE;
+	}
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ack_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+	dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ar_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = (struct p_block_ack *)h;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(mdev);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, sector);
+		drbd_rs_failed_io(mdev, sector, size);
+		put_ldev(mdev);
+	}
+
+	return TRUE;
+}
+
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+
+	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+
+	return TRUE;
+}
+
+static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_work *w;
+	sector_t sector;
+	int size;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_oos_found(mdev, sector, size);
+	else
+		ov_oos_print(mdev);
+
+	drbd_rs_complete_io(mdev, sector);
+	dec_rs_pending(mdev);
+
+	if (--mdev->ov_left == 0) {
+		w = kmalloc(sizeof(*w), GFP_NOIO);
+		if (w) {
+			w->cb = w_ov_finished;
+			drbd_queue_work_front(&mdev->data.work, w);
+		} else {
+			dev_err(DEV, "kmalloc(w) failed.");
+			ov_oos_print(mdev);
+			drbd_resync_finished(mdev);
+		}
+	}
+	return TRUE;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*process)(struct drbd_conf *mdev, struct p_header *h);
+};
+
+static struct asender_cmd *get_asender_cmd(int cmd)
+{
+	static struct asender_cmd asender_tbl[] = {
+		/* anything missing from this table is in
+		 * the drbd_cmd_handler (drbd_default_handler) table,
+		 * see the beginning of drbdd() */
+	[P_PING]	    = { sizeof(struct p_header), got_Ping },
+	[P_PING_ACK]	    = { sizeof(struct p_header), got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_DISCARD_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_MAX_CMD]	    = { 0, NULL },
+	};
+	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
+		return NULL;
+	return &asender_tbl[cmd];
+}
+
+int drbd_asender(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct p_header *h = &mdev->meta.rbuf.header;
+	struct asender_cmd *cmd = NULL;
+
+	int rv, len;
+	void *buf    = h;
+	int received = 0;
+	int expect   = sizeof(struct p_header);
+	int empty;
+
+	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+
+	current->policy = SCHED_RR;  /* Make this a realtime task! */
+	current->rt_priority = 2;    /* more important than all other tasks */
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
+			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
+			mdev->meta.socket->sk->sk_rcvtimeo =
+				mdev->net_conf->ping_timeo*HZ/10;
+		}
+
+		/* conditionally cork;
+		 * it may hurt latency if we cork without much to send */
+		if (!mdev->net_conf->no_cork &&
+			3 < atomic_read(&mdev->unacked_cnt))
+			drbd_tcp_cork(mdev->meta.socket);
+		while (1) {
+			clear_bit(SIGNAL_ASENDER, &mdev->flags);
+			flush_signals(current);
+			if (!drbd_process_done_ee(mdev)) {
+				dev_err(DEV, "process_done_ee() = NOT_OK\n");
+				goto reconnect;
+			}
+			/* to avoid race with newly queued ACKs */
+			set_bit(SIGNAL_ASENDER, &mdev->flags);
+			spin_lock_irq(&mdev->req_lock);
+			empty = list_empty(&mdev->done_ee);
+			spin_unlock_irq(&mdev->req_lock);
+			/* new ack may have been queued right here,
+			 * but then there is also a signal pending,
+			 * and we start over... */
+			if (empty)
+				break;
+		}
+		/* but unconditionally uncork unless disabled */
+		if (!mdev->net_conf->no_cork)
+			drbd_tcp_uncork(mdev->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
+
+		rv = drbd_recv_short(mdev, mdev->meta.socket,
+				     buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+		flush_signals(current);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			dev_err(DEV, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			if (mdev->meta.socket->sk->sk_rcvtimeo ==
+			    mdev->net_conf->ping_timeo*HZ/10) {
+				dev_err(DEV, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &mdev->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+				dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
+				    (long)be32_to_cpu(h->magic),
+				    h->command, h->length);
+				goto reconnect;
+			}
+			cmd = get_asender_cmd(be16_to_cpu(h->command));
+			len = be16_to_cpu(h->length);
+			if (unlikely(cmd == NULL)) {
+				dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
+				    (long)be32_to_cpu(h->magic),
+				    h->command, h->length);
+				goto disconnect;
+			}
+			expect = cmd->pkt_size;
+			ERR_IF(len != expect-sizeof(struct p_header))
+				goto reconnect;
+		}
+		if (received == expect) {
+			D_ASSERT(cmd != NULL);
+			if (!cmd->process(mdev, h))
+				goto reconnect;
+
+			buf	 = h;
+			received = 0;
+			expect	 = sizeof(struct p_header);
+			cmd	 = NULL;
+		}
+	}
+
+	if (0) {
+reconnect:
+		drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	}
+	if (0) {
+disconnect:
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	}
+	clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+	D_ASSERT(mdev->state.conn < C_CONNECTED);
+	dev_info(DEV, "asender terminated\n");
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 0000000..de81ab7
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1125 @@
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+	int cpu;
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
+	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+	part_inc_in_flight(&mdev->vdisk->part0, rw);
+	part_stat_unlock();
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int rw = bio_data_dir(req->master_bio);
+	unsigned long duration = jiffies - req->start_time;
+	int cpu;
+	cpu = part_stat_lock();
+	part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
+	part_round_stats(cpu, &mdev->vdisk->part0);
+	part_dec_in_flight(&mdev->vdisk->part0, rw);
+	part_stat_unlock();
+}
+
+static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+{
+	const unsigned long s = req->rq_state;
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (rw == WRITE) {
+		/* remove it from the transfer log.
+		 * well, only if it had been there in the first
+		 * place... if it had not (local only or conflicting
+		 * and never sent), it should still be "empty" as
+		 * initialized in drbd_req_new(), so we can list_del() it
+		 * here unconditionally */
+		list_del(&req->tl_requests);
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+		if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+			drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+			drbd_set_in_sync(mdev, req->sector, req->size);
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_endio_pri.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_LOCAL_MASK) {
+			if (get_ldev_if_state(mdev, D_FAILED)) {
+				drbd_al_complete_io(mdev, req->sector);
+				put_ldev(mdev);
+			} else if (__ratelimit(&drbd_ratelimit_state)) {
+				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
+				     "but my Disk seems to have failed :(\n",
+				     (unsigned long long) req->sector);
+			}
+		}
+	}
+
+	/* if it was a local io error, we want to notify our
+	 * peer about that, and see if we need to
+	 * detach the disk and stuff.
+	 * to avoid allocating some special work
+	 * struct, reuse the request. */
+
+	/* THINK
+	 * why do we do this not when we detect the error,
+	 * but delay it until it is "done", i.e. possibly
+	 * until the next barrier ack? */
+
+	if (rw == WRITE &&
+	    ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
+		if (!(req->w.list.next == LIST_POISON1 ||
+		      list_empty(&req->w.list))) {
+			/* DEBUG ASSERT only; if this triggers, we
+			 * probably corrupt the worker list here */
+			dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
+			dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
+		}
+		req->w.cb = w_io_error;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		/* drbd_req_free() is done in w_io_error */
+	} else {
+		drbd_req_free(req);
+	}
+}
+
+static void queue_barrier(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* We are within the req_lock. Once we queued the barrier for sending,
+	 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
+	 * barrier/epoch object is added. This is the only place this bit is
+	 * set. It indicates that the barrier for this epoch is already queued,
+	 * and no new epoch has been created yet. */
+	if (test_bit(CREATE_BARRIER, &mdev->flags))
+		return;
+
+	b = mdev->newest_tle;
+	b->w.cb = w_send_barrier;
+	/* inc_ap_pending done here, so we won't
+	 * get imbalanced on connection loss.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in tl_clear.  */
+	inc_ap_pending(mdev);
+	drbd_queue_work(&mdev->data.work, &b->w);
+	set_bit(CREATE_BARRIER, &mdev->flags);
+}
+
+static void _about_to_complete_local_write(struct drbd_conf *mdev,
+	struct drbd_request *req)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	/* before we can signal completion to the upper layers,
+	 * we may need to close the current epoch */
+	if (mdev->state.conn >= C_CONNECTED &&
+	    req->epoch == mdev->newest_tle->br_number)
+		queue_barrier(mdev);
+
+	/* we need to do the conflict detection stuff,
+	 * if we have the ee_hash (two_primaries) and
+	 * this has been on the network */
+	if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
+		const sector_t sector = req->sector;
+		const int size = req->size;
+
+		/* ASSERT:
+		 * there must be no conflicting requests, since
+		 * they must have been failed on the spot */
+#define OVERLAPS overlaps(sector, size, i->sector, i->size)
+		slot = tl_hash_slot(mdev, sector);
+		hlist_for_each_entry(i, n, slot, colision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
+				      "other: %p %llus +%u\n",
+				      req, (unsigned long long)sector, size,
+				      i, (unsigned long long)i->sector, i->size);
+			}
+		}
+
+		/* maybe "wake" those conflicting epoch entries
+		 * that wait for this request to finish.
+		 *
+		 * currently, there can be only _one_ such ee
+		 * (well, or some more, which would be pending
+		 * P_DISCARD_ACK not yet sent by the asender...),
+		 * since we block the receiver thread upon the
+		 * first conflict detection, which will wait on
+		 * misc_wait.  maybe we want to assert that?
+		 *
+		 * anyways, if we found one,
+		 * we just have to do a wake_up.  */
+#undef OVERLAPS
+#define OVERLAPS overlaps(sector, size, e->sector, e->size)
+		slot = ee_hash_slot(mdev, req->sector);
+		hlist_for_each_entry(e, n, slot, colision) {
+			if (OVERLAPS) {
+				wake_up(&mdev->misc_wait);
+				break;
+			}
+		}
+	}
+#undef OVERLAPS
+}
+
+void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m)
+{
+	bio_endio(m->bio, m->error);
+	dec_ap_bio(mdev);
+}
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_conf *mdev = req->mdev;
+	/* only WRITES may end up here without a master bio (on barrier ack) */
+	int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if (s & RQ_NET_QUEUED)
+		return;
+	if (s & RQ_NET_PENDING)
+		return;
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
+	if (req->master_bio) {
+		/* this is data_received (remote read)
+		 * or protocol C P_WRITE_ACK
+		 * or protocol B P_RECV_ACK
+		 * or protocol A "handed_over_to_network" (SendAck)
+		 * or canceled or failed,
+		 * or killed from the transfer log due to connection loss.
+		 */
+
+		/*
+		 * figure out whether to report success or failure.
+		 *
+		 * report success when at least one of the operations succeeded.
+		 * or, to put the other way,
+		 * only report failure, when both operations failed.
+		 *
+		 * what to do about the failures is handled elsewhere.
+		 * what we need to do here is just: complete the master_bio.
+		 *
+		 * local completion error, if any, has been stored as ERR_PTR
+		 * in private_bio within drbd_endio_pri.
+		 */
+		int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+		int error = PTR_ERR(req->private_bio);
+
+		/* remove the request from the conflict detection
+		 * respective block_id verification hash */
+		if (!hlist_unhashed(&req->colision))
+			hlist_del(&req->colision);
+		else
+			D_ASSERT((s & RQ_NET_MASK) == 0);
+
+		/* for writes we need to do some extra housekeeping */
+		if (rw == WRITE)
+			_about_to_complete_local_write(mdev, req);
+
+		/* Update disk stats */
+		_drbd_end_io_acct(mdev, req);
+
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+	}
+
+	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
+		/* this is disconnected (local only) operation,
+		 * or protocol C P_WRITE_ACK,
+		 * or protocol A or B P_BARRIER_ACK,
+		 * or killed from the transfer log due to connection loss. */
+		_req_is_done(mdev, req, rw);
+	}
+	/* else: network part and not DONE yet. that is
+	 * protocol A or B, barrier ack still pending... */
+}
+
+/*
+ * checks whether there was an overlapping request
+ * or ee already registered.
+ *
+ * if so, return 1, in which case this request is completed on the spot,
+ * without ever being submitted or send.
+ *
+ * return 0 if it is ok to submit this request.
+ *
+ * NOTE:
+ * paranoia: assume something above us is broken, and issues different write
+ * requests for the same block simultaneously...
+ *
+ * To ensure these won't be reordered differently on both nodes, resulting in
+ * diverging data sets, we discard the later one(s). Not that this is supposed
+ * to happen, but this is the rationale why we also have to check for
+ * conflicting requests with local origin, and why we have to do so regardless
+ * of whether we allowed multiple primaries.
+ *
+ * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
+ * second hlist_for_each_entry becomes a noop. This is even simpler than to
+ * grab a reference on the net_conf, and check for the two_primaries flag...
+ */
+static int _req_conflicts(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->mdev;
+	const sector_t sector = req->sector;
+	const int size = req->size;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	D_ASSERT(hlist_unhashed(&req->colision));
+
+	if (!get_net_conf(mdev))
+		return 0;
+
+	/* BUG_ON */
+	ERR_IF (mdev->tl_hash_s == 0)
+		goto out_no_conflict;
+	BUG_ON(mdev->tl_hash == NULL);
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+	slot = tl_hash_slot(mdev, sector);
+	hlist_for_each_entry(i, n, slot, colision) {
+		if (OVERLAPS) {
+			dev_alert(DEV, "%s[%u] Concurrent local write detected! "
+			      "[DISCARD L] new: %llus +%u; "
+			      "pending: %llus +%u\n",
+			      current->comm, current->pid,
+			      (unsigned long long)sector, size,
+			      (unsigned long long)i->sector, i->size);
+			goto out_conflict;
+		}
+	}
+
+	if (mdev->ee_hash_s) {
+		/* now, check for overlapping requests with remote origin */
+		BUG_ON(mdev->ee_hash == NULL);
+#undef OVERLAPS
+#define OVERLAPS overlaps(e->sector, e->size, sector, size)
+		slot = ee_hash_slot(mdev, sector);
+		hlist_for_each_entry(e, n, slot, colision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
+				      " [DISCARD L] new: %llus +%u; "
+				      "pending: %llus +%u\n",
+				      current->comm, current->pid,
+				      (unsigned long long)sector, size,
+				      (unsigned long long)e->sector, e->size);
+				goto out_conflict;
+			}
+		}
+	}
+#undef OVERLAPS
+
+out_no_conflict:
+	/* this is like it should be, and what we expected.
+	 * our users do behave after all... */
+	put_net_conf(mdev);
+	return 0;
+
+out_conflict:
+	put_net_conf(mdev);
+	return 1;
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_conf *mdev = req->mdev;
+	m->bio = NULL;
+
+	switch (what) {
+	default:
+		dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case created:
+		break;
+		*/
+
+	case to_be_send: /* via network */
+		/* reached via drbd_make_request_common
+		 * and from w_read_retry_remote */
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+		break;
+
+	case to_be_submitted: /* locally */
+		/* reached via drbd_make_request_common */
+		D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
+		req->rq_state |= RQ_LOCAL_PENDING;
+		break;
+
+	case completed_ok:
+		if (bio_data_dir(req->master_bio) == WRITE)
+			mdev->writ_cnt += req->size>>9;
+		else
+			mdev->read_cnt += req->size>>9;
+
+		req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case write_completed_with_error:
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
+		      (unsigned long long)req->sector, req->size);
+		/* and now: check how to handle local io error. */
+		__drbd_chk_io_error(mdev, FALSE);
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case read_ahead_completed_with_error:
+		/* it is legal to fail READA */
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case read_completed_with_error:
+		drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
+		      (unsigned long long)req->sector, req->size);
+		/* _req_mod(req,to_be_send); oops, recursion... */
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+
+		__drbd_chk_io_error(mdev, FALSE);
+		put_ldev(mdev);
+		/* NOTE: if we have no connection,
+		 * or know the peer has no good data either,
+		 * then we don't actually need to "queue_for_net_read",
+		 * but we do so anyways, since the drbd_io_error()
+		 * and the potential state change to "Diskless"
+		 * needs to be done from process context */
+
+		/* fall through: _req_mod(req,queue_for_net_read); */
+
+	case queue_for_net_read:
+		/* READ or READA, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from drbd_make_request_common
+		 * or from bio_endio during read io-error recovery */
+
+		/* so we can verify the handle in the answer packet
+		 * corresponding hlist_del is in _req_may_be_done() */
+		hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
+
+		set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
+			? w_read_retry_remote
+			: w_send_read_req;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case queue_for_net_write:
+		/* assert something? */
+		/* from drbd_make_request_common only */
+
+		hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
+		/* corresponding hlist_del is in _req_may_be_done() */
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * drbd_make_request_common, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* otherwise we may lose an unplug, which may cause some remote
+		 * io-scheduler timeout to expire, increasing maximum latency,
+		 * hurting performance. */
+		set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+		/* see drbd_make_request_common,
+		 * just after it grabs the req_lock */
+		D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
+
+		req->epoch = mdev->newest_tle->br_number;
+		list_add_tail(&req->tl_requests,
+				&mdev->newest_tle->requests);
+
+		/* increment size of current epoch */
+		mdev->newest_tle->n_req++;
+
+		/* queue work item to send data */
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&mdev->data.work, &req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+			queue_barrier(mdev);
+
+		break;
+
+	case send_canceled:
+		/* treat it the same */
+	case send_failed:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		req->rq_state &= ~RQ_NET_QUEUED;
+		/* if we did it right, tl_clear should be scheduled only after
+		 * this, so this should not be necessary! */
+		_req_may_be_done(req, m);
+		break;
+
+	case handed_over_to_network:
+		/* assert something? */
+		if (bio_data_dir(req->master_bio) == WRITE &&
+		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			if (req->rq_state & RQ_NET_PENDING) {
+				dec_ap_pending(mdev);
+				req->rq_state &= ~RQ_NET_PENDING;
+				req->rq_state |= RQ_NET_OK;
+			} /* else: neg-ack was faster... */
+			/* it is still not yet RQ_NET_DONE until the
+			 * corresponding epoch barrier got acked as well,
+			 * so we know what to dirty on connection loss */
+		}
+		req->rq_state &= ~RQ_NET_QUEUED;
+		req->rq_state |= RQ_NET_SENT;
+		/* because _drbd_send_zc_bio could sleep, and may want to
+		 * dereference the bio even after the "write_acked_by_peer" and
+		 * "completed_ok" events came in, once we return from
+		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
+		 * whether it is done already, and end it.  */
+		_req_may_be_done(req, m);
+		break;
+
+	case connection_lost_while_pending:
+		/* transfer log cleanup after connection loss */
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING)
+			dec_ap_pending(mdev);
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_DONE;
+		/* if it is still queued, we may not complete it here.
+		 * it will be canceled soon. */
+		if (!(req->rq_state & RQ_NET_QUEUED))
+			_req_may_be_done(req, m);
+		break;
+
+	case write_acked_by_peer_and_sis:
+		req->rq_state |= RQ_NET_SIS;
+	case conflict_discarded_by_peer:
+		/* for discarded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log. */
+		if (what == conflict_discarded_by_peer)
+			dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
+			      " DRBD is not a random data generator!\n",
+			      (unsigned long long)req->sector, req->size);
+		req->rq_state |= RQ_NET_DONE;
+		/* fall through */
+	case write_acked_by_peer:
+		/* protocol C; successfully written on peer.
+		 * Nothing to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices.
+		 *
+		 * A barrier request is expected to have forced all prior
+		 * requests onto stable storage, so completion of a barrier
+		 * request could set NET_DONE right here, and not wait for the
+		 * P_BARRIER_ACK, but that is an unnecessary optimization. */
+
+		/* this makes it effectively the same as for: */
+	case recv_acked_by_peer:
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in handed_over_to_network about
+		 * protocol != C */
+		req->rq_state |= RQ_NET_OK;
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		req->rq_state &= ~RQ_NET_PENDING;
+		_req_may_be_done(req, m);
+		break;
+
+	case neg_acked:
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING)
+			dec_ap_pending(mdev);
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done(req, m);
+		/* else: done by handed_over_to_network */
+		break;
+
+	case barrier_acked:
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests have been acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			dev_err(DEV, "FIXME (barrier_acked but pending)\n");
+			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+		}
+		D_ASSERT(req->rq_state & RQ_NET_SENT);
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done(req, m);
+		break;
+
+	case data_received:
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		req->rq_state &= ~RQ_NET_PENDING;
+		req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+		_req_may_be_done(req, m);
+		break;
+	};
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (mdev->state.disk == D_UP_TO_DATE)
+		return 1;
+	if (mdev->state.disk >= D_OUTDATED)
+		return 0;
+	if (mdev->state.disk <  D_INCONSISTENT)
+		return 0;
+	/* state.disk == D_INCONSISTENT   We will have a look at the BitMap */
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	D_ASSERT(sector  < nr_sectors);
+	D_ASSERT(esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+}
+
+static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+{
+	const int rw = bio_rw(bio);
+	const int size = bio->bi_size;
+	const sector_t sector = bio->bi_sector;
+	struct drbd_tl_epoch *b = NULL;
+	struct drbd_request *req;
+	int local, remote;
+	int err = -EIO;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(mdev, bio);
+	if (!req) {
+		dec_ap_bio(mdev);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		dev_err(DEV, "could not kmalloc() req\n");
+		bio_endio(bio, -ENOMEM);
+		return 0;
+	}
+
+	local = get_ldev(mdev);
+	if (!local) {
+		bio_put(req->private_bio); /* or we get a bio leak */
+		req->private_bio = NULL;
+	}
+	if (rw == WRITE) {
+		remote = 1;
+	} else {
+		/* READ || READA */
+		if (local) {
+			if (!drbd_may_do_local_read(mdev, sector, size)) {
+				/* we could kick the syncer to
+				 * sync this extent asap, wait for
+				 * it, then continue locally.
+				 * Or just issue the request remotely.
+				 */
+				local = 0;
+				bio_put(req->private_bio);
+				req->private_bio = NULL;
+				put_ldev(mdev);
+			}
+		}
+		remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
+	}
+
+	/* If we have a disk, but a READA request is mapped to remote,
+	 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
+	 * Just fail that READA request right here.
+	 *
+	 * THINK: maybe fail all READA when not local?
+	 *        or make this configurable...
+	 *        if network is slow, READA won't do any good.
+	 */
+	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
+		err = -EWOULDBLOCK;
+		goto fail_and_free_req;
+	}
+
+	/* For WRITES going to the local disk, grab a reference on the target
+	 * extent.  This waits for any resync activity in the corresponding
+	 * resync extent to finish, and, if necessary, pulls in the target
+	 * extent into the activity log, which involves further disk io because
+	 * of transactional on-disk meta data updates. */
+	if (rw == WRITE && local)
+		drbd_al_begin_io(mdev, sector);
+
+	remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
+			    (mdev->state.pdsk == D_INCONSISTENT &&
+			     mdev->state.conn >= C_CONNECTED));
+
+	if (!(local || remote)) {
+		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+		goto fail_free_complete;
+	}
+
+	/* For WRITE request, we have to make sure that we have an
+	 * unused_spare_tle, in case we need to start a new epoch.
+	 * I try to be smart and avoid to pre-allocate always "just in case",
+	 * but there is a race between testing the bit and pointer outside the
+	 * spinlock, and grabbing the spinlock.
+	 * if we lost that race, we retry.  */
+	if (rw == WRITE && remote &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+allocate_barrier:
+		b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
+		if (!b) {
+			dev_err(DEV, "Failed to alloc barrier.\n");
+			err = -ENOMEM;
+			goto fail_free_complete;
+		}
+	}
+
+	/* GOOD, everything prepared, grab the spin_lock */
+	spin_lock_irq(&mdev->req_lock);
+
+	if (remote) {
+		remote = (mdev->state.pdsk == D_UP_TO_DATE ||
+			    (mdev->state.pdsk == D_INCONSISTENT &&
+			     mdev->state.conn >= C_CONNECTED));
+		if (!remote)
+			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
+		if (!(local || remote)) {
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+			spin_unlock_irq(&mdev->req_lock);
+			goto fail_free_complete;
+		}
+	}
+
+	if (b && mdev->unused_spare_tle == NULL) {
+		mdev->unused_spare_tle = b;
+		b = NULL;
+	}
+	if (rw == WRITE && remote &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+		/* someone closed the current epoch
+		 * while we were grabbing the spinlock */
+		spin_unlock_irq(&mdev->req_lock);
+		goto allocate_barrier;
+	}
+
+
+	/* Update disk stats */
+	_drbd_start_io_acct(mdev, req, bio);
+
+	/* _maybe_start_new_epoch(mdev);
+	 * If we need to generate a write barrier packet, we have to add the
+	 * new epoch (barrier) object, and queue the barrier packet for sending,
+	 * and queue the req's data after it _within the same lock_, otherwise
+	 * we have race conditions were the reorder domains could be mixed up.
+	 *
+	 * Even read requests may start a new epoch and queue the corresponding
+	 * barrier packet.  To get the write ordering right, we only have to
+	 * make sure that, if this is a write request and it triggered a
+	 * barrier packet, this request is queued within the same spinlock. */
+	if (remote && mdev->unused_spare_tle &&
+	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, mdev->unused_spare_tle);
+		mdev->unused_spare_tle = NULL;
+	} else {
+		D_ASSERT(!(remote && rw == WRITE &&
+			   test_bit(CREATE_BARRIER, &mdev->flags)));
+	}
+
+	/* NOTE
+	 * Actually, 'local' may be wrong here already, since we may have failed
+	 * to write to the meta data, and may become wrong anytime because of
+	 * local io-error for some other request, which would lead to us
+	 * "detaching" the local disk.
+	 *
+	 * 'remote' may become wrong any time because the network could fail.
+	 *
+	 * This is a harmless race condition, though, since it is handled
+	 * correctly at the appropriate places; so it just defers the failure
+	 * of the respective operation.
+	 */
+
+	/* mark them early for readability.
+	 * this just sets some state flags. */
+	if (remote)
+		_req_mod(req, to_be_send);
+	if (local)
+		_req_mod(req, to_be_submitted);
+
+	/* check this request on the collision detection hash tables.
+	 * if we have a conflict, just complete it here.
+	 * THINK do we want to check reads, too? (I don't think so...) */
+	if (rw == WRITE && _req_conflicts(req)) {
+		/* this is a conflicting request.
+		 * even though it may have been only _partially_
+		 * overlapping with one of the currently pending requests,
+		 * without even submitting or sending it, we will
+		 * pretend that it was successfully served right now.
+		 */
+		if (local) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			drbd_al_complete_io(mdev, req->sector);
+			put_ldev(mdev);
+			local = 0;
+		}
+		if (remote)
+			dec_ap_pending(mdev);
+		_drbd_end_io_acct(mdev, req);
+		/* THINK: do we want to fail it (-EIO), or pretend success? */
+		bio_endio(req->master_bio, 0);
+		req->master_bio = NULL;
+		dec_ap_bio(mdev);
+		drbd_req_free(req);
+		remote = 0;
+	}
+
+	/* NOTE remote first: to get the concurrent write detection right,
+	 * we must register the request before start of local IO.  */
+	if (remote) {
+		/* either WRITE and C_CONNECTED,
+		 * or READ, and no local disk,
+		 * or READ, but not in sync.
+		 */
+		_req_mod(req, (rw == WRITE)
+				? queue_for_net_write
+				: queue_for_net_read);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+	kfree(b); /* if someone else has beaten us to it... */
+
+	if (local) {
+		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+
+		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+				     : rw == READ  ? DRBD_FAULT_DT_RD
+				     :               DRBD_FAULT_DT_RA))
+			bio_endio(req->private_bio, -EIO);
+		else
+			generic_make_request(req->private_bio);
+	}
+
+	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
+	 * we plug after submit, so we won't miss an unplug event */
+	drbd_plug_device(mdev);
+
+	return 0;
+
+fail_free_complete:
+	if (rw == WRITE && local)
+		drbd_al_complete_io(mdev, sector);
+fail_and_free_req:
+	if (local) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+		put_ldev(mdev);
+	}
+	bio_endio(bio, err);
+	drbd_req_free(req);
+	dec_ap_bio(mdev);
+	kfree(b);
+
+	return 0;
+}
+
+/* helper function for drbd_make_request
+ * if we can determine just by the mdev (state) that this request will fail,
+ * return 1
+ * otherwise return 0
+ */
+static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
+{
+	/* Unconfigured */
+	if (mdev->state.conn == C_DISCONNECTING &&
+	    mdev->state.disk == D_DISKLESS)
+		return 1;
+
+	if (mdev->state.role != R_PRIMARY &&
+		(!allow_oos || is_write)) {
+		if (__ratelimit(&drbd_ratelimit_state)) {
+			dev_err(DEV, "Process %s[%u] tried to %s; "
+			    "since we are not in Primary state, "
+			    "we cannot allow this\n",
+			    current->comm, current->pid,
+			    is_write ? "WRITE" : "READ");
+		}
+		return 1;
+	}
+
+	/*
+	 * Paranoia: we might have been primary, but sync target, or
+	 * even diskless, then lost the connection.
+	 * This should have been handled (panic? suspend?) somewhere
+	 * else. But maybe it was not, so check again here.
+	 * Caution: as long as we do not have a read/write lock on mdev,
+	 * to serialize state changes, this is racy, since we may lose
+	 * the connection *after* we test for the cstate.
+	 */
+	if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+{
+	unsigned int s_enr, e_enr;
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+
+	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
+		bio_endio(bio, -EPERM);
+		return 0;
+	}
+
+	/* Reject barrier requests if we know the underlying device does
+	 * not support them.
+	 * XXX: Need to get this info from peer as well some how so we
+	 * XXX: reject if EITHER side/data/metadata area does not support them.
+	 *
+	 * because of those XXX, this is not yet enabled,
+	 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
+	 */
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
+		/* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(bio->bi_size > 0);
+	D_ASSERT((bio->bi_size & 0x1ff) == 0);
+	D_ASSERT(bio->bi_idx == 0);
+
+	/* to make some things easier, force alignment of requests within the
+	 * granularity of our hash tables */
+	s_enr = bio->bi_sector >> HT_SHIFT;
+	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+
+	if (likely(s_enr == e_enr)) {
+		inc_ap_bio(mdev, 1);
+		return drbd_make_request_common(mdev, bio);
+	}
+
+	/* can this bio be split generically?
+	 * Maybe add our own split-arbitrary-bios function. */
+	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
+		/* rather error out here than BUG in bio_split */
+		dev_err(DEV, "bio would need to, but cannot, be split: "
+		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
+		    bio->bi_vcnt, bio->bi_idx, bio->bi_size,
+		    (unsigned long long)bio->bi_sector);
+		bio_endio(bio, -EINVAL);
+	} else {
+		/* This bio crosses some boundary, so we have to split it. */
+		struct bio_pair *bp;
+		/* works for the "do not cross hash slot boundaries" case
+		 * e.g. sector 262269, size 4096
+		 * s_enr = 262269 >> 6 = 4097
+		 * e_enr = (262269+8-1) >> 6 = 4098
+		 * HT_SHIFT = 6
+		 * sps = 64, mask = 63
+		 * first_sectors = 64 - (262269 & 63) = 3
+		 */
+		const sector_t sect = bio->bi_sector;
+		const int sps = 1 << HT_SHIFT; /* sectors per slot */
+		const int mask = sps - 1;
+		const sector_t first_sectors = sps - (sect & mask);
+		bp = bio_split(bio,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+				bio_split_pool,
+#endif
+				first_sectors);
+
+		/* we need to get a "reference count" (ap_bio_cnt)
+		 * to avoid races with the disconnect/reconnect/suspend code.
+		 * In case we need to split the bio here, we need to get two references
+		 * atomically, otherwise we might deadlock when trying to submit the
+		 * second one! */
+		inc_ap_bio(mdev, 2);
+
+		D_ASSERT(e_enr == s_enr + 1);
+
+		drbd_make_request_common(mdev, &bp->bio1);
+		drbd_make_request_common(mdev, &bp->bio2);
+		bio_pair_release(bp);
+	}
+	return 0;
+}
+
+/* This is called by bio_add_page().  With this function we reduce
+ * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
+ * units (was AL_EXTENTs).
+ *
+ * we do the calculation within the lower 32bit of the byte offsets,
+ * since we don't care for actual offset, but only check whether it
+ * would cross "activity log extent" boundaries.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset.  so the resulting bio may still
+ * cross extent boundaries.  those are dealt with (bio_split) in
+ * drbd_make_request_26.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+	unsigned int bio_offset =
+		(unsigned int)bvm->bi_sector << 9; /* 32 bit */
+	unsigned int bio_size = bvm->bi_size;
+	int limit, backing_limit;
+
+	limit = DRBD_MAX_SEGMENT_SIZE
+	      - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
+	if (limit < 0)
+		limit = 0;
+	if (bio_size == 0) {
+		if (limit <= bvec->bv_len)
+			limit = bvec->bv_len;
+	} else if (limit && get_ldev(mdev)) {
+		struct request_queue * const b =
+			mdev->ldev->backing_bdev->bd_disk->queue;
+		if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+			limit = min(limit, backing_limit);
+		}
+		put_ldev(mdev);
+	}
+	return limit;
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 0000000..f22c1bc
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,326 @@
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+   DRBD is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   DRBD is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either sucessfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	created,
+	to_be_send,
+	to_be_submitted,
+
+	/* XXX yes, now I am inconsistent...
+	 * these two are not "events" but "actions"
+	 * oh, well... */
+	queue_for_net_write,
+	queue_for_net_read,
+
+	send_canceled,
+	send_failed,
+	handed_over_to_network,
+	connection_lost_while_pending,
+	recv_acked_by_peer,
+	write_acked_by_peer,
+	write_acked_by_peer_and_sis, /* and set_in_sync */
+	conflict_discarded_by_peer,
+	neg_acked,
+	barrier_acked, /* in protocol A and B */
+	data_received, /* (remote read) */
+
+	read_completed_with_error,
+	read_ahead_completed_with_error,
+	write_completed_with_error,
+	completed_ok,
+	nothing, /* for tracing only */
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 210
+	 * 000: no local possible
+	 * 001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 110: completed ok
+	 * 010: completed with error
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+
+	/* 76543
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write_acked (C),
+	 *        data_received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+/* epoch entries */
+static inline
+struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->ee_hash_s == 0);
+	return mdev->ee_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
+}
+
+/* transfer log (drbd_request objects) */
+static inline
+struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->tl_hash_s == 0);
+	return mdev->tl_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
+}
+
+/* application reads (drbd_request objects) */
+static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	return mdev->app_reads_hash
+		+ ((unsigned int)(sector) % APP_R_HSIZE);
+}
+
+/* when we receive the answer for a read request,
+ * verify that we actually know about it */
+static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = ar_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, colision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			D_ASSERT(req->sector == sector);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+	struct bio *bio_src)
+{
+	struct bio *bio;
+	struct drbd_request *req =
+		mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (likely(req)) {
+		bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+		req->rq_state    = 0;
+		req->mdev        = mdev;
+		req->master_bio  = bio_src;
+		req->private_bio = bio;
+		req->epoch       = 0;
+		req->sector      = bio->bi_sector;
+		req->size        = bio->bi_size;
+		req->start_time  = jiffies;
+		INIT_HLIST_NODE(&req->colision);
+		INIT_LIST_HEAD(&req->tl_requests);
+		INIT_LIST_HEAD(&req->w.list);
+
+		bio->bi_private  = req;
+		bio->bi_end_io   = drbd_endio_pri;
+		bio->bi_next     = NULL;
+	}
+	return req;
+}
+
+static inline void drbd_req_free(struct drbd_request *req)
+{
+	mempool_free(req, drbd_request_mempool);
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void _req_may_be_done(struct drbd_request *req,
+		struct bio_and_error *m);
+extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	__req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+
+/* completion of master bio is outside of spinlock.
+ * If you need it irqsave, do it your self! */
+static inline void req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	spin_lock_irq(&mdev->req_lock);
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 0000000..76863e3
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+
+static const char *drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+};
+
+static const char *drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 0000000..fc82400
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transfered bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 0000000..ed8796f
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1512 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+#define SLEEP_TIME (HZ/10)
+
+static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
+
+
+
+/* defined here:
+   drbd_md_io_complete
+   drbd_endio_write_sec
+   drbd_endio_read_sec
+   drbd_endio_pri
+
+ * more endio handlers:
+   atodb_endio in drbd_actlog.c
+   drbd_bm_async_io_complete in drbd_bitmap.c
+
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the sync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_io_complete(struct bio *bio, int error)
+{
+	struct drbd_md_io *md_io;
+
+	md_io = (struct drbd_md_io *)bio->bi_private;
+	md_io->error = error;
+
+	complete(&md_io->event);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_epoch_entry *e = NULL;
+	struct drbd_conf *mdev;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	e = bio->bi_private;
+	mdev = e->mdev;
+
+	if (error)
+		dev_warn(DEV, "read: error=%d s=%llus\n", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
+				(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->read_cnt += e->size >> 9;
+	list_del(&e->w.list);
+	if (list_empty(&mdev->read_ee))
+		wake_up(&mdev->ee_wait);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	drbd_chk_io_error(mdev, error, FALSE);
+	drbd_queue_work(&mdev->data.work, &e->w);
+	put_ldev(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_epoch_entry *e = NULL;
+	struct drbd_conf *mdev;
+	sector_t e_sector;
+	int do_wake;
+	int is_syncer_req;
+	int do_al_complete_io;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+
+	e = bio->bi_private;
+	mdev = e->mdev;
+
+	if (error)
+		dev_warn(DEV, "write: error=%d s=%llus\n", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
+				(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	/* error == -ENOTSUPP would be a better test,
+	 * alas it is not reliable */
+	if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+		drbd_bump_write_ordering(mdev, WO_bdev_flush);
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		list_del(&e->w.list);
+		e->w.cb = w_e_reissue;
+		/* put_ldev actually happens below, once we come here again. */
+		__release(local);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+		drbd_queue_work(&mdev->data.work, &e->w);
+		return;
+	}
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->writ_cnt += e->size >> 9;
+	is_syncer_req = is_syncer_block_id(e->block_id);
+
+	/* after we moved e to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	e_sector = e->sector;
+	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+
+	list_del(&e->w.list); /* has been on active_ee or sync_ee */
+	list_add_tail(&e->w.list, &mdev->done_ee);
+
+	/* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
+	 * neither did we wake possibly waiting conflicting requests.
+	 * done from "drbd_process_done_ee" within the appropriate w.cb
+	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+
+	do_wake = is_syncer_req
+		? list_empty(&mdev->sync_ee)
+		: list_empty(&mdev->active_ee);
+
+	if (error)
+		__drbd_chk_io_error(mdev, FALSE);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (is_syncer_req)
+		drbd_rs_complete_io(mdev, e_sector);
+
+	if (do_wake)
+		wake_up(&mdev->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(mdev, e_sector);
+
+	wake_asender(mdev);
+	put_ldev(mdev);
+
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_endio_pri(struct bio *bio, int error)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	if (error)
+		dev_warn(DEV, "p %s: error=%d\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read", error);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "p %s: setting error to -EIO\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read");
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(error)) {
+		what = (bio_data_dir(bio) == WRITE)
+			? write_completed_with_error
+			: (bio_rw(bio) == READA)
+			  ? read_completed_with_error
+			  : read_ahead_completed_with_error;
+	} else
+		what = completed_ok;
+
+	bio_put(req->private_bio);
+	req->private_bio = ERR_PTR(error);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+
+int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	/* NOTE: mdev->ldev can be NULL by the time we get here! */
+	/* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
+
+	/* the only way this callback is scheduled is from _req_may_be_done,
+	 * when it is done and had a local write error, see comments there */
+	drbd_req_free(req);
+
+	return TRUE;
+}
+
+int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	/* We should not detach for read io-error,
+	 * but try to WRITE the P_DATA_REPLY to the failed location,
+	 * to give the disk the chance to relocate that block */
+
+	spin_lock_irq(&mdev->req_lock);
+	if (cancel ||
+	    mdev->state.conn < C_CONNECTED ||
+	    mdev->state.pdsk <= D_INCONSISTENT) {
+		_req_mod(req, send_canceled);
+		spin_unlock_irq(&mdev->req_lock);
+		dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
+		return 1;
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return w_send_read_req(mdev, w, 0);
+}
+
+int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	ERR_IF(cancel) return 1;
+	dev_err(DEV, "resync inactive, but callback triggered??\n");
+	return 1; /* Simply ignore this! */
+}
+
+void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct bio_vec *bvec;
+	int i;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		crypto_hash_update(&desc, &sg, sg.length);
+	}
+	crypto_hash_final(&desc, digest);
+}
+
+static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int digest_size;
+	void *digest;
+	int ok;
+
+	D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		return 1;
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+
+			inc_rs_pending(mdev);
+			ok = drbd_send_drequest_csum(mdev,
+						     e->sector,
+						     e->size,
+						     digest,
+						     digest_size,
+						     P_CSUM_RS_REQUEST);
+			kfree(digest);
+		} else {
+			dev_err(DEV, "kmalloc() of digest failed.\n");
+			ok = 0;
+		}
+	} else
+		ok = 1;
+
+	drbd_free_ee(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
+	return ok;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	struct drbd_epoch_entry *e;
+
+	if (!get_ldev(mdev))
+		return 0;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
+	if (!e) {
+		put_ldev(mdev);
+		return 2;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	e->private_bio->bi_end_io = drbd_endio_read_sec;
+	e->private_bio->bi_rw = READ;
+	e->w.cb = w_e_send_csum;
+
+	mdev->read_cnt += size >> 9;
+	drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
+
+	return 1;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	unsigned long flags;
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+	int queue;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+
+	if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
+		queue = 1;
+		if (mdev->state.conn == C_VERIFY_S)
+			mdev->resync_work.cb = w_make_ov_request;
+		else
+			mdev->resync_work.cb = w_make_resync_request;
+	} else {
+		queue = 0;
+		mdev->resync_work.cb = w_resync_inactive;
+	}
+
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	/* harmless race: list_empty outside data.work.q_lock */
+	if (list_empty(&mdev->resync_work.list) && queue)
+		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+}
+
+int w_make_resync_request(struct drbd_conf *mdev,
+		struct drbd_work *w, int cancel)
+{
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+	int number, i, size, pe, mx;
+	int align, queued, sndbuf;
+
+	if (unlikely(cancel))
+		return 1;
+
+	if (unlikely(mdev->state.conn < C_CONNECTED)) {
+		dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
+		return 0;
+	}
+
+	if (mdev->state.conn != C_SYNC_TARGET)
+		dev_err(DEV, "%s in w_make_resync_request\n",
+			drbd_conn_str(mdev->state.conn));
+
+	if (!get_ldev(mdev)) {
+		/* Since we only need to access mdev->rsync a
+		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		dev_err(DEV, "Disk broke down during resync!\n");
+		mdev->resync_work.cb = w_resync_inactive;
+		return 1;
+	}
+
+	number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+	pe = atomic_read(&mdev->rs_pending_cnt);
+
+	mutex_lock(&mdev->data.mutex);
+	if (mdev->data.socket)
+		mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
+	else
+		mx = 1;
+	mutex_unlock(&mdev->data.mutex);
+
+	/* For resync rates >160MB/sec, allow more pending RS requests */
+	if (number > mx)
+		mx = number;
+
+	/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+	if ((pe + number) > mx) {
+		number = mx - pe;
+	}
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests, when half of the send buffer is filled */
+		mutex_lock(&mdev->data.mutex);
+		if (mdev->data.socket) {
+			queued = mdev->data.socket->sk->sk_wmem_queued;
+			sndbuf = mdev->data.socket->sk->sk_sndbuf;
+		} else {
+			queued = 1;
+			sndbuf = 0;
+		}
+		mutex_unlock(&mdev->data.mutex);
+		if (queued > sndbuf / 2)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
+
+		if (bit == -1UL) {
+			mdev->bm_resync_fo = drbd_bm_bits(mdev);
+			mdev->resync_work.cb = w_resync_inactive;
+			put_ldev(mdev);
+			return 1;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->bm_resync_fo = bit;
+			goto requeue;
+		}
+		mdev->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
+			drbd_rs_complete_io(mdev, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 *
+		 * we _do_ care about the agreed-upon q->max_segment_size
+		 * here, as splitting up the requests on the other side is more
+		 * difficult.  the consequence is, that on lvm and md and other
+		 * "indirect" devices, this is dead code, since
+		 * q->max_segment_size will be PAGE_SIZE.
+		 */
+		align = 1;
+		for (;;) {
+			if (size + BM_BLOCK_SIZE > max_segment_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(mdev, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			mdev->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+			switch (read_for_csum(mdev, sector, size)) {
+			case 0: /* Disk failure*/
+				put_ldev(mdev);
+				return 0;
+			case 2: /* Allocation failed */
+				drbd_rs_complete_io(mdev, sector);
+				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				goto requeue;
+			/* case 1: everything ok */
+			}
+		} else {
+			inc_rs_pending(mdev);
+			if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+					       sector, size, ID_SYNCER)) {
+				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(mdev);
+				put_ldev(mdev);
+				return 0;
+			}
+		}
+	}
+
+	if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		mdev->resync_work.cb = w_resync_inactive;
+		put_ldev(mdev);
+		return 1;
+	}
+
+ requeue:
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(mdev);
+	return 1;
+}
+
+static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+
+	if (unlikely(cancel))
+		return 1;
+
+	if (unlikely(mdev->state.conn < C_CONNECTED)) {
+		dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
+		return 0;
+	}
+
+	number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+	if (atomic_read(&mdev->rs_pending_cnt) > number)
+		goto requeue;
+
+	number -= atomic_read(&mdev->rs_pending_cnt);
+
+	sector = mdev->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity) {
+			mdev->resync_work.cb = w_resync_inactive;
+			return 1;
+		}
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(mdev);
+		if (!drbd_send_ov_request(mdev, sector, size)) {
+			dec_rs_pending(mdev);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	mdev->ov_position = sector;
+
+ requeue:
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+
+int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+	ov_oos_print(mdev);
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+int drbd_resync_finished(struct drbd_conf *mdev)
+{
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_work *w;
+	char *khelper_cmd = NULL;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(mdev)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		drbd_kick_lo(mdev);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ / 10);
+		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
+		if (w) {
+			w->cb = w_resync_finished;
+			drbd_queue_work(&mdev->data.work, w);
+			return 1;
+		}
+		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
+	}
+
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+	db = mdev->rs_total;
+	dbdt = Bit2KB(db/dt);
+	mdev->rs_paused /= HZ;
+
+	if (!get_ldev(mdev))
+		goto out;
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns = os;
+	ns.conn = C_CONNECTED;
+
+	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
+	     "Online verify " : "Resync",
+	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(mdev);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT((n_oos - mdev->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (mdev->csums_tfm && mdev->rs_total) {
+			const unsigned long s = mdev->rs_same_csum;
+			const unsigned long t = mdev->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total));
+		}
+	}
+
+	if (mdev->rs_failed) {
+		dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (mdev->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
+				drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
+			} else {
+				dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		drbd_uuid_set_bm(mdev, 0UL);
+
+		if (mdev->p_uuid) {
+			/* Now the two UUID sets are equal, update what we
+			 * know of the peer. */
+			int i;
+			for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+				mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+		}
+	}
+
+	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&mdev->req_lock);
+	put_ldev(mdev);
+out:
+	mdev->rs_total  = 0;
+	mdev->rs_failed = 0;
+	mdev->rs_paused = 0;
+	mdev->ov_start_sector = 0;
+
+	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
+		dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
+	}
+
+	if (khelper_cmd)
+		drbd_khelper(mdev, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	if (drbd_bio_has_active_page(e->private_bio)) {
+		/* This might happen if sendpage() has not finished */
+		spin_lock_irq(&mdev->req_lock);
+		list_add_tail(&e->w.list, &mdev->net_ee);
+		spin_unlock_irq(&mdev->req_lock);
+	} else
+		drbd_free_ee(mdev, e);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(mdev);
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		} else {
+			if (__ratelimit(&drbd_ratelimit_state))
+				dev_err(DEV, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			ok = 1;
+		}
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(mdev, e->sector, e->size);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	drbd_rs_complete_io(mdev, e->sector);
+
+	di = (struct digest_info *)(unsigned long)e->block_id;
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (mdev->csums_tfm) {
+			digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+			D_ASSERT(digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(mdev, e->sector, e->size);
+			mdev->rs_same_csum++;
+			ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+		} else {
+			inc_rs_pending(mdev);
+			e->block_id = ID_SYNCER;
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		}
+	} else {
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(mdev);
+
+	kfree(di);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block/ack() failed\n");
+	return ok;
+}
+
+int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int digest_size;
+	void *digest;
+	int ok = 1;
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (unlikely(!drbd_bio_uptodate(e->private_bio)))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+	/* FIXME if this allocation fails, online verify will not terminate! */
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+		inc_rs_pending(mdev);
+		ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
+					     digest, digest_size, P_OV_REPLY);
+		if (!ok)
+			dec_rs_pending(mdev);
+		kfree(digest);
+	}
+
+out:
+	drbd_free_ee(mdev, e);
+
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
+		mdev->ov_last_oos_size += size>>9;
+	} else {
+		mdev->ov_last_oos_start = sector;
+		mdev->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(mdev, sector, size);
+	set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+}
+
+int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	int digest_size;
+	void *digest;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	drbd_rs_complete_io(mdev, e->sector);
+
+	di = (struct digest_info *)(unsigned long)e->block_id;
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+
+			D_ASSERT(digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	} else {
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(mdev);
+
+	kfree(di);
+
+	if (!eq)
+		drbd_ov_oos_found(mdev, e->sector, e->size);
+	else
+		ov_oos_print(mdev);
+
+	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
+			      eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	drbd_free_ee(mdev, e);
+
+	if (--mdev->ov_left == 0) {
+		ov_oos_print(mdev);
+		drbd_resync_finished(mdev);
+	}
+
+	return ok;
+}
+
+int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+	complete(&b->done);
+	return 1;
+}
+
+int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
+	struct p_barrier *p = &mdev->data.sbuf.barrier;
+	int ok = 1;
+
+	/* really avoid racing with tl_clear.  w.cb may have been referenced
+	 * just before it was reassigned and re-queued, so double check that.
+	 * actually, this race was harmless, since we only try to send the
+	 * barrier packet here, and otherwise do nothing with the object.
+	 * but compare with the head of w_clear_epoch */
+	spin_lock_irq(&mdev->req_lock);
+	if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
+		cancel = 1;
+	spin_unlock_irq(&mdev->req_lock);
+	if (cancel)
+		return 1;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+	p->barrier = b->br_number;
+	/* inc_ap_pending was done where this was queued.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in w_clear_epoch.  */
+	ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
+				(struct p_header *)p, sizeof(*p), 0);
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (cancel)
+		return 1;
+	return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_dblock(mdev, req);
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
+				(unsigned long)req);
+
+	if (!ok) {
+		/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
+		 * so this is probably redundant */
+		if (mdev->state.conn >= C_CONNECTED)
+			drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	}
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+static int _drbd_may_sync_now(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev = mdev;
+
+	while (1) {
+		if (odev->sync_conf.after == -1)
+			return 1;
+		odev = minor_to_mdev(odev->sync_conf.after);
+		ERR_IF(!odev) return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev))
+			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+			       != SS_NOTHING_TO_DO);
+	}
+
+	return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev))
+				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+							CS_HARD, NULL)
+				       != SS_NOTHING_TO_DO) ;
+		}
+	}
+	return rv;
+}
+
+void resume_next_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_resume_next(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_pause_after(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+{
+	struct drbd_conf *odev;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
+		return ERR_SYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_mdev(o_minor);
+	while (1) {
+		if (odev == mdev)
+			return ERR_SYNC_AFTER_CYCLE;
+
+		/* dependency chain ends here, no cycles. */
+		if (odev->sync_conf.after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_mdev(odev->sync_conf.after);
+	}
+}
+
+int drbd_alter_sa(struct drbd_conf *mdev, int na)
+{
+	int changes;
+	int retcode;
+
+	write_lock_irq(&global_state_lock);
+	retcode = sync_after_error(mdev, na);
+	if (retcode == NO_ERROR) {
+		mdev->sync_conf.after = na;
+		do {
+			changes  = _drbd_pause_after(mdev);
+			changes |= _drbd_resume_next(mdev);
+		} while (changes);
+	}
+	write_unlock_irq(&global_state_lock);
+	return retcode;
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @mdev:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
+{
+	union drbd_state ns;
+	int r;
+
+	if (mdev->state.conn >= C_SYNC_SOURCE) {
+		dev_err(DEV, "Resync already running!\n");
+		return;
+	}
+
+	/* In case a previous resync run was aborted by an IO error/detach on the peer. */
+	drbd_rs_cancel_all(mdev);
+
+	if (side == C_SYNC_TARGET) {
+		/* Since application IO was locked out during C_WF_BITMAP_T and
+		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+		   we check that we might make the data inconsistent. */
+		r = drbd_khelper(mdev, "before-resync-target");
+		r = (r >> 8) & 0xff;
+		if (r > 0) {
+			dev_info(DEV, "before-resync-target handler returned %d, "
+			     "dropping connection.\n", r);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return;
+		}
+	}
+
+	drbd_state_lock(mdev);
+
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		drbd_state_unlock(mdev);
+		return;
+	}
+
+	if (side == C_SYNC_TARGET) {
+		mdev->bm_resync_fo = 0;
+	} else /* side == C_SYNC_SOURCE */ {
+		u64 uuid;
+
+		get_random_bytes(&uuid, sizeof(u64));
+		drbd_uuid_set(mdev, UI_BITMAP, uuid);
+		drbd_send_sync_uuid(mdev, uuid);
+
+		D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
+	}
+
+	write_lock_irq(&global_state_lock);
+	ns = mdev->state;
+
+	ns.aftr_isp = !_drbd_may_sync_now(mdev);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns = mdev->state;
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		mdev->rs_total     =
+		mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+		mdev->rs_failed    = 0;
+		mdev->rs_paused    = 0;
+		mdev->rs_start     =
+		mdev->rs_mark_time = jiffies;
+		mdev->rs_same_csum = 0;
+		_drbd_pause_after(mdev);
+	}
+	write_unlock_irq(&global_state_lock);
+	drbd_state_unlock(mdev);
+	put_ldev(mdev);
+
+	if (r == SS_SUCCESS) {
+		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) mdev->rs_total);
+
+		if (mdev->rs_total == 0) {
+			/* Peer still reachable? Beware of failing before-resync-target handlers! */
+			request_ping(mdev);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
+			drbd_resync_finished(mdev);
+			return;
+		}
+
+		/* ns.conn may already be != mdev->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
+
+		drbd_md_sync(mdev);
+	}
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct drbd_work *w = NULL;
+	LIST_HEAD(work_list);
+	int intr = 0, i;
+
+	sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+
+		if (down_trylock(&mdev->data.work.s)) {
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket && !mdev->net_conf->no_cork)
+				drbd_tcp_uncork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+
+			intr = down_interruptible(&mdev->data.work.s);
+
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket  && !mdev->net_conf->no_cork)
+				drbd_tcp_cork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+		}
+
+		if (intr) {
+			D_ASSERT(intr == -EINTR);
+			flush_signals(current);
+			ERR_IF (get_t_state(thi) == Running)
+				continue;
+			break;
+		}
+
+		if (get_t_state(thi) != Running)
+			break;
+		/* With this break, we have done a down() but not consumed
+		   the entry from the list. The cleanup code takes care of
+		   this...   */
+
+		w = NULL;
+		spin_lock_irq(&mdev->data.work.q_lock);
+		ERR_IF(list_empty(&mdev->data.work.q)) {
+			/* something terribly wrong in our logic.
+			 * we were able to down() the semaphore,
+			 * but the list is empty... doh.
+			 *
+			 * what is the best thing to do now?
+			 * try again from scratch, restarting the receiver,
+			 * asender, whatnot? could break even more ugly,
+			 * e.g. when we are primary, but no good local data.
+			 *
+			 * I'll try to get away just starting over this loop.
+			 */
+			spin_unlock_irq(&mdev->data.work.q_lock);
+			continue;
+		}
+		w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
+		list_del_init(&w->list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
+			/* dev_warn(DEV, "worker: a callback failed! \n"); */
+			if (mdev->state.conn >= C_CONNECTED)
+				drbd_force_state(mdev,
+						NS(conn, C_NETWORK_FAILURE));
+		}
+	}
+	D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
+	D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
+
+	spin_lock_irq(&mdev->data.work.q_lock);
+	i = 0;
+	while (!list_empty(&mdev->data.work.q)) {
+		list_splice_init(&mdev->data.work.q, &work_list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		while (!list_empty(&work_list)) {
+			w = list_entry(work_list.next, struct drbd_work, list);
+			list_del_init(&w->list);
+			w->cb(mdev, w, 1);
+			i++; /* dead debugging code */
+		}
+
+		spin_lock_irq(&mdev->data.work.q_lock);
+	}
+	sema_init(&mdev->data.work.s, 0);
+	/* DANGEROUS race: if someone did queue his work within the spinlock,
+	 * but up() ed outside the spinlock, we could get an up() on the
+	 * semaphore without corresponding list entry.
+	 * So don't do that.
+	 */
+	spin_unlock_irq(&mdev->data.work.q_lock);
+
+	D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+	/* _drbd_set_state only uses stop_nowait.
+	 * wait here for the Exiting receiver. */
+	drbd_thread_stop(&mdev->receiver);
+	drbd_mdev_cleanup(mdev);
+
+	dev_info(DEV, "worker terminated\n");
+
+	clear_bit(DEVICE_DYING, &mdev->flags);
+	clear_bit(CONFIG_PENDING, &mdev->flags);
+	wake_up(&mdev->state_wait);
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 0000000..f93fa11
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
+#ifndef _DRBD_WRAPPERS_H
+#define _DRBD_WRAPPERS_H
+
+#include <linux/ctype.h>
+#include <linux/mm.h>
+
+/* see get_sb_bdev and bd_claim */
+extern char *drbd_sec_holder;
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
+					sector_t size)
+{
+	/* set_capacity(mdev->this_bdev->bd_disk, size); */
+	set_capacity(mdev->vdisk, size);
+	mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
+
+static inline int drbd_bio_has_active_page(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (page_count(bvec->bv_page) > 1)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_endio_read_sec(struct bio *bio, int error);
+extern void drbd_endio_write_sec(struct bio *bio, int error);
+extern void drbd_endio_pri(struct bio *bio, int error);
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_conf *mdev,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       mdev_to_minor(mdev));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (FAULT_ACTIVE(mdev, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
+static inline void drbd_plug_device(struct drbd_conf *mdev)
+{
+	struct request_queue *q;
+	q = bdev_get_queue(mdev->this_bdev);
+
+	spin_lock_irq(q->queue_lock);
+
+/* XXX the check on !blk_queue_plugged is redundant,
+ * implicitly checked in blk_plug_device */
+
+	if (!blk_queue_plugged(q)) {
+		blk_plug_device(q);
+		del_timer(&q->unplug_timer);
+		/* unplugging should not happen automatically... */
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+
+static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
+{
+        return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
+                == CRYPTO_ALG_TYPE_HASH;
+}
+
+#ifndef __CHECKER__
+# undef __cond_lock
+# define __cond_lock(x,c) (c)
+#endif
+
+#endif
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 3bb7c47..1fb6c31 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
-	unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	unsigned long timeout;
+
+	for (timeout = 20; timeout; timeout--) {
+		if (!notify[3])
+			return 0;
+		udelay(10);
+	}
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);
 
 	do {
 		if (!notify[3])
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 614da5b..e3749d0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3557,67 +3557,65 @@ static ctl_table cdrom_table[] = {
 		.data		= &cdrom_sysctl_settings.info, 
 		.maxlen		= CDROM_STR_SIZE,
 		.mode		= 0444,
-		.proc_handler	= &cdrom_sysctl_info,
+		.proc_handler	= cdrom_sysctl_info,
 	},
 	{
 		.procname	= "autoclose",
 		.data		= &cdrom_sysctl_settings.autoclose,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "autoeject",
 		.data		= &cdrom_sysctl_settings.autoeject,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "debug",
 		.data		= &cdrom_sysctl_settings.debug,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "lock",
 		.data		= &cdrom_sysctl_settings.lock,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler,
+		.proc_handler	= cdrom_sysctl_handler,
 	},
 	{
 		.procname	= "check_media",
 		.data		= &cdrom_sysctl_settings.check,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &cdrom_sysctl_handler
+		.proc_handler	= cdrom_sysctl_handler
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table cdrom_cdrom_table[] = {
 	{
-		.ctl_name	= DEV_CDROM,
 		.procname	= "cdrom",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= cdrom_table,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 /* Make sure that /proc/sys/dev is there */
 static ctl_table cdrom_root_table[] = {
 	{
-		.ctl_name	= CTL_DEV,
 		.procname	= "dev",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= cdrom_cdrom_table,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 static struct ctl_table_header *cdrom_sysctl_header;
 
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 70a770a..e481c59 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -675,36 +675,33 @@ static int hpet_is_known(struct hpet_data *hdp)
 
 static ctl_table hpet_table[] = {
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "max-user-freq",
 	 .data = &hpet_max_freq,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec,
+	 .proc_handler = proc_dointvec,
 	 },
-	{.ctl_name = 0}
+	{}
 };
 
 static ctl_table hpet_root[] = {
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "hpet",
 	 .maxlen = 0,
 	 .mode = 0555,
 	 .child = hpet_table,
 	 },
-	{.ctl_name = 0}
+	{}
 };
 
 static ctl_table dev_root[] = {
 	{
-	 .ctl_name = CTL_DEV,
 	 .procname = "dev",
 	 .maxlen = 0,
 	 .mode = 0555,
 	 .child = hpet_root,
 	 },
-	{.ctl_name = 0}
+	{}
 };
 
 static struct ctl_table_header *sysctl_header;
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 2e66b5f..0dec5da 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -660,26 +660,23 @@ static struct ipmi_smi_watcher smi_watcher = {
 #include <linux/sysctl.h>
 
 static ctl_table ipmi_table[] = {
-	{ .ctl_name	= DEV_IPMI_POWEROFF_POWERCYCLE,
-	  .procname	= "poweroff_powercycle",
+	{ .procname	= "poweroff_powercycle",
 	  .data		= &poweroff_powercycle,
 	  .maxlen	= sizeof(poweroff_powercycle),
 	  .mode		= 0644,
-	  .proc_handler	= &proc_dointvec },
+	  .proc_handler	= proc_dointvec },
 	{ }
 };
 
 static ctl_table ipmi_dir_table[] = {
-	{ .ctl_name	= DEV_IPMI,
-	  .procname	= "ipmi",
+	{ .procname	= "ipmi",
 	  .mode		= 0555,
 	  .child	= ipmi_table },
 	{ }
 };
 
 static ctl_table ipmi_root_table[] = {
-	{ .ctl_name	= CTL_DEV,
-	  .procname	= "dev",
+	{ .procname	= "dev",
 	  .mode		= 0555,
 	  .child	= ipmi_dir_table },
 	{ }
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 62f282e..d86c0bc 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -431,30 +431,25 @@ static struct cdev ptmx_cdev;
 
 static struct ctl_table pty_table[] = {
 	{
-		.ctl_name	= PTY_MAX,
 		.procname	= "max",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.data		= &pty_limit,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &pty_limit_min,
 		.extra2		= &pty_limit_max,
 	}, {
-		.ctl_name	= PTY_NR,
 		.procname	= "nr",
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.data		= &pty_count,
-		.proc_handler	= &proc_dointvec,
-	}, {
-		.ctl_name	= 0
-	}
+		.proc_handler	= proc_dointvec,
+	}, 
+	{}
 };
 
 static struct ctl_table pty_kern_table[] = {
 	{
-		.ctl_name	= KERN_PTY,
 		.procname	= "pty",
 		.mode		= 0555,
 		.child		= pty_table,
@@ -464,7 +459,6 @@ static struct ctl_table pty_kern_table[] = {
 
 static struct ctl_table pty_root_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= pty_kern_table,
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 04b505e..dcd08635 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1257,94 +1257,54 @@ static int proc_do_uuid(ctl_table *table, int write,
 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
-static int uuid_strategy(ctl_table *table,
-			 void __user *oldval, size_t __user *oldlenp,
-			 void __user *newval, size_t newlen)
-{
-	unsigned char tmp_uuid[16], *uuid;
-	unsigned int len;
-
-	if (!oldval || !oldlenp)
-		return 1;
-
-	uuid = table->data;
-	if (!uuid) {
-		uuid = tmp_uuid;
-		uuid[8] = 0;
-	}
-	if (uuid[8] == 0)
-		generate_random_uuid(uuid);
-
-	if (get_user(len, oldlenp))
-		return -EFAULT;
-	if (len) {
-		if (len > 16)
-			len = 16;
-		if (copy_to_user(oldval, uuid, len) ||
-		    put_user(len, oldlenp))
-			return -EFAULT;
-	}
-	return 1;
-}
-
 static int sysctl_poolsize = INPUT_POOL_WORDS * 32;
 ctl_table random_table[] = {
 	{
-		.ctl_name 	= RANDOM_POOLSIZE,
 		.procname	= "poolsize",
 		.data		= &sysctl_poolsize,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= RANDOM_ENTROPY_COUNT,
 		.procname	= "entropy_avail",
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 		.data		= &input_pool.entropy_count,
 	},
 	{
-		.ctl_name	= RANDOM_READ_THRESH,
 		.procname	= "read_wakeup_threshold",
 		.data		= &random_read_wakeup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_read_thresh,
 		.extra2		= &max_read_thresh,
 	},
 	{
-		.ctl_name	= RANDOM_WRITE_THRESH,
 		.procname	= "write_wakeup_threshold",
 		.data		= &random_write_wakeup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_write_thresh,
 		.extra2		= &max_write_thresh,
 	},
 	{
-		.ctl_name	= RANDOM_BOOT_ID,
 		.procname	= "boot_id",
 		.data		= &sysctl_bootid,
 		.maxlen		= 16,
 		.mode		= 0444,
-		.proc_handler	= &proc_do_uuid,
-		.strategy	= &uuid_strategy,
+		.proc_handler	= proc_do_uuid,
 	},
 	{
-		.ctl_name	= RANDOM_UUID,
 		.procname	= "uuid",
 		.maxlen		= 16,
 		.mode		= 0444,
-		.proc_handler	= &proc_do_uuid,
-		.strategy	= &uuid_strategy,
+		.proc_handler	= proc_do_uuid,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 #endif 	/* CONFIG_SYSCTL */
 
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index bc4ab3e..95acb8c 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -282,34 +282,31 @@ static irqreturn_t rtc_interrupt(int irq, void *dev_id)
  */
 static ctl_table rtc_table[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "max-user-freq",
 		.data		= &rtc_max_user_freq,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table rtc_root[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "rtc",
 		.mode		= 0555,
 		.child		= rtc_table,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table dev_root[] = {
 	{
-		.ctl_name	= CTL_DEV,
 		.procname	= "dev",
 		.mode		= 0555,
 		.child		= rtc_root,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table_header *sysctl_header;
diff --git a/drivers/dio/dio-driver.c b/drivers/dio/dio-driver.c
index 9c0c9af..a7b174e 100644
--- a/drivers/dio/dio-driver.c
+++ b/drivers/dio/dio-driver.c
@@ -140,5 +140,4 @@ postcore_initcall(dio_driver_init);
 EXPORT_SYMBOL(dio_match_device);
 EXPORT_SYMBOL(dio_register_driver);
 EXPORT_SYMBOL(dio_unregister_driver);
-EXPORT_SYMBOL(dio_dev_driver);
 EXPORT_SYMBOL(dio_bus_type);
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index b401dad..eb140ff 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -54,7 +54,7 @@ config DW_DMAC
 
 config AT_HDMAC
 	tristate "Atmel AHB DMA support"
-	depends on ARCH_AT91SAM9RL
+	depends on ARCH_AT91SAM9RL || ARCH_AT91SAM9G45
 	select DMA_ENGINE
 	help
 	  Support the Atmel AHB DMA controller.  This can be integrated in
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 713ed7d..689cc6a 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -3,7 +3,6 @@
 
 static bool report_gart_errors;
 static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
-static void (*orig_mce_callback)(struct mce *m);
 
 void amd_report_gart_errors(bool v)
 {
@@ -363,8 +362,10 @@ static inline void amd_decode_err_code(unsigned int ec)
 		pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
 }
 
-static void amd_decode_mce(struct mce *m)
+static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
+			   void *data)
 {
+	struct mce *m = (struct mce *)data;
 	struct err_regs regs;
 	int node, ecc;
 
@@ -420,20 +421,22 @@ static void amd_decode_mce(struct mce *m)
 	}
 
 	amd_decode_err_code(m->status & 0xffff);
+
+	return NOTIFY_STOP;
 }
 
+static struct notifier_block amd_mce_dec_nb = {
+	.notifier_call	= amd_decode_mce,
+};
+
 static int __init mce_amd_init(void)
 {
 	/*
 	 * We can decode MCEs for Opteron and later CPUs:
 	 */
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
-	    (boot_cpu_data.x86 >= 0xf)) {
-		/* safe the default decode mce callback */
-		orig_mce_callback = x86_mce_decode_callback;
-
-		x86_mce_decode_callback = amd_decode_mce;
-	}
+	    (boot_cpu_data.x86 >= 0xf))
+		atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
 
 	return 0;
 }
@@ -442,7 +445,7 @@ early_initcall(mce_amd_init);
 #ifdef MODULE
 static void __exit mce_amd_exit(void)
 {
-	x86_mce_decode_callback = orig_mce_callback;
+	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
 }
 
 MODULE_DESCRIPTION("AMD MCE decoder");
diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index e4864e8..7083bcc 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -38,15 +38,14 @@
 
 #include "core.h"
 
-int fw_compute_block_crc(u32 *block)
+int fw_compute_block_crc(__be32 *block)
 {
-	__be32 be32_block[256];
-	int i, length;
+	int length;
+	u16 crc;
 
-	length = (*block >> 16) & 0xff;
-	for (i = 0; i < length; i++)
-		be32_block[i] = cpu_to_be32(block[i + 1]);
-	*block |= crc_itu_t(0, (u8 *) be32_block, length * 4);
+	length = (be32_to_cpu(block[0]) >> 16) & 0xff;
+	crc = crc_itu_t(0, (u8 *)&block[1], length * 4);
+	*block |= cpu_to_be32(crc);
 
 	return length;
 }
@@ -57,6 +56,8 @@ static LIST_HEAD(card_list);
 static LIST_HEAD(descriptor_list);
 static int descriptor_count;
 
+static __be32 tmp_config_rom[256];
+
 #define BIB_CRC(v)		((v) <<  0)
 #define BIB_CRC_LENGTH(v)	((v) << 16)
 #define BIB_INFO_LENGTH(v)	((v) << 24)
@@ -72,11 +73,10 @@ static int descriptor_count;
 #define BIB_CMC			((1) << 30)
 #define BIB_IMC			((1) << 31)
 
-static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
+static size_t generate_config_rom(struct fw_card *card, __be32 *config_rom)
 {
 	struct fw_descriptor *desc;
-	static u32 config_rom[256];
-	int i, j, length;
+	int i, j, k, length;
 
 	/*
 	 * Initialize contents of config rom buffer.  On the OHCI
@@ -87,40 +87,39 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
 	 * the version stored in the OHCI registers.
 	 */
 
-	memset(config_rom, 0, sizeof(config_rom));
-	config_rom[0] = BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0);
-	config_rom[1] = 0x31333934;
-
-	config_rom[2] =
+	config_rom[0] = cpu_to_be32(
+		BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0));
+	config_rom[1] = cpu_to_be32(0x31333934);
+	config_rom[2] = cpu_to_be32(
 		BIB_LINK_SPEED(card->link_speed) |
 		BIB_GENERATION(card->config_rom_generation++ % 14 + 2) |
 		BIB_MAX_ROM(2) |
 		BIB_MAX_RECEIVE(card->max_receive) |
-		BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC;
-	config_rom[3] = card->guid >> 32;
-	config_rom[4] = card->guid;
+		BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC);
+	config_rom[3] = cpu_to_be32(card->guid >> 32);
+	config_rom[4] = cpu_to_be32(card->guid);
 
 	/* Generate root directory. */
-	i = 5;
-	config_rom[i++] = 0;
-	config_rom[i++] = 0x0c0083c0; /* node capabilities */
-	j = i + descriptor_count;
+	config_rom[6] = cpu_to_be32(0x0c0083c0); /* node capabilities */
+	i = 7;
+	j = 7 + descriptor_count;
 
 	/* Generate root directory entries for descriptors. */
 	list_for_each_entry (desc, &descriptor_list, link) {
 		if (desc->immediate > 0)
-			config_rom[i++] = desc->immediate;
-		config_rom[i] = desc->key | (j - i);
+			config_rom[i++] = cpu_to_be32(desc->immediate);
+		config_rom[i] = cpu_to_be32(desc->key | (j - i));
 		i++;
 		j += desc->length;
 	}
 
 	/* Update root directory length. */
-	config_rom[5] = (i - 5 - 1) << 16;
+	config_rom[5] = cpu_to_be32((i - 5 - 1) << 16);
 
 	/* End of root directory, now copy in descriptors. */
 	list_for_each_entry (desc, &descriptor_list, link) {
-		memcpy(&config_rom[i], desc->data, desc->length * 4);
+		for (k = 0; k < desc->length; k++)
+			config_rom[i + k] = cpu_to_be32(desc->data[k]);
 		i += desc->length;
 	}
 
@@ -131,20 +130,17 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
 	for (i = 0; i < j; i += length + 1)
 		length = fw_compute_block_crc(config_rom + i);
 
-	*config_rom_length = j;
-
-	return config_rom;
+	return j;
 }
 
 static void update_config_roms(void)
 {
 	struct fw_card *card;
-	u32 *config_rom;
 	size_t length;
 
 	list_for_each_entry (card, &card_list, link) {
-		config_rom = generate_config_rom(card, &length);
-		card->driver->set_config_rom(card, config_rom, length);
+		length = generate_config_rom(card, tmp_config_rom);
+		card->driver->set_config_rom(card, tmp_config_rom, length);
 	}
 }
 
@@ -211,11 +207,8 @@ static const char gap_count_table[] = {
 
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay)
 {
-	int scheduled;
-
 	fw_card_get(card);
-	scheduled = schedule_delayed_work(&card->work, delay);
-	if (!scheduled)
+	if (!schedule_delayed_work(&card->work, delay))
 		fw_card_put(card);
 }
 
@@ -435,7 +428,6 @@ EXPORT_SYMBOL(fw_card_initialize);
 int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid)
 {
-	u32 *config_rom;
 	size_t length;
 	int ret;
 
@@ -445,8 +437,8 @@ int fw_card_add(struct fw_card *card,
 
 	mutex_lock(&card_mutex);
 
-	config_rom = generate_config_rom(card, &length);
-	ret = card->driver->enable(card, config_rom, length);
+	length = generate_config_rom(card, tmp_config_rom);
+	ret = card->driver->enable(card, tmp_config_rom, length);
 	if (ret == 0)
 		list_add_tail(&card->link, &card_list);
 
@@ -465,7 +457,8 @@ EXPORT_SYMBOL(fw_card_add);
  * shutdown still need to be provided by the card driver.
  */
 
-static int dummy_enable(struct fw_card *card, u32 *config_rom, size_t length)
+static int dummy_enable(struct fw_card *card,
+			const __be32 *config_rom, size_t length)
 {
 	BUG();
 	return -1;
@@ -478,7 +471,7 @@ static int dummy_update_phy_reg(struct fw_card *card, int address,
 }
 
 static int dummy_set_config_rom(struct fw_card *card,
-				u32 *config_rom, size_t length)
+				const __be32 *config_rom, size_t length)
 {
 	/*
 	 * We take the card out of card_list before setting the dummy
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 5089331..231e6ee 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -130,9 +130,22 @@ struct iso_resource {
 	struct iso_resource_event *e_alloc, *e_dealloc;
 };
 
-static void schedule_iso_resource(struct iso_resource *);
 static void release_iso_resource(struct client *, struct client_resource *);
 
+static void schedule_iso_resource(struct iso_resource *r, unsigned long delay)
+{
+	client_get(r->client);
+	if (!schedule_delayed_work(&r->work, delay))
+		client_put(r->client);
+}
+
+static void schedule_if_iso_resource(struct client_resource *resource)
+{
+	if (resource->release == release_iso_resource)
+		schedule_iso_resource(container_of(resource,
+					struct iso_resource, resource), 0);
+}
+
 /*
  * dequeue_event() just kfree()'s the event, so the event has to be
  * the first field in a struct XYZ_event.
@@ -166,7 +179,7 @@ struct iso_interrupt_event {
 
 struct iso_resource_event {
 	struct event event;
-	struct fw_cdev_event_iso_resource resource;
+	struct fw_cdev_event_iso_resource iso_resource;
 };
 
 static inline void __user *u64_to_uptr(__u64 value)
@@ -314,11 +327,8 @@ static void for_each_client(struct fw_device *device,
 
 static int schedule_reallocations(int id, void *p, void *data)
 {
-	struct client_resource *r = p;
+	schedule_if_iso_resource(p);
 
-	if (r->release == release_iso_resource)
-		schedule_iso_resource(container_of(r,
-					struct iso_resource, resource));
 	return 0;
 }
 
@@ -414,9 +424,7 @@ static int add_client_resource(struct client *client,
 				  &resource->handle);
 	if (ret >= 0) {
 		client_get(client);
-		if (resource->release == release_iso_resource)
-			schedule_iso_resource(container_of(resource,
-						struct iso_resource, resource));
+		schedule_if_iso_resource(resource);
 	}
 	spin_unlock_irqrestore(&client->lock, flags);
 
@@ -428,26 +436,26 @@ static int add_client_resource(struct client *client,
 
 static int release_client_resource(struct client *client, u32 handle,
 				   client_resource_release_fn_t release,
-				   struct client_resource **resource)
+				   struct client_resource **return_resource)
 {
-	struct client_resource *r;
+	struct client_resource *resource;
 
 	spin_lock_irq(&client->lock);
 	if (client->in_shutdown)
-		r = NULL;
+		resource = NULL;
 	else
-		r = idr_find(&client->resource_idr, handle);
-	if (r && r->release == release)
+		resource = idr_find(&client->resource_idr, handle);
+	if (resource && resource->release == release)
 		idr_remove(&client->resource_idr, handle);
 	spin_unlock_irq(&client->lock);
 
-	if (!(r && r->release == release))
+	if (!(resource && resource->release == release))
 		return -EINVAL;
 
-	if (resource)
-		*resource = r;
+	if (return_resource)
+		*return_resource = resource;
 	else
-		r->release(client, r);
+		resource->release(client, resource);
 
 	client_put(client);
 
@@ -699,6 +707,7 @@ static int ioctl_send_response(struct client *client, void *buffer)
 	struct fw_cdev_send_response *request = buffer;
 	struct client_resource *resource;
 	struct inbound_transaction_resource *r;
+	int ret = 0;
 
 	if (release_client_resource(client, request->handle,
 				    release_request, &resource) < 0)
@@ -708,13 +717,17 @@ static int ioctl_send_response(struct client *client, void *buffer)
 			 resource);
 	if (request->length < r->length)
 		r->length = request->length;
-	if (copy_from_user(r->data, u64_to_uptr(request->data), r->length))
-		return -EFAULT;
+
+	if (copy_from_user(r->data, u64_to_uptr(request->data), r->length)) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	fw_send_response(client->device->card, r->request, request->rcode);
+ out:
 	kfree(r);
 
-	return 0;
+	return ret;
 }
 
 static int ioctl_initiate_bus_reset(struct client *client, void *buffer)
@@ -1028,8 +1041,7 @@ static void iso_resource_work(struct work_struct *work)
 	/* Allow 1000ms grace period for other reallocations. */
 	if (todo == ISO_RES_ALLOC &&
 	    time_is_after_jiffies(client->device->card->reset_jiffies + HZ)) {
-		if (schedule_delayed_work(&r->work, DIV_ROUND_UP(HZ, 3)))
-			client_get(client);
+		schedule_iso_resource(r, DIV_ROUND_UP(HZ, 3));
 		skip = true;
 	} else {
 		/* We could be called twice within the same generation. */
@@ -1097,12 +1109,12 @@ static void iso_resource_work(struct work_struct *work)
 		e = r->e_dealloc;
 		r->e_dealloc = NULL;
 	}
-	e->resource.handle	= r->resource.handle;
-	e->resource.channel	= channel;
-	e->resource.bandwidth	= bandwidth;
+	e->iso_resource.handle    = r->resource.handle;
+	e->iso_resource.channel   = channel;
+	e->iso_resource.bandwidth = bandwidth;
 
 	queue_event(client, &e->event,
-		    &e->resource, sizeof(e->resource), NULL, 0);
+		    &e->iso_resource, sizeof(e->iso_resource), NULL, 0);
 
 	if (free) {
 		cancel_delayed_work(&r->work);
@@ -1114,13 +1126,6 @@ static void iso_resource_work(struct work_struct *work)
 	client_put(client);
 }
 
-static void schedule_iso_resource(struct iso_resource *r)
-{
-	client_get(r->client);
-	if (!schedule_delayed_work(&r->work, 0))
-		client_put(r->client);
-}
-
 static void release_iso_resource(struct client *client,
 				 struct client_resource *resource)
 {
@@ -1129,7 +1134,7 @@ static void release_iso_resource(struct client *client,
 
 	spin_lock_irq(&client->lock);
 	r->todo = ISO_RES_DEALLOC;
-	schedule_iso_resource(r);
+	schedule_iso_resource(r, 0);
 	spin_unlock_irq(&client->lock);
 }
 
@@ -1162,10 +1167,10 @@ static int init_iso_resource(struct client *client,
 	r->e_alloc	= e1;
 	r->e_dealloc	= e2;
 
-	e1->resource.closure	= request->closure;
-	e1->resource.type	= FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED;
-	e2->resource.closure	= request->closure;
-	e2->resource.type	= FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED;
+	e1->iso_resource.closure = request->closure;
+	e1->iso_resource.type    = FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED;
+	e2->iso_resource.closure = request->closure;
+	e2->iso_resource.type    = FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED;
 
 	if (todo == ISO_RES_ALLOC) {
 		r->resource.release = release_iso_resource;
@@ -1175,7 +1180,7 @@ static int init_iso_resource(struct client *client,
 	} else {
 		r->resource.release = NULL;
 		r->resource.handle = -1;
-		schedule_iso_resource(r);
+		schedule_iso_resource(r, 0);
 	}
 	request->handle = r->resource.handle;
 
@@ -1295,7 +1300,23 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 static int dispatch_ioctl(struct client *client,
 			  unsigned int cmd, void __user *arg)
 {
-	char buffer[256];
+	char buffer[sizeof(union {
+		struct fw_cdev_get_info			_00;
+		struct fw_cdev_send_request		_01;
+		struct fw_cdev_allocate			_02;
+		struct fw_cdev_deallocate		_03;
+		struct fw_cdev_send_response		_04;
+		struct fw_cdev_initiate_bus_reset	_05;
+		struct fw_cdev_add_descriptor		_06;
+		struct fw_cdev_remove_descriptor	_07;
+		struct fw_cdev_create_iso_context	_08;
+		struct fw_cdev_queue_iso		_09;
+		struct fw_cdev_start_iso		_0a;
+		struct fw_cdev_stop_iso			_0b;
+		struct fw_cdev_get_cycle_timer		_0c;
+		struct fw_cdev_allocate_iso_resource	_0d;
+		struct fw_cdev_send_stream_packet	_13;
+	})];
 	int ret;
 
 	if (_IOC_TYPE(cmd) != '#' ||
@@ -1390,10 +1411,10 @@ static int fw_device_op_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int shutdown_resource(int id, void *p, void *data)
 {
-	struct client_resource *r = p;
+	struct client_resource *resource = p;
 	struct client *client = data;
 
-	r->release(client, r);
+	resource->release(client, resource);
 	client_put(client);
 
 	return 0;
@@ -1402,7 +1423,7 @@ static int shutdown_resource(int id, void *p, void *data)
 static int fw_device_op_release(struct inode *inode, struct file *file)
 {
 	struct client *client = file->private_data;
-	struct event *e, *next_e;
+	struct event *event, *next_event;
 
 	mutex_lock(&client->device->client_list_mutex);
 	list_del(&client->link);
@@ -1423,8 +1444,8 @@ static int fw_device_op_release(struct inode *inode, struct file *file)
 	idr_remove_all(&client->resource_idr);
 	idr_destroy(&client->resource_idr);
 
-	list_for_each_entry_safe(e, next_e, &client->event_list, link)
-		kfree(e);
+	list_for_each_entry_safe(event, next_event, &client->event_list, link)
+		kfree(event);
 
 	client_put(client);
 
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index fddf2b3..9a5f38c 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -28,9 +28,9 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/string.h>
 
 #include <asm/atomic.h>
+#include <asm/byteorder.h>
 #include <asm/system.h>
 
 #include "core.h"
@@ -510,13 +510,16 @@ static void update_tree(struct fw_card *card, struct fw_node *root)
 static void update_topology_map(struct fw_card *card,
 				u32 *self_ids, int self_id_count)
 {
-	int node_count;
+	int node_count = (card->root_node->node_id & 0x3f) + 1;
+	__be32 *map = card->topology_map;
+
+	*map++ = cpu_to_be32((self_id_count + 2) << 16);
+	*map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1);
+	*map++ = cpu_to_be32((node_count << 16) | self_id_count);
+
+	while (self_id_count--)
+		*map++ = cpu_to_be32p(self_ids++);
 
-	card->topology_map[1]++;
-	node_count = (card->root_node->node_id & 0x3f) + 1;
-	card->topology_map[2] = (node_count << 16) | self_id_count;
-	card->topology_map[0] = (self_id_count + 2) << 16;
-	memcpy(&card->topology_map[3], self_ids, self_id_count * 4);
 	fw_compute_block_crc(card->topology_map);
 }
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index da628c7..842739d 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -218,12 +218,15 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
 		packet->header_length = 16;
 		packet->payload_length = 0;
 		break;
+
+	default:
+		WARN(1, KERN_ERR "wrong tcode %d", tcode);
 	}
  common:
 	packet->speed = speed;
 	packet->generation = generation;
 	packet->ack = 0;
-	packet->payload_bus = 0;
+	packet->payload_mapped = false;
 }
 
 /**
@@ -595,11 +598,10 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 		break;
 
 	default:
-		BUG();
-		return;
+		WARN(1, KERN_ERR "wrong tcode %d", tcode);
 	}
 
-	response->payload_bus = 0;
+	response->payload_mapped = false;
 }
 EXPORT_SYMBOL(fw_fill_response);
 
@@ -810,8 +812,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request
 		int speed, unsigned long long offset,
 		void *payload, size_t length, void *callback_data)
 {
-	int i, start, end;
-	__be32 *map;
+	int start;
 
 	if (!TCODE_IS_READ_REQUEST(tcode)) {
 		fw_send_response(card, request, RCODE_TYPE_ERROR);
@@ -824,11 +825,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request
 	}
 
 	start = (offset - topology_map_region.start) / 4;
-	end = start + length / 4;
-	map = payload;
-
-	for (i = 0; i < length / 4; i++)
-		map[i] = cpu_to_be32(card->topology_map[start + i]);
+	memcpy(payload, &card->topology_map[start], length);
 
 	fw_send_response(card, request, RCODE_COMPLETE);
 }
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 7ff6e75..ed3b1a7 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -40,7 +40,8 @@ struct fw_card_driver {
 	 * enable the PHY or set the link_on bit and initiate a bus
 	 * reset.
 	 */
-	int (*enable)(struct fw_card *card, u32 *config_rom, size_t length);
+	int (*enable)(struct fw_card *card,
+		      const __be32 *config_rom, size_t length);
 
 	int (*update_phy_reg)(struct fw_card *card, int address,
 			      int clear_bits, int set_bits);
@@ -48,10 +49,10 @@ struct fw_card_driver {
 	/*
 	 * Update the config rom for an enabled card.  This function
 	 * should change the config rom that is presented on the bus
-	 * an initiate a bus reset.
+	 * and initiate a bus reset.
 	 */
 	int (*set_config_rom)(struct fw_card *card,
-			      u32 *config_rom, size_t length);
+			      const __be32 *config_rom, size_t length);
 
 	void (*send_request)(struct fw_card *card, struct fw_packet *packet);
 	void (*send_response)(struct fw_card *card, struct fw_packet *packet);
@@ -93,7 +94,7 @@ int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid);
 void fw_core_remove_card(struct fw_card *card);
 int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
-int fw_compute_block_crc(u32 *block);
+int fw_compute_block_crc(__be32 *block);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 94260aa..ae4556f 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -205,7 +205,7 @@ struct fw_ohci {
 	dma_addr_t config_rom_bus;
 	__be32 *next_config_rom;
 	dma_addr_t next_config_rom_bus;
-	u32 next_header;
+	__be32 next_header;
 
 	struct ar_context ar_request_ctx;
 	struct ar_context ar_response_ctx;
@@ -997,7 +997,8 @@ static int at_context_queue_packet(struct context *ctx,
 			packet->ack = RCODE_SEND_ERROR;
 			return -1;
 		}
-		packet->payload_bus = payload_bus;
+		packet->payload_bus	= payload_bus;
+		packet->payload_mapped	= true;
 
 		d[2].req_count    = cpu_to_le16(packet->payload_length);
 		d[2].data_address = cpu_to_le32(payload_bus);
@@ -1025,7 +1026,7 @@ static int at_context_queue_packet(struct context *ctx,
 	 */
 	if (ohci->generation != packet->generation ||
 	    reg_read(ohci, OHCI1394_IntEventSet) & OHCI1394_busReset) {
-		if (packet->payload_length > 0)
+		if (packet->payload_mapped)
 			dma_unmap_single(ohci->card.device, payload_bus,
 					 packet->payload_length, DMA_TO_DEVICE);
 		packet->ack = RCODE_GENERATION;
@@ -1061,7 +1062,7 @@ static int handle_at_packet(struct context *context,
 		/* This packet was cancelled, just continue. */
 		return 1;
 
-	if (packet->payload_bus)
+	if (packet->payload_mapped)
 		dma_unmap_single(ohci->card.device, packet->payload_bus,
 				 packet->payload_length, DMA_TO_DEVICE);
 
@@ -1357,8 +1358,9 @@ static void bus_reset_tasklet(unsigned long data)
 		 */
 		reg_write(ohci, OHCI1394_BusOptions,
 			  be32_to_cpu(ohci->config_rom[2]));
-		ohci->config_rom[0] = cpu_to_be32(ohci->next_header);
-		reg_write(ohci, OHCI1394_ConfigROMhdr, ohci->next_header);
+		ohci->config_rom[0] = ohci->next_header;
+		reg_write(ohci, OHCI1394_ConfigROMhdr,
+			  be32_to_cpu(ohci->next_header));
 	}
 
 #ifdef CONFIG_FIREWIRE_OHCI_REMOTE_DMA
@@ -1477,7 +1479,17 @@ static int software_reset(struct fw_ohci *ohci)
 	return -EBUSY;
 }
 
-static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
+static void copy_config_rom(__be32 *dest, const __be32 *src, size_t length)
+{
+	size_t size = length * 4;
+
+	memcpy(dest, src, size);
+	if (size < CONFIG_ROM_SIZE)
+		memset(&dest[length], 0, CONFIG_ROM_SIZE - size);
+}
+
+static int ohci_enable(struct fw_card *card,
+		       const __be32 *config_rom, size_t length)
 {
 	struct fw_ohci *ohci = fw_ohci(card);
 	struct pci_dev *dev = to_pci_dev(card->device);
@@ -1579,8 +1591,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 		if (ohci->next_config_rom == NULL)
 			return -ENOMEM;
 
-		memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE);
-		fw_memcpy_to_be32(ohci->next_config_rom, config_rom, length * 4);
+		copy_config_rom(ohci->next_config_rom, config_rom, length);
 	} else {
 		/*
 		 * In the suspend case, config_rom is NULL, which
@@ -1590,7 +1601,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 		ohci->next_config_rom_bus = ohci->config_rom_bus;
 	}
 
-	ohci->next_header = be32_to_cpu(ohci->next_config_rom[0]);
+	ohci->next_header = ohci->next_config_rom[0];
 	ohci->next_config_rom[0] = 0;
 	reg_write(ohci, OHCI1394_ConfigROMhdr, 0);
 	reg_write(ohci, OHCI1394_BusOptions,
@@ -1624,7 +1635,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 }
 
 static int ohci_set_config_rom(struct fw_card *card,
-			       u32 *config_rom, size_t length)
+			       const __be32 *config_rom, size_t length)
 {
 	struct fw_ohci *ohci;
 	unsigned long flags;
@@ -1673,9 +1684,7 @@ static int ohci_set_config_rom(struct fw_card *card,
 		ohci->next_config_rom = next_config_rom;
 		ohci->next_config_rom_bus = next_config_rom_bus;
 
-		memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE);
-		fw_memcpy_to_be32(ohci->next_config_rom, config_rom,
-				  length * 4);
+		copy_config_rom(ohci->next_config_rom, config_rom, length);
 
 		ohci->next_header = config_rom[0];
 		ohci->next_config_rom[0] = 0;
@@ -1729,7 +1738,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet)
 	if (packet->ack != 0)
 		goto out;
 
-	if (packet->payload_bus)
+	if (packet->payload_mapped)
 		dma_unmap_single(ohci->card.device, packet->payload_bus,
 				 packet->payload_length, DMA_TO_DEVICE);
 
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index 98dbbda..d485cdd 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -820,20 +820,25 @@ static void sbp2_release_target(struct kref *kref)
 	fw_device_put(device);
 }
 
-static struct workqueue_struct *sbp2_wq;
+static void sbp2_target_get(struct sbp2_target *tgt)
+{
+	kref_get(&tgt->kref);
+}
 
 static void sbp2_target_put(struct sbp2_target *tgt)
 {
 	kref_put(&tgt->kref, sbp2_release_target);
 }
 
+static struct workqueue_struct *sbp2_wq;
+
 /*
  * Always get the target's kref when scheduling work on one its units.
  * Each workqueue job is responsible to call sbp2_target_put() upon return.
  */
 static void sbp2_queue_work(struct sbp2_logical_unit *lu, unsigned long delay)
 {
-	kref_get(&lu->tgt->kref);
+	sbp2_target_get(lu->tgt);
 	if (!queue_delayed_work(sbp2_wq, &lu->work, delay))
 		sbp2_target_put(lu->tgt);
 }
diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
index d7ece13..8d8a00e 100644
--- a/drivers/i2c/Kconfig
+++ b/drivers/i2c/Kconfig
@@ -5,6 +5,7 @@
 menuconfig I2C
 	tristate "I2C support"
 	depends on HAS_IOMEM
+	select RT_MUTEXES
 	---help---
 	  I2C (pronounce: I-square-C) is a slow serial bus protocol used in
 	  many micro controller applications and developed by Philips.  SMBus,
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index e8fe7f1..5f318ce 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -640,22 +640,6 @@ config I2C_TINY_USB
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-tiny-usb.
 
-comment "Graphics adapter I2C/DDC channel drivers"
-	depends on PCI
-
-config I2C_VOODOO3
-	tristate "Voodoo 3 (DEPRECATED)"
-	depends on PCI
-	select I2C_ALGOBIT
-	help
-	  If you say yes to this option, support will be included for the
-	  Voodoo 3 I2C interface. This driver is deprecated and you should
-	  use the tdfxfb driver instead, which additionally provides
-	  framebuffer support.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-voodoo3.
-
 comment "Other I2C/SMBus bus drivers"
 
 config I2C_ACORN
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index ff937ac..302c551 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -61,9 +61,6 @@ obj-$(CONFIG_I2C_PARPORT_LIGHT)	+= i2c-parport-light.o
 obj-$(CONFIG_I2C_TAOS_EVM)	+= i2c-taos-evm.o
 obj-$(CONFIG_I2C_TINY_USB)	+= i2c-tiny-usb.o
 
-# Graphics adapter I2C/DDC channel drivers
-obj-$(CONFIG_I2C_VOODOO3)	+= i2c-voodoo3.o
-
 # Other I2C/SMBus bus drivers
 obj-$(CONFIG_I2C_ACORN)		+= i2c-acorn.o
 obj-$(CONFIG_I2C_ELEKTOR)	+= i2c-elektor.o
diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c
index d108450..8de7d7b 100644
--- a/drivers/i2c/busses/i2c-ali1535.c
+++ b/drivers/i2c/busses/i2c-ali1535.c
@@ -138,7 +138,7 @@ static unsigned short ali1535_smba;
    Note the differences between kernels with the old PCI BIOS interface and
    newer kernels with the real PCI interface. In compat.h some things are
    defined to make the transition easier. */
-static int ali1535_setup(struct pci_dev *dev)
+static int __devinit ali1535_setup(struct pci_dev *dev)
 {
 	int retval = -ENODEV;
 	unsigned char temp;
diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c
index d627fce..e7e3205 100644
--- a/drivers/i2c/busses/i2c-ali15x3.c
+++ b/drivers/i2c/busses/i2c-ali15x3.c
@@ -131,7 +131,7 @@ MODULE_PARM_DESC(force_addr,
 static struct pci_driver ali15x3_driver;
 static unsigned short ali15x3_smba;
 
-static int ali15x3_setup(struct pci_dev *ALI15X3_dev)
+static int __devinit ali15x3_setup(struct pci_dev *ALI15X3_dev)
 {
 	u16 a;
 	unsigned char temp;
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 55edcfe..df6ab553 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -767,6 +767,9 @@ static int __devinit i801_probe(struct pci_dev *dev, const struct pci_device_id
 	/* set up the sysfs linkage to our parent device */
 	i801_adapter.dev.parent = &dev->dev;
 
+	/* Retry up to 3 times on lost arbitration */
+	i801_adapter.retries = 3;
+
 	snprintf(i801_adapter.name, sizeof(i801_adapter.name),
 		"SMBus I801 adapter at %04lx", i801_smba);
 	err = i2c_add_adapter(&i801_adapter);
diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c
index a75c75e..5901707 100644
--- a/drivers/i2c/busses/i2c-iop3xx.c
+++ b/drivers/i2c/busses/i2c-iop3xx.c
@@ -56,12 +56,6 @@ iic_cook_addr(struct i2c_msg *msg)
 	if (msg->flags & I2C_M_RD)
 		addr |= 1;
 
-	/*
-	 * Read or Write?
-	 */
-	if (msg->flags & I2C_M_REV_DIR_ADDR)
-		addr ^= 1;
-
 	return addr;   
 }
 
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index bbab0e1..ed387ff 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -338,9 +338,6 @@ mv64xxx_i2c_prepare_for_io(struct mv64xxx_i2c_data *drv_data,
 	if (msg->flags & I2C_M_RD)
 		dir = 1;
 
-	if (msg->flags & I2C_M_REV_DIR_ADDR)
-		dir ^= 1;
-
 	if (msg->flags & I2C_M_TEN) {
 		drv_data->addr1 = 0xf0 | (((u32)msg->addr & 0x300) >> 7) | dir;
 		drv_data->addr2 = (u32)msg->addr & 0xff;
diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c
index 3c9d71f..1c440a7 100644
--- a/drivers/i2c/busses/i2c-powermac.c
+++ b/drivers/i2c/busses/i2c-powermac.c
@@ -49,48 +49,38 @@ static s32 i2c_powermac_smbus_xfer(	struct i2c_adapter*	adap,
 	int			rc = 0;
 	int			read = (read_write == I2C_SMBUS_READ);
 	int			addrdir = (addr << 1) | read;
+	int			mode, subsize, len;
+	u32			subaddr;
+	u8			*buf;
 	u8			local[2];
 
-	rc = pmac_i2c_open(bus, 0);
-	if (rc)
-		return rc;
+	if (size == I2C_SMBUS_QUICK || size == I2C_SMBUS_BYTE) {
+		mode = pmac_i2c_mode_std;
+		subsize = 0;
+		subaddr = 0;
+	} else {
+		mode = read ? pmac_i2c_mode_combined : pmac_i2c_mode_stdsub;
+		subsize = 1;
+		subaddr = command;
+	}
 
 	switch (size) {
         case I2C_SMBUS_QUICK:
-		rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std);
-		if (rc)
-			goto bail;
-		rc = pmac_i2c_xfer(bus, addrdir, 0, 0, NULL, 0);
+		buf = NULL;
+		len = 0;
 	    	break;
         case I2C_SMBUS_BYTE:
-		rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std);
-		if (rc)
-			goto bail;
-		rc = pmac_i2c_xfer(bus, addrdir, 0, 0, &data->byte, 1);
-	    	break;
         case I2C_SMBUS_BYTE_DATA:
-		rc = pmac_i2c_setmode(bus, read ?
-				      pmac_i2c_mode_combined :
-				      pmac_i2c_mode_stdsub);
-		if (rc)
-			goto bail;
-		rc = pmac_i2c_xfer(bus, addrdir, 1, command, &data->byte, 1);
+		buf = &data->byte;
+		len = 1;
 	    	break;
         case I2C_SMBUS_WORD_DATA:
-		rc = pmac_i2c_setmode(bus, read ?
-				      pmac_i2c_mode_combined :
-				      pmac_i2c_mode_stdsub);
-		if (rc)
-			goto bail;
 		if (!read) {
 			local[0] = data->word & 0xff;
 			local[1] = (data->word >> 8) & 0xff;
 		}
-		rc = pmac_i2c_xfer(bus, addrdir, 1, command, local, 2);
-		if (rc == 0 && read) {
-			data->word = ((u16)local[1]) << 8;
-			data->word |= local[0];
-		}
+		buf = local;
+		len = 2;
 	    	break;
 
 	/* Note that these are broken vs. the expected smbus API where
@@ -105,28 +95,44 @@ static s32 i2c_powermac_smbus_xfer(	struct i2c_adapter*	adap,
 	 * a repeat start/addr phase (but not stop in between)
 	 */
         case I2C_SMBUS_BLOCK_DATA:
-		rc = pmac_i2c_setmode(bus, read ?
-				      pmac_i2c_mode_combined :
-				      pmac_i2c_mode_stdsub);
-		if (rc)
-			goto bail;
-		rc = pmac_i2c_xfer(bus, addrdir, 1, command, data->block,
-				   data->block[0] + 1);
-
+		buf = data->block;
+		len = data->block[0] + 1;
 		break;
 	case I2C_SMBUS_I2C_BLOCK_DATA:
-		rc = pmac_i2c_setmode(bus, read ?
-				      pmac_i2c_mode_combined :
-				      pmac_i2c_mode_stdsub);
-		if (rc)
-			goto bail;
-		rc = pmac_i2c_xfer(bus, addrdir, 1, command,
-				   &data->block[1], data->block[0]);
+		buf = &data->block[1];
+		len = data->block[0];
 		break;
 
         default:
-	    	rc = -EINVAL;
+		return -EINVAL;
+	}
+
+	rc = pmac_i2c_open(bus, 0);
+	if (rc) {
+		dev_err(&adap->dev, "Failed to open I2C, err %d\n", rc);
+		return rc;
+	}
+
+	rc = pmac_i2c_setmode(bus, mode);
+	if (rc) {
+		dev_err(&adap->dev, "Failed to set I2C mode %d, err %d\n",
+			mode, rc);
+		goto bail;
 	}
+
+	rc = pmac_i2c_xfer(bus, addrdir, subsize, subaddr, buf, len);
+	if (rc) {
+		dev_err(&adap->dev,
+			"I2C transfer at 0x%02x failed, size %d, err %d\n",
+			addrdir >> 1, size, rc);
+		goto bail;
+	}
+
+	if (size == I2C_SMBUS_WORD_DATA && read) {
+		data->word = ((u16)local[1]) << 8;
+		data->word |= local[0];
+	}
+
  bail:
 	pmac_i2c_close(bus);
 	return rc;
@@ -146,20 +152,33 @@ static int i2c_powermac_master_xfer(	struct i2c_adapter *adap,
 	int			read;
 	int			addrdir;
 
+	if (num != 1) {
+		dev_err(&adap->dev,
+			"Multi-message I2C transactions not supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (msgs->flags & I2C_M_TEN)
 		return -EINVAL;
 	read = (msgs->flags & I2C_M_RD) != 0;
 	addrdir = (msgs->addr << 1) | read;
-	if (msgs->flags & I2C_M_REV_DIR_ADDR)
-		addrdir ^= 1;
 
 	rc = pmac_i2c_open(bus, 0);
-	if (rc)
+	if (rc) {
+		dev_err(&adap->dev, "Failed to open I2C, err %d\n", rc);
 		return rc;
+	}
 	rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std);
-	if (rc)
+	if (rc) {
+		dev_err(&adap->dev, "Failed to set I2C mode %d, err %d\n",
+			pmac_i2c_mode_std, rc);
 		goto bail;
+	}
 	rc = pmac_i2c_xfer(bus, addrdir, 0, 0, msgs->buf, msgs->len);
+	if (rc < 0)
+		dev_err(&adap->dev, "I2C %s 0x%02x failed, err %d\n",
+			addrdir & 1 ? "read from" : "write to", addrdir >> 1,
+			rc);
  bail:
 	pmac_i2c_close(bus);
 	return rc < 0 ? rc : 1;
@@ -183,19 +202,16 @@ static const struct i2c_algorithm i2c_powermac_algorithm = {
 static int __devexit i2c_powermac_remove(struct platform_device *dev)
 {
 	struct i2c_adapter	*adapter = platform_get_drvdata(dev);
-	struct pmac_i2c_bus	*bus = i2c_get_adapdata(adapter);
 	int			rc;
 
 	rc = i2c_del_adapter(adapter);
-	pmac_i2c_detach_adapter(bus, adapter);
-	i2c_set_adapdata(adapter, NULL);
 	/* We aren't that prepared to deal with this... */
 	if (rc)
 		printk(KERN_WARNING
 		       "i2c-powermac.c: Failed to remove bus %s !\n",
 		       adapter->name);
 	platform_set_drvdata(dev, NULL);
-	kfree(adapter);
+	memset(adapter, 0, sizeof(*adapter));
 
 	return 0;
 }
@@ -206,12 +222,12 @@ static int __devinit i2c_powermac_probe(struct platform_device *dev)
 	struct pmac_i2c_bus *bus = dev->dev.platform_data;
 	struct device_node *parent = NULL;
 	struct i2c_adapter *adapter;
-	char name[32];
 	const char *basename;
 	int rc;
 
 	if (bus == NULL)
 		return -EINVAL;
+	adapter = pmac_i2c_get_adapter(bus);
 
 	/* Ok, now we need to make up a name for the interface that will
 	 * match what we used to do in the past, that is basically the
@@ -237,29 +253,22 @@ static int __devinit i2c_powermac_probe(struct platform_device *dev)
 	default:
 		return -EINVAL;
 	}
-	snprintf(name, 32, "%s %d", basename, pmac_i2c_get_channel(bus));
+	snprintf(adapter->name, sizeof(adapter->name), "%s %d", basename,
+		 pmac_i2c_get_channel(bus));
 	of_node_put(parent);
 
-	adapter = kzalloc(sizeof(struct i2c_adapter), GFP_KERNEL);
-	if (adapter == NULL) {
-		printk(KERN_ERR "i2c-powermac: can't allocate inteface !\n");
-		return -ENOMEM;
-	}
 	platform_set_drvdata(dev, adapter);
-	strcpy(adapter->name, name);
 	adapter->algo = &i2c_powermac_algorithm;
 	i2c_set_adapdata(adapter, bus);
 	adapter->dev.parent = &dev->dev;
-	pmac_i2c_attach_adapter(bus, adapter);
 	rc = i2c_add_adapter(adapter);
 	if (rc) {
 		printk(KERN_ERR "i2c-powermac: Adapter %s registration "
-		       "failed\n", name);
-		i2c_set_adapdata(adapter, NULL);
-		pmac_i2c_detach_adapter(bus, adapter);
+		       "failed\n", adapter->name);
+		memset(adapter, 0, sizeof(*adapter));
 	}
 
-	printk(KERN_INFO "PowerMac i2c bus %s registered\n", name);
+	printk(KERN_INFO "PowerMac i2c bus %s registered\n", adapter->name);
 
 	if (!strncmp(basename, "uni-n", 5)) {
 		struct device_node *np;
diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c
index 139f0c7..844569f 100644
--- a/drivers/i2c/busses/i2c-sis5595.c
+++ b/drivers/i2c/busses/i2c-sis5595.c
@@ -142,7 +142,7 @@ static void sis5595_write(u8 reg, u8 data)
 	outb(data, sis5595_base + SMB_DAT);
 }
 
-static int sis5595_setup(struct pci_dev *SIS5595_dev)
+static int __devinit sis5595_setup(struct pci_dev *SIS5595_dev)
 {
 	u16 a;
 	u8 val;
diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c
index 70ca41e..68cff7a 100644
--- a/drivers/i2c/busses/i2c-sis630.c
+++ b/drivers/i2c/busses/i2c-sis630.c
@@ -389,7 +389,7 @@ static u32 sis630_func(struct i2c_adapter *adapter)
 		I2C_FUNC_SMBUS_BLOCK_DATA;
 }
 
-static int sis630_setup(struct pci_dev *sis630_dev)
+static int __devinit sis630_setup(struct pci_dev *sis630_dev)
 {
 	unsigned char b;
 	struct pci_dev *dummy = NULL;
diff --git a/drivers/i2c/busses/i2c-stub.c b/drivers/i2c/busses/i2c-stub.c
index 1b7b2af..0c770ea 100644
--- a/drivers/i2c/busses/i2c-stub.c
+++ b/drivers/i2c/busses/i2c-stub.c
@@ -35,6 +35,10 @@ module_param_array(chip_addr, ushort, NULL, S_IRUGO);
 MODULE_PARM_DESC(chip_addr,
 		 "Chip addresses (up to 10, between 0x03 and 0x77)");
 
+static unsigned long functionality = ~0UL;
+module_param(functionality, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(functionality, "Override functionality bitfield");
+
 struct stub_chip {
 	u8 pointer;
 	u16 words[256];		/* Byte operations use the LSB as per SMBus
@@ -48,7 +52,7 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags,
 	char read_write, u8 command, int size, union i2c_smbus_data * data)
 {
 	s32 ret;
-	int i;
+	int i, len;
 	struct stub_chip *chip = NULL;
 
 	/* Search for the right chip */
@@ -118,6 +122,29 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags,
 		ret = 0;
 		break;
 
+	case I2C_SMBUS_I2C_BLOCK_DATA:
+		len = data->block[0];
+		if (read_write == I2C_SMBUS_WRITE) {
+			for (i = 0; i < len; i++) {
+				chip->words[command + i] &= 0xff00;
+				chip->words[command + i] |= data->block[1 + i];
+			}
+			dev_dbg(&adap->dev, "i2c block data - addr 0x%02x, "
+					"wrote %d bytes at 0x%02x.\n",
+					addr, len, command);
+		} else {
+			for (i = 0; i < len; i++) {
+				data->block[1 + i] =
+					chip->words[command + i] & 0xff;
+			}
+			dev_dbg(&adap->dev, "i2c block data - addr 0x%02x, "
+					"read  %d bytes at 0x%02x.\n",
+					addr, len, command);
+		}
+
+		ret = 0;
+		break;
+
 	default:
 		dev_dbg(&adap->dev, "Unsupported I2C/SMBus command\n");
 		ret = -EOPNOTSUPP;
@@ -129,8 +156,9 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags,
 
 static u32 stub_func(struct i2c_adapter *adapter)
 {
-	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
-		I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA;
+	return (I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+		I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+		I2C_FUNC_SMBUS_I2C_BLOCK) & functionality;
 }
 
 static const struct i2c_algorithm smbus_algorithm = {
diff --git a/drivers/i2c/busses/i2c-voodoo3.c b/drivers/i2c/busses/i2c-voodoo3.c
deleted file mode 100644
index 7663d57..0000000
--- a/drivers/i2c/busses/i2c-voodoo3.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
-    Copyright (c) 1998, 1999  Frodo Looijaard <frodol@dds.nl>,
-    Philip Edelbrock <phil@netroedge.com>,
-    Ralph Metzler <rjkm@thp.uni-koeln.de>, and
-    Mark D. Studebaker <mdsxyz123@yahoo.com>
-    
-    Based on code written by Ralph Metzler <rjkm@thp.uni-koeln.de> and
-    Simon Vogl
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* This interfaces to the I2C bus of the Voodoo3 to gain access to
-    the BT869 and possibly other I2C devices. */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/i2c.h>
-#include <linux/i2c-algo-bit.h>
-#include <asm/io.h>
-
-/* the only registers we use */
-#define REG		0x78
-#define REG2 		0x70
-
-/* bit locations in the register */
-#define DDC_ENAB	0x00040000
-#define DDC_SCL_OUT	0x00080000
-#define DDC_SDA_OUT	0x00100000
-#define DDC_SCL_IN	0x00200000
-#define DDC_SDA_IN	0x00400000
-#define I2C_ENAB	0x00800000
-#define I2C_SCL_OUT	0x01000000
-#define I2C_SDA_OUT	0x02000000
-#define I2C_SCL_IN	0x04000000
-#define I2C_SDA_IN	0x08000000
-
-/* initialization states */
-#define INIT2		0x2
-#define INIT3		0x4
-
-/* delays */
-#define CYCLE_DELAY	10
-#define TIMEOUT		(HZ / 2)
-
-
-static void __iomem *ioaddr;
-
-/* The voo GPIO registers don't have individual masks for each bit
-   so we always have to read before writing. */
-
-static void bit_vooi2c_setscl(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if (val)
-		r |= I2C_SCL_OUT;
-	else
-		r &= ~I2C_SCL_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-static void bit_vooi2c_setsda(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if (val)
-		r |= I2C_SDA_OUT;
-	else
-		r &= ~I2C_SDA_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-/* The GPIO pins are open drain, so the pins always remain outputs.
-   We rely on the i2c-algo-bit routines to set the pins high before
-   reading the input from other chips. */
-
-static int bit_vooi2c_getscl(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & I2C_SCL_IN));
-}
-
-static int bit_vooi2c_getsda(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & I2C_SDA_IN));
-}
-
-static void bit_vooddc_setscl(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if (val)
-		r |= DDC_SCL_OUT;
-	else
-		r &= ~DDC_SCL_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-static void bit_vooddc_setsda(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if (val)
-		r |= DDC_SDA_OUT;
-	else
-		r &= ~DDC_SDA_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-static int bit_vooddc_getscl(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & DDC_SCL_IN));
-}
-
-static int bit_vooddc_getsda(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & DDC_SDA_IN));
-}
-
-static int config_v3(struct pci_dev *dev)
-{
-	unsigned long cadr;
-
-	/* map Voodoo3 memory */
-	cadr = dev->resource[0].start;
-	cadr &= PCI_BASE_ADDRESS_MEM_MASK;
-	ioaddr = ioremap_nocache(cadr, 0x1000);
-	if (ioaddr) {
-		writel(0x8160, ioaddr + REG2);
-		writel(0xcffc0020, ioaddr + REG);
-		dev_info(&dev->dev, "Using Banshee/Voodoo3 I2C device at %p\n", ioaddr);
-		return 0;
-	}
-	return -ENODEV;
-}
-
-static struct i2c_algo_bit_data voo_i2c_bit_data = {
-	.setsda		= bit_vooi2c_setsda,
-	.setscl		= bit_vooi2c_setscl,
-	.getsda		= bit_vooi2c_getsda,
-	.getscl		= bit_vooi2c_getscl,
-	.udelay		= CYCLE_DELAY,
-	.timeout	= TIMEOUT
-};
-
-static struct i2c_adapter voodoo3_i2c_adapter = {
-	.owner		= THIS_MODULE,
-	.name		= "I2C Voodoo3/Banshee adapter",
-	.algo_data	= &voo_i2c_bit_data,
-};
-
-static struct i2c_algo_bit_data voo_ddc_bit_data = {
-	.setsda		= bit_vooddc_setsda,
-	.setscl		= bit_vooddc_setscl,
-	.getsda		= bit_vooddc_getsda,
-	.getscl		= bit_vooddc_getscl,
-	.udelay		= CYCLE_DELAY,
-	.timeout	= TIMEOUT
-};
-
-static struct i2c_adapter voodoo3_ddc_adapter = {
-	.owner		= THIS_MODULE,
-	.class		= I2C_CLASS_DDC, 
-	.name		= "DDC Voodoo3/Banshee adapter",
-	.algo_data	= &voo_ddc_bit_data,
-};
-
-static struct pci_device_id voodoo3_ids[] __devinitdata = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO3) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_BANSHEE) },
-	{ 0, }
-};
-
-MODULE_DEVICE_TABLE (pci, voodoo3_ids);
-
-static int __devinit voodoo3_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	int retval;
-
-	retval = config_v3(dev);
-	if (retval)
-		return retval;
-
-	/* set up the sysfs linkage to our parent device */
-	voodoo3_i2c_adapter.dev.parent = &dev->dev;
-	voodoo3_ddc_adapter.dev.parent = &dev->dev;
-
-	retval = i2c_bit_add_bus(&voodoo3_i2c_adapter);
-	if (retval)
-		return retval;
-	retval = i2c_bit_add_bus(&voodoo3_ddc_adapter);
-	if (retval)
-		i2c_del_adapter(&voodoo3_i2c_adapter);
-	return retval;
-}
-
-static void __devexit voodoo3_remove(struct pci_dev *dev)
-{
-	i2c_del_adapter(&voodoo3_i2c_adapter);
-	i2c_del_adapter(&voodoo3_ddc_adapter);
-	iounmap(ioaddr);
-}
-
-static struct pci_driver voodoo3_driver = {
-	.name		= "voodoo3_smbus",
-	.id_table	= voodoo3_ids,
-	.probe		= voodoo3_probe,
-	.remove		= __devexit_p(voodoo3_remove),
-};
-
-static int __init i2c_voodoo3_init(void)
-{
-	return pci_register_driver(&voodoo3_driver);
-}
-
-static void __exit i2c_voodoo3_exit(void)
-{
-	pci_unregister_driver(&voodoo3_driver);
-}
-
-
-MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>, "
-		"Philip Edelbrock <phil@netroedge.com>, "
-		"Ralph Metzler <rjkm@thp.uni-koeln.de>, "
-		"and Mark D. Studebaker <mdsxyz123@yahoo.com>");
-MODULE_DESCRIPTION("Voodoo3 I2C/SMBus driver");
-MODULE_LICENSE("GPL");
-
-module_init(i2c_voodoo3_init);
-module_exit(i2c_voodoo3_exit);
diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index f9618f4..ae4539d 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -6,16 +6,6 @@
 
 menu "Miscellaneous I2C Chip support"
 
-config DS1682
-	tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm"
-	depends on EXPERIMENTAL
-	help
-	  If you say yes here you get support for Dallas Semiconductor
-	  DS1682 Total Elapsed Time Recorder.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called ds1682.
-
 config SENSORS_TSL2550
 	tristate "Taos TSL2550 ambient light sensor"
 	depends on EXPERIMENTAL
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index 749cf36..fe0af0f 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -10,7 +10,6 @@
 # * I/O expander drivers go to drivers/gpio
 #
 
-obj-$(CONFIG_DS1682)		+= ds1682.o
 obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
 
 ifeq ($(CONFIG_I2C_DEBUG_CHIP),y)
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 2965043..4f34823 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -558,11 +558,9 @@ static void i2c_scan_static_board_info(struct i2c_adapter *adapter)
 	up_read(&__i2c_board_lock);
 }
 
-static int i2c_do_add_adapter(struct device_driver *d, void *data)
+static int i2c_do_add_adapter(struct i2c_driver *driver,
+			      struct i2c_adapter *adap)
 {
-	struct i2c_driver *driver = to_i2c_driver(d);
-	struct i2c_adapter *adap = data;
-
 	/* Detect supported devices on that bus, and instantiate them */
 	i2c_detect(adap, driver);
 
@@ -574,6 +572,11 @@ static int i2c_do_add_adapter(struct device_driver *d, void *data)
 	return 0;
 }
 
+static int __process_new_adapter(struct device_driver *d, void *data)
+{
+	return i2c_do_add_adapter(to_i2c_driver(d), data);
+}
+
 static int i2c_register_adapter(struct i2c_adapter *adap)
 {
 	int res = 0, dummy;
@@ -584,7 +587,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		goto out_list;
 	}
 
-	mutex_init(&adap->bus_lock);
+	rt_mutex_init(&adap->bus_lock);
 
 	/* Set default timeout to 1 second if not already set */
 	if (adap->timeout == 0)
@@ -614,7 +617,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	/* Notify drivers */
 	mutex_lock(&core_lock);
 	dummy = bus_for_each_drv(&i2c_bus_type, NULL, adap,
-				 i2c_do_add_adapter);
+				 __process_new_adapter);
 	mutex_unlock(&core_lock);
 
 	return 0;
@@ -715,10 +718,9 @@ retry:
 }
 EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter);
 
-static int i2c_do_del_adapter(struct device_driver *d, void *data)
+static int i2c_do_del_adapter(struct i2c_driver *driver,
+			      struct i2c_adapter *adapter)
 {
-	struct i2c_driver *driver = to_i2c_driver(d);
-	struct i2c_adapter *adapter = data;
 	struct i2c_client *client, *_n;
 	int res;
 
@@ -750,6 +752,11 @@ static int __unregister_client(struct device *dev, void *dummy)
 	return 0;
 }
 
+static int __process_removed_adapter(struct device_driver *d, void *data)
+{
+	return i2c_do_del_adapter(to_i2c_driver(d), data);
+}
+
 /**
  * i2c_del_adapter - unregister I2C adapter
  * @adap: the adapter being unregistered
@@ -777,7 +784,7 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	/* Tell drivers about this removal */
 	mutex_lock(&core_lock);
 	res = bus_for_each_drv(&i2c_bus_type, NULL, adap,
-			       i2c_do_del_adapter);
+			       __process_removed_adapter);
 	mutex_unlock(&core_lock);
 	if (res)
 		return res;
@@ -826,22 +833,11 @@ EXPORT_SYMBOL(i2c_del_adapter);
 
 /* ------------------------------------------------------------------------- */
 
-static int __attach_adapter(struct device *dev, void *data)
+static int __process_new_driver(struct device *dev, void *data)
 {
-	struct i2c_adapter *adapter;
-	struct i2c_driver *driver = data;
-
 	if (dev->type != &i2c_adapter_type)
 		return 0;
-	adapter = to_i2c_adapter(dev);
-
-	i2c_detect(adapter, driver);
-
-	/* Legacy drivers scan i2c busses directly */
-	if (driver->attach_adapter)
-		driver->attach_adapter(adapter);
-
-	return 0;
+	return i2c_do_add_adapter(data, to_i2c_adapter(dev));
 }
 
 /*
@@ -873,40 +869,18 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
 	INIT_LIST_HEAD(&driver->clients);
 	/* Walk the adapters that are already present */
 	mutex_lock(&core_lock);
-	bus_for_each_dev(&i2c_bus_type, NULL, driver, __attach_adapter);
+	bus_for_each_dev(&i2c_bus_type, NULL, driver, __process_new_driver);
 	mutex_unlock(&core_lock);
 
 	return 0;
 }
 EXPORT_SYMBOL(i2c_register_driver);
 
-static int __detach_adapter(struct device *dev, void *data)
+static int __process_removed_driver(struct device *dev, void *data)
 {
-	struct i2c_adapter *adapter;
-	struct i2c_driver *driver = data;
-	struct i2c_client *client, *_n;
-
 	if (dev->type != &i2c_adapter_type)
 		return 0;
-	adapter = to_i2c_adapter(dev);
-
-	/* Remove the devices we created ourselves as the result of hardware
-	 * probing (using a driver's detect method) */
-	list_for_each_entry_safe(client, _n, &driver->clients, detected) {
-		dev_dbg(&adapter->dev, "Removing %s at 0x%x\n",
-			client->name, client->addr);
-		list_del(&client->detected);
-		i2c_unregister_device(client);
-	}
-
-	if (driver->detach_adapter) {
-		if (driver->detach_adapter(adapter))
-			dev_err(&adapter->dev,
-				"detach_adapter failed for driver [%s]\n",
-				driver->driver.name);
-	}
-
-	return 0;
+	return i2c_do_del_adapter(data, to_i2c_adapter(dev));
 }
 
 /**
@@ -917,7 +891,7 @@ static int __detach_adapter(struct device *dev, void *data)
 void i2c_del_driver(struct i2c_driver *driver)
 {
 	mutex_lock(&core_lock);
-	bus_for_each_dev(&i2c_bus_type, NULL, driver, __detach_adapter);
+	bus_for_each_dev(&i2c_bus_type, NULL, driver, __process_removed_driver);
 	mutex_unlock(&core_lock);
 
 	driver_unregister(&driver->driver);
@@ -1092,12 +1066,12 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 #endif
 
 		if (in_atomic() || irqs_disabled()) {
-			ret = mutex_trylock(&adap->bus_lock);
+			ret = rt_mutex_trylock(&adap->bus_lock);
 			if (!ret)
 				/* I2C activity is ongoing. */
 				return -EAGAIN;
 		} else {
-			mutex_lock_nested(&adap->bus_lock, adap->level);
+			rt_mutex_lock(&adap->bus_lock);
 		}
 
 		/* Retry automatically on arbitration loss */
@@ -1109,7 +1083,7 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 			if (time_after(jiffies, orig_jiffies + adap->timeout))
 				break;
 		}
-		mutex_unlock(&adap->bus_lock);
+		rt_mutex_unlock(&adap->bus_lock);
 
 		return ret;
 	} else {
@@ -1180,7 +1154,7 @@ EXPORT_SYMBOL(i2c_master_recv);
  * ----------------------------------------------------
  */
 
-static int i2c_detect_address(struct i2c_client *temp_client, int kind,
+static int i2c_detect_address(struct i2c_client *temp_client,
 			      struct i2c_driver *driver)
 {
 	struct i2c_board_info info;
@@ -1199,22 +1173,18 @@ static int i2c_detect_address(struct i2c_client *temp_client, int kind,
 	if (i2c_check_addr(adapter, addr))
 		return 0;
 
-	/* Make sure there is something at this address, unless forced */
-	if (kind < 0) {
-		if (i2c_smbus_xfer(adapter, addr, 0, 0, 0,
-				   I2C_SMBUS_QUICK, NULL) < 0)
-			return 0;
+	/* Make sure there is something at this address */
+	if (i2c_smbus_xfer(adapter, addr, 0, 0, 0, I2C_SMBUS_QUICK, NULL) < 0)
+		return 0;
 
-		/* prevent 24RF08 corruption */
-		if ((addr & ~0x0f) == 0x50)
-			i2c_smbus_xfer(adapter, addr, 0, 0, 0,
-				       I2C_SMBUS_QUICK, NULL);
-	}
+	/* Prevent 24RF08 corruption */
+	if ((addr & ~0x0f) == 0x50)
+		i2c_smbus_xfer(adapter, addr, 0, 0, 0, I2C_SMBUS_QUICK, NULL);
 
 	/* Finally call the custom detection function */
 	memset(&info, 0, sizeof(struct i2c_board_info));
 	info.addr = addr;
-	err = driver->detect(temp_client, kind, &info);
+	err = driver->detect(temp_client, -1, &info);
 	if (err) {
 		/* -ENODEV is returned if the detection fails. We catch it
 		   here as this isn't an error. */
@@ -1259,40 +1229,13 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 		return -ENOMEM;
 	temp_client->adapter = adapter;
 
-	/* Force entries are done first, and are not affected by ignore
-	   entries */
-	if (address_data->forces) {
-		const unsigned short * const *forces = address_data->forces;
-		int kind;
-
-		for (kind = 0; forces[kind]; kind++) {
-			for (i = 0; forces[kind][i] != I2C_CLIENT_END;
-			     i += 2) {
-				if (forces[kind][i] == adap_id
-				 || forces[kind][i] == ANY_I2C_BUS) {
-					dev_dbg(&adapter->dev, "found force "
-						"parameter for adapter %d, "
-						"addr 0x%02x, kind %d\n",
-						adap_id, forces[kind][i + 1],
-						kind);
-					temp_client->addr = forces[kind][i + 1];
-					err = i2c_detect_address(temp_client,
-						kind, driver);
-					if (err)
-						goto exit_free;
-				}
-			}
-		}
-	}
-
 	/* Stop here if the classes do not match */
 	if (!(adapter->class & driver->class))
 		goto exit_free;
 
 	/* Stop here if we can't use SMBUS_QUICK */
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) {
-		if (address_data->probe[0] == I2C_CLIENT_END
-		 && address_data->normal_i2c[0] == I2C_CLIENT_END)
+		if (address_data->normal_i2c[0] == I2C_CLIENT_END)
 			goto exit_free;
 
 		dev_warn(&adapter->dev, "SMBus Quick command not supported, "
@@ -1301,48 +1244,12 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 		goto exit_free;
 	}
 
-	/* Probe entries are done second, and are not affected by ignore
-	   entries either */
-	for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) {
-		if (address_data->probe[i] == adap_id
-		 || address_data->probe[i] == ANY_I2C_BUS) {
-			dev_dbg(&adapter->dev, "found probe parameter for "
-				"adapter %d, addr 0x%02x\n", adap_id,
-				address_data->probe[i + 1]);
-			temp_client->addr = address_data->probe[i + 1];
-			err = i2c_detect_address(temp_client, -1, driver);
-			if (err)
-				goto exit_free;
-		}
-	}
-
-	/* Normal entries are done last, unless shadowed by an ignore entry */
 	for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) {
-		int j, ignore;
-
-		ignore = 0;
-		for (j = 0; address_data->ignore[j] != I2C_CLIENT_END;
-		     j += 2) {
-			if ((address_data->ignore[j] == adap_id ||
-			     address_data->ignore[j] == ANY_I2C_BUS)
-			 && address_data->ignore[j + 1]
-			    == address_data->normal_i2c[i]) {
-				dev_dbg(&adapter->dev, "found ignore "
-					"parameter for adapter %d, "
-					"addr 0x%02x\n", adap_id,
-					address_data->ignore[j + 1]);
-				ignore = 1;
-				break;
-			}
-		}
-		if (ignore)
-			continue;
-
 		dev_dbg(&adapter->dev, "found normal entry for adapter %d, "
 			"addr 0x%02x\n", adap_id,
 			address_data->normal_i2c[i]);
 		temp_client->addr = address_data->normal_i2c[i];
-		err = i2c_detect_address(temp_client, -1, driver);
+		err = i2c_detect_address(temp_client, driver);
 		if (err)
 			goto exit_free;
 	}
@@ -1913,7 +1820,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC;
 
 	if (adapter->algo->smbus_xfer) {
-		mutex_lock(&adapter->bus_lock);
+		rt_mutex_lock(&adapter->bus_lock);
 
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
@@ -1927,7 +1834,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		mutex_unlock(&adapter->bus_lock);
+		rt_mutex_unlock(&adapter->bus_lock);
 	} else
 		res = i2c_smbus_xfer_emulated(adapter,addr,flags,read_write,
 					      command, protocol, data);
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 7e13d2d..f4110aa 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -34,7 +34,6 @@
 #include <linux/list.h>
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
-#include <linux/smp_lock.h>
 #include <linux/jiffies.h>
 #include <asm/uaccess.h>
 
@@ -445,20 +444,14 @@ static int i2cdev_open(struct inode *inode, struct file *file)
 	struct i2c_client *client;
 	struct i2c_adapter *adap;
 	struct i2c_dev *i2c_dev;
-	int ret = 0;
 
-	lock_kernel();
 	i2c_dev = i2c_dev_get_by_minor(minor);
-	if (!i2c_dev) {
-		ret = -ENODEV;
-		goto out;
-	}
+	if (!i2c_dev)
+		return -ENODEV;
 
 	adap = i2c_get_adapter(i2c_dev->adap->nr);
-	if (!adap) {
-		ret = -ENODEV;
-		goto out;
-	}
+	if (!adap)
+		return -ENODEV;
 
 	/* This creates an anonymous i2c_client, which may later be
 	 * pointed to some address using I2C_SLAVE or I2C_SLAVE_FORCE.
@@ -470,8 +463,7 @@ static int i2cdev_open(struct inode *inode, struct file *file)
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
 	if (!client) {
 		i2c_put_adapter(adap);
-		ret = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 	snprintf(client->name, I2C_NAME_SIZE, "i2c-dev %d", adap->nr);
 	client->driver = &i2cdev_driver;
@@ -479,9 +471,7 @@ static int i2cdev_open(struct inode *inode, struct file *file)
 	client->adapter = adap;
 	file->private_data = client;
 
-out:
-	unlock_kernel();
-	return ret;
+	return 0;
 }
 
 static int i2cdev_release(struct inode *inode, struct file *file)
diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
index 39d4e01..a743e68 100644
--- a/drivers/ide/ide-pci-generic.c
+++ b/drivers/ide/ide-pci-generic.c
@@ -162,9 +162,10 @@ static const struct pci_device_id generic_pci_tbl[] = {
 #ifdef CONFIG_BLK_DEV_IDE_SATA
 	{ PCI_VDEVICE(VIA,	PCI_DEVICE_ID_VIA_8237_SATA),		 5 },
 #endif
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO),		 4 },
 	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_1),	 4 },
 	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),	 4 },
+	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),	 4 },
+	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),	 4 },
 	{ PCI_VDEVICE(NETCELL,	PCI_DEVICE_ID_REVOLUTION),		 6 },
 	/*
 	 * Must come last.  If you add entries adjust
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 65c1429..d0dc1db 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -82,6 +82,7 @@
  *
  */
 
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -434,7 +435,6 @@ static void initialize_dma_trm_ctx(struct dma_trm_ctx *d)
 /* Count the number of available iso contexts */
 static int get_nb_iso_ctx(struct ti_ohci *ohci, int reg)
 {
-	int i,ctx=0;
 	u32 tmp;
 
 	reg_write(ohci, reg, 0xffffffff);
@@ -443,11 +443,7 @@ static int get_nb_iso_ctx(struct ti_ohci *ohci, int reg)
 	DBGMSG("Iso contexts reg: %08x implemented: %08x", reg, tmp);
 
 	/* Count the number of contexts */
-	for (i=0; i<32; i++) {
-	    	if (tmp & 1) ctx++;
-		tmp >>= 1;
-	}
-	return ctx;
+	return hweight32(tmp);
 }
 
 /* Global initialization */
diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c
index bba85ad..1a494d5 100644
--- a/drivers/input/keyboard/omap-keypad.c
+++ b/drivers/input/keyboard/omap-keypad.c
@@ -35,12 +35,12 @@
 #include <linux/mutex.h>
 #include <linux/errno.h>
 #include <mach/gpio.h>
-#include <mach/keypad.h>
-#include <mach/menelaus.h>
+#include <plat/keypad.h>
+#include <plat/menelaus.h>
 #include <asm/irq.h>
 #include <mach/hardware.h>
 #include <asm/io.h>
-#include <mach/mux.h>
+#include <plat/mux.h>
 
 #undef NEW_BOARD_LEARNING_MODE
 
diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c
index 4460507..b982603 100644
--- a/drivers/leds/leds-ams-delta.c
+++ b/drivers/leds/leds-ams-delta.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/leds.h>
-#include <mach/board-ams-delta.h>
+#include <plat/board-ams-delta.h>
 
 /*
  * Our context
diff --git a/drivers/leds/leds-locomo.c b/drivers/leds/leds-locomo.c
index 5d91362..1f7c10f 100644
--- a/drivers/leds/leds-locomo.c
+++ b/drivers/leds/leds-locomo.c
@@ -44,7 +44,7 @@ static void locomoled_brightness_set1(struct led_classdev *led_cdev,
 
 static struct led_classdev locomo_led0 = {
 	.name			= "locomo:amber:charge",
-	.default_trigger	= "sharpsl-charge",
+	.default_trigger	= "main-battery-charging",
 	.brightness_set		= locomoled_brightness_set0,
 };
 
diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index cc9f275..7b4ef5b 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -27,54 +27,49 @@ static int mouse_last_keycode;
 /* file(s) in /proc/sys/dev/mac_hid */
 static ctl_table mac_hid_files[] = {
 	{
-		.ctl_name	= DEV_MAC_HID_MOUSE_BUTTON_EMULATION,
 		.procname	= "mouse_button_emulation",
 		.data		= &mouse_emulate_buttons,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,
 		.procname	= "mouse_button2_keycode",
 		.data		= &mouse_button2_keycode,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,
 		.procname	= "mouse_button3_keycode",
 		.data		= &mouse_button3_keycode,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 /* dir in /proc/sys/dev */
 static ctl_table mac_hid_dir[] = {
 	{
-		.ctl_name	= DEV_MAC_HID,
 		.procname	= "mac_hid",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= mac_hid_files,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 /* /proc/sys/dev itself, in case that is not there yet */
 static ctl_table mac_hid_root_dir[] = {
 	{
-		.ctl_name	= CTL_DEV,
 		.procname	= "dev",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= mac_hid_dir,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table_header *mac_hid_sysctl_header;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b182f86..5f154ef 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -98,44 +98,40 @@ static struct ctl_table_header *raid_table_header;
 
 static ctl_table raid_table[] = {
 	{
-		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN,
 		.procname	= "speed_limit_min",
 		.data		= &sysctl_speed_limit_min,
 		.maxlen		= sizeof(int),
 		.mode		= S_IRUGO|S_IWUSR,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX,
 		.procname	= "speed_limit_max",
 		.data		= &sysctl_speed_limit_max,
 		.maxlen		= sizeof(int),
 		.mode		= S_IRUGO|S_IWUSR,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table raid_dir_table[] = {
 	{
-		.ctl_name	= DEV_RAID,
 		.procname	= "raid",
 		.maxlen		= 0,
 		.mode		= S_IRUGO|S_IXUGO,
 		.child		= raid_table,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table raid_root_table[] = {
 	{
-		.ctl_name	= CTL_DEV,
 		.procname	= "dev",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= raid_dir_table,
 	},
-	{ .ctl_name = 0 }
+	{  }
 };
 
 static const struct block_device_operations md_fops;
diff --git a/drivers/media/radio/Kconfig b/drivers/media/radio/Kconfig
index a87a477..b134553 100644
--- a/drivers/media/radio/Kconfig
+++ b/drivers/media/radio/Kconfig
@@ -195,6 +195,24 @@ config RADIO_MAESTRO
 	  To compile this driver as a module, choose M here: the
 	  module will be called radio-maestro.
 
+config RADIO_MIROPCM20
+	tristate "miroSOUND PCM20 radio"
+	depends on ISA && VIDEO_V4L2
+	select SND_MIRO
+	---help---
+	  Choose Y here if you have this FM radio card. You also need to enable
+	  the ALSA sound system. This choice automatically selects the ALSA
+	  sound card driver "Miro miroSOUND PCM1pro/PCM12/PCM20radio" as this
+	  is required for the radio-miropcm20.
+
+	  In order to control your radio card, you will need to use programs
+	  that are compatible with the Video For Linux API.  Information on
+	  this API and pointers to "v4l" programs may be found at
+	  <file:Documentation/video4linux/API.html>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called radio-miropcm20.
+
 config RADIO_SF16FMI
 	tristate "SF16FMI Radio"
 	depends on ISA && VIDEO_V4L2
diff --git a/drivers/media/radio/Makefile b/drivers/media/radio/Makefile
index 2a1be3b..8a63d54 100644
--- a/drivers/media/radio/Makefile
+++ b/drivers/media/radio/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_RADIO_TRUST) += radio-trust.o
 obj-$(CONFIG_I2C_SI4713) += si4713-i2c.o
 obj-$(CONFIG_RADIO_SI4713) += radio-si4713.o
 obj-$(CONFIG_RADIO_MAESTRO) += radio-maestro.o
+obj-$(CONFIG_RADIO_MIROPCM20) += radio-miropcm20.o
 obj-$(CONFIG_USB_DSBR) += dsbr100.o
 obj-$(CONFIG_RADIO_SI470X) += si470x/
 obj-$(CONFIG_USB_MR800) += radio-mr800.o
diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c
new file mode 100644
index 0000000..4ff8854
--- /dev/null
+++ b/drivers/media/radio/radio-miropcm20.c
@@ -0,0 +1,270 @@
+/* Miro PCM20 radio driver for Linux radio support
+ * (c) 1998 Ruurd Reitsma <R.A.Reitsma@wbmt.tudelft.nl>
+ * Thanks to Norberto Pellici for the ACI device interface specification
+ * The API part is based on the radiotrack driver by M. Kirkwood
+ * This driver relies on the aci mixer provided by the snd-miro
+ * ALSA driver.
+ * Look there for further info...
+ */
+
+/* What ever you think about the ACI, version 0x07 is not very well!
+ * I can't get frequency, 'tuner status', 'tuner flags' or mute/mono
+ * conditions...                Robert
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/videodev2.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-ioctl.h>
+#include <sound/aci.h>
+
+static int radio_nr = -1;
+module_param(radio_nr, int, 0);
+MODULE_PARM_DESC(radio_nr, "Set radio device number (/dev/radioX).  Default: -1 (autodetect)");
+
+static int mono;
+module_param(mono, bool, 0);
+MODULE_PARM_DESC(mono, "Force tuner into mono mode.");
+
+struct pcm20 {
+	struct v4l2_device v4l2_dev;
+	struct video_device vdev;
+	unsigned long freq;
+	int muted;
+	struct snd_miro_aci *aci;
+};
+
+static struct pcm20 pcm20_card = {
+	.freq   = 87*16000,
+	.muted  = 1,
+};
+
+static int pcm20_mute(struct pcm20 *dev, unsigned char mute)
+{
+	dev->muted = mute;
+	return snd_aci_cmd(dev->aci, ACI_SET_TUNERMUTE, mute, -1);
+}
+
+static int pcm20_stereo(struct pcm20 *dev, unsigned char stereo)
+{
+	return snd_aci_cmd(dev->aci, ACI_SET_TUNERMONO, !stereo, -1);
+}
+
+static int pcm20_setfreq(struct pcm20 *dev, unsigned long freq)
+{
+	unsigned char freql;
+	unsigned char freqh;
+	struct snd_miro_aci *aci = dev->aci;
+
+	dev->freq = freq;
+
+	freq /= 160;
+	if (!(aci->aci_version == 0x07 || aci->aci_version >= 0xb0))
+		freq /= 10;  /* I don't know exactly which version
+			      * needs this hack */
+	freql = freq & 0xff;
+	freqh = freq >> 8;
+
+	pcm20_stereo(dev, !mono);
+	return snd_aci_cmd(aci, ACI_WRITE_TUNE, freql, freqh);
+}
+
+static const struct v4l2_file_operations pcm20_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= video_ioctl2,
+};
+
+static int vidioc_querycap(struct file *file, void *priv,
+				struct v4l2_capability *v)
+{
+	strlcpy(v->driver, "Miro PCM20", sizeof(v->driver));
+	strlcpy(v->card, "Miro PCM20", sizeof(v->card));
+	strlcpy(v->bus_info, "ISA", sizeof(v->bus_info));
+	v->version = 0x1;
+	v->capabilities = V4L2_CAP_TUNER | V4L2_CAP_RADIO;
+	return 0;
+}
+
+static int vidioc_g_tuner(struct file *file, void *priv,
+				struct v4l2_tuner *v)
+{
+	if (v->index)	/* Only 1 tuner */
+		return -EINVAL;
+	strlcpy(v->name, "FM", sizeof(v->name));
+	v->type = V4L2_TUNER_RADIO;
+	v->rangelow = 87*16000;
+	v->rangehigh = 108*16000;
+	v->signal = 0xffff;
+	v->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_STEREO;
+	v->capability = V4L2_TUNER_CAP_LOW;
+	v->audmode = V4L2_TUNER_MODE_MONO;
+	return 0;
+}
+
+static int vidioc_s_tuner(struct file *file, void *priv,
+				struct v4l2_tuner *v)
+{
+	return v->index ? -EINVAL : 0;
+}
+
+static int vidioc_g_frequency(struct file *file, void *priv,
+				struct v4l2_frequency *f)
+{
+	struct pcm20 *dev = video_drvdata(file);
+
+	if (f->tuner != 0)
+		return -EINVAL;
+
+	f->type = V4L2_TUNER_RADIO;
+	f->frequency = dev->freq;
+	return 0;
+}
+
+
+static int vidioc_s_frequency(struct file *file, void *priv,
+				struct v4l2_frequency *f)
+{
+	struct pcm20 *dev = video_drvdata(file);
+
+	if (f->tuner != 0 || f->type != V4L2_TUNER_RADIO)
+		return -EINVAL;
+
+	dev->freq = f->frequency;
+	pcm20_setfreq(dev, f->frequency);
+	return 0;
+}
+
+static int vidioc_queryctrl(struct file *file, void *priv,
+				struct v4l2_queryctrl *qc)
+{
+	switch (qc->id) {
+	case V4L2_CID_AUDIO_MUTE:
+		return v4l2_ctrl_query_fill(qc, 0, 1, 1, 1);
+	}
+	return -EINVAL;
+}
+
+static int vidioc_g_ctrl(struct file *file, void *priv,
+				struct v4l2_control *ctrl)
+{
+	struct pcm20 *dev = video_drvdata(file);
+
+	switch (ctrl->id) {
+	case V4L2_CID_AUDIO_MUTE:
+		ctrl->value = dev->muted;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int vidioc_s_ctrl(struct file *file, void *priv,
+				struct v4l2_control *ctrl)
+{
+	struct pcm20 *dev = video_drvdata(file);
+
+	switch (ctrl->id) {
+	case V4L2_CID_AUDIO_MUTE:
+		pcm20_mute(dev, ctrl->value);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int vidioc_g_input(struct file *filp, void *priv, unsigned int *i)
+{
+	*i = 0;
+	return 0;
+}
+
+static int vidioc_s_input(struct file *filp, void *priv, unsigned int i)
+{
+	return i ? -EINVAL : 0;
+}
+
+static int vidioc_g_audio(struct file *file, void *priv,
+				struct v4l2_audio *a)
+{
+	a->index = 0;
+	strlcpy(a->name, "Radio", sizeof(a->name));
+	a->capability = V4L2_AUDCAP_STEREO;
+	return 0;
+}
+
+static int vidioc_s_audio(struct file *file, void *priv,
+				struct v4l2_audio *a)
+{
+	return a->index ? -EINVAL : 0;
+}
+
+static const struct v4l2_ioctl_ops pcm20_ioctl_ops = {
+	.vidioc_querycap    = vidioc_querycap,
+	.vidioc_g_tuner     = vidioc_g_tuner,
+	.vidioc_s_tuner     = vidioc_s_tuner,
+	.vidioc_g_frequency = vidioc_g_frequency,
+	.vidioc_s_frequency = vidioc_s_frequency,
+	.vidioc_queryctrl   = vidioc_queryctrl,
+	.vidioc_g_ctrl      = vidioc_g_ctrl,
+	.vidioc_s_ctrl      = vidioc_s_ctrl,
+	.vidioc_g_audio     = vidioc_g_audio,
+	.vidioc_s_audio     = vidioc_s_audio,
+	.vidioc_g_input     = vidioc_g_input,
+	.vidioc_s_input     = vidioc_s_input,
+};
+
+static int __init pcm20_init(void)
+{
+	struct pcm20 *dev = &pcm20_card;
+	struct v4l2_device *v4l2_dev = &dev->v4l2_dev;
+	int res;
+
+	dev->aci = snd_aci_get_aci();
+	if (dev->aci == NULL) {
+		v4l2_err(v4l2_dev,
+			 "you must load the snd-miro driver first!\n");
+		return -ENODEV;
+	}
+	strlcpy(v4l2_dev->name, "miropcm20", sizeof(v4l2_dev->name));
+
+
+	res = v4l2_device_register(NULL, v4l2_dev);
+	if (res < 0) {
+		v4l2_err(v4l2_dev, "could not register v4l2_device\n");
+		return -EINVAL;
+	}
+
+	strlcpy(dev->vdev.name, v4l2_dev->name, sizeof(dev->vdev.name));
+	dev->vdev.v4l2_dev = v4l2_dev;
+	dev->vdev.fops = &pcm20_fops;
+	dev->vdev.ioctl_ops = &pcm20_ioctl_ops;
+	dev->vdev.release = video_device_release_empty;
+	video_set_drvdata(&dev->vdev, dev);
+
+	if (video_register_device(&dev->vdev, VFL_TYPE_RADIO, radio_nr) < 0)
+		goto fail;
+
+	v4l2_info(v4l2_dev, "Mirosound PCM20 Radio tuner\n");
+	return 0;
+fail:
+	v4l2_device_unregister(v4l2_dev);
+	return -EINVAL;
+}
+
+MODULE_AUTHOR("Ruurd Reitsma, Krzysztof Helt");
+MODULE_DESCRIPTION("A driver for the Miro PCM20 radio card.");
+MODULE_LICENSE("GPL");
+
+static void __exit pcm20_cleanup(void)
+{
+	struct pcm20 *dev = &pcm20_card;
+
+	video_unregister_device(&dev->vdev);
+	v4l2_device_unregister(&dev->v4l2_dev);
+}
+
+module_init(pcm20_init);
+module_exit(pcm20_cleanup);
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 570be13..08f2d07 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -121,6 +121,12 @@ config TWL4030_POWER
 	  and load scripts controling which resources are switched off/on
 	  or reset when a sleep, wakeup or warm reset event occurs.
 
+config TWL4030_CODEC
+	bool
+	depends on TWL4030_CORE
+	select MFD_CORE
+	default n
+
 config MFD_TMIO
 	bool
 	default n
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index f3b277b..af0fc90 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
 obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 obj-$(CONFIG_TWL4030_POWER)    += twl4030-power.o
+obj-$(CONFIG_TWL4030_CODEC)	+= twl4030-codec.o
 
 obj-$(CONFIG_MFD_MC13783)	+= mc13783-core.o
 
diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c
index 57271cb..84815f9 100644
--- a/drivers/mfd/mcp-core.c
+++ b/drivers/mfd/mcp-core.c
@@ -17,11 +17,11 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/mfd/mcp.h>
 
 #include <mach/dma.h>
 #include <asm/system.h>
 
-#include "mcp.h"
 
 #define to_mcp(d)		container_of(d, struct mcp, attached_device)
 #define to_mcp_driver(d)	container_of(d, struct mcp_driver, drv)
diff --git a/drivers/mfd/mcp-sa11x0.c b/drivers/mfd/mcp-sa11x0.c
index 62b32da..2584272 100644
--- a/drivers/mfd/mcp-sa11x0.c
+++ b/drivers/mfd/mcp-sa11x0.c
@@ -19,6 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/mcp.h>
 
 #include <mach/dma.h>
 #include <mach/hardware.h>
@@ -28,7 +29,6 @@
 
 #include <mach/assabet.h>
 
-#include "mcp.h"
 
 struct mcp_sa11x0 {
 	u32	mccr0;
@@ -163,6 +163,7 @@ static int mcp_sa11x0_probe(struct platform_device *pdev)
 	mcp->dma_audio_wr	= DMA_Ser4MCP0Wr;
 	mcp->dma_telco_rd	= DMA_Ser4MCP1Rd;
 	mcp->dma_telco_wr	= DMA_Ser4MCP1Wr;
+	mcp->gpio_base		= data->gpio_base;
 
 	platform_set_drvdata(pdev, mcp);
 
diff --git a/drivers/mfd/mcp.h b/drivers/mfd/mcp.h
deleted file mode 100644
index c093a93..0000000
--- a/drivers/mfd/mcp.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  linux/drivers/mfd/mcp.h
- *
- *  Copyright (C) 2001 Russell King, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- */
-#ifndef MCP_H
-#define MCP_H
-
-struct mcp_ops;
-
-struct mcp {
-	struct module	*owner;
-	struct mcp_ops	*ops;
-	spinlock_t	lock;
-	int		use_count;
-	unsigned int	sclk_rate;
-	unsigned int	rw_timeout;
-	dma_device_t	dma_audio_rd;
-	dma_device_t	dma_audio_wr;
-	dma_device_t	dma_telco_rd;
-	dma_device_t	dma_telco_wr;
-	struct device	attached_device;
-};
-
-struct mcp_ops {
-	void		(*set_telecom_divisor)(struct mcp *, unsigned int);
-	void		(*set_audio_divisor)(struct mcp *, unsigned int);
-	void		(*reg_write)(struct mcp *, unsigned int, unsigned int);
-	unsigned int	(*reg_read)(struct mcp *, unsigned int);
-	void		(*enable)(struct mcp *);
-	void		(*disable)(struct mcp *);
-};
-
-void mcp_set_telecom_divisor(struct mcp *, unsigned int);
-void mcp_set_audio_divisor(struct mcp *, unsigned int);
-void mcp_reg_write(struct mcp *, unsigned int, unsigned int);
-unsigned int mcp_reg_read(struct mcp *, unsigned int);
-void mcp_enable(struct mcp *);
-void mcp_disable(struct mcp *);
-#define mcp_get_sclk_rate(mcp)	((mcp)->sclk_rate)
-
-struct mcp *mcp_host_alloc(struct device *, size_t);
-int mcp_host_register(struct mcp *);
-void mcp_host_unregister(struct mcp *);
-
-struct mcp_driver {
-	struct device_driver drv;
-	int (*probe)(struct mcp *);
-	void (*remove)(struct mcp *);
-	int (*suspend)(struct mcp *, pm_message_t);
-	int (*resume)(struct mcp *);
-};
-
-int mcp_driver_register(struct mcp_driver *);
-void mcp_driver_unregister(struct mcp_driver *);
-
-#define mcp_get_drvdata(mcp)	dev_get_drvdata(&(mcp)->attached_device)
-#define mcp_set_drvdata(mcp,d)	dev_set_drvdata(&(mcp)->attached_device, d)
-
-#define mcp_priv(mcp)		((void *)((mcp)+1))
-
-#endif
diff --git a/drivers/mfd/menelaus.c b/drivers/mfd/menelaus.c
index 4b364ba..970afa1 100644
--- a/drivers/mfd/menelaus.c
+++ b/drivers/mfd/menelaus.c
@@ -44,7 +44,7 @@
 #include <asm/mach/irq.h>
 
 #include <mach/gpio.h>
-#include <mach/menelaus.h>
+#include <plat/menelaus.h>
 
 #define DRIVER_NAME			"menelaus"
 
diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c
new file mode 100644
index 0000000..77b9149
--- /dev/null
+++ b/drivers/mfd/twl4030-codec.c
@@ -0,0 +1,276 @@
+/*
+ * MFD driver for twl4030 codec submodule
+ *
+ * Author:	Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * Copyright:   (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/platform_device.h>
+#include <linux/i2c/twl4030.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/twl4030-codec.h>
+
+#define TWL4030_CODEC_CELLS	2
+
+static struct platform_device *twl4030_codec_dev;
+
+struct twl4030_codec_resource {
+	int request_count;
+	u8 reg;
+	u8 mask;
+};
+
+struct twl4030_codec {
+	unsigned int audio_mclk;
+	struct mutex mutex;
+	struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX];
+	struct mfd_cell cells[TWL4030_CODEC_CELLS];
+};
+
+/*
+ * Modify the resource, the function returns the content of the register
+ * after the modification.
+ */
+static int twl4030_codec_set_resource(enum twl4030_codec_res id, int enable)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	if (enable)
+		val |= codec->resource[id].mask;
+	else
+		val &= ~codec->resource[id].mask;
+
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, codec->resource[id].reg);
+
+	return val;
+}
+
+static inline int twl4030_codec_get_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	return val;
+}
+
+/*
+ * Enable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_enable_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count)
+		/* Resource was disabled, enable it */
+		val = twl4030_codec_set_resource(id, 1);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	codec->resource[id].request_count++;
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_enable_resource);
+
+/*
+ * Disable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_disable_resource(unsigned id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count) {
+		dev_err(&twl4030_codec_dev->dev,
+			"Resource has been disabled already (%u)\n", id);
+		mutex_unlock(&codec->mutex);
+		return -EPERM;
+	}
+	codec->resource[id].request_count--;
+
+	if (!codec->resource[id].request_count)
+		/* Resource can be disabled now */
+		val = twl4030_codec_set_resource(id, 0);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource);
+
+unsigned int twl4030_codec_get_mclk(void)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+
+	return codec->audio_mclk;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_get_mclk);
+
+static int __devinit twl4030_codec_probe(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec;
+	struct twl4030_codec_data *pdata = pdev->dev.platform_data;
+	struct mfd_cell *cell = NULL;
+	int ret, childs = 0;
+	u8 val;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "Platform data is missing\n");
+		return -EINVAL;
+	}
+
+	/* Configure APLL_INFREQ and disable APLL if enabled */
+	val = 0;
+	switch (pdata->audio_mclk) {
+	case 19200000:
+		val |= TWL4030_APLL_INFREQ_19200KHZ;
+		break;
+	case 26000000:
+		val |= TWL4030_APLL_INFREQ_26000KHZ;
+		break;
+	case 38400000:
+		val |= TWL4030_APLL_INFREQ_38400KHZ;
+		break;
+	default:
+		dev_err(&pdev->dev, "Invalid audio_mclk\n");
+		return -EINVAL;
+	}
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, TWL4030_REG_APLL_CTL);
+
+	codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL);
+	if (!codec)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, codec);
+
+	twl4030_codec_dev = pdev;
+	mutex_init(&codec->mutex);
+	codec->audio_mclk = pdata->audio_mclk;
+
+	/* Codec power */
+	codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE;
+	codec->resource[TWL4030_CODEC_RES_POWER].mask = TWL4030_CODECPDZ;
+
+	/* PLL */
+	codec->resource[TWL4030_CODEC_RES_APLL].reg = TWL4030_REG_APLL_CTL;
+	codec->resource[TWL4030_CODEC_RES_APLL].mask = TWL4030_APLL_EN;
+
+	if (pdata->audio) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_audio";
+		cell->platform_data = pdata->audio;
+		cell->data_size = sizeof(*pdata->audio);
+		childs++;
+	}
+	if (pdata->vibra) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_vibra";
+		cell->platform_data = pdata->vibra;
+		cell->data_size = sizeof(*pdata->vibra);
+		childs++;
+	}
+
+	if (childs)
+		ret = mfd_add_devices(&pdev->dev, pdev->id, codec->cells,
+				      childs, NULL, 0);
+	else {
+		dev_err(&pdev->dev, "No platform data found for childs\n");
+		ret = -ENODEV;
+	}
+
+	if (!ret)
+		return 0;
+
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+	return ret;
+}
+
+static int __devexit twl4030_codec_remove(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(pdev);
+
+	mfd_remove_devices(&pdev->dev);
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_codec");
+
+static struct platform_driver twl4030_codec_driver = {
+	.probe		= twl4030_codec_probe,
+	.remove		= __devexit_p(twl4030_codec_remove),
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "twl4030_codec",
+	},
+};
+
+static int __devinit twl4030_codec_init(void)
+{
+	return platform_driver_register(&twl4030_codec_driver);
+}
+module_init(twl4030_codec_init);
+
+static void __devexit twl4030_codec_exit(void)
+{
+	platform_driver_unregister(&twl4030_codec_driver);
+}
+module_exit(twl4030_codec_exit);
+
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index a1c47ee..40449cd 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -39,7 +39,7 @@
 #include <linux/i2c/twl4030.h>
 
 #if defined(CONFIG_ARCH_OMAP2) || defined(CONFIG_ARCH_OMAP3)
-#include <mach/cpu.h>
+#include <plat/cpu.h>
 #endif
 
 /*
@@ -114,6 +114,12 @@
 #define twl_has_watchdog()        false
 #endif
 
+#if defined(CONFIG_TWL4030_CODEC) || defined(CONFIG_TWL4030_CODEC_MODULE)
+#define twl_has_codec()	true
+#else
+#define twl_has_codec()	false
+#endif
+
 /* Triton Core internal information (BEGIN) */
 
 /* Last - for index max*/
@@ -601,6 +607,14 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features)
 			return PTR_ERR(child);
 	}
 
+	if (twl_has_codec() && pdata->codec) {
+		child = add_child(1, "twl4030_codec",
+				pdata->codec, sizeof(*pdata->codec),
+				false, 0, 0);
+		if (IS_ERR(child))
+			return PTR_ERR(child);
+	}
+
 	if (twl_has_regulator()) {
 		/*
 		child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1);
@@ -763,7 +777,7 @@ static int twl4030_remove(struct i2c_client *client)
 }
 
 /* NOTE:  this driver only handles a single twl4030/tps659x0 chip */
-static int
+static int __init
 twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id)
 {
 	int				status;
diff --git a/drivers/mfd/ucb1x00-assabet.c b/drivers/mfd/ucb1x00-assabet.c
index 86fed48..cea9da6 100644
--- a/drivers/mfd/ucb1x00-assabet.c
+++ b/drivers/mfd/ucb1x00-assabet.c
@@ -14,10 +14,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/device.h>
+#include <linux/mfd/ucb1x00.h>
 
 #include <mach/dma.h>
 
-#include "ucb1x00.h"
 
 #define UCB1X00_ATTR(name,input)\
 static ssize_t name##_show(struct device *dev, struct device_attribute *attr, \
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index 60c3988..252b741 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -25,12 +25,12 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
+#include <linux/mfd/ucb1x00.h>
+#include <linux/gpio.h>
 
 #include <mach/dma.h>
 #include <mach/hardware.h>
 
-#include "ucb1x00.h"
-
 static DEFINE_MUTEX(ucb1x00_mutex);
 static LIST_HEAD(ucb1x00_drivers);
 static LIST_HEAD(ucb1x00_devices);
@@ -108,6 +108,60 @@ unsigned int ucb1x00_io_read(struct ucb1x00 *ucb)
 	return ucb1x00_reg_read(ucb, UCB_IO_DATA);
 }
 
+static void ucb1x00_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	if (value)
+		ucb->io_out |= 1 << offset;
+	else
+		ucb->io_out &= ~(1 << offset);
+
+	ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+}
+
+static int ucb1x00_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	return ucb1x00_reg_read(ucb, UCB_IO_DATA) & (1 << offset);
+}
+
+static int ucb1x00_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	ucb->io_dir &= ~(1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+
+	return 0;
+}
+
+static int ucb1x00_gpio_direction_output(struct gpio_chip *chip, unsigned offset
+		, int value)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	ucb->io_dir |= (1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
+
+	if (value)
+		ucb->io_out |= 1 << offset;
+	else
+		ucb->io_out &= ~(1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+
+	return 0;
+}
+
 /*
  * UCB1300 data sheet says we must:
  *  1. enable ADC	=> 5us (including reference startup time)
@@ -476,6 +530,7 @@ static int ucb1x00_probe(struct mcp *mcp)
 	struct ucb1x00_driver *drv;
 	unsigned int id;
 	int ret = -ENODEV;
+	int temp;
 
 	mcp_enable(mcp);
 	id = mcp_reg_read(mcp, UCB_ID);
@@ -508,12 +563,27 @@ static int ucb1x00_probe(struct mcp *mcp)
 		goto err_free;
 	}
 
+	ucb->gpio.base = -1;
+	if (mcp->gpio_base != 0) {
+		ucb->gpio.label = dev_name(&ucb->dev);
+		ucb->gpio.base = mcp->gpio_base;
+		ucb->gpio.ngpio = 10;
+		ucb->gpio.set = ucb1x00_gpio_set;
+		ucb->gpio.get = ucb1x00_gpio_get;
+		ucb->gpio.direction_input = ucb1x00_gpio_direction_input;
+		ucb->gpio.direction_output = ucb1x00_gpio_direction_output;
+		ret = gpiochip_add(&ucb->gpio);
+		if (ret)
+			goto err_free;
+	} else
+		dev_info(&ucb->dev, "gpio_base not set so no gpiolib support");
+
 	ret = request_irq(ucb->irq, ucb1x00_irq, IRQF_TRIGGER_RISING,
 			  "UCB1x00", ucb);
 	if (ret) {
 		printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n",
 			ucb->irq, ret);
-		goto err_free;
+		goto err_gpio;
 	}
 
 	mcp_set_drvdata(mcp, ucb);
@@ -522,6 +592,7 @@ static int ucb1x00_probe(struct mcp *mcp)
 	if (ret)
 		goto err_irq;
 
+
 	INIT_LIST_HEAD(&ucb->devs);
 	mutex_lock(&ucb1x00_mutex);
 	list_add(&ucb->node, &ucb1x00_devices);
@@ -529,10 +600,14 @@ static int ucb1x00_probe(struct mcp *mcp)
 		ucb1x00_add_dev(ucb, drv);
 	}
 	mutex_unlock(&ucb1x00_mutex);
+
 	goto out;
 
  err_irq:
 	free_irq(ucb->irq, ucb);
+ err_gpio:
+	if (ucb->gpio.base != -1)
+		temp = gpiochip_remove(&ucb->gpio);
  err_free:
 	kfree(ucb);
  err_disable:
@@ -545,6 +620,7 @@ static void ucb1x00_remove(struct mcp *mcp)
 {
 	struct ucb1x00 *ucb = mcp_get_drvdata(mcp);
 	struct list_head *l, *n;
+	int ret;
 
 	mutex_lock(&ucb1x00_mutex);
 	list_del(&ucb->node);
@@ -554,6 +630,12 @@ static void ucb1x00_remove(struct mcp *mcp)
 	}
 	mutex_unlock(&ucb1x00_mutex);
 
+	if (ucb->gpio.base != -1) {
+		ret = gpiochip_remove(&ucb->gpio);
+		if (ret)
+			dev_err(&ucb->dev, "Can't remove gpio chip: %d\n", ret);
+	}
+
 	free_irq(ucb->irq, ucb);
 	device_unregister(&ucb->dev);
 }
@@ -604,6 +686,7 @@ static int ucb1x00_resume(struct mcp *mcp)
 	struct ucb1x00 *ucb = mcp_get_drvdata(mcp);
 	struct ucb1x00_dev *dev;
 
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
 	mutex_lock(&ucb1x00_mutex);
 	list_for_each_entry(dev, &ucb->devs, dev_node) {
 		if (dev->drv->resume)
diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c
index 61b7d3e..000cb41 100644
--- a/drivers/mfd/ucb1x00-ts.c
+++ b/drivers/mfd/ucb1x00-ts.c
@@ -30,12 +30,12 @@
 #include <linux/freezer.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/mfd/ucb1x00.h>
 
 #include <mach/dma.h>
 #include <mach/collie.h>
 #include <asm/mach-types.h>
 
-#include "ucb1x00.h"
 
 
 struct ucb1x00_ts {
diff --git a/drivers/mfd/ucb1x00.h b/drivers/mfd/ucb1x00.h
deleted file mode 100644
index a8ad8a0..0000000
--- a/drivers/mfd/ucb1x00.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- *  linux/drivers/mfd/ucb1x00.h
- *
- *  Copyright (C) 2001 Russell King, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- */
-#ifndef UCB1200_H
-#define UCB1200_H
-
-#define UCB_IO_DATA	0x00
-#define UCB_IO_DIR	0x01
-
-#define UCB_IO_0		(1 << 0)
-#define UCB_IO_1		(1 << 1)
-#define UCB_IO_2		(1 << 2)
-#define UCB_IO_3		(1 << 3)
-#define UCB_IO_4		(1 << 4)
-#define UCB_IO_5		(1 << 5)
-#define UCB_IO_6		(1 << 6)
-#define UCB_IO_7		(1 << 7)
-#define UCB_IO_8		(1 << 8)
-#define UCB_IO_9		(1 << 9)
-
-#define UCB_IE_RIS	0x02
-#define UCB_IE_FAL	0x03
-#define UCB_IE_STATUS	0x04
-#define UCB_IE_CLEAR	0x04
-#define UCB_IE_ADC		(1 << 11)
-#define UCB_IE_TSPX		(1 << 12)
-#define UCB_IE_TSMX		(1 << 13)
-#define UCB_IE_TCLIP		(1 << 14)
-#define UCB_IE_ACLIP		(1 << 15)
-
-#define UCB_IRQ_TSPX		12
-
-#define UCB_TC_A	0x05
-#define UCB_TC_A_LOOP		(1 << 7)	/* UCB1200 */
-#define UCB_TC_A_AMPL		(1 << 7)	/* UCB1300 */
-
-#define UCB_TC_B	0x06
-#define UCB_TC_B_VOICE_ENA	(1 << 3)
-#define UCB_TC_B_CLIP		(1 << 4)
-#define UCB_TC_B_ATT		(1 << 6)
-#define UCB_TC_B_SIDE_ENA	(1 << 11)
-#define UCB_TC_B_MUTE		(1 << 13)
-#define UCB_TC_B_IN_ENA		(1 << 14)
-#define UCB_TC_B_OUT_ENA	(1 << 15)
-
-#define UCB_AC_A	0x07
-#define UCB_AC_B	0x08
-#define UCB_AC_B_LOOP		(1 << 8)
-#define UCB_AC_B_MUTE		(1 << 13)
-#define UCB_AC_B_IN_ENA		(1 << 14)
-#define UCB_AC_B_OUT_ENA	(1 << 15)
-
-#define UCB_TS_CR	0x09
-#define UCB_TS_CR_TSMX_POW	(1 << 0)
-#define UCB_TS_CR_TSPX_POW	(1 << 1)
-#define UCB_TS_CR_TSMY_POW	(1 << 2)
-#define UCB_TS_CR_TSPY_POW	(1 << 3)
-#define UCB_TS_CR_TSMX_GND	(1 << 4)
-#define UCB_TS_CR_TSPX_GND	(1 << 5)
-#define UCB_TS_CR_TSMY_GND	(1 << 6)
-#define UCB_TS_CR_TSPY_GND	(1 << 7)
-#define UCB_TS_CR_MODE_INT	(0 << 8)
-#define UCB_TS_CR_MODE_PRES	(1 << 8)
-#define UCB_TS_CR_MODE_POS	(2 << 8)
-#define UCB_TS_CR_BIAS_ENA	(1 << 11)
-#define UCB_TS_CR_TSPX_LOW	(1 << 12)
-#define UCB_TS_CR_TSMX_LOW	(1 << 13)
-
-#define UCB_ADC_CR	0x0a
-#define UCB_ADC_SYNC_ENA	(1 << 0)
-#define UCB_ADC_VREFBYP_CON	(1 << 1)
-#define UCB_ADC_INP_TSPX	(0 << 2)
-#define UCB_ADC_INP_TSMX	(1 << 2)
-#define UCB_ADC_INP_TSPY	(2 << 2)
-#define UCB_ADC_INP_TSMY	(3 << 2)
-#define UCB_ADC_INP_AD0		(4 << 2)
-#define UCB_ADC_INP_AD1		(5 << 2)
-#define UCB_ADC_INP_AD2		(6 << 2)
-#define UCB_ADC_INP_AD3		(7 << 2)
-#define UCB_ADC_EXT_REF		(1 << 5)
-#define UCB_ADC_START		(1 << 7)
-#define UCB_ADC_ENA		(1 << 15)
-
-#define UCB_ADC_DATA	0x0b
-#define UCB_ADC_DAT_VAL		(1 << 15)
-#define UCB_ADC_DAT(x)		(((x) & 0x7fe0) >> 5)
-
-#define UCB_ID		0x0c
-#define UCB_ID_1200		0x1004
-#define UCB_ID_1300		0x1005
-#define UCB_ID_TC35143          0x9712
-
-#define UCB_MODE	0x0d
-#define UCB_MODE_DYN_VFLAG_ENA	(1 << 12)
-#define UCB_MODE_AUD_OFF_CAN	(1 << 13)
-
-#include "mcp.h"
-
-struct ucb1x00_irq {
-	void *devid;
-	void (*fn)(int, void *);
-};
-
-struct ucb1x00 {
-	spinlock_t		lock;
-	struct mcp		*mcp;
-	unsigned int		irq;
-	struct semaphore	adc_sem;
-	spinlock_t		io_lock;
-	u16			id;
-	u16			io_dir;
-	u16			io_out;
-	u16			adc_cr;
-	u16			irq_fal_enbl;
-	u16			irq_ris_enbl;
-	struct ucb1x00_irq	irq_handler[16];
-	struct device		dev;
-	struct list_head	node;
-	struct list_head	devs;
-};
-
-struct ucb1x00_driver;
-
-struct ucb1x00_dev {
-	struct list_head	dev_node;
-	struct list_head	drv_node;
-	struct ucb1x00		*ucb;
-	struct ucb1x00_driver	*drv;
-	void			*priv;
-};
-
-struct ucb1x00_driver {
-	struct list_head	node;
-	struct list_head	devs;
-	int	(*add)(struct ucb1x00_dev *dev);
-	void	(*remove)(struct ucb1x00_dev *dev);
-	int	(*suspend)(struct ucb1x00_dev *dev, pm_message_t state);
-	int	(*resume)(struct ucb1x00_dev *dev);
-};
-
-#define classdev_to_ucb1x00(cd)	container_of(cd, struct ucb1x00, dev)
-
-int ucb1x00_register_driver(struct ucb1x00_driver *);
-void ucb1x00_unregister_driver(struct ucb1x00_driver *);
-
-/**
- *	ucb1x00_clkrate - return the UCB1x00 SIB clock rate
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Return the SIB clock rate in Hz.
- */
-static inline unsigned int ucb1x00_clkrate(struct ucb1x00 *ucb)
-{
-	return mcp_get_sclk_rate(ucb->mcp);
-}
-
-/**
- *	ucb1x00_enable - enable the UCB1x00 SIB clock
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Enable the SIB clock.  This can be called multiple times.
- */
-static inline void ucb1x00_enable(struct ucb1x00 *ucb)
-{
-	mcp_enable(ucb->mcp);
-}
-
-/**
- *	ucb1x00_disable - disable the UCB1x00 SIB clock
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Disable the SIB clock.  The SIB clock will only be disabled
- *	when the number of ucb1x00_enable calls match the number of
- *	ucb1x00_disable calls.
- */
-static inline void ucb1x00_disable(struct ucb1x00 *ucb)
-{
-	mcp_disable(ucb->mcp);
-}
-
-/**
- *	ucb1x00_reg_write - write a UCB1x00 register
- *	@ucb: UCB1x00 structure describing chip
- *	@reg: UCB1x00 4-bit register index to write
- *	@val: UCB1x00 16-bit value to write
- *
- *	Write the UCB1x00 register @reg with value @val.  The SIB
- *	clock must be running for this function to return.
- */
-static inline void ucb1x00_reg_write(struct ucb1x00 *ucb, unsigned int reg, unsigned int val)
-{
-	mcp_reg_write(ucb->mcp, reg, val);
-}
-
-/**
- *	ucb1x00_reg_read - read a UCB1x00 register
- *	@ucb: UCB1x00 structure describing chip
- *	@reg: UCB1x00 4-bit register index to write
- *
- *	Read the UCB1x00 register @reg and return its value.  The SIB
- *	clock must be running for this function to return.
- */
-static inline unsigned int ucb1x00_reg_read(struct ucb1x00 *ucb, unsigned int reg)
-{
-	return mcp_reg_read(ucb->mcp, reg);
-}
-/**
- *	ucb1x00_set_audio_divisor - 
- *	@ucb: UCB1x00 structure describing chip
- *	@div: SIB clock divisor
- */
-static inline void ucb1x00_set_audio_divisor(struct ucb1x00 *ucb, unsigned int div)
-{
-	mcp_set_audio_divisor(ucb->mcp, div);
-}
-
-/**
- *	ucb1x00_set_telecom_divisor -
- *	@ucb: UCB1x00 structure describing chip
- *	@div: SIB clock divisor
- */
-static inline void ucb1x00_set_telecom_divisor(struct ucb1x00 *ucb, unsigned int div)
-{
-	mcp_set_telecom_divisor(ucb->mcp, div);
-}
-
-void ucb1x00_io_set_dir(struct ucb1x00 *ucb, unsigned int, unsigned int);
-void ucb1x00_io_write(struct ucb1x00 *ucb, unsigned int, unsigned int);
-unsigned int ucb1x00_io_read(struct ucb1x00 *ucb);
-
-#define UCB_NOSYNC	(0)
-#define UCB_SYNC	(1)
-
-unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync);
-void ucb1x00_adc_enable(struct ucb1x00 *ucb);
-void ucb1x00_adc_disable(struct ucb1x00 *ucb);
-
-/*
- * Which edges of the IRQ do you want to control today?
- */
-#define UCB_RISING	(1 << 0)
-#define UCB_FALLING	(1 << 1)
-
-int ucb1x00_hook_irq(struct ucb1x00 *ucb, unsigned int idx, void (*fn)(int, void *), void *devid);
-void ucb1x00_enable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
-void ucb1x00_disable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
-int ucb1x00_free_irq(struct ucb1x00 *ucb, unsigned int idx, void *devid);
-
-#endif
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index a2ea383..2c16ca6 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -246,6 +246,16 @@ config EP93XX_PWM
 	  To compile this driver as a module, choose M here: the module will
 	  be called ep93xx_pwm.
 
+config DS1682
+	tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm"
+	depends on I2C && EXPERIMENTAL
+	help
+	  If you say yes here you get support for Dallas Semiconductor
+	  DS1682 Total Elapsed Time Recorder.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called ds1682.
+
 source "drivers/misc/c2port/Kconfig"
 source "drivers/misc/eeprom/Kconfig"
 source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index e311267..906a0ed 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
 obj-$(CONFIG_HP_ILO)		+= hpilo.o
 obj-$(CONFIG_ISL29003)		+= isl29003.o
 obj-$(CONFIG_EP93XX_PWM)	+= ep93xx_pwm.o
+obj-$(CONFIG_DS1682)		+= ds1682.o
 obj-$(CONFIG_C2PORT)		+= c2port/
 obj-$(CONFIG_IWMC3200TOP)      += iwmc3200top/
 obj-y				+= eeprom/
diff --git a/drivers/i2c/chips/ds1682.c b/drivers/misc/ds1682.c
index f3ee4a1..f3ee4a1 100644
--- a/drivers/i2c/chips/ds1682.c
+++ b/drivers/misc/ds1682.c
diff --git a/drivers/misc/ics932s401.c b/drivers/misc/ics932s401.c
index 6e43ab4..4bb7a3a 100644
--- a/drivers/misc/ics932s401.c
+++ b/drivers/misc/ics932s401.c
@@ -417,32 +417,25 @@ static int ics932s401_detect(struct i2c_client *client, int kind,
 			  struct i2c_board_info *info)
 {
 	struct i2c_adapter *adapter = client->adapter;
+	int vendor, device, revision;
 
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
 		return -ENODEV;
 
-	if (kind <= 0) {
-		int vendor, device, revision;
-
-		vendor = i2c_smbus_read_word_data(client,
-						  ICS932S401_REG_VENDOR_REV);
-		vendor >>= 8;
-		revision = vendor >> ICS932S401_REV_SHIFT;
-		vendor &= ICS932S401_VENDOR_MASK;
-		if (vendor != ICS932S401_VENDOR)
-			return -ENODEV;
-
-		device = i2c_smbus_read_word_data(client,
-						  ICS932S401_REG_DEVICE);
-		device >>= 8;
-		if (device != ICS932S401_DEVICE)
-			return -ENODEV;
-
-		if (revision != ICS932S401_REV)
-			dev_info(&adapter->dev, "Unknown revision %d\n",
-				 revision);
-	} else
-		dev_dbg(&adapter->dev, "detection forced\n");
+	vendor = i2c_smbus_read_word_data(client, ICS932S401_REG_VENDOR_REV);
+	vendor >>= 8;
+	revision = vendor >> ICS932S401_REV_SHIFT;
+	vendor &= ICS932S401_VENDOR_MASK;
+	if (vendor != ICS932S401_VENDOR)
+		return -ENODEV;
+
+	device = i2c_smbus_read_word_data(client, ICS932S401_REG_DEVICE);
+	device >>= 8;
+	if (device != ICS932S401_DEVICE)
+		return -ENODEV;
+
+	if (revision != ICS932S401_REV)
+		dev_info(&adapter->dev, "Unknown revision %d\n", revision);
 
 	strlcpy(info->type, "ics932s401", I2C_NAME_SIZE);
 
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index fd3688a..832ed4c 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -89,48 +89,40 @@ static int xpc_disengage_max_timelimit = 120;
 
 static ctl_table xpc_sys_xpc_hb_dir[] = {
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "hb_interval",
 	 .data = &xpc_hb_interval,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
-	 .strategy = &sysctl_intvec,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_hb_min_interval,
 	 .extra2 = &xpc_hb_max_interval},
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "hb_check_interval",
 	 .data = &xpc_hb_check_interval,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
-	 .strategy = &sysctl_intvec,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_hb_check_min_interval,
 	 .extra2 = &xpc_hb_check_max_interval},
 	{}
 };
 static ctl_table xpc_sys_xpc_dir[] = {
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "hb",
 	 .mode = 0555,
 	 .child = xpc_sys_xpc_hb_dir},
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "disengage_timelimit",
 	 .data = &xpc_disengage_timelimit,
 	 .maxlen = sizeof(int),
 	 .mode = 0644,
-	 .proc_handler = &proc_dointvec_minmax,
-	 .strategy = &sysctl_intvec,
+	 .proc_handler = proc_dointvec_minmax,
 	 .extra1 = &xpc_disengage_min_timelimit,
 	 .extra2 = &xpc_disengage_max_timelimit},
 	{}
 };
 static ctl_table xpc_sys_dir[] = {
 	{
-	 .ctl_name = CTL_UNNUMBERED,
 	 .procname = "xpc",
 	 .mode = 0555,
 	 .child = xpc_sys_xpc_dir},
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index c76677a..b5bbe59 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -106,7 +106,8 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
 
 #if defined CONFIG_X86_64
-	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset);
+	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
+			UV_AFFINITY_CPU);
 	if (mq->irq < 0) {
 		dev_err(xpc_part, "uv_setup_irq() returned error=%d\n",
 			-mq->irq);
@@ -136,7 +137,7 @@ static void
 xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq)
 {
 #if defined CONFIG_X86_64
-	uv_teardown_irq(mq->irq, mq->mmr_blade, mq->mmr_offset);
+	uv_teardown_irq(mq->irq);
 
 #elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
 	int mmr_pnode;
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 705a589..90d168a 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -56,7 +56,7 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
 				clk = 255;
 			host->cclk = host->mclk / (2 * (clk + 1));
 		}
-		if (host->hw_designer == 0x80)
+		if (host->hw_designer == AMBA_VENDOR_ST)
 			clk |= MCI_FCEN; /* Bug fix in ST IP block */
 		clk |= MCI_CLK_ENABLE;
 		/* This hasn't proven to be worthwhile */
diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index b8fd7af..5f970e2 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -30,12 +30,12 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 
-#include <mach/board.h>
-#include <mach/mmc.h>
+#include <plat/board.h>
+#include <plat/mmc.h>
 #include <mach/gpio.h>
-#include <mach/dma.h>
-#include <mach/mux.h>
-#include <mach/fpga.h>
+#include <plat/dma.h>
+#include <plat/mux.h>
+#include <plat/fpga.h>
 
 #define	OMAP_MMC_REG_CMD	0x00
 #define	OMAP_MMC_REG_ARGL	0x04
diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 0aecaae..4b23225 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -30,11 +30,11 @@
 #include <linux/mmc/core.h>
 #include <linux/io.h>
 #include <linux/semaphore.h>
-#include <mach/dma.h>
+#include <plat/dma.h>
 #include <mach/hardware.h>
-#include <mach/board.h>
-#include <mach/mmc.h>
-#include <mach/cpu.h>
+#include <plat/board.h>
+#include <plat/mmc.h>
+#include <plat/cpu.h>
 
 /* OMAP HSMMC Host Controller Registers */
 #define OMAP_HSMMC_SYSCONFIG	0x0010
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index 9fb480b..bb47ff4 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -43,6 +43,9 @@
 #define NR_SG	1
 #define CLKRT_OFF	(~0)
 
+#define mmc_has_26MHz()		(cpu_is_pxa300() || cpu_is_pxa310() \
+				|| cpu_is_pxa935())
+
 struct pxamci_host {
 	struct mmc_host		*mmc;
 	spinlock_t		lock;
@@ -457,7 +460,7 @@ static void pxamci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 			clk_enable(host->clk);
 
 		if (ios->clock == 26000000) {
-			/* to support 26MHz on pxa300/pxa310 */
+			/* to support 26MHz */
 			host->clkrt = 7;
 		} else {
 			/* to handle (19.5MHz, 26MHz) */
@@ -608,8 +611,7 @@ static int pxamci_probe(struct platform_device *pdev)
 	 * Calculate minimum clock rate, rounding up.
 	 */
 	mmc->f_min = (host->clkrate + 63) / 64;
-	mmc->f_max = (cpu_is_pxa300() || cpu_is_pxa310()) ? 26000000
-							  : host->clkrate;
+	mmc->f_max = (mmc_has_26MHz()) ? 26000000 : host->clkrate;
 
 	pxamci_init_ocr(host);
 
@@ -618,7 +620,7 @@ static int pxamci_probe(struct platform_device *pdev)
 	if (!cpu_is_pxa25x()) {
 		mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ;
 		host->cmdat |= CMDAT_SDIO_INT_EN;
-		if (cpu_is_pxa300() || cpu_is_pxa310())
+		if (mmc_has_26MHz())
 			mmc->caps |= MMC_CAP_MMC_HIGHSPEED |
 				     MMC_CAP_SD_HIGHSPEED;
 	}
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
index a244781..ead0b2f 100644
--- a/drivers/mtd/maps/omap_nor.c
+++ b/drivers/mtd/maps/omap_nor.c
@@ -45,7 +45,7 @@
 #include <asm/io.h>
 #include <mach/hardware.h>
 #include <asm/mach/flash.h>
-#include <mach/tc.h>
+#include <plat/tc.h>
 
 #ifdef CONFIG_MTD_PARTITIONS
 static const char *part_probes[] = { /* "RedBoot", */ "cmdlinepart", NULL };
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 8ca17a3..64e2b37 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
 				return -EIO;
+		rq_flush_dcache_pages(req);
 		return 0;
 
 	case WRITE:
 		if (!tr->writesect)
 			return -EIO;
 
+		rq_flush_dcache_pages(req);
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
 				return -EIO;
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 2fda0b6..8f8e87b 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -358,7 +358,7 @@ endchoice
 
 config MTD_NAND_PXA3xx
 	tristate "Support for NAND flash devices on PXA3xx"
-	depends on MTD_NAND && PXA3xx
+	depends on MTD_NAND && (PXA3xx || ARCH_MMP)
 	help
 	  This enables the driver for the NAND flash device found on
 	  PXA3xx processors
diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index 005b91f..2548e10 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -25,7 +25,7 @@
 #include <mach/hardware.h>
 #include <asm/sizes.h>
 #include <mach/gpio.h>
-#include <mach/board-ams-delta.h>
+#include <plat/board-ams-delta.h>
 
 /*
  * MTD structure for E3 (Delta)
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 090ab87..1bb799f 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -18,9 +18,9 @@
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 
-#include <mach/dma.h>
-#include <mach/gpmc.h>
-#include <mach/nand.h>
+#include <plat/dma.h>
+#include <plat/gpmc.h>
+#include <plat/nand.h>
 
 #define GPMC_IRQ_STATUS		0x18
 #define GPMC_ECC_CONFIG		0x1F4
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 6ea520a..1a5a036 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
@@ -22,7 +23,7 @@
 #include <linux/irq.h>
 
 #include <mach/dma.h>
-#include <mach/pxa3xx_nand.h>
+#include <plat/pxa3xx_nand.h>
 
 #define	CHIP_DELAY_TIMEOUT	(2 * HZ/10)
 
@@ -84,10 +85,6 @@
 #define NDCB0_CMD1_MASK		(0xff)
 #define NDCB0_ADDR_CYC_SHIFT	(16)
 
-/* dma-able I/O address for the NAND data and commands */
-#define NDCB0_DMA_ADDR		(0x43100048)
-#define NDDB_DMA_ADDR		(0x43100040)
-
 /* macros for registers read/write */
 #define nand_writel(info, off, val)	\
 	__raw_writel((val), (info)->mmio_base + (off))
@@ -123,6 +120,7 @@ struct pxa3xx_nand_info {
 
 	struct clk		*clk;
 	void __iomem		*mmio_base;
+	unsigned long		mmio_phys;
 
 	unsigned int 		buf_start;
 	unsigned int		buf_count;
@@ -228,13 +226,35 @@ static struct pxa3xx_nand_flash samsung512MbX16 = {
 	.chip_id	= 0x46ec,
 };
 
+static struct pxa3xx_nand_flash samsung2GbX8 = {
+	.timing		= &samsung512MbX16_timing,
+	.cmdset		= &smallpage_cmdset,
+	.page_per_block	= 64,
+	.page_size	= 2048,
+	.flash_width	= 8,
+	.dfc_width	= 8,
+	.num_blocks	= 2048,
+	.chip_id	= 0xdaec,
+};
+
+static struct pxa3xx_nand_flash samsung32GbX8 = {
+	.timing		= &samsung512MbX16_timing,
+	.cmdset		= &smallpage_cmdset,
+	.page_per_block	= 128,
+	.page_size	= 4096,
+	.flash_width	= 8,
+	.dfc_width	= 8,
+	.num_blocks	= 8192,
+	.chip_id	= 0xd7ec,
+};
+
 static struct pxa3xx_nand_timing micron_timing = {
 	.tCH	= 10,
 	.tCS	= 25,
 	.tWH	= 15,
 	.tWP	= 25,
 	.tRH	= 15,
-	.tRP	= 25,
+	.tRP	= 30,
 	.tR	= 25000,
 	.tWHR	= 60,
 	.tAR	= 10,
@@ -262,6 +282,28 @@ static struct pxa3xx_nand_flash micron1GbX16 = {
 	.chip_id	= 0xb12c,
 };
 
+static struct pxa3xx_nand_flash micron4GbX8 = {
+	.timing		= &micron_timing,
+	.cmdset		= &largepage_cmdset,
+	.page_per_block	= 64,
+	.page_size	= 2048,
+	.flash_width	= 8,
+	.dfc_width	= 8,
+	.num_blocks	= 4096,
+	.chip_id	= 0xdc2c,
+};
+
+static struct pxa3xx_nand_flash micron4GbX16 = {
+	.timing		= &micron_timing,
+	.cmdset		= &largepage_cmdset,
+	.page_per_block	= 64,
+	.page_size	= 2048,
+	.flash_width	= 16,
+	.dfc_width	= 16,
+	.num_blocks	= 4096,
+	.chip_id	= 0xcc2c,
+};
+
 static struct pxa3xx_nand_timing stm2GbX16_timing = {
 	.tCH = 10,
 	.tCS = 35,
@@ -287,8 +329,12 @@ static struct pxa3xx_nand_flash stm2GbX16 = {
 
 static struct pxa3xx_nand_flash *builtin_flash_types[] = {
 	&samsung512MbX16,
+	&samsung2GbX8,
+	&samsung32GbX8,
 	&micron1GbX8,
 	&micron1GbX16,
+	&micron4GbX8,
+	&micron4GbX16,
 	&stm2GbX16,
 };
 #endif /* CONFIG_MTD_NAND_PXA3xx_BUILTIN */
@@ -489,7 +535,7 @@ static int handle_data_pio(struct pxa3xx_nand_info *info)
 	switch (info->state) {
 	case STATE_PIO_WRITING:
 		__raw_writesl(info->mmio_base + NDDB, info->data_buff,
-				info->data_size << 2);
+				DIV_ROUND_UP(info->data_size, 4));
 
 		enable_int(info, NDSR_CS0_BBD | NDSR_CS0_CMDD);
 
@@ -501,7 +547,7 @@ static int handle_data_pio(struct pxa3xx_nand_info *info)
 		break;
 	case STATE_PIO_READING:
 		__raw_readsl(info->mmio_base + NDDB, info->data_buff,
-				info->data_size << 2);
+				DIV_ROUND_UP(info->data_size, 4));
 		break;
 	default:
 		printk(KERN_ERR "%s: invalid state %d\n", __func__,
@@ -523,11 +569,11 @@ static void start_data_dma(struct pxa3xx_nand_info *info, int dir_out)
 
 	if (dir_out) {
 		desc->dsadr = info->data_buff_phys;
-		desc->dtadr = NDDB_DMA_ADDR;
+		desc->dtadr = info->mmio_phys + NDDB;
 		desc->dcmd |= DCMD_INCSRCADDR | DCMD_FLOWTRG;
 	} else {
 		desc->dtadr = info->data_buff_phys;
-		desc->dsadr = NDDB_DMA_ADDR;
+		desc->dsadr = info->mmio_phys + NDDB;
 		desc->dcmd |= DCMD_INCTRGADDR | DCMD_FLOWSRC;
 	}
 
@@ -669,6 +715,7 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
 		/* disable HW ECC to get all the OOB data */
 		info->buf_count = mtd->writesize + mtd->oobsize;
 		info->buf_start = mtd->writesize + column;
+		memset(info->data_buff, 0xFF, info->buf_count);
 
 		if (prepare_read_prog_cmd(info, cmdset->read1, column, page_addr))
 			break;
@@ -1239,13 +1286,17 @@ static int pxa3xx_nand_probe(struct platform_device *pdev)
 		ret = -ENODEV;
 		goto fail_free_res;
 	}
+	info->mmio_phys = r->start;
 
 	ret = pxa3xx_nand_init_buff(info);
 	if (ret)
 		goto fail_free_io;
 
-	ret = request_irq(IRQ_NAND, pxa3xx_nand_irq, IRQF_DISABLED,
-				pdev->name, info);
+	/* initialize all interrupts to be disabled */
+	disable_int(info, NDSR_MASK);
+
+	ret = request_irq(irq, pxa3xx_nand_irq, IRQF_DISABLED,
+			  pdev->name, info);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to request IRQ\n");
 		goto fail_free_buf;
@@ -1271,7 +1322,7 @@ static int pxa3xx_nand_probe(struct platform_device *pdev)
 	return add_mtd_partitions(mtd, pdata->parts, pdata->nr_parts);
 
 fail_free_irq:
-	free_irq(IRQ_NAND, info);
+	free_irq(irq, info);
 fail_free_buf:
 	if (use_dma) {
 		pxa_free_dma(info->data_dma_ch);
@@ -1296,12 +1347,15 @@ static int pxa3xx_nand_remove(struct platform_device *pdev)
 	struct mtd_info *mtd = platform_get_drvdata(pdev);
 	struct pxa3xx_nand_info *info = mtd->priv;
 	struct resource *r;
+	int irq;
 
 	platform_set_drvdata(pdev, NULL);
 
 	del_mtd_device(mtd);
 	del_mtd_partitions(mtd);
-	free_irq(IRQ_NAND, info);
+	irq = platform_get_irq(pdev, 0);
+	if (irq >= 0)
+		free_irq(irq, info);
 	if (use_dma) {
 		pxa_free_dma(info->data_dma_ch);
 		dma_free_writecombine(&pdev->dev, info->data_buff_size,
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c
index 0108ed4..86c4f6dc 100644
--- a/drivers/mtd/onenand/omap2.c
+++ b/drivers/mtd/onenand/omap2.c
@@ -36,13 +36,13 @@
 #include <linux/io.h>
 
 #include <asm/mach/flash.h>
-#include <mach/gpmc.h>
-#include <mach/onenand.h>
+#include <plat/gpmc.h>
+#include <plat/onenand.h>
 #include <mach/gpio.h>
 
-#include <mach/dma.h>
+#include <plat/dma.h>
 
-#include <mach/board.h>
+#include <plat/board.h>
 
 #define DRIVER_NAME "omap2-onenand"
 
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ddf224d..e6627b2 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -9,7 +9,8 @@
  *
  *  Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net
  *
- *  Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell.
+ *  Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and
+ *  Grant Likely.
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -82,6 +83,29 @@ struct property *of_find_property(const struct device_node *np,
 }
 EXPORT_SYMBOL(of_find_property);
 
+/**
+ * of_find_all_nodes - Get next node in global list
+ * @prev:	Previous node or NULL to start iteration
+ *		of_node_put() will be called on it
+ *
+ * Returns a node pointer with refcount incremented, use
+ * of_node_put() on it when done.
+ */
+struct device_node *of_find_all_nodes(struct device_node *prev)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = prev ? prev->allnext : allnodes;
+	for (; np != NULL; np = np->allnext)
+		if (of_node_get(np))
+			break;
+	of_node_put(prev);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_all_nodes);
+
 /*
  * Find a property with a given name for a given node
  * and return the value.
diff --git a/drivers/parport/parport_mfc3.c b/drivers/parport/parport_mfc3.c
index 6dec9ba..362db31 100644
--- a/drivers/parport/parport_mfc3.c
+++ b/drivers/parport/parport_mfc3.c
@@ -386,7 +386,7 @@ static void __exit parport_mfc3_exit(void)
 		if (!this_port[i])
 			continue;
 		parport_remove_port(this_port[i]);
-		if (!this_port[i]->irq != PARPORT_IRQ_NONE) {
+		if (this_port[i]->irq != PARPORT_IRQ_NONE) {
 			if (--use_cnt == 0) 
 				free_irq(IRQ_AMIGA_PORTS, &pp_mfc3_ops);
 		}
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 8eefe56..3f56bc0 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -233,10 +233,10 @@ static int do_hardware_modes (ctl_table *table, int write,
 	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
 }
 
-#define PARPORT_PORT_DIR(CHILD) { .ctl_name = 0, .procname = NULL, .mode = 0555, .child = CHILD }
-#define PARPORT_PARPORT_DIR(CHILD) { .ctl_name = DEV_PARPORT, .procname = "parport", \
+#define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD }
+#define PARPORT_PARPORT_DIR(CHILD) { .procname = "parport", \
                                      .mode = 0555, .child = CHILD }
-#define PARPORT_DEV_DIR(CHILD) { .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = CHILD }
+#define PARPORT_DEV_DIR(CHILD) { .procname = "dev", .mode = 0555, .child = CHILD }
 #define PARPORT_DEVICES_ROOT_DIR  {  .procname = "devices", \
                                     .mode = 0555, .child = NULL }
 
@@ -270,7 +270,7 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.proc_handler	= &proc_dointvec_minmax,
+			.proc_handler	= proc_dointvec_minmax,
 			.extra1		= (void*) &parport_min_spintime_value,
 			.extra2		= (void*) &parport_max_spintime_value
 		},
@@ -279,28 +279,28 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_base_addr
+			.proc_handler	= do_hardware_base_addr
 		},
 		{
 			.procname	= "irq",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_irq
+			.proc_handler	= do_hardware_irq
 		},
 		{
 			.procname	= "dma",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_dma
+			.proc_handler	= do_hardware_dma
 		},
 		{
 			.procname	= "modes",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_hardware_modes
+			.proc_handler	= do_hardware_modes
 		},
 		PARPORT_DEVICES_ROOT_DIR,
 #ifdef CONFIG_PARPORT_1284
@@ -309,35 +309,35 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe0",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	=  &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe1",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe2",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 		{
 			.procname	= "autoprobe3",
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_autoprobe
+			.proc_handler	= do_autoprobe
 		},
 #endif /* IEEE 1284 support */
 		{}
@@ -348,7 +348,7 @@ static const struct parport_sysctl_table parport_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= 0,
 			.mode		= 0444,
-			.proc_handler	= &do_active_device
+			.proc_handler	= do_active_device
 		},
 		{}
 	},
@@ -386,14 +386,13 @@ parport_device_sysctl_template = {
 			.data		= NULL,
 			.maxlen		= sizeof(unsigned long),
 			.mode		= 0644,
-			.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+			.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
 			.extra1		= (void*) &parport_min_timeslice_value,
 			.extra2		= (void*) &parport_max_timeslice_value
 		},
 	},
 	{
 		{
-			.ctl_name	= 0,
 			.procname	= NULL,
 			.data		= NULL,
 			.maxlen		= 0,
@@ -438,7 +437,7 @@ parport_default_sysctl_table = {
 			.data		= &parport_default_timeslice,
 			.maxlen		= sizeof(parport_default_timeslice),
 			.mode		= 0644,
-			.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+			.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
 			.extra1		= (void*) &parport_min_timeslice_value,
 			.extra2		= (void*) &parport_max_timeslice_value
 		},
@@ -447,7 +446,7 @@ parport_default_sysctl_table = {
 			.data		= &parport_default_spintime,
 			.maxlen		= sizeof(parport_default_spintime),
 			.mode		= 0644,
-			.proc_handler	= &proc_dointvec_minmax,
+			.proc_handler	= proc_dointvec_minmax,
 			.extra1		= (void*) &parport_min_spintime_value,
 			.extra2		= (void*) &parport_max_spintime_value
 		},
@@ -455,7 +454,6 @@ parport_default_sysctl_table = {
 	},
 	{
 		{
-			.ctl_name	= DEV_PARPORT_DEFAULT,
 			.procname	= "default",
 			.mode		= 0555,
 			.child		= parport_default_sysctl_table.vars
@@ -495,7 +493,6 @@ int parport_proc_register(struct parport *port)
 		t->vars[6 + i].extra2 = &port->probe_info[i];
 
 	t->port_dir[0].procname = port->name;
-	t->port_dir[0].ctl_name = 0;
 
 	t->port_dir[0].child = t->vars;
 	t->parport_dir[0].child = t->port_dir;
@@ -534,11 +531,9 @@ int parport_device_proc_register(struct pardevice *device)
 	t->dev_dir[0].child = t->parport_dir;
 	t->parport_dir[0].child = t->port_dir;
 	t->port_dir[0].procname = port->name;
-	t->port_dir[0].ctl_name = 0;
 	t->port_dir[0].child = t->devices_root_dir;
 	t->devices_root_dir[0].child = t->device_dir;
 
-	t->device_dir[0].ctl_name = 0;
 	t->device_dir[0].procname = device->name;
 	t->device_dir[0].child = t->vars;
 	t->vars[0].data = &device->timeslice;
diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig
index f3ccbcc..cd5082d 100644
--- a/drivers/pcmcia/Kconfig
+++ b/drivers/pcmcia/Kconfig
@@ -179,7 +179,7 @@ config PCMCIA_BCM63XX
 	depends on BCM63XX && PCMCIA
 
 config PCMCIA_SOC_COMMON
-	bool
+	tristate
 
 config PCMCIA_SA1100
 	tristate "SA1100 support"
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 68570bc..663781d 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -23,8 +23,8 @@
 #include <asm/io.h>
 #include <asm/sizes.h>
 
-#include <mach/mux.h>
-#include <mach/tc.h>
+#include <plat/mux.h>
+#include <plat/tc.h>
 
 
 /* NOTE:  don't expect this to support many I/O cards.  The 16xx chips have
diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c
index 11cc3ba..8db86b9 100644
--- a/drivers/pcmcia/sa1100_generic.c
+++ b/drivers/pcmcia/sa1100_generic.c
@@ -51,7 +51,7 @@ static int (*sa11x0_pcmcia_hw_init[])(struct device *dev) = {
 #ifdef CONFIG_SA1100_CERF
 	pcmcia_cerf_init,
 #endif
-#ifdef CONFIG_SA1100_H3600
+#if defined(CONFIG_SA1100_H3100) || defined(CONFIG_SA1100_H3600)
 	pcmcia_h3600_init,
 #endif
 #ifdef CONFIG_SA1100_SHANNON
diff --git a/drivers/pcmcia/sa1100_h3600.c b/drivers/pcmcia/sa1100_h3600.c
index 3a121ac69..56329ad 100644
--- a/drivers/pcmcia/sa1100_h3600.c
+++ b/drivers/pcmcia/sa1100_h3600.c
@@ -10,47 +10,139 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/gpio.h>
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
 #include <asm/mach-types.h>
-#include <mach/h3600.h>
+#include <mach/h3xxx.h>
 
 #include "sa1100_generic.h"
 
 static struct pcmcia_irqs irqs[] = {
-	{ 0, IRQ_GPIO_H3600_PCMCIA_CD0, "PCMCIA CD0" },
-	{ 1, IRQ_GPIO_H3600_PCMCIA_CD1, "PCMCIA CD1" }
+	{ .sock = 0, .str = "PCMCIA CD0" }, /* .irq will be filled later */
+	{ .sock = 1, .str = "PCMCIA CD1" }
 };
 
 static int h3600_pcmcia_hw_init(struct soc_pcmcia_socket *skt)
 {
-	skt->socket.pci_irq = skt->nr ? IRQ_GPIO_H3600_PCMCIA_IRQ1
-				      : IRQ_GPIO_H3600_PCMCIA_IRQ0;
+	int err;
 
+	switch (skt->nr) {
+	case 0:
+		err = gpio_request(H3XXX_GPIO_PCMCIA_IRQ0, "PCMCIA IRQ0");
+		if (err)
+			goto err00;
+		err = gpio_direction_input(H3XXX_GPIO_PCMCIA_IRQ0);
+		if (err)
+			goto err01;
+		skt->socket.pci_irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_IRQ0);
+
+		err = gpio_request(H3XXX_GPIO_PCMCIA_CD0, "PCMCIA CD0");
+		if (err)
+			goto err01;
+		err = gpio_direction_input(H3XXX_GPIO_PCMCIA_CD0);
+		if (err)
+			goto err02;
+		irqs[0].irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_CD0);
+
+		err = gpio_request(H3XXX_EGPIO_OPT_NVRAM_ON, "OPT NVRAM ON");
+		if (err)
+			goto err02;
+		err = gpio_direction_output(H3XXX_EGPIO_OPT_NVRAM_ON, 0);
+		if (err)
+			goto err03;
+		err = gpio_request(H3XXX_EGPIO_OPT_ON, "OPT ON");
+		if (err)
+			goto err03;
+		err = gpio_direction_output(H3XXX_EGPIO_OPT_ON, 0);
+		if (err)
+			goto err04;
+		err = gpio_request(H3XXX_EGPIO_OPT_RESET, "OPT RESET");
+		if (err)
+			goto err04;
+		err = gpio_direction_output(H3XXX_EGPIO_OPT_RESET, 0);
+		if (err)
+			goto err05;
+		err = gpio_request(H3XXX_EGPIO_CARD_RESET, "PCMCIA CARD RESET");
+		if (err)
+			goto err05;
+		err = gpio_direction_output(H3XXX_EGPIO_CARD_RESET, 0);
+		if (err)
+			goto err06;
+		err = soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs));
+		if (err)
+			goto err06;
+		break;
+	case 1:
+		err = gpio_request(H3XXX_GPIO_PCMCIA_IRQ1, "PCMCIA IRQ1");
+		if (err)
+			goto err10;
+		err = gpio_direction_input(H3XXX_GPIO_PCMCIA_IRQ1);
+		if (err)
+			goto err11;
+		skt->socket.pci_irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_IRQ1);
+
+		err = gpio_request(H3XXX_GPIO_PCMCIA_CD1, "PCMCIA CD1");
+		if (err)
+			goto err11;
+		err = gpio_direction_input(H3XXX_GPIO_PCMCIA_CD1);
+		if (err)
+			goto err12;
+		irqs[1].irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_CD1);
+
+		err = soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs));
+		if (err)
+			goto err12;
+		break;
+	}
+	return 0;
 
-	return soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs));
+err06:	gpio_free(H3XXX_EGPIO_CARD_RESET);
+err05:	gpio_free(H3XXX_EGPIO_OPT_RESET);
+err04:	gpio_free(H3XXX_EGPIO_OPT_ON);
+err03:	gpio_free(H3XXX_EGPIO_OPT_NVRAM_ON);
+err02:	gpio_free(H3XXX_GPIO_PCMCIA_CD0);
+err01:	gpio_free(H3XXX_GPIO_PCMCIA_IRQ0);
+err00:	return err;
+
+err12:	gpio_free(H3XXX_GPIO_PCMCIA_CD0);
+err11:	gpio_free(H3XXX_GPIO_PCMCIA_IRQ0);
+err10:	return err;
 }
 
 static void h3600_pcmcia_hw_shutdown(struct soc_pcmcia_socket *skt)
 {
 	soc_pcmcia_free_irqs(skt, irqs, ARRAY_SIZE(irqs));
   
-	/* Disable CF bus: */
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 0);
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 0);
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 1);
+	switch (skt->nr) {
+	case 0:
+		/* Disable CF bus: */
+		gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 0);
+		gpio_set_value(H3XXX_EGPIO_OPT_ON, 0);
+		gpio_set_value(H3XXX_EGPIO_OPT_RESET, 1);
+
+		gpio_free(H3XXX_EGPIO_CARD_RESET);
+		gpio_free(H3XXX_EGPIO_OPT_RESET);
+		gpio_free(H3XXX_EGPIO_OPT_ON);
+		gpio_free(H3XXX_EGPIO_OPT_NVRAM_ON);
+		gpio_free(H3XXX_GPIO_PCMCIA_CD0);
+		gpio_free(H3XXX_GPIO_PCMCIA_IRQ0);
+		break;
+	case 1:
+		gpio_free(H3XXX_GPIO_PCMCIA_CD1);
+		gpio_free(H3XXX_GPIO_PCMCIA_IRQ1);
+		break;
+	}
 }
 
 static void
 h3600_pcmcia_socket_state(struct soc_pcmcia_socket *skt, struct pcmcia_state *state)
 {
-	unsigned long levels = GPLR;
-
 	switch (skt->nr) {
 	case 0:
-		state->detect = levels & GPIO_H3600_PCMCIA_CD0 ? 0 : 1;
-		state->ready = levels & GPIO_H3600_PCMCIA_IRQ0 ? 1 : 0;
+		state->detect = !gpio_get_value(H3XXX_GPIO_PCMCIA_CD0);
+		state->ready = !!gpio_get_value(H3XXX_GPIO_PCMCIA_IRQ0);
 		state->bvd1 = 0;
 		state->bvd2 = 0;
 		state->wrprot = 0; /* Not available on H3600. */
@@ -59,8 +151,8 @@ h3600_pcmcia_socket_state(struct soc_pcmcia_socket *skt, struct pcmcia_state *st
 		break;
 
 	case 1:
-		state->detect = levels & GPIO_H3600_PCMCIA_CD1 ? 0 : 1;
-		state->ready = levels & GPIO_H3600_PCMCIA_IRQ1 ? 1 : 0;
+		state->detect = !gpio_get_value(H3XXX_GPIO_PCMCIA_CD1);
+		state->ready = !!gpio_get_value(H3XXX_GPIO_PCMCIA_IRQ1);
 		state->bvd1 = 0;
 		state->bvd2 = 0;
 		state->wrprot = 0; /* Not available on H3600. */
@@ -79,7 +171,7 @@ h3600_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, const socket_state_
 		return -1;
 	}
 
-	assign_h3600_egpio(IPAQ_EGPIO_CARD_RESET, !!(state->flags & SS_RESET));
+	gpio_set_value(H3XXX_EGPIO_CARD_RESET, !!(state->flags & SS_RESET));
 
 	/* Silently ignore Vpp, output enable, speaker enable. */
 
@@ -89,9 +181,9 @@ h3600_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, const socket_state_
 static void h3600_pcmcia_socket_init(struct soc_pcmcia_socket *skt)
 {
 	/* Enable CF bus: */
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 1);
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 1);
-	assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 0);
+	gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 1);
+	gpio_set_value(H3XXX_EGPIO_OPT_ON, 1);
+	gpio_set_value(H3XXX_EGPIO_OPT_RESET, 0);
 
 	msleep(10);
 
@@ -109,10 +201,10 @@ static void h3600_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt)
 	 * socket 0 then socket 1.
 	 */
 	if (skt->nr == 1) {
-		assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 0);
-		assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 0);
+		gpio_set_value(H3XXX_EGPIO_OPT_ON, 0);
+		gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 0);
 		/* hmm, does this suck power? */
-		assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 1);
+		gpio_set_value(H3XXX_EGPIO_OPT_RESET, 1);
 	}
 }
 
@@ -131,7 +223,7 @@ int __init pcmcia_h3600_init(struct device *dev)
 {
 	int ret = -ENODEV;
 
-	if (machine_is_h3600())
+	if (machine_is_h3600() || machine_is_h3100())
 		ret = sa11xx_drv_pcmcia_probe(dev, &h3600_pcmcia_ops, 0, 2);
 
 	return ret;
diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index cea6cef..1186749 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -77,6 +77,13 @@ config BATTERY_TOSA
 	  Say Y to enable support for the battery on the Sharp Zaurus
 	  SL-6000 (tosa) models.
 
+config BATTERY_COLLIE
+	tristate "Sharp SL-5500 (collie) battery"
+	depends on SA1100_COLLIE && MCP_UCB1200
+	help
+	  Say Y to enable support for the battery on the Sharp Zaurus
+	  SL-5500 (collie) models.
+
 config BATTERY_WM97XX
 	bool "WM97xx generic battery driver"
 	depends on TOUCHSCREEN_WM97XX=y
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index b96f29d..356cdfd 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_BATTERY_DS2782)	+= ds2782_battery.o
 obj-$(CONFIG_BATTERY_PMU)	+= pmu_battery.o
 obj-$(CONFIG_BATTERY_OLPC)	+= olpc_battery.o
 obj-$(CONFIG_BATTERY_TOSA)	+= tosa_battery.o
+obj-$(CONFIG_BATTERY_COLLIE)	+= collie_battery.o
 obj-$(CONFIG_BATTERY_WM97XX)	+= wm97xx_battery.o
 obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
diff --git a/drivers/power/collie_battery.c b/drivers/power/collie_battery.c
new file mode 100644
index 0000000..039f41a
--- /dev/null
+++ b/drivers/power/collie_battery.c
@@ -0,0 +1,418 @@
+/*
+ * Battery and Power Management code for the Sharp SL-5x00
+ *
+ * Copyright (C) 2009 Thomas Kunze
+ *
+ * based on tosa_battery.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/power_supply.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/gpio.h>
+#include <linux/mfd/ucb1x00.h>
+
+#include <asm/mach/sharpsl_param.h>
+#include <asm/mach-types.h>
+#include <mach/collie.h>
+
+static DEFINE_MUTEX(bat_lock); /* protects gpio pins */
+static struct work_struct bat_work;
+static struct ucb1x00 *ucb;
+
+struct collie_bat {
+	int status;
+	struct power_supply psy;
+	int full_chrg;
+
+	struct mutex work_lock; /* protects data */
+
+	bool (*is_present)(struct collie_bat *bat);
+	int gpio_full;
+	int gpio_charge_on;
+
+	int technology;
+
+	int gpio_bat;
+	int adc_bat;
+	int adc_bat_divider;
+	int bat_max;
+	int bat_min;
+
+	int gpio_temp;
+	int adc_temp;
+	int adc_temp_divider;
+};
+
+static struct collie_bat collie_bat_main;
+
+static unsigned long collie_read_bat(struct collie_bat *bat)
+{
+	unsigned long value = 0;
+
+	if (bat->gpio_bat < 0 || bat->adc_bat < 0)
+		return 0;
+	mutex_lock(&bat_lock);
+	gpio_set_value(bat->gpio_bat, 1);
+	msleep(5);
+	ucb1x00_adc_enable(ucb);
+	value = ucb1x00_adc_read(ucb, bat->adc_bat, UCB_SYNC);
+	ucb1x00_adc_disable(ucb);
+	gpio_set_value(bat->gpio_bat, 0);
+	mutex_unlock(&bat_lock);
+	value = value * 1000000 / bat->adc_bat_divider;
+
+	return value;
+}
+
+static unsigned long collie_read_temp(struct collie_bat *bat)
+{
+	unsigned long value = 0;
+	if (bat->gpio_temp < 0 || bat->adc_temp < 0)
+		return 0;
+
+	mutex_lock(&bat_lock);
+	gpio_set_value(bat->gpio_temp, 1);
+	msleep(5);
+	ucb1x00_adc_enable(ucb);
+	value = ucb1x00_adc_read(ucb, bat->adc_temp, UCB_SYNC);
+	ucb1x00_adc_disable(ucb);
+	gpio_set_value(bat->gpio_temp, 0);
+	mutex_unlock(&bat_lock);
+
+	value = value * 10000 / bat->adc_temp_divider;
+
+	return value;
+}
+
+static int collie_bat_get_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    union power_supply_propval *val)
+{
+	int ret = 0;
+	struct collie_bat *bat = container_of(psy, struct collie_bat, psy);
+
+	if (bat->is_present && !bat->is_present(bat)
+			&& psp != POWER_SUPPLY_PROP_PRESENT) {
+		return -ENODEV;
+	}
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		val->intval = bat->status;
+		break;
+	case POWER_SUPPLY_PROP_TECHNOLOGY:
+		val->intval = bat->technology;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = collie_read_bat(bat);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+		if (bat->full_chrg == -1)
+			val->intval = bat->bat_max;
+		else
+			val->intval = bat->full_chrg;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+		val->intval = bat->bat_max;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+		val->intval = bat->bat_min;
+		break;
+	case POWER_SUPPLY_PROP_TEMP:
+		val->intval = collie_read_temp(bat);
+		break;
+	case POWER_SUPPLY_PROP_PRESENT:
+		val->intval = bat->is_present ? bat->is_present(bat) : 1;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static void collie_bat_external_power_changed(struct power_supply *psy)
+{
+	schedule_work(&bat_work);
+}
+
+static irqreturn_t collie_bat_gpio_isr(int irq, void *data)
+{
+	pr_info("collie_bat_gpio irq: %d\n", gpio_get_value(irq_to_gpio(irq)));
+	schedule_work(&bat_work);
+	return IRQ_HANDLED;
+}
+
+static void collie_bat_update(struct collie_bat *bat)
+{
+	int old;
+	struct power_supply *psy = &bat->psy;
+
+	mutex_lock(&bat->work_lock);
+
+	old = bat->status;
+
+	if (bat->is_present && !bat->is_present(bat)) {
+		printk(KERN_NOTICE "%s not present\n", psy->name);
+		bat->status = POWER_SUPPLY_STATUS_UNKNOWN;
+		bat->full_chrg = -1;
+	} else if (power_supply_am_i_supplied(psy)) {
+		if (bat->status == POWER_SUPPLY_STATUS_DISCHARGING) {
+			gpio_set_value(bat->gpio_charge_on, 1);
+			mdelay(15);
+		}
+
+		if (gpio_get_value(bat->gpio_full)) {
+			if (old == POWER_SUPPLY_STATUS_CHARGING ||
+					bat->full_chrg == -1)
+				bat->full_chrg = collie_read_bat(bat);
+
+			gpio_set_value(bat->gpio_charge_on, 0);
+			bat->status = POWER_SUPPLY_STATUS_FULL;
+		} else {
+			gpio_set_value(bat->gpio_charge_on, 1);
+			bat->status = POWER_SUPPLY_STATUS_CHARGING;
+		}
+	} else {
+		gpio_set_value(bat->gpio_charge_on, 0);
+		bat->status = POWER_SUPPLY_STATUS_DISCHARGING;
+	}
+
+	if (old != bat->status)
+		power_supply_changed(psy);
+
+	mutex_unlock(&bat->work_lock);
+}
+
+static void collie_bat_work(struct work_struct *work)
+{
+	collie_bat_update(&collie_bat_main);
+}
+
+
+static enum power_supply_property collie_bat_main_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_TEMP,
+};
+
+static enum power_supply_property collie_bat_bu_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX,
+	POWER_SUPPLY_PROP_PRESENT,
+};
+
+static struct collie_bat collie_bat_main = {
+	.status = POWER_SUPPLY_STATUS_DISCHARGING,
+	.full_chrg = -1,
+	.psy = {
+		.name		= "main-battery",
+		.type		= POWER_SUPPLY_TYPE_BATTERY,
+		.properties	= collie_bat_main_props,
+		.num_properties	= ARRAY_SIZE(collie_bat_main_props),
+		.get_property	= collie_bat_get_property,
+		.external_power_changed = collie_bat_external_power_changed,
+		.use_for_apm	= 1,
+	},
+
+	.gpio_full = COLLIE_GPIO_CO,
+	.gpio_charge_on = COLLIE_GPIO_CHARGE_ON,
+
+	.technology = POWER_SUPPLY_TECHNOLOGY_LIPO,
+
+	.gpio_bat = COLLIE_GPIO_MBAT_ON,
+	.adc_bat = UCB_ADC_INP_AD1,
+	.adc_bat_divider = 155,
+	.bat_max = 4310000,
+	.bat_min = 1551 * 1000000 / 414,
+
+	.gpio_temp = COLLIE_GPIO_TMP_ON,
+	.adc_temp = UCB_ADC_INP_AD0,
+	.adc_temp_divider = 10000,
+};
+
+static struct collie_bat collie_bat_bu = {
+	.status = POWER_SUPPLY_STATUS_UNKNOWN,
+	.full_chrg = -1,
+
+	.psy = {
+		.name		= "backup-battery",
+		.type		= POWER_SUPPLY_TYPE_BATTERY,
+		.properties	= collie_bat_bu_props,
+		.num_properties	= ARRAY_SIZE(collie_bat_bu_props),
+		.get_property	= collie_bat_get_property,
+		.external_power_changed = collie_bat_external_power_changed,
+	},
+
+	.gpio_full = -1,
+	.gpio_charge_on = -1,
+
+	.technology = POWER_SUPPLY_TECHNOLOGY_LiMn,
+
+	.gpio_bat = COLLIE_GPIO_BBAT_ON,
+	.adc_bat = UCB_ADC_INP_AD1,
+	.adc_bat_divider = 155,
+	.bat_max = 3000000,
+	.bat_min = 1900000,
+
+	.gpio_temp = -1,
+	.adc_temp = -1,
+	.adc_temp_divider = -1,
+};
+
+static struct {
+	int gpio;
+	char *name;
+	bool output;
+	int value;
+} gpios[] = {
+	{ COLLIE_GPIO_CO,		"main battery full",	0, 0 },
+	{ COLLIE_GPIO_MAIN_BAT_LOW,	"main battery low",	0, 0 },
+	{ COLLIE_GPIO_CHARGE_ON,	"main charge on",	1, 0 },
+	{ COLLIE_GPIO_MBAT_ON,		"main battery",		1, 0 },
+	{ COLLIE_GPIO_TMP_ON,		"main battery temp",	1, 0 },
+	{ COLLIE_GPIO_BBAT_ON,		"backup battery",	1, 0 },
+};
+
+#ifdef CONFIG_PM
+static int collie_bat_suspend(struct ucb1x00_dev *dev, pm_message_t state)
+{
+	/* flush all pending status updates */
+	flush_scheduled_work();
+	return 0;
+}
+
+static int collie_bat_resume(struct ucb1x00_dev *dev)
+{
+	/* things may have changed while we were away */
+	schedule_work(&bat_work);
+	return 0;
+}
+#else
+#define collie_bat_suspend NULL
+#define collie_bat_resume NULL
+#endif
+
+static int __devinit collie_bat_probe(struct ucb1x00_dev *dev)
+{
+	int ret;
+	int i;
+
+	if (!machine_is_collie())
+		return -ENODEV;
+
+	ucb = dev->ucb;
+
+	for (i = 0; i < ARRAY_SIZE(gpios); i++) {
+		ret = gpio_request(gpios[i].gpio, gpios[i].name);
+		if (ret) {
+			i--;
+			goto err_gpio;
+		}
+
+		if (gpios[i].output)
+			ret = gpio_direction_output(gpios[i].gpio,
+					gpios[i].value);
+		else
+			ret = gpio_direction_input(gpios[i].gpio);
+
+		if (ret)
+			goto err_gpio;
+	}
+
+	mutex_init(&collie_bat_main.work_lock);
+
+	INIT_WORK(&bat_work, collie_bat_work);
+
+	ret = power_supply_register(&dev->ucb->dev, &collie_bat_main.psy);
+	if (ret)
+		goto err_psy_reg_main;
+	ret = power_supply_register(&dev->ucb->dev, &collie_bat_bu.psy);
+	if (ret)
+		goto err_psy_reg_bu;
+
+	ret = request_irq(gpio_to_irq(COLLIE_GPIO_CO),
+				collie_bat_gpio_isr,
+				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				"main full", &collie_bat_main);
+	if (!ret) {
+		schedule_work(&bat_work);
+		return 0;
+	}
+	power_supply_unregister(&collie_bat_bu.psy);
+err_psy_reg_bu:
+	power_supply_unregister(&collie_bat_main.psy);
+err_psy_reg_main:
+
+	/* see comment in collie_bat_remove */
+	flush_scheduled_work();
+
+	i--;
+err_gpio:
+	for (; i >= 0; i--)
+		gpio_free(gpios[i].gpio);
+
+	return ret;
+}
+
+static void __devexit collie_bat_remove(struct ucb1x00_dev *dev)
+{
+	int i;
+
+	free_irq(gpio_to_irq(COLLIE_GPIO_CO), &collie_bat_main);
+
+	power_supply_unregister(&collie_bat_bu.psy);
+	power_supply_unregister(&collie_bat_main.psy);
+
+	/*
+	 * now flush all pending work.
+	 * we won't get any more schedules, since all
+	 * sources (isr and external_power_changed)
+	 * are unregistered now.
+	 */
+	flush_scheduled_work();
+
+	for (i = ARRAY_SIZE(gpios) - 1; i >= 0; i--)
+		gpio_free(gpios[i].gpio);
+}
+
+static struct ucb1x00_driver collie_bat_driver = {
+	.add		= collie_bat_probe,
+	.remove		= __devexit_p(collie_bat_remove),
+	.suspend	= collie_bat_suspend,
+	.resume		= collie_bat_resume,
+};
+
+static int __init collie_bat_init(void)
+{
+	return ucb1x00_register_driver(&collie_bat_driver);
+}
+
+static void __exit collie_bat_exit(void)
+{
+	ucb1x00_unregister_driver(&collie_bat_driver);
+}
+
+module_init(collie_bat_init);
+module_exit(collie_bat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Kunze");
+MODULE_DESCRIPTION("Collie battery driver");
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 3c20dae..f2e1004 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -509,6 +509,15 @@ config RTC_DRV_M48T59
 	  This driver can also be built as a module, if so, the module
 	  will be called "rtc-m48t59".
 
+config RTC_DRV_MSM6242
+	tristate "Oki MSM6242"
+	help
+	  If you say yes here you get support for the Oki MSM6242
+	  timekeeping chip. It is used in some Amiga models (e.g. A2000).
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-msm6242.
+
 config RTC_MXC
 	tristate "Freescale MXC Real Time Clock"
 	depends on ARCH_MXC
@@ -529,6 +538,16 @@ config RTC_DRV_BQ4802
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-bq4802.
 
+config RTC_DRV_RP5C01
+	tristate "Ricoh RP5C01"
+	help
+	  If you say yes here you get support for the Ricoh RP5C01
+	  timekeeping chip. It is used in some Amiga models (e.g. A3000
+	  and A4000).
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-rp5c01.
+
 config RTC_DRV_V3020
 	tristate "EM Microelectronic V3020"
 	help
@@ -780,7 +799,7 @@ config RTC_DRV_TX4939
 
 config RTC_DRV_MV
 	tristate "Marvell SoC RTC"
-	depends on ARCH_KIRKWOOD
+	depends on ARCH_KIRKWOOD || ARCH_DOVE
 	help
 	  If you say yes here you will get support for the in-chip RTC
 	  that can be found in some of Marvell's SoC devices, such as
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index aa3fbd5..af1ba7a 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_RTC_DRV_M48T86)	+= rtc-m48t86.o
 obj-$(CONFIG_RTC_MXC)		+= rtc-mxc.o
 obj-$(CONFIG_RTC_DRV_MAX6900)	+= rtc-max6900.o
 obj-$(CONFIG_RTC_DRV_MAX6902)	+= rtc-max6902.o
+obj-$(CONFIG_RTC_DRV_MSM6242)	+= rtc-msm6242.o
 obj-$(CONFIG_RTC_DRV_MV)	+= rtc-mv.o
 obj-$(CONFIG_RTC_DRV_OMAP)	+= rtc-omap.o
 obj-$(CONFIG_RTC_DRV_PCAP)	+= rtc-pcap.o
@@ -64,6 +65,7 @@ obj-$(CONFIG_RTC_DRV_PL031)	+= rtc-pl031.o
 obj-$(CONFIG_RTC_DRV_PS3)	+= rtc-ps3.o
 obj-$(CONFIG_RTC_DRV_PXA)	+= rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R9701)	+= rtc-r9701.o
+obj-$(CONFIG_RTC_DRV_RP5C01)	+= rtc-rp5c01.o
 obj-$(CONFIG_RTC_DRV_RS5C313)	+= rtc-rs5c313.o
 obj-$(CONFIG_RTC_DRV_RS5C348)	+= rtc-rs5c348.o
 obj-$(CONFIG_RTC_DRV_RS5C372)	+= rtc-rs5c372.o
diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c
new file mode 100644
index 0000000..5f5968a
--- /dev/null
+++ b/drivers/rtc/rtc-msm6242.c
@@ -0,0 +1,269 @@
+/*
+ *  Oki MSM6242 RTC Driver
+ *
+ *  Copyright 2009 Geert Uytterhoeven
+ *
+ *  Based on the A2000 TOD code in arch/m68k/amiga/config.c
+ *  Copyright (C) 1993 Hamish Macdonald
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+
+enum {
+	MSM6242_SECOND1		= 0x0,	/* 1-second digit register */
+	MSM6242_SECOND10	= 0x1,	/* 10-second digit register */
+	MSM6242_MINUTE1		= 0x2,	/* 1-minute digit register */
+	MSM6242_MINUTE10	= 0x3,	/* 10-minute digit register */
+	MSM6242_HOUR1		= 0x4,	/* 1-hour digit register */
+	MSM6242_HOUR10		= 0x5,	/* PM/AM, 10-hour digit register */
+	MSM6242_DAY1		= 0x6,	/* 1-day digit register */
+	MSM6242_DAY10		= 0x7,	/* 10-day digit register */
+	MSM6242_MONTH1		= 0x8,	/* 1-month digit register */
+	MSM6242_MONTH10		= 0x9,	/* 10-month digit register */
+	MSM6242_YEAR1		= 0xa,	/* 1-year digit register */
+	MSM6242_YEAR10		= 0xb,	/* 10-year digit register */
+	MSM6242_WEEK		= 0xc,	/* Week register */
+	MSM6242_CD		= 0xd,	/* Control Register D */
+	MSM6242_CE		= 0xe,	/* Control Register E */
+	MSM6242_CF		= 0xf,	/* Control Register F */
+};
+
+#define MSM6242_HOUR10_AM	(0 << 2)
+#define MSM6242_HOUR10_PM	(1 << 2)
+#define MSM6242_HOUR10_HR_MASK	(3 << 0)
+
+#define MSM6242_WEEK_SUNDAY	0
+#define MSM6242_WEEK_MONDAY	1
+#define MSM6242_WEEK_TUESDAY	2
+#define MSM6242_WEEK_WEDNESDAY	3
+#define MSM6242_WEEK_THURSDAY	4
+#define MSM6242_WEEK_FRIDAY	5
+#define MSM6242_WEEK_SATURDAY	6
+
+#define MSM6242_CD_30_S_ADJ	(1 << 3)	/* 30-second adjustment */
+#define MSM6242_CD_IRQ_FLAG	(1 << 2)
+#define MSM6242_CD_BUSY		(1 << 1)
+#define MSM6242_CD_HOLD		(1 << 0)
+
+#define MSM6242_CE_T_MASK	(3 << 2)
+#define MSM6242_CE_T_64HZ	(0 << 2)	/* period 1/64 second */
+#define MSM6242_CE_T_1HZ	(1 << 2)	/* period 1 second */
+#define MSM6242_CE_T_1MINUTE	(2 << 2)	/* period 1 minute */
+#define MSM6242_CE_T_1HOUR	(3 << 2)	/* period 1 hour */
+
+#define MSM6242_CE_ITRPT_STND	(1 << 1)
+#define MSM6242_CE_MASK		(1 << 0)	/* STD.P output control */
+
+#define MSM6242_CF_TEST		(1 << 3)
+#define MSM6242_CF_12H		(0 << 2)
+#define MSM6242_CF_24H		(1 << 2)
+#define MSM6242_CF_STOP		(1 << 1)
+#define MSM6242_CF_REST		(1 << 0)	/* reset */
+
+
+struct msm6242_priv {
+	u32 __iomem *regs;
+	struct rtc_device *rtc;
+};
+
+static inline unsigned int msm6242_read(struct msm6242_priv *priv,
+				       unsigned int reg)
+{
+	return __raw_readl(&priv->regs[reg]) & 0xf;
+}
+
+static inline void msm6242_write(struct msm6242_priv *priv, unsigned int val,
+				unsigned int reg)
+{
+	return __raw_writel(val, &priv->regs[reg]);
+}
+
+static inline void msm6242_set(struct msm6242_priv *priv, unsigned int val,
+			       unsigned int reg)
+{
+	msm6242_write(priv, msm6242_read(priv, reg) | val, reg);
+}
+
+static inline void msm6242_clear(struct msm6242_priv *priv, unsigned int val,
+				 unsigned int reg)
+{
+	msm6242_write(priv, msm6242_read(priv, reg) & ~val, reg);
+}
+
+static void msm6242_lock(struct msm6242_priv *priv)
+{
+	int cnt = 5;
+
+	msm6242_set(priv, MSM6242_CD_HOLD, MSM6242_CD);
+
+	while ((msm6242_read(priv, MSM6242_CD) & MSM6242_CD_BUSY) && cnt) {
+		msm6242_clear(priv, MSM6242_CD_HOLD, MSM6242_CD);
+		udelay(70);
+		msm6242_set(priv, MSM6242_CD_HOLD, MSM6242_CD);
+		cnt--;
+	}
+
+	if (!cnt)
+		pr_warning("msm6242: timed out waiting for RTC (0x%x)\n",
+			   msm6242_read(priv, MSM6242_CD));
+}
+
+static void msm6242_unlock(struct msm6242_priv *priv)
+{
+	msm6242_clear(priv, MSM6242_CD_HOLD, MSM6242_CD);
+}
+
+static int msm6242_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct msm6242_priv *priv = dev_get_drvdata(dev);
+
+	msm6242_lock(priv);
+
+	tm->tm_sec  = msm6242_read(priv, MSM6242_SECOND10) * 10 +
+		      msm6242_read(priv, MSM6242_SECOND1);
+	tm->tm_min  = msm6242_read(priv, MSM6242_MINUTE10) * 10 +
+		      msm6242_read(priv, MSM6242_MINUTE1);
+	tm->tm_hour = (msm6242_read(priv, MSM6242_HOUR10 & 3)) * 10 +
+		      msm6242_read(priv, MSM6242_HOUR1);
+	tm->tm_mday = msm6242_read(priv, MSM6242_DAY10) * 10 +
+		      msm6242_read(priv, MSM6242_DAY1);
+	tm->tm_wday = msm6242_read(priv, MSM6242_WEEK);
+	tm->tm_mon  = msm6242_read(priv, MSM6242_MONTH10) * 10 +
+		      msm6242_read(priv, MSM6242_MONTH1) - 1;
+	tm->tm_year = msm6242_read(priv, MSM6242_YEAR10) * 10 +
+		      msm6242_read(priv, MSM6242_YEAR1);
+	if (tm->tm_year <= 69)
+		tm->tm_year += 100;
+
+	if (!(msm6242_read(priv, MSM6242_CF) & MSM6242_CF_24H)) {
+		unsigned int pm = msm6242_read(priv, MSM6242_HOUR10) &
+				  MSM6242_HOUR10_PM;
+		if (!pm && tm->tm_hour == 12)
+			tm->tm_hour = 0;
+		else if (pm && tm->tm_hour != 12)
+			tm->tm_hour += 12;
+	}
+
+	msm6242_unlock(priv);
+
+	return rtc_valid_tm(tm);
+}
+
+static int msm6242_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct msm6242_priv *priv = dev_get_drvdata(dev);
+
+	msm6242_lock(priv);
+
+	msm6242_write(priv, tm->tm_sec / 10, MSM6242_SECOND10);
+	msm6242_write(priv, tm->tm_sec % 10, MSM6242_SECOND1);
+	msm6242_write(priv, tm->tm_min / 10, MSM6242_MINUTE10);
+	msm6242_write(priv, tm->tm_min % 10, MSM6242_MINUTE1);
+	if (msm6242_read(priv, MSM6242_CF) & MSM6242_CF_24H)
+		msm6242_write(priv, tm->tm_hour / 10, MSM6242_HOUR10);
+	else if (tm->tm_hour >= 12)
+		msm6242_write(priv, MSM6242_HOUR10_PM + (tm->tm_hour - 12) / 10,
+			      MSM6242_HOUR10);
+	else
+		msm6242_write(priv, tm->tm_hour / 10, MSM6242_HOUR10);
+	msm6242_write(priv, tm->tm_hour % 10, MSM6242_HOUR1);
+	msm6242_write(priv, tm->tm_mday / 10, MSM6242_DAY10);
+	msm6242_write(priv, tm->tm_mday % 10, MSM6242_DAY1);
+	if (tm->tm_wday != -1)
+		msm6242_write(priv, tm->tm_wday, MSM6242_WEEK);
+	msm6242_write(priv, (tm->tm_mon + 1) / 10, MSM6242_MONTH10);
+	msm6242_write(priv, (tm->tm_mon + 1) % 10, MSM6242_MONTH1);
+	if (tm->tm_year >= 100)
+		tm->tm_year -= 100;
+	msm6242_write(priv, tm->tm_year / 10, MSM6242_YEAR10);
+	msm6242_write(priv, tm->tm_year % 10, MSM6242_YEAR1);
+
+	msm6242_unlock(priv);
+	return 0;
+}
+
+static const struct rtc_class_ops msm6242_rtc_ops = {
+	.read_time	= msm6242_read_time,
+	.set_time	= msm6242_set_time,
+};
+
+static int __init msm6242_rtc_probe(struct platform_device *dev)
+{
+	struct resource *res;
+	struct msm6242_priv *priv;
+	struct rtc_device *rtc;
+	int error;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->regs = ioremap(res->start, resource_size(res));
+	if (!priv->regs) {
+		error = -ENOMEM;
+		goto out_free_priv;
+	}
+
+	rtc = rtc_device_register("rtc-msm6242", &dev->dev, &msm6242_rtc_ops,
+				  THIS_MODULE);
+	if (IS_ERR(rtc)) {
+		error = PTR_ERR(rtc);
+		goto out_unmap;
+	}
+
+	priv->rtc = rtc;
+	platform_set_drvdata(dev, priv);
+	return 0;
+
+out_unmap:
+	iounmap(priv->regs);
+out_free_priv:
+	kfree(priv);
+	return error;
+}
+
+static int __exit msm6242_rtc_remove(struct platform_device *dev)
+{
+	struct msm6242_priv *priv = platform_get_drvdata(dev);
+
+	rtc_device_unregister(priv->rtc);
+	iounmap(priv->regs);
+	kfree(priv);
+	return 0;
+}
+
+static struct platform_driver msm6242_rtc_driver = {
+	.driver	= {
+		.name	= "rtc-msm6242",
+		.owner	= THIS_MODULE,
+	},
+	.remove	= __exit_p(msm6242_rtc_remove),
+};
+
+static int __init msm6242_rtc_init(void)
+{
+	return platform_driver_probe(&msm6242_rtc_driver, msm6242_rtc_probe);
+}
+
+static void __exit msm6242_rtc_fini(void)
+{
+	platform_driver_unregister(&msm6242_rtc_driver);
+}
+
+module_init(msm6242_rtc_init);
+module_exit(msm6242_rtc_fini);
+
+MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Oki MSM6242 RTC driver");
+MODULE_ALIAS("platform:rtc-msm6242");
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
new file mode 100644
index 0000000..e1313fe
--- /dev/null
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -0,0 +1,222 @@
+/*
+ *  Ricoh RP5C01 RTC Driver
+ *
+ *  Copyright 2009 Geert Uytterhoeven
+ *
+ *  Based on the A3000 TOD code in arch/m68k/amiga/config.c
+ *  Copyright (C) 1993 Hamish Macdonald
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+
+enum {
+	RP5C01_1_SECOND		= 0x0,	/* MODE 00 */
+	RP5C01_10_SECOND	= 0x1,	/* MODE 00 */
+	RP5C01_1_MINUTE		= 0x2,	/* MODE 00 and MODE 01 */
+	RP5C01_10_MINUTE	= 0x3,	/* MODE 00 and MODE 01 */
+	RP5C01_1_HOUR		= 0x4,	/* MODE 00 and MODE 01 */
+	RP5C01_10_HOUR		= 0x5,	/* MODE 00 and MODE 01 */
+	RP5C01_DAY_OF_WEEK	= 0x6,	/* MODE 00 and MODE 01 */
+	RP5C01_1_DAY		= 0x7,	/* MODE 00 and MODE 01 */
+	RP5C01_10_DAY		= 0x8,	/* MODE 00 and MODE 01 */
+	RP5C01_1_MONTH		= 0x9,	/* MODE 00 */
+	RP5C01_10_MONTH		= 0xa,	/* MODE 00 */
+	RP5C01_1_YEAR		= 0xb,	/* MODE 00 */
+	RP5C01_10_YEAR		= 0xc,	/* MODE 00 */
+
+	RP5C01_12_24_SELECT	= 0xa,	/* MODE 01 */
+	RP5C01_LEAP_YEAR	= 0xb,	/* MODE 01 */
+
+	RP5C01_MODE		= 0xd,	/* all modes */
+	RP5C01_TEST		= 0xe,	/* all modes */
+	RP5C01_RESET		= 0xf,	/* all modes */
+};
+
+#define RP5C01_12_24_SELECT_12	(0 << 0)
+#define RP5C01_12_24_SELECT_24	(1 << 0)
+
+#define RP5C01_10_HOUR_AM	(0 << 1)
+#define RP5C01_10_HOUR_PM	(1 << 1)
+
+#define RP5C01_MODE_TIMER_EN	(1 << 3)	/* timer enable */
+#define RP5C01_MODE_ALARM_EN	(1 << 2)	/* alarm enable */
+
+#define RP5C01_MODE_MODE_MASK	(3 << 0)
+#define RP5C01_MODE_MODE00	(0 << 0)	/* time */
+#define RP5C01_MODE_MODE01	(1 << 0)	/* alarm, 12h/24h, leap year */
+#define RP5C01_MODE_RAM_BLOCK10	(2 << 0)	/* RAM 4 bits x 13 */
+#define RP5C01_MODE_RAM_BLOCK11	(3 << 0)	/* RAM 4 bits x 13 */
+
+#define RP5C01_RESET_1HZ_PULSE	(1 << 3)
+#define RP5C01_RESET_16HZ_PULSE	(1 << 2)
+#define RP5C01_RESET_SECOND	(1 << 1)	/* reset divider stages for */
+						/* seconds or smaller units */
+#define RP5C01_RESET_ALARM	(1 << 0)	/* reset all alarm registers */
+
+
+struct rp5c01_priv {
+	u32 __iomem *regs;
+	struct rtc_device *rtc;
+};
+
+static inline unsigned int rp5c01_read(struct rp5c01_priv *priv,
+				       unsigned int reg)
+{
+	return __raw_readl(&priv->regs[reg]) & 0xf;
+}
+
+static inline void rp5c01_write(struct rp5c01_priv *priv, unsigned int val,
+				unsigned int reg)
+{
+	return __raw_writel(val, &priv->regs[reg]);
+}
+
+static void rp5c01_lock(struct rp5c01_priv *priv)
+{
+	rp5c01_write(priv, RP5C01_MODE_MODE00, RP5C01_MODE);
+}
+
+static void rp5c01_unlock(struct rp5c01_priv *priv)
+{
+	rp5c01_write(priv, RP5C01_MODE_TIMER_EN | RP5C01_MODE_MODE01,
+		     RP5C01_MODE);
+}
+
+static int rp5c01_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct rp5c01_priv *priv = dev_get_drvdata(dev);
+
+	rp5c01_lock(priv);
+
+	tm->tm_sec  = rp5c01_read(priv, RP5C01_10_SECOND) * 10 +
+		      rp5c01_read(priv, RP5C01_1_SECOND);
+	tm->tm_min  = rp5c01_read(priv, RP5C01_10_MINUTE) * 10 +
+		      rp5c01_read(priv, RP5C01_1_MINUTE);
+	tm->tm_hour = rp5c01_read(priv, RP5C01_10_HOUR) * 10 +
+		      rp5c01_read(priv, RP5C01_1_HOUR);
+	tm->tm_mday = rp5c01_read(priv, RP5C01_10_DAY) * 10 +
+		      rp5c01_read(priv, RP5C01_1_DAY);
+	tm->tm_wday = rp5c01_read(priv, RP5C01_DAY_OF_WEEK);
+	tm->tm_mon  = rp5c01_read(priv, RP5C01_10_MONTH) * 10 +
+		      rp5c01_read(priv, RP5C01_1_MONTH) - 1;
+	tm->tm_year = rp5c01_read(priv, RP5C01_10_YEAR) * 10 +
+		      rp5c01_read(priv, RP5C01_1_YEAR);
+	if (tm->tm_year <= 69)
+		tm->tm_year += 100;
+
+	rp5c01_unlock(priv);
+
+	return rtc_valid_tm(tm);
+}
+
+static int rp5c01_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct rp5c01_priv *priv = dev_get_drvdata(dev);
+
+	rp5c01_lock(priv);
+
+	rp5c01_write(priv, tm->tm_sec / 10, RP5C01_10_SECOND);
+	rp5c01_write(priv, tm->tm_sec % 10, RP5C01_1_SECOND);
+	rp5c01_write(priv, tm->tm_min / 10, RP5C01_10_MINUTE);
+	rp5c01_write(priv, tm->tm_min % 10, RP5C01_1_MINUTE);
+	rp5c01_write(priv, tm->tm_hour / 10, RP5C01_10_HOUR);
+	rp5c01_write(priv, tm->tm_hour % 10, RP5C01_1_HOUR);
+	rp5c01_write(priv, tm->tm_mday / 10, RP5C01_10_DAY);
+	rp5c01_write(priv, tm->tm_mday % 10, RP5C01_1_DAY);
+	if (tm->tm_wday != -1)
+		rp5c01_write(priv, tm->tm_wday, RP5C01_DAY_OF_WEEK);
+	rp5c01_write(priv, (tm->tm_mon + 1) / 10, RP5C01_10_MONTH);
+	rp5c01_write(priv, (tm->tm_mon + 1) % 10, RP5C01_1_MONTH);
+	if (tm->tm_year >= 100)
+		tm->tm_year -= 100;
+	rp5c01_write(priv, tm->tm_year / 10, RP5C01_10_YEAR);
+	rp5c01_write(priv, tm->tm_year % 10, RP5C01_1_YEAR);
+
+	rp5c01_unlock(priv);
+	return 0;
+}
+
+static const struct rtc_class_ops rp5c01_rtc_ops = {
+	.read_time	= rp5c01_read_time,
+	.set_time	= rp5c01_set_time,
+};
+
+static int __init rp5c01_rtc_probe(struct platform_device *dev)
+{
+	struct resource *res;
+	struct rp5c01_priv *priv;
+	struct rtc_device *rtc;
+	int error;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->regs = ioremap(res->start, resource_size(res));
+	if (!priv->regs) {
+		error = -ENOMEM;
+		goto out_free_priv;
+	}
+
+	rtc = rtc_device_register("rtc-rp5c01", &dev->dev, &rp5c01_rtc_ops,
+				  THIS_MODULE);
+	if (IS_ERR(rtc)) {
+		error = PTR_ERR(rtc);
+		goto out_unmap;
+	}
+
+	priv->rtc = rtc;
+	platform_set_drvdata(dev, priv);
+	return 0;
+
+out_unmap:
+	iounmap(priv->regs);
+out_free_priv:
+	kfree(priv);
+	return error;
+}
+
+static int __exit rp5c01_rtc_remove(struct platform_device *dev)
+{
+	struct rp5c01_priv *priv = platform_get_drvdata(dev);
+
+	rtc_device_unregister(priv->rtc);
+	iounmap(priv->regs);
+	kfree(priv);
+	return 0;
+}
+
+static struct platform_driver rp5c01_rtc_driver = {
+	.driver	= {
+		.name	= "rtc-rp5c01",
+		.owner	= THIS_MODULE,
+	},
+	.remove	= __exit_p(rp5c01_rtc_remove),
+};
+
+static int __init rp5c01_rtc_init(void)
+{
+	return platform_driver_probe(&rp5c01_rtc_driver, rp5c01_rtc_probe);
+}
+
+static void __exit rp5c01_rtc_fini(void)
+{
+	platform_driver_unregister(&rp5c01_rtc_driver);
+}
+
+module_init(rp5c01_rtc_init);
+module_exit(rp5c01_rtc_fini);
+
+MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ricoh RP5C01 RTC driver");
+MODULE_ALIAS("platform:rtc-rp5c01");
diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c
index b44462a..740fe40 100644
--- a/drivers/s390/char/sclp_async.c
+++ b/drivers/s390/char/sclp_async.c
@@ -101,18 +101,17 @@ static struct ctl_table callhome_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_handler_callhome,
 	},
-	{ .ctl_name = 0 }
+	{}
 };
 
 static struct ctl_table kern_dir_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= callhome_table,
 	},
-	{ .ctl_name = 0 }
+	{}
 };
 
 /*
diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c
index 63a30f5..2b6b93f 100644
--- a/drivers/scsi/scsi_sysctl.c
+++ b/drivers/scsi/scsi_sysctl.c
@@ -13,26 +13,23 @@
 
 
 static ctl_table scsi_table[] = {
-	{ .ctl_name	= DEV_SCSI_LOGGING_LEVEL,
-	  .procname	= "logging_level",
+	{ .procname	= "logging_level",
 	  .data		= &scsi_logging_level,
 	  .maxlen	= sizeof(scsi_logging_level),
 	  .mode		= 0644,
-	  .proc_handler	= &proc_dointvec },
+	  .proc_handler	= proc_dointvec },
 	{ }
 };
 
 static ctl_table scsi_dir_table[] = {
-	{ .ctl_name	= DEV_SCSI,
-	  .procname	= "scsi",
+	{ .procname	= "scsi",
 	  .mode		= 0555,
 	  .child	= scsi_table },
 	{ }
 };
 
 static ctl_table scsi_root_table[] = {
-	{ .ctl_name	= CTL_DEV,
-	  .procname	= "dev",
+	{ .procname	= "dev",
 	  .mode		= 0555,
 	  .child	= scsi_dir_table },
 	{ }
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index e522572..50943ff 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1477,4 +1477,17 @@ config SERIAL_BCM63XX_CONSOLE
 	  If you have enabled the serial port on the bcm63xx CPU
 	  you can make it the console by answering Y to this option.
 
+config SERIAL_GRLIB_GAISLER_APBUART
+	tristate "GRLIB APBUART serial support"
+	depends on OF
+	---help---
+	Add support for the GRLIB APBUART serial port.
+
+config SERIAL_GRLIB_GAISLER_APBUART_CONSOLE
+	bool "Console on GRLIB APBUART serial port"
+	depends on SERIAL_GRLIB_GAISLER_APBUART=y
+	select SERIAL_CORE_CONSOLE
+	help
+	Support for running a console on the GRLIB APBUART
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index d21d5dd..5548fe7 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -81,3 +81,4 @@ obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
 obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
+obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o
diff --git a/drivers/serial/apbuart.c b/drivers/serial/apbuart.c
new file mode 100644
index 0000000..fe91319
--- /dev/null
+++ b/drivers/serial/apbuart.c
@@ -0,0 +1,710 @@
+/*
+ *  Driver for GRLIB serial ports (APBUART)
+ *
+ *  Based on linux/drivers/serial/amba.c
+ *
+ *  Copyright (C) 2000 Deep Blue Solutions Ltd.
+ *  Copyright (C) 2003 Konrad Eisele <eiselekd@web.de>
+ *  Copyright (C) 2006 Daniel Hellstrom <daniel@gaisler.com>, Aeroflex Gaisler AB
+ *  Copyright (C) 2008 Gilead Kutnick <kutnickg@zin-tech.com>
+ *  Copyright (C) 2009 Kristoffer Glembo <kristoffer@gaisler.com>, Aeroflex Gaisler AB
+ */
+
+#if defined(CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/module.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/kthread.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/serial_core.h>
+#include <asm/irq.h>
+
+#include "apbuart.h"
+
+#define SERIAL_APBUART_MAJOR	TTY_MAJOR
+#define SERIAL_APBUART_MINOR	64
+#define UART_DUMMY_RSR_RX	0x8000	/* for ignore all read */
+
+static void apbuart_tx_chars(struct uart_port *port);
+
+static void apbuart_stop_tx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr &= ~UART_CTRL_TI;
+	UART_PUT_CTRL(port, cr);
+}
+
+static void apbuart_start_tx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr |= UART_CTRL_TI;
+	UART_PUT_CTRL(port, cr);
+
+	if (UART_GET_STATUS(port) & UART_STATUS_THE)
+		apbuart_tx_chars(port);
+}
+
+static void apbuart_stop_rx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr &= ~(UART_CTRL_RI);
+	UART_PUT_CTRL(port, cr);
+}
+
+static void apbuart_enable_ms(struct uart_port *port)
+{
+	/* No modem status change interrupts for APBUART */
+}
+
+static void apbuart_rx_chars(struct uart_port *port)
+{
+	struct tty_struct *tty = port->state->port.tty;
+	unsigned int status, ch, rsr, flag;
+	unsigned int max_chars = port->fifosize;
+
+	status = UART_GET_STATUS(port);
+
+	while (UART_RX_DATA(status) && (max_chars--)) {
+
+		ch = UART_GET_CHAR(port);
+		flag = TTY_NORMAL;
+
+		port->icount.rx++;
+
+		rsr = UART_GET_STATUS(port) | UART_DUMMY_RSR_RX;
+		UART_PUT_STATUS(port, 0);
+		if (rsr & UART_STATUS_ERR) {
+
+			if (rsr & UART_STATUS_BR) {
+				rsr &= ~(UART_STATUS_FE | UART_STATUS_PE);
+				port->icount.brk++;
+				if (uart_handle_break(port))
+					goto ignore_char;
+			} else if (rsr & UART_STATUS_PE) {
+				port->icount.parity++;
+			} else if (rsr & UART_STATUS_FE) {
+				port->icount.frame++;
+			}
+			if (rsr & UART_STATUS_OE)
+				port->icount.overrun++;
+
+			rsr &= port->read_status_mask;
+
+			if (rsr & UART_STATUS_PE)
+				flag = TTY_PARITY;
+			else if (rsr & UART_STATUS_FE)
+				flag = TTY_FRAME;
+		}
+
+		if (uart_handle_sysrq_char(port, ch))
+			goto ignore_char;
+
+		uart_insert_char(port, rsr, UART_STATUS_OE, ch, flag);
+
+
+	      ignore_char:
+		status = UART_GET_STATUS(port);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static void apbuart_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	int count;
+
+	if (port->x_char) {
+		UART_PUT_CHAR(port, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		apbuart_stop_tx(port);
+		return;
+	}
+
+	/* amba: fill FIFO */
+	count = port->fifosize >> 1;
+	do {
+		UART_PUT_CHAR(port, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		apbuart_stop_tx(port);
+}
+
+static irqreturn_t apbuart_int(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned int status;
+
+	spin_lock(&port->lock);
+
+	status = UART_GET_STATUS(port);
+	if (status & UART_STATUS_DR)
+		apbuart_rx_chars(port);
+	if (status & UART_STATUS_THE)
+		apbuart_tx_chars(port);
+
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int apbuart_tx_empty(struct uart_port *port)
+{
+	unsigned int status = UART_GET_STATUS(port);
+	return status & UART_STATUS_THE ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int apbuart_get_mctrl(struct uart_port *port)
+{
+	/* The GRLIB APBUART handles flow control in hardware */
+	return TIOCM_CAR | TIOCM_DSR | TIOCM_CTS;
+}
+
+static void apbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* The GRLIB APBUART handles flow control in hardware */
+}
+
+static void apbuart_break_ctl(struct uart_port *port, int break_state)
+{
+	/* We don't support sending break */
+}
+
+static int apbuart_startup(struct uart_port *port)
+{
+	int retval;
+	unsigned int cr;
+
+	/* Allocate the IRQ */
+	retval = request_irq(port->irq, apbuart_int, 0, "apbuart", port);
+	if (retval)
+		return retval;
+
+	/* Finally, enable interrupts */
+	cr = UART_GET_CTRL(port);
+	UART_PUT_CTRL(port,
+		      cr | UART_CTRL_RE | UART_CTRL_TE |
+		      UART_CTRL_RI | UART_CTRL_TI);
+
+	return 0;
+}
+
+static void apbuart_shutdown(struct uart_port *port)
+{
+	unsigned int cr;
+
+	/* disable all interrupts, disable the port */
+	cr = UART_GET_CTRL(port);
+	UART_PUT_CTRL(port,
+		      cr & ~(UART_CTRL_RE | UART_CTRL_TE |
+			     UART_CTRL_RI | UART_CTRL_TI));
+
+	/* Free the interrupt */
+	free_irq(port->irq, port);
+}
+
+static void apbuart_set_termios(struct uart_port *port,
+				struct ktermios *termios, struct ktermios *old)
+{
+	unsigned int cr;
+	unsigned long flags;
+	unsigned int baud, quot;
+
+	/* Ask the core to calculate the divisor for us. */
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	if (baud == 0)
+		panic("invalid baudrate %i\n", port->uartclk / 16);
+
+	/* uart_get_divisor calc a *16 uart freq, apbuart is *8 */
+	quot = (uart_get_divisor(port, baud)) * 2;
+	cr = UART_GET_CTRL(port);
+	cr &= ~(UART_CTRL_PE | UART_CTRL_PS);
+
+	if (termios->c_cflag & PARENB) {
+		cr |= UART_CTRL_PE;
+		if ((termios->c_cflag & PARODD))
+			cr |= UART_CTRL_PS;
+	}
+
+	/* Enable flow control. */
+	if (termios->c_cflag & CRTSCTS)
+		cr |= UART_CTRL_FL;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* Update the per-port timeout. */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = UART_STATUS_OE;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= UART_STATUS_FE | UART_STATUS_PE;
+
+	/* Characters to ignore */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= UART_STATUS_FE | UART_STATUS_PE;
+
+	/* Ignore all characters if CREAD is not set. */
+	if ((termios->c_cflag & CREAD) == 0)
+		port->ignore_status_mask |= UART_DUMMY_RSR_RX;
+
+	/* Set baud rate */
+	quot -= 1;
+	UART_PUT_SCAL(port, quot);
+	UART_PUT_CTRL(port, cr);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *apbuart_type(struct uart_port *port)
+{
+	return port->type == PORT_APBUART ? "GRLIB/APBUART" : NULL;
+}
+
+static void apbuart_release_port(struct uart_port *port)
+{
+	release_mem_region(port->mapbase, 0x100);
+}
+
+static int apbuart_request_port(struct uart_port *port)
+{
+	return request_mem_region(port->mapbase, 0x100, "grlib-apbuart")
+	    != NULL ? 0 : -EBUSY;
+	return 0;
+}
+
+/* Configure/autoconfigure the port */
+static void apbuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_APBUART;
+		apbuart_request_port(port);
+	}
+}
+
+/* Verify the new serial_struct (for TIOCSSERIAL) */
+static int apbuart_verify_port(struct uart_port *port,
+			       struct serial_struct *ser)
+{
+	int ret = 0;
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_APBUART)
+		ret = -EINVAL;
+	if (ser->irq < 0 || ser->irq >= NR_IRQS)
+		ret = -EINVAL;
+	if (ser->baud_base < 9600)
+		ret = -EINVAL;
+	return ret;
+}
+
+static struct uart_ops grlib_apbuart_ops = {
+	.tx_empty = apbuart_tx_empty,
+	.set_mctrl = apbuart_set_mctrl,
+	.get_mctrl = apbuart_get_mctrl,
+	.stop_tx = apbuart_stop_tx,
+	.start_tx = apbuart_start_tx,
+	.stop_rx = apbuart_stop_rx,
+	.enable_ms = apbuart_enable_ms,
+	.break_ctl = apbuart_break_ctl,
+	.startup = apbuart_startup,
+	.shutdown = apbuart_shutdown,
+	.set_termios = apbuart_set_termios,
+	.type = apbuart_type,
+	.release_port = apbuart_release_port,
+	.request_port = apbuart_request_port,
+	.config_port = apbuart_config_port,
+	.verify_port = apbuart_verify_port,
+};
+
+static struct uart_port grlib_apbuart_ports[UART_NR];
+static struct device_node *grlib_apbuart_nodes[UART_NR];
+
+static int apbuart_scan_fifo_size(struct uart_port *port, int portnumber)
+{
+	int ctrl, loop = 0;
+	int status;
+	int fifosize;
+	unsigned long flags;
+
+	ctrl = UART_GET_CTRL(port);
+
+	/*
+	 * Enable the transceiver and wait for it to be ready to send data.
+	 * Clear interrupts so that this process will not be externally
+	 * interrupted in the middle (which can cause the transceiver to
+	 * drain prematurely).
+	 */
+
+	local_irq_save(flags);
+
+	UART_PUT_CTRL(port, ctrl | UART_CTRL_TE);
+
+	while (!UART_TX_READY(UART_GET_STATUS(port)))
+		loop++;
+
+	/*
+	 * Disable the transceiver so data isn't actually sent during the
+	 * actual test.
+	 */
+
+	UART_PUT_CTRL(port, ctrl & ~(UART_CTRL_TE));
+
+	fifosize = 1;
+	UART_PUT_CHAR(port, 0);
+
+	/*
+	 * So long as transmitting a character increments the tranceivier FIFO
+	 * length the FIFO must be at least that big. These bytes will
+	 * automatically drain off of the FIFO.
+	 */
+
+	status = UART_GET_STATUS(port);
+	while (((status >> 20) & 0x3F) == fifosize) {
+		fifosize++;
+		UART_PUT_CHAR(port, 0);
+		status = UART_GET_STATUS(port);
+	}
+
+	fifosize--;
+
+	UART_PUT_CTRL(port, ctrl);
+	local_irq_restore(flags);
+
+	if (fifosize == 0)
+		fifosize = 1;
+
+	return fifosize;
+}
+
+static void apbuart_flush_fifo(struct uart_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->fifosize; i++)
+		UART_GET_CHAR(port);
+}
+
+
+/* ======================================================================== */
+/* Console driver, if enabled                                               */
+/* ======================================================================== */
+
+#ifdef CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE
+
+static void apbuart_console_putchar(struct uart_port *port, int ch)
+{
+	unsigned int status;
+	do {
+		status = UART_GET_STATUS(port);
+	} while (!UART_TX_READY(status));
+	UART_PUT_CHAR(port, ch);
+}
+
+static void
+apbuart_console_write(struct console *co, const char *s, unsigned int count)
+{
+	struct uart_port *port = &grlib_apbuart_ports[co->index];
+	unsigned int status, old_cr, new_cr;
+
+	/* First save the CR then disable the interrupts */
+	old_cr = UART_GET_CTRL(port);
+	new_cr = old_cr & ~(UART_CTRL_RI | UART_CTRL_TI);
+	UART_PUT_CTRL(port, new_cr);
+
+	uart_console_write(port, s, count, apbuart_console_putchar);
+
+	/*
+	 *      Finally, wait for transmitter to become empty
+	 *      and restore the TCR
+	 */
+	do {
+		status = UART_GET_STATUS(port);
+	} while (!UART_TX_READY(status));
+	UART_PUT_CTRL(port, old_cr);
+}
+
+static void __init
+apbuart_console_get_options(struct uart_port *port, int *baud,
+			    int *parity, int *bits)
+{
+	if (UART_GET_CTRL(port) & (UART_CTRL_RE | UART_CTRL_TE)) {
+
+		unsigned int quot, status;
+		status = UART_GET_STATUS(port);
+
+		*parity = 'n';
+		if (status & UART_CTRL_PE) {
+			if ((status & UART_CTRL_PS) == 0)
+				*parity = 'e';
+			else
+				*parity = 'o';
+		}
+
+		*bits = 8;
+		quot = UART_GET_SCAL(port) / 8;
+		*baud = port->uartclk / (16 * (quot + 1));
+	}
+}
+
+static int __init apbuart_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 38400;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	pr_debug("apbuart_console_setup co=%p, co->index=%i, options=%s\n",
+		 co, co->index, options);
+
+	/*
+	 * Check whether an invalid uart number has been specified, and
+	 * if so, search for the first available port that does have
+	 * console support.
+	 */
+	if (co->index >= grlib_apbuart_port_nr)
+		co->index = 0;
+
+	port = &grlib_apbuart_ports[co->index];
+
+	spin_lock_init(&port->lock);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	else
+		apbuart_console_get_options(port, &baud, &parity, &bits);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver grlib_apbuart_driver;
+
+static struct console grlib_apbuart_console = {
+	.name = "ttyS",
+	.write = apbuart_console_write,
+	.device = uart_console_device,
+	.setup = apbuart_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &grlib_apbuart_driver,
+};
+
+
+static void grlib_apbuart_configure(void);
+
+static int __init apbuart_console_init(void)
+{
+	grlib_apbuart_configure();
+	register_console(&grlib_apbuart_console);
+	return 0;
+}
+
+console_initcall(apbuart_console_init);
+
+#define APBUART_CONSOLE	(&grlib_apbuart_console)
+#else
+#define APBUART_CONSOLE	NULL
+#endif
+
+static struct uart_driver grlib_apbuart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "serial",
+	.dev_name = "ttyS",
+	.major = SERIAL_APBUART_MAJOR,
+	.minor = SERIAL_APBUART_MINOR,
+	.nr = UART_NR,
+	.cons = APBUART_CONSOLE,
+};
+
+
+/* ======================================================================== */
+/* OF Platform Driver                                                       */
+/* ======================================================================== */
+
+static int __devinit apbuart_probe(struct of_device *op,
+				   const struct of_device_id *match)
+{
+	int i = -1;
+	struct uart_port *port = NULL;
+
+	i = 0;
+	for (i = 0; i < grlib_apbuart_port_nr; i++) {
+		if (op->node == grlib_apbuart_nodes[i])
+			break;
+	}
+
+	port = &grlib_apbuart_ports[i];
+	port->dev = &op->dev;
+
+	uart_add_one_port(&grlib_apbuart_driver, (struct uart_port *) port);
+
+	apbuart_flush_fifo((struct uart_port *) port);
+
+	printk(KERN_INFO "grlib-apbuart at 0x%llx, irq %d\n",
+	       (unsigned long long) port->mapbase, port->irq);
+	return 0;
+
+}
+
+static struct of_device_id __initdata apbuart_match[] = {
+	{
+	 .name = "GAISLER_APBUART",
+	 },
+	{},
+};
+
+static struct of_platform_driver grlib_apbuart_of_driver = {
+	.match_table = apbuart_match,
+	.probe = apbuart_probe,
+	.driver = {
+		   .owner = THIS_MODULE,
+		   .name = "grlib-apbuart",
+		   },
+};
+
+
+static void grlib_apbuart_configure(void)
+{
+	static int enum_done;
+	struct device_node *np, *rp;
+	struct uart_port *port = NULL;
+	const u32 *prop;
+	int freq_khz;
+	int v = 0, d = 0;
+	unsigned int addr;
+	int irq, line;
+	struct amba_prom_registers *regs;
+
+	if (enum_done)
+		return;
+
+	/* Get bus frequency */
+	rp = of_find_node_by_path("/");
+	rp = of_get_next_child(rp, NULL);
+	prop = of_get_property(rp, "clock-frequency", NULL);
+	freq_khz = *prop;
+
+	line = 0;
+	for_each_matching_node(np, apbuart_match) {
+
+		int *vendor = (int *) of_get_property(np, "vendor", NULL);
+		int *device = (int *) of_get_property(np, "device", NULL);
+		int *irqs = (int *) of_get_property(np, "interrupts", NULL);
+		regs = (struct amba_prom_registers *)
+		    of_get_property(np, "reg", NULL);
+
+		if (vendor)
+			v = *vendor;
+		if (device)
+			d = *device;
+
+		if (!irqs || !regs)
+			return;
+
+		grlib_apbuart_nodes[line] = np;
+
+		addr = regs->phys_addr;
+		irq = *irqs;
+
+		port = &grlib_apbuart_ports[line];
+
+		port->mapbase = addr;
+		port->membase = ioremap(addr, sizeof(struct grlib_apbuart_regs_map));
+		port->irq = irq;
+		port->iotype = UPIO_MEM;
+		port->ops = &grlib_apbuart_ops;
+		port->flags = UPF_BOOT_AUTOCONF;
+		port->line = line;
+		port->uartclk = freq_khz * 1000;
+		port->fifosize = apbuart_scan_fifo_size((struct uart_port *) port, line);
+		line++;
+
+		/* We support maximum UART_NR uarts ... */
+		if (line == UART_NR)
+			break;
+
+	}
+
+	enum_done = 1;
+
+	grlib_apbuart_driver.nr = grlib_apbuart_port_nr = line;
+}
+
+static int __init grlib_apbuart_init(void)
+{
+	int ret;
+
+	/* Find all APBUARTS in device the tree and initialize their ports */
+	grlib_apbuart_configure();
+
+	printk(KERN_INFO "Serial: GRLIB APBUART driver\n");
+
+	ret = uart_register_driver(&grlib_apbuart_driver);
+
+	if (ret) {
+		printk(KERN_ERR "%s: uart_register_driver failed (%i)\n",
+		       __FILE__, ret);
+		return ret;
+	}
+
+	ret = of_register_platform_driver(&grlib_apbuart_of_driver);
+	if (ret) {
+		printk(KERN_ERR
+		       "%s: of_register_platform_driver failed (%i)\n",
+		       __FILE__, ret);
+		uart_unregister_driver(&grlib_apbuart_driver);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit grlib_apbuart_exit(void)
+{
+	int i;
+
+	for (i = 0; i < grlib_apbuart_port_nr; i++)
+		uart_remove_one_port(&grlib_apbuart_driver,
+				     &grlib_apbuart_ports[i]);
+
+	uart_unregister_driver(&grlib_apbuart_driver);
+	of_unregister_platform_driver(&grlib_apbuart_of_driver);
+}
+
+module_init(grlib_apbuart_init);
+module_exit(grlib_apbuart_exit);
+
+MODULE_AUTHOR("Aeroflex Gaisler AB");
+MODULE_DESCRIPTION("GRLIB APBUART serial driver");
+MODULE_VERSION("2.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/apbuart.h b/drivers/serial/apbuart.h
new file mode 100644
index 0000000..5faf87c
--- /dev/null
+++ b/drivers/serial/apbuart.h
@@ -0,0 +1,64 @@
+#ifndef __GRLIB_APBUART_H__
+#define __GRLIB_APBUART_H__
+
+#include <asm/io.h>
+
+#define UART_NR		8
+static int grlib_apbuart_port_nr;
+
+struct grlib_apbuart_regs_map {
+	u32 data;
+	u32 status;
+	u32 ctrl;
+	u32 scaler;
+};
+
+struct amba_prom_registers {
+	unsigned int phys_addr;
+	unsigned int reg_size;
+};
+
+/*
+ *  The following defines the bits in the APBUART Status Registers.
+ */
+#define UART_STATUS_DR   0x00000001	/* Data Ready */
+#define UART_STATUS_TSE  0x00000002	/* TX Send Register Empty */
+#define UART_STATUS_THE  0x00000004	/* TX Hold Register Empty */
+#define UART_STATUS_BR   0x00000008	/* Break Error */
+#define UART_STATUS_OE   0x00000010	/* RX Overrun Error */
+#define UART_STATUS_PE   0x00000020	/* RX Parity Error */
+#define UART_STATUS_FE   0x00000040	/* RX Framing Error */
+#define UART_STATUS_ERR  0x00000078	/* Error Mask */
+
+/*
+ *  The following defines the bits in the APBUART Ctrl Registers.
+ */
+#define UART_CTRL_RE     0x00000001	/* Receiver enable */
+#define UART_CTRL_TE     0x00000002	/* Transmitter enable */
+#define UART_CTRL_RI     0x00000004	/* Receiver interrupt enable */
+#define UART_CTRL_TI     0x00000008	/* Transmitter irq */
+#define UART_CTRL_PS     0x00000010	/* Parity select */
+#define UART_CTRL_PE     0x00000020	/* Parity enable */
+#define UART_CTRL_FL     0x00000040	/* Flow control enable */
+#define UART_CTRL_LB     0x00000080	/* Loopback enable */
+
+#define APBBASE(port) ((struct grlib_apbuart_regs_map *)((port)->membase))
+
+#define APBBASE_DATA_P(port)	(&(APBBASE(port)->data))
+#define APBBASE_STATUS_P(port)	(&(APBBASE(port)->status))
+#define APBBASE_CTRL_P(port)	(&(APBBASE(port)->ctrl))
+#define APBBASE_SCALAR_P(port)	(&(APBBASE(port)->scaler))
+
+#define UART_GET_CHAR(port)	(__raw_readl(APBBASE_DATA_P(port)))
+#define UART_PUT_CHAR(port, v)	(__raw_writel(v, APBBASE_DATA_P(port)))
+#define UART_GET_STATUS(port)	(__raw_readl(APBBASE_STATUS_P(port)))
+#define UART_PUT_STATUS(port, v)(__raw_writel(v, APBBASE_STATUS_P(port)))
+#define UART_GET_CTRL(port)	(__raw_readl(APBBASE_CTRL_P(port)))
+#define UART_PUT_CTRL(port, v)	(__raw_writel(v, APBBASE_CTRL_P(port)))
+#define UART_GET_SCAL(port)	(__raw_readl(APBBASE_SCALAR_P(port)))
+#define UART_PUT_SCAL(port, v)	(__raw_writel(v, APBBASE_SCALAR_P(port)))
+
+#define UART_RX_DATA(s)		(((s) & UART_STATUS_DR) != 0)
+#define UART_TX_READY(s)	(((s) & UART_STATUS_THE) != 0)
+
+#endif /* __GRLIB_APBUART_H__ */
diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index c99f082..73f089d 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -2,7 +2,7 @@
  *
  * Driver for Samsung S3C2410 SoC onboard UARTs.
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/serial/s3c2412.c b/drivers/serial/s3c2412.c
index 6e057d8..ce75e28 100644
--- a/drivers/serial/s3c2412.c
+++ b/drivers/serial/s3c2412.c
@@ -2,7 +2,7 @@
  *
  * Driver for Samsung S3C2412 and S3C2413 SoC onboard UARTs.
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/serial/s3c2440.c b/drivers/serial/s3c2440.c
index 69ff5d3..094cc39 100644
--- a/drivers/serial/s3c2440.c
+++ b/drivers/serial/s3c2440.c
@@ -2,7 +2,7 @@
  *
  * Driver for Samsung S3C2440 and S3C2442 SoC onboard UARTs.
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/serial/s3c24a0.c b/drivers/serial/s3c24a0.c
index 26c49e1..fad6083 100644
--- a/drivers/serial/s3c24a0.c
+++ b/drivers/serial/s3c24a0.c
@@ -6,7 +6,7 @@
  *
  * Author: Sandeep Patil <sandeep.patil@azingo.com>
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/serial/samsung.c b/drivers/serial/samsung.c
index 1523e8d..52e3df1 100644
--- a/drivers/serial/samsung.c
+++ b/drivers/serial/samsung.c
@@ -2,7 +2,7 @@
  *
  * Driver core for Samsung SoC onboard UARTs.
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/serial/samsung.h b/drivers/serial/samsung.h
index d3fe315..1fb2234 100644
--- a/drivers/serial/samsung.h
+++ b/drivers/serial/samsung.h
@@ -2,7 +2,7 @@
  *
  * Driver for Samsung SoC onboard UARTs.
  *
- * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics
+ * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics
  *	http://armlinux.simtec.co.uk/
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index ba1a872..bf5f95a 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -35,8 +35,8 @@
 
 #include <linux/spi/spi.h>
 
-#include <mach/dma.h>
-#include <mach/clock.h>
+#include <plat/dma.h>
+#include <plat/clock.h>
 
 
 #define OMAP2_MCSPI_MAX_FREQ		48000000
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index e75ba9b..6c3a855 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -51,8 +51,8 @@
 #include <asm/io.h>
 #include <asm/mach-types.h>
 
-#include <mach/mux.h>
-#include <mach/omap730.h>	/* OMAP730_IO_CONF registers */
+#include <plat/mux.h>
+#include <plat/omap7xx.h>	/* OMAP7XX_IO_CONF registers */
 
 
 /* FIXME address is now a platform device resource,
@@ -504,7 +504,7 @@ static int __init uwire_probe(struct platform_device *pdev)
 	}
 	clk_enable(uwire->ck);
 
-	if (cpu_is_omap730())
+	if (cpu_is_omap7xx())
 		uwire_idx_shift = 1;
 	else
 		uwire_idx_shift = 2;
@@ -573,8 +573,8 @@ static int __init omap_uwire_init(void)
 	}
 	if (machine_is_omap_perseus2()) {
 		/* configure pins: MPU_UW_nSCS1, MPU_UW_SDO, MPU_UW_SCLK */
-		int val = omap_readl(OMAP730_IO_CONF_9) & ~0x00EEE000;
-		omap_writel(val | 0x00AAA000, OMAP730_IO_CONF_9);
+		int val = omap_readl(OMAP7XX_IO_CONF_9) & ~0x00EEE000;
+		omap_writel(val | 0x00AAA000, OMAP7XX_IO_CONF_9);
 	}
 
 	return platform_driver_probe(&uwire_driver, uwire_probe);
diff --git a/drivers/staging/arlan/arlan-proc.c b/drivers/staging/arlan/arlan-proc.c
index a8b6896..b22983e 100644
--- a/drivers/staging/arlan/arlan-proc.c
+++ b/drivers/staging/arlan/arlan-proc.c
@@ -816,84 +816,83 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write,
 
 
 /* Place files in /proc/sys/dev/arlan */
-#define CTBLN(num,card,nam) \
-        { .ctl_name = num,\
-          .procname = #nam,\
+#define CTBLN(card,nam) \
+        { .procname = #nam,\
           .data = &(arlan_conf[card].nam),\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec}
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec}
 #ifdef ARLAN_DEBUGGING
 
 #define ARLAN_PROC_DEBUG_ENTRIES \
-        { .ctl_name = 48, .procname = "entry_exit_debug",\
+        { .procname = "entry_exit_debug",\
           .data = &arlan_entry_and_exit_debug,\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec},\
-	{ .ctl_name = 49, .procname = "debug", .data = &arlan_debug,\
-          .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec},
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec},\
+	{ .procname = "debug", .data = &arlan_debug,\
+          .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec},
 #else 
 #define ARLAN_PROC_DEBUG_ENTRIES
 #endif
 
 #define ARLAN_SYSCTL_TABLE_TOTAL(cardNo)\
-	CTBLN(1,cardNo,spreadingCode),\
-	CTBLN(2,cardNo, channelNumber),\
-	CTBLN(3,cardNo, scramblingDisable),\
-	CTBLN(4,cardNo, txAttenuation),\
-	CTBLN(5,cardNo, systemId), \
-	CTBLN(6,cardNo, maxDatagramSize),\
-	CTBLN(7,cardNo, maxFrameSize),\
-	CTBLN(8,cardNo, maxRetries),\
-	CTBLN(9,cardNo, receiveMode),\
-	CTBLN(10,cardNo, priority),\
-	CTBLN(11,cardNo, rootOrRepeater),\
-	CTBLN(12,cardNo, SID),\
-	CTBLN(13,cardNo, registrationMode),\
-	CTBLN(14,cardNo, registrationFill),\
-	CTBLN(15,cardNo, localTalkAddress),\
-	CTBLN(16,cardNo, codeFormat),\
-	CTBLN(17,cardNo, numChannels),\
-	CTBLN(18,cardNo, channel1),\
-	CTBLN(19,cardNo, channel2),\
-	CTBLN(20,cardNo, channel3),\
-	CTBLN(21,cardNo, channel4),\
-	CTBLN(22,cardNo, txClear),\
-	CTBLN(23,cardNo, txRetries),\
-	CTBLN(24,cardNo, txRouting),\
-	CTBLN(25,cardNo, txScrambled),\
-	CTBLN(26,cardNo, rxParameter),\
-	CTBLN(27,cardNo, txTimeoutMs),\
-	CTBLN(28,cardNo, waitCardTimeout),\
-	CTBLN(29,cardNo, channelSet), \
-	{.ctl_name = 30, .procname = "name",\
+	CTBLN(cardNo,spreadingCode),\
+	CTBLN(cardNo, channelNumber),\
+	CTBLN(cardNo, scramblingDisable),\
+	CTBLN(cardNo, txAttenuation),\
+	CTBLN(cardNo, systemId), \
+	CTBLN(cardNo, maxDatagramSize),\
+	CTBLN(cardNo, maxFrameSize),\
+	CTBLN(cardNo, maxRetries),\
+	CTBLN(cardNo, receiveMode),\
+	CTBLN(cardNo, priority),\
+	CTBLN(cardNo, rootOrRepeater),\
+	CTBLN(cardNo, SID),\
+	CTBLN(cardNo, registrationMode),\
+	CTBLN(cardNo, registrationFill),\
+	CTBLN(cardNo, localTalkAddress),\
+	CTBLN(cardNo, codeFormat),\
+	CTBLN(cardNo, numChannels),\
+	CTBLN(cardNo, channel1),\
+	CTBLN(cardNo, channel2),\
+	CTBLN(cardNo, channel3),\
+	CTBLN(cardNo, channel4),\
+	CTBLN(cardNo, txClear),\
+	CTBLN(cardNo, txRetries),\
+	CTBLN(cardNo, txRouting),\
+	CTBLN(cardNo, txScrambled),\
+	CTBLN(cardNo, rxParameter),\
+	CTBLN(cardNo, txTimeoutMs),\
+	CTBLN(cardNo, waitCardTimeout),\
+	CTBLN(cardNo, channelSet), \
+	{ .procname = "name",\
 	 .data = arlan_conf[cardNo].siteName,\
-	 .maxlen = 16, .mode = 0600, .proc_handler = &proc_dostring},\
-	CTBLN(31,cardNo,waitTime),\
-	CTBLN(32,cardNo,lParameter),\
-	CTBLN(33,cardNo,_15),\
-	CTBLN(34,cardNo,headerSize),\
-	CTBLN(36,cardNo,tx_delay_ms),\
-	CTBLN(37,cardNo,retries),\
-	CTBLN(38,cardNo,ReTransmitPacketMaxSize),\
-	CTBLN(39,cardNo,waitReTransmitPacketMaxSize),\
-	CTBLN(40,cardNo,fastReTransCount),\
-	CTBLN(41,cardNo,driverRetransmissions),\
-	CTBLN(42,cardNo,txAckTimeoutMs),\
-	CTBLN(43,cardNo,registrationInterrupts),\
-	CTBLN(44,cardNo,hardwareType),\
-	CTBLN(45,cardNo,radioType),\
-	CTBLN(46,cardNo,writeEEPROM),\
-	CTBLN(47,cardNo,writeRadioType),\
+	 .maxlen = 16, .mode = 0600, .proc_handler = proc_dostring},\
+	CTBLN(cardNo,waitTime),\
+	CTBLN(cardNo,lParameter),\
+	CTBLN(cardNo,_15),\
+	CTBLN(cardNo,headerSize),\
+	CTBLN(cardNo,tx_delay_ms),\
+	CTBLN(cardNo,retries),\
+	CTBLN(cardNo,ReTransmitPacketMaxSize),\
+	CTBLN(cardNo,waitReTransmitPacketMaxSize),\
+	CTBLN(cardNo,fastReTransCount),\
+	CTBLN(cardNo,driverRetransmissions),\
+	CTBLN(cardNo,txAckTimeoutMs),\
+	CTBLN(cardNo,registrationInterrupts),\
+	CTBLN(cardNo,hardwareType),\
+	CTBLN(cardNo,radioType),\
+	CTBLN(cardNo,writeEEPROM),\
+	CTBLN(cardNo,writeRadioType),\
 	ARLAN_PROC_DEBUG_ENTRIES\
-	CTBLN(50,cardNo,in_speed),\
-	CTBLN(51,cardNo,out_speed),\
-	CTBLN(52,cardNo,in_speed10),\
-	CTBLN(53,cardNo,out_speed10),\
-	CTBLN(54,cardNo,in_speed_max),\
-	CTBLN(55,cardNo,out_speed_max),\
-	CTBLN(56,cardNo,measure_rate),\
-	CTBLN(57,cardNo,pre_Command_Wait),\
-	CTBLN(58,cardNo,rx_tweak1),\
-	CTBLN(59,cardNo,rx_tweak2),\
-	CTBLN(60,cardNo,tx_queue_len),\
+	CTBLN(cardNo,in_speed),\
+	CTBLN(cardNo,out_speed),\
+	CTBLN(cardNo,in_speed10),\
+	CTBLN(cardNo,out_speed10),\
+	CTBLN(cardNo,in_speed_max),\
+	CTBLN(cardNo,out_speed_max),\
+	CTBLN(cardNo,measure_rate),\
+	CTBLN(cardNo,pre_Command_Wait),\
+	CTBLN(cardNo,rx_tweak1),\
+	CTBLN(cardNo,rx_tweak2),\
+	CTBLN(cardNo,tx_queue_len),\
 
 
 
@@ -903,63 +902,56 @@ static ctl_table arlan_conf_table0[] =
 
 #ifdef ARLAN_PROC_SHM_DUMP
 	{
-		.ctl_name	= 150,
 		.procname	= "arlan0-txRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
-		.ctl_name	= 151,
 		.procname	= "arlan0-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
-		.ctl_name	= 152,
 		.procname	= "arlan0-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
-		.ctl_name	= 153,
 		.procname	= "arlan0-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
-		.ctl_name	= 154,
 		.procname	= "arlan0-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
-		.ctl_name	= 155,
 		.procname	= "config0",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure
+		.proc_handler	= arlan_configure
 	},
 	{
-		.ctl_name	= 156,
 		.procname	= "reset0",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
-	{ .ctl_name = 0 }
+	{  }
 };
 
 static ctl_table arlan_conf_table1[] =
@@ -969,63 +961,56 @@ static ctl_table arlan_conf_table1[] =
 
 #ifdef ARLAN_PROC_SHM_DUMP
 	{
-		.ctl_name	= 150,
 		.procname	= "arlan1-txRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
-		.ctl_name	= 151,
 		.procname	= "arlan1-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
-		.ctl_name	= 152,
 		.procname	= "arlan1-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
-		.ctl_name	= 153,
 		.procname	= "arlan1-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
-		.ctl_name	= 154,
 		.procname	= "arlan1-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
-		.ctl_name	= 155,
 		.procname	= "config1",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
-		.ctl_name	= 156,
 		.procname	= "reset1",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table arlan_conf_table2[] =
@@ -1035,63 +1020,56 @@ static ctl_table arlan_conf_table2[] =
 
 #ifdef ARLAN_PROC_SHM_DUMP
 	{
-		.ctl_name	= 150,
 		.procname	= "arlan2-txRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
-		.ctl_name	= 151,
 		.procname	= "arlan2-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
-		.ctl_name	= 152,
 		.procname	= "arlan2-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
-		.ctl_name	= 153,
 		.procname	= "arlan2-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
-		.ctl_name	= 154,
 		.procname	= "arlan2-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
-		.ctl_name	= 155,
 		.procname	= "config2",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
-		.ctl_name	= 156,
 		.procname	= "reset2",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table arlan_conf_table3[] =
@@ -1101,63 +1079,56 @@ static ctl_table arlan_conf_table3[] =
 
 #ifdef ARLAN_PROC_SHM_DUMP
 	{
-		.ctl_name	= 150,
 		.procname	= "arlan3-txRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_infotxRing,
+		.proc_handler	= arlan_sysctl_infotxRing,
 	},
 	{
-		.ctl_name	= 151,
 		.procname	= "arlan3-rxRing",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_inforxRing,
+		.proc_handler	= arlan_sysctl_inforxRing,
 	},
 	{
-		.ctl_name	= 152,
 		.procname	= "arlan3-18",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info18,
+		.proc_handler	= arlan_sysctl_info18,
 	},
 	{
-		.ctl_name	= 153,
 		.procname	= "arlan3-ring",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info161719,
+		.proc_handler	= arlan_sysctl_info161719,
 	},
 	{
-		.ctl_name	= 154,
 		.procname	= "arlan3-shm-cpy",
 		.data		= &arlan_drive_info,
 		.maxlen		= ARLAN_STR_SIZE,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_info,
+		.proc_handler	= arlan_sysctl_info,
 	},
 #endif
 	{
-		.ctl_name	= 155,
 		.procname	= "config3",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_configure,
+		.proc_handler	= arlan_configure,
 	},
 	{
-		.ctl_name	= 156,
 		.procname	= "reset3",
 		.data		= &conf_reset_result,
 		.maxlen		= 100,
 		.mode		= 0400,
-		.proc_handler	= &arlan_sysctl_reset,
+		.proc_handler	= arlan_sysctl_reset,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 
@@ -1165,41 +1136,37 @@ static ctl_table arlan_conf_table3[] =
 static ctl_table arlan_table[] =
 {
 	{
-		.ctl_name	= 0,
 		.procname	= "arlan0",
 		.maxlen		= 0,
 		.mode		= 0600,
 		.child		= arlan_conf_table0,
 	},
 	{
-		.ctl_name	= 0,
 		.procname	= "arlan1",
 		.maxlen		= 0,
 		.mode		= 0600,
 		.child		= arlan_conf_table1,
 	},
 	{
-		.ctl_name	= 0,
 		.procname	= "arlan2",
 		.maxlen		= 0,
 		.mode		= 0600,
 		.child		= arlan_conf_table2,
 	},
 	{
-		.ctl_name	= 0,
 		.procname	= "arlan3",
 		.maxlen		= 0,
 		.mode		= 0600,
 		.child		= arlan_conf_table3,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 #else
 
-static ctl_table arlan_table[MAX_ARLANS + 1] =
+static ctl_table arlan_table[] =
 {
-	{ .ctl_name = 0 }
+	{ }
 };
 #endif
 
@@ -1209,22 +1176,14 @@ static ctl_table arlan_table[MAX_ARLANS + 1] =
 static ctl_table arlan_root_table[] =
 {
 	{
-		.ctl_name	= CTL_ARLAN,
 		.procname	= "arlan",
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= arlan_table,
 	},
-	{ .ctl_name = 0 }
+	{  }
 };
 
-/* Make sure that /proc/sys/dev is there */
-//static ctl_table arlan_device_root_table[] =
-//{
-//	{CTL_DEV, "dev", NULL, 0, 0555, arlan_root_table},
-//	{0}
-//};
-
 
 static struct ctl_table_header *arlan_device_sysctl_header;
 
@@ -1234,8 +1193,6 @@ int __init init_arlan_proc(void)
 	int i = 0;
 	if (arlan_device_sysctl_header)
 		return 0;
-	for (i = 0; i < MAX_ARLANS && arlan_device[i]; i++)
-		arlan_table[i].ctl_name = i + 1;
 	arlan_device_sysctl_header = register_sysctl_table(arlan_root_table);
 	if (!arlan_device_sysctl_header)
 		return -1;
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index c94de31..f69b778 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
 	struct inode *inode = mapping->host;
 	struct pohmelfs_inode *pi = POHMELFS_I(inode);
 	struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int err = 0;
 	int done = 0;
 	int nr_pages;
@@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
 	int scanned = 0;
 	int range_whole = 0;
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -248,10 +242,6 @@ retry:
 
 			if (wbc->nr_to_write <= 0)
 				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
 
 			continue;
 out_continue:
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index a2db0e1..f81e4f0 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -52,9 +52,9 @@
 #include <asm/unaligned.h>
 #include <asm/mach-types.h>
 
-#include <mach/dma.h>
-#include <mach/usb.h>
-#include <mach/control.h>
+#include <plat/dma.h>
+#include <plat/usb.h>
+#include <plat/control.h>
 
 #include "omap_udc.h"
 
@@ -2098,6 +2098,7 @@ static inline int machine_without_vbus_sense(void)
 		|| machine_is_omap_h4()
 #endif
 		|| machine_is_sx1()
+		|| cpu_is_omap7xx() /* No known omap7xx boards with vbus sense */
 		);
 }
 
@@ -2838,6 +2839,16 @@ static int __init omap_udc_probe(struct platform_device *pdev)
 		udelay(100);
 	}
 
+	if (cpu_is_omap7xx()) {
+		dc_clk = clk_get(&pdev->dev, "usb_dc_ck");
+		hhc_clk = clk_get(&pdev->dev, "l3_ocpi_ck");
+		BUG_ON(IS_ERR(dc_clk) || IS_ERR(hhc_clk));
+		/* can't use omap_udc_enable_clock yet */
+		clk_enable(dc_clk);
+		clk_enable(hhc_clk);
+		udelay(100);
+	}
+
 	INFO("OMAP UDC rev %d.%d%s\n",
 		omap_readw(UDC_REV) >> 4, omap_readw(UDC_REV) & 0xf,
 		config->otg ? ", Mini-AB" : "");
@@ -2970,7 +2981,7 @@ known:
 		goto cleanup3;
 	}
 #endif
-	if (cpu_is_omap16xx()) {
+	if (cpu_is_omap16xx() || cpu_is_omap7xx()) {
 		udc->dc_clk = dc_clk;
 		udc->hhc_clk = hhc_clk;
 		clk_disable(hhc_clk);
@@ -3008,7 +3019,7 @@ cleanup0:
 	if (xceiv)
 		otg_put_transceiver(xceiv);
 
-	if (cpu_is_omap16xx() || cpu_is_omap24xx()) {
+	if (cpu_is_omap16xx() || cpu_is_omap24xx() || cpu_is_omap7xx()) {
 		clk_disable(hhc_clk);
 		clk_disable(dc_clk);
 		clk_put(hhc_clk);
@@ -3115,6 +3126,10 @@ static struct platform_driver udc_driver = {
 
 static int __init udc_init(void)
 {
+	/* Disable DMA for omap7xx -- it doesn't work right. */
+	if (cpu_is_omap7xx())
+		use_dma = 0;
+
 	INFO("%s, version: " DRIVER_VERSION
 #ifdef	USE_ISO
 		" (iso)"
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index 83cbecd..5645f70 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -24,10 +24,10 @@
 #include <asm/io.h>
 #include <asm/mach-types.h>
 
-#include <mach/mux.h>
+#include <plat/mux.h>
 #include <mach/irqs.h>
-#include <mach/fpga.h>
-#include <mach/usb.h>
+#include <plat/fpga.h>
+#include <plat/usb.h>
 
 
 /* OMAP-1510 OHCI has its own MMU for DMA */
diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index 3487520..6761d20 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c
@@ -35,7 +35,7 @@
 
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
-#include <mach/mux.h>
+#include <plat/mux.h>
 
 #include "musb_core.h"
 #include "omap2430.h"
diff --git a/drivers/usb/musb/omap2430.h b/drivers/usb/musb/omap2430.h
index dc76707..fbede77 100644
--- a/drivers/usb/musb/omap2430.h
+++ b/drivers/usb/musb/omap2430.h
@@ -12,7 +12,7 @@
 
 #if defined(CONFIG_ARCH_OMAP2430) || defined(CONFIG_ARCH_OMAP3430)
 #include <mach/hardware.h>
-#include <mach/usb.h>
+#include <plat/usb.h>
 
 /*
  * OMAP2430-specific definitions
diff --git a/drivers/usb/musb/tusb6010_omap.c b/drivers/usb/musb/tusb6010_omap.c
index 7e073a0..e13c770 100644
--- a/drivers/usb/musb/tusb6010_omap.c
+++ b/drivers/usb/musb/tusb6010_omap.c
@@ -15,8 +15,8 @@
 #include <linux/usb.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
-#include <mach/dma.h>
-#include <mach/mux.h>
+#include <plat/dma.h>
+#include <plat/mux.h>
 
 #include "musb_core.h"
 
diff --git a/drivers/usb/otg/isp1301_omap.c b/drivers/usb/otg/isp1301_omap.c
index 77a5f41..d54460a 100644
--- a/drivers/usb/otg/isp1301_omap.c
+++ b/drivers/usb/otg/isp1301_omap.c
@@ -36,8 +36,8 @@
 #include <asm/irq.h>
 #include <asm/mach-types.h>
 
-#include <mach/usb.h>
-#include <mach/mux.h>
+#include <plat/usb.h>
+#include <plat/mux.h>
 
 
 #ifndef	DEBUG
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 188e1ba..6b89eb5 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -5,6 +5,9 @@
 menu "Graphics support"
 	depends on HAS_IOMEM
 
+config HAVE_FB_ATMEL
+	bool
+
 source "drivers/char/agp/Kconfig"
 
 source "drivers/gpu/vga/Kconfig"
@@ -937,7 +940,7 @@ config FB_S1D13XXX
 
 config FB_ATMEL
 	tristate "AT91/AT32 LCD Controller support"
-	depends on FB && (ARCH_AT91SAM9261 || ARCH_AT91SAM9G10 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91SAM9G45 || ARCH_AT91CAP9 || AVR32)
+	depends on FB && HAVE_FB_ATMEL
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
diff --git a/drivers/video/atafb.c b/drivers/video/atafb.c
index 37624f7..b7687c5 100644
--- a/drivers/video/atafb.c
+++ b/drivers/video/atafb.c
@@ -2242,6 +2242,9 @@ static int ext_setcolreg(unsigned int regno, unsigned int red,
 	if (!external_vgaiobase)
 		return 1;
 
+	if (regno > 255)
+		return 1;
+
 	switch (external_card_type) {
 	case IS_VGA:
 		OUTB(0x3c8, regno);
diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c
index 701a108..7fcb0eb 100644
--- a/drivers/video/backlight/da903x_bl.c
+++ b/drivers/video/backlight/da903x_bl.c
@@ -25,6 +25,7 @@
 
 #define DA9034_WLED_CONTROL1	0x3C
 #define DA9034_WLED_CONTROL2	0x3D
+#define DA9034_WLED_ISET(x)	((x) & 0x1f)
 
 #define DA9034_WLED_BOOST_EN	(1 << 5)
 
@@ -101,6 +102,7 @@ static struct backlight_ops da903x_backlight_ops = {
 
 static int da903x_backlight_probe(struct platform_device *pdev)
 {
+	struct da9034_backlight_pdata *pdata = pdev->dev.platform_data;
 	struct da903x_backlight_data *data;
 	struct backlight_device *bl;
 	int max_brightness;
@@ -127,6 +129,11 @@ static int da903x_backlight_probe(struct platform_device *pdev)
 	data->da903x_dev = pdev->dev.parent;
 	data->current_brightness = 0;
 
+	/* adjust the WLED output current */
+	if (pdata)
+		da903x_write(data->da903x_dev, DA9034_WLED_CONTROL2,
+				DA9034_WLED_ISET(pdata->output_current));
+
 	bl = backlight_device_register(pdev->name, data->da903x_dev,
 			data, &da903x_backlight_ops);
 	if (IS_ERR(bl)) {
diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c
index cbad67e..8693e5f 100644
--- a/drivers/video/backlight/omap1_bl.c
+++ b/drivers/video/backlight/omap1_bl.c
@@ -26,8 +26,8 @@
 #include <linux/backlight.h>
 
 #include <mach/hardware.h>
-#include <mach/board.h>
-#include <mach/mux.h>
+#include <plat/board.h>
+#include <plat/mux.h>
 
 #define OMAPBL_MAX_INTENSITY		0xff
 
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index bbfb502..4a3d46e 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -367,6 +367,7 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 
 	spi_message_init(m);
 
+	x->cs_change = 1;
 	x->tx_buf = &lcd->buf[0];
 	spi_message_add_tail(x, m);
 
diff --git a/drivers/video/omap/Makefile b/drivers/video/omap/Makefile
index b63b198..49226a1 100644
--- a/drivers/video/omap/Makefile
+++ b/drivers/video/omap/Makefile
@@ -35,6 +35,7 @@ objs-y$(CONFIG_MACH_OMAP3EVM) += lcd_omap3evm.o
 objs-y$(CONFIG_MACH_OMAP3_BEAGLE) += lcd_omap3beagle.o
 objs-y$(CONFIG_FB_OMAP_LCD_MIPID) += lcd_mipid.o
 objs-y$(CONFIG_MACH_OVERO) += lcd_overo.o
+objs-y$(CONFIG_MACH_HERALD) += lcd_htcherald.o
 
 omapfb-objs := $(objs-yy)
 
diff --git a/drivers/video/omap/blizzard.c b/drivers/video/omap/blizzard.c
index 70dadf9..f5d75f2 100644
--- a/drivers/video/omap/blizzard.c
+++ b/drivers/video/omap/blizzard.c
@@ -26,9 +26,9 @@
 #include <linux/delay.h>
 #include <linux/clk.h>
 
-#include <mach/dma.h>
-#include <mach/omapfb.h>
-#include <mach/blizzard.h>
+#include <plat/dma.h>
+#include <plat/omapfb.h>
+#include <plat/blizzard.h>
 
 #include "dispc.h"
 
diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c
index f16e421..7c833db 100644
--- a/drivers/video/omap/dispc.c
+++ b/drivers/video/omap/dispc.c
@@ -25,9 +25,9 @@
 #include <linux/clk.h>
 #include <linux/io.h>
 
-#include <mach/sram.h>
-#include <mach/omapfb.h>
-#include <mach/board.h>
+#include <plat/sram.h>
+#include <plat/omapfb.h>
+#include <plat/board.h>
 
 #include "dispc.h"
 
@@ -204,6 +204,7 @@ static u32 inline dispc_read_reg(int idx)
 /* Select RFBI or bypass mode */
 static void enable_rfbi_mode(int enable)
 {
+	void __iomem *rfbi_control;
 	u32 l;
 
 	l = dispc_read_reg(DISPC_CONTROL);
@@ -216,9 +217,15 @@ static void enable_rfbi_mode(int enable)
 	dispc_write_reg(DISPC_CONTROL, l);
 
 	/* Set bypass mode in RFBI module */
-	l = __raw_readl(OMAP2_IO_ADDRESS(RFBI_CONTROL));
+	rfbi_control = ioremap(RFBI_CONTROL, SZ_1K);
+	if (!rfbi_control) {
+		pr_err("Unable to ioremap rfbi_control\n");
+		return;
+	}
+	l = __raw_readl(rfbi_control);
 	l |= enable ? 0 : (1 << 1);
-	__raw_writel(l, OMAP2_IO_ADDRESS(RFBI_CONTROL));
+	__raw_writel(l, rfbi_control);
+	iounmap(rfbi_control);
 }
 
 static void set_lcd_data_lines(int data_lines)
@@ -1367,6 +1374,7 @@ static int omap_dispc_init(struct omapfb_device *fbdev, int ext_mode,
 	int r;
 	u32 l;
 	struct lcd_panel *panel = fbdev->panel;
+	void __iomem *ram_fw_base;
 	int tmo = 10000;
 	int skip_init = 0;
 	int i;
@@ -1441,7 +1449,13 @@ static int omap_dispc_init(struct omapfb_device *fbdev, int ext_mode,
 	}
 
 	/* L3 firewall setting: enable access to OCM RAM */
-	__raw_writel(0x402000b0, OMAP2_IO_ADDRESS(0x680050a0));
+	ram_fw_base = ioremap(0x68005000, SZ_1K);
+	if (!ram_fw_base) {
+		dev_err(dispc.fbdev->dev, "Cannot ioremap to enable OCM RAM\n");
+		goto fail1;
+	}
+	__raw_writel(0x402000b0, ram_fw_base + 0xa0);
+	iounmap(ram_fw_base);
 
 	if ((r = alloc_palette_ram()) < 0)
 		goto fail2;
diff --git a/drivers/video/omap/hwa742.c b/drivers/video/omap/hwa742.c
index ca51583..17a975e 100644
--- a/drivers/video/omap/hwa742.c
+++ b/drivers/video/omap/hwa742.c
@@ -26,9 +26,9 @@
 #include <linux/delay.h>
 #include <linux/clk.h>
 
-#include <mach/dma.h>
-#include <mach/omapfb.h>
-#include <mach/hwa742.h>
+#include <plat/dma.h>
+#include <plat/omapfb.h>
+#include <plat/hwa742.h>
 
 #define HWA742_REV_CODE_REG       0x0
 #define HWA742_CONFIG_REG         0x2
diff --git a/drivers/video/omap/lcd_2430sdp.c b/drivers/video/omap/lcd_2430sdp.c
index 393712b..fea7fee 100644
--- a/drivers/video/omap/lcd_2430sdp.c
+++ b/drivers/video/omap/lcd_2430sdp.c
@@ -27,8 +27,8 @@
 #include <linux/gpio.h>
 #include <linux/i2c/twl4030.h>
 
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define SDP2430_LCD_PANEL_BACKLIGHT_GPIO	91
diff --git a/drivers/video/omap/lcd_ams_delta.c b/drivers/video/omap/lcd_ams_delta.c
index 1f74399..3d52772 100644
--- a/drivers/video/omap/lcd_ams_delta.c
+++ b/drivers/video/omap/lcd_ams_delta.c
@@ -25,9 +25,9 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include <mach/board-ams-delta.h>
+#include <plat/board-ams-delta.h>
 #include <mach/hardware.h>
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 #define AMS_DELTA_DEFAULT_CONTRAST	112
 
diff --git a/drivers/video/omap/lcd_apollon.c b/drivers/video/omap/lcd_apollon.c
index 626ae3a..4c5cefc 100644
--- a/drivers/video/omap/lcd_apollon.c
+++ b/drivers/video/omap/lcd_apollon.c
@@ -25,8 +25,8 @@
 #include <linux/platform_device.h>
 
 #include <mach/gpio.h>
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 
 /* #define USE_35INCH_LCD 1 */
 
diff --git a/drivers/video/omap/lcd_h3.c b/drivers/video/omap/lcd_h3.c
index 417ae5e..240b4fb 100644
--- a/drivers/video/omap/lcd_h3.c
+++ b/drivers/video/omap/lcd_h3.c
@@ -24,7 +24,7 @@
 #include <linux/i2c/tps65010.h>
 
 #include <mach/gpio.h>
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 #define MODULE_NAME	"omapfb-lcd_h3"
 
diff --git a/drivers/video/omap/lcd_h4.c b/drivers/video/omap/lcd_h4.c
index 0c398bd..720625d 100644
--- a/drivers/video/omap/lcd_h4.c
+++ b/drivers/video/omap/lcd_h4.c
@@ -22,7 +22,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 static int h4_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev)
 {
diff --git a/drivers/video/omap/lcd_htcherald.c b/drivers/video/omap/lcd_htcherald.c
new file mode 100644
index 0000000..2e0c81e
--- /dev/null
+++ b/drivers/video/omap/lcd_htcherald.c
@@ -0,0 +1,130 @@
+/*
+ * File: drivers/video/omap/lcd-htcherald.c
+ *
+ * LCD panel support for the HTC Herald
+ *
+ * Copyright (C) 2009 Cory Maccarrone <darkstar6262@gmail.com>
+ * Copyright (C) 2009 Wing Linux
+ *
+ * Based on the lcd_htcwizard.c file from the linwizard project:
+ * Copyright (C) linwizard.sourceforge.net
+ * Author: Angelo Arrifano <miknix@gmail.com>
+ * Based on lcd_h4 by Imre Deak <imre.deak@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <plat/omapfb.h>
+
+static int htcherald_panel_init(struct lcd_panel *panel,
+					struct omapfb_device *fbdev)
+{
+	return 0;
+}
+
+static void htcherald_panel_cleanup(struct lcd_panel *panel)
+{
+}
+
+static int htcherald_panel_enable(struct lcd_panel *panel)
+{
+	return 0;
+}
+
+static void htcherald_panel_disable(struct lcd_panel *panel)
+{
+}
+
+static unsigned long htcherald_panel_get_caps(struct lcd_panel *panel)
+{
+	return 0;
+}
+
+/* Found on WIZ200 (miknix) and some HERA110 models (darkstar62) */
+struct lcd_panel htcherald_panel_1 = {
+	.name		= "lcd_herald",
+	.config		= OMAP_LCDC_PANEL_TFT |
+			  OMAP_LCDC_INV_HSYNC |
+			  OMAP_LCDC_INV_VSYNC |
+			  OMAP_LCDC_INV_PIX_CLOCK,
+	.bpp		= 16,
+	.data_lines	= 16,
+	.x_res		= 240,
+	.y_res		= 320,
+	.pixel_clock	= 6093,
+	.pcd		= 0, /* 15 */
+	.hsw		= 10,
+	.hfp		= 10,
+	.hbp		= 20,
+	.vsw		= 3,
+	.vfp		= 2,
+	.vbp		= 2,
+
+	.init		= htcherald_panel_init,
+	.cleanup	= htcherald_panel_cleanup,
+	.enable		= htcherald_panel_enable,
+	.disable	= htcherald_panel_disable,
+	.get_caps	= htcherald_panel_get_caps,
+};
+
+static int htcherald_panel_probe(struct platform_device *pdev)
+{
+	omapfb_register_panel(&htcherald_panel_1);
+	return 0;
+}
+
+static int htcherald_panel_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static int htcherald_panel_suspend(struct platform_device *pdev,
+						pm_message_t mesg)
+{
+	return 0;
+}
+
+static int htcherald_panel_resume(struct platform_device *pdev)
+{
+	return 0;
+}
+
+struct platform_driver htcherald_panel_driver = {
+	.probe		= htcherald_panel_probe,
+	.remove		= htcherald_panel_remove,
+	.suspend	= htcherald_panel_suspend,
+	.resume		= htcherald_panel_resume,
+	.driver		= {
+		.name	= "lcd_htcherald",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int htcherald_panel_drv_init(void)
+{
+	return platform_driver_register(&htcherald_panel_driver);
+}
+
+static void htcherald_panel_drv_cleanup(void)
+{
+	platform_driver_unregister(&htcherald_panel_driver);
+}
+
+module_init(htcherald_panel_drv_init);
+module_exit(htcherald_panel_drv_cleanup);
+
diff --git a/drivers/video/omap/lcd_inn1510.c b/drivers/video/omap/lcd_inn1510.c
index cdbd8bb..aafe9b4 100644
--- a/drivers/video/omap/lcd_inn1510.c
+++ b/drivers/video/omap/lcd_inn1510.c
@@ -23,8 +23,8 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 
-#include <mach/fpga.h>
-#include <mach/omapfb.h>
+#include <plat/fpga.h>
+#include <plat/omapfb.h>
 
 static int innovator1510_panel_init(struct lcd_panel *panel,
 				    struct omapfb_device *fbdev)
diff --git a/drivers/video/omap/lcd_inn1610.c b/drivers/video/omap/lcd_inn1610.c
index 268f7f8..0de3382 100644
--- a/drivers/video/omap/lcd_inn1610.c
+++ b/drivers/video/omap/lcd_inn1610.c
@@ -23,7 +23,7 @@
 #include <linux/platform_device.h>
 
 #include <mach/gpio.h>
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 #define MODULE_NAME	"omapfb-lcd_h3"
 
diff --git a/drivers/video/omap/lcd_ldp.c b/drivers/video/omap/lcd_ldp.c
index dbfe897..6a260df 100644
--- a/drivers/video/omap/lcd_ldp.c
+++ b/drivers/video/omap/lcd_ldp.c
@@ -27,8 +27,8 @@
 #include <linux/i2c/twl4030.h>
 
 #include <mach/gpio.h>
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define LCD_PANEL_BACKLIGHT_GPIO	(15 + OMAP_MAX_GPIO_LINES)
diff --git a/drivers/video/omap/lcd_mipid.c b/drivers/video/omap/lcd_mipid.c
index 918ee89..2162eb0 100644
--- a/drivers/video/omap/lcd_mipid.c
+++ b/drivers/video/omap/lcd_mipid.c
@@ -23,8 +23,8 @@
 #include <linux/workqueue.h>
 #include <linux/spi/spi.h>
 
-#include <mach/omapfb.h>
-#include <mach/lcd_mipid.h>
+#include <plat/omapfb.h>
+#include <plat/lcd_mipid.h>
 
 #define MIPID_MODULE_NAME		"lcd_mipid"
 
diff --git a/drivers/video/omap/lcd_omap2evm.c b/drivers/video/omap/lcd_omap2evm.c
index 7a2bbe2..e1a38ab 100644
--- a/drivers/video/omap/lcd_omap2evm.c
+++ b/drivers/video/omap/lcd_omap2evm.c
@@ -26,8 +26,8 @@
 #include <linux/gpio.h>
 #include <linux/i2c/twl4030.h>
 
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define LCD_PANEL_ENABLE_GPIO	154
diff --git a/drivers/video/omap/lcd_omap3beagle.c b/drivers/video/omap/lcd_omap3beagle.c
index 4011910..ccec084 100644
--- a/drivers/video/omap/lcd_omap3beagle.c
+++ b/drivers/video/omap/lcd_omap3beagle.c
@@ -25,8 +25,8 @@
 #include <linux/gpio.h>
 #include <linux/i2c/twl4030.h>
 
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define LCD_PANEL_ENABLE_GPIO       170
diff --git a/drivers/video/omap/lcd_omap3evm.c b/drivers/video/omap/lcd_omap3evm.c
index b6a4c2c..556eb31 100644
--- a/drivers/video/omap/lcd_omap3evm.c
+++ b/drivers/video/omap/lcd_omap3evm.c
@@ -25,8 +25,8 @@
 #include <linux/gpio.h>
 #include <linux/i2c/twl4030.h>
 
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define LCD_PANEL_ENABLE_GPIO       153
diff --git a/drivers/video/omap/lcd_osk.c b/drivers/video/omap/lcd_osk.c
index b3fa88b..bb21d7d 100644
--- a/drivers/video/omap/lcd_osk.c
+++ b/drivers/video/omap/lcd_osk.c
@@ -24,8 +24,8 @@
 #include <linux/platform_device.h>
 
 #include <mach/gpio.h>
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 
 static int osk_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev)
 {
diff --git a/drivers/video/omap/lcd_overo.c b/drivers/video/omap/lcd_overo.c
index 2bc5c92..b0f86e5 100644
--- a/drivers/video/omap/lcd_overo.c
+++ b/drivers/video/omap/lcd_overo.c
@@ -24,8 +24,8 @@
 #include <linux/i2c/twl4030.h>
 
 #include <mach/gpio.h>
-#include <mach/mux.h>
-#include <mach/omapfb.h>
+#include <plat/mux.h>
+#include <plat/omapfb.h>
 #include <asm/mach-types.h>
 
 #define LCD_ENABLE       144
diff --git a/drivers/video/omap/lcd_palmte.c b/drivers/video/omap/lcd_palmte.c
index 4bf3c79..d302896 100644
--- a/drivers/video/omap/lcd_palmte.c
+++ b/drivers/video/omap/lcd_palmte.c
@@ -23,8 +23,8 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 
-#include <mach/fpga.h>
-#include <mach/omapfb.h>
+#include <plat/fpga.h>
+#include <plat/omapfb.h>
 
 static int palmte_panel_init(struct lcd_panel *panel,
 				struct omapfb_device *fbdev)
diff --git a/drivers/video/omap/lcd_palmtt.c b/drivers/video/omap/lcd_palmtt.c
index 48ea1f9..557424f 100644
--- a/drivers/video/omap/lcd_palmtt.c
+++ b/drivers/video/omap/lcd_palmtt.c
@@ -30,7 +30,7 @@ GPIO13 - screen blanking
 #include <linux/io.h>
 
 #include <mach/gpio.h>
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 static int palmtt_panel_init(struct lcd_panel *panel,
 	struct omapfb_device *fbdev)
diff --git a/drivers/video/omap/lcd_palmz71.c b/drivers/video/omap/lcd_palmz71.c
index 0697d29..5f4b5b2 100644
--- a/drivers/video/omap/lcd_palmz71.c
+++ b/drivers/video/omap/lcd_palmz71.c
@@ -24,7 +24,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 static int palmz71_panel_init(struct lcd_panel *panel,
 			      struct omapfb_device *fbdev)
diff --git a/drivers/video/omap/lcdc.c b/drivers/video/omap/lcdc.c
index ab39492..5f32caf 100644
--- a/drivers/video/omap/lcdc.c
+++ b/drivers/video/omap/lcdc.c
@@ -29,8 +29,8 @@
 #include <linux/vmalloc.h>
 #include <linux/clk.h>
 
-#include <mach/dma.h>
-#include <mach/omapfb.h>
+#include <plat/dma.h>
+#include <plat/omapfb.h>
 
 #include <asm/mach-types.h>
 
diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c
index 0d0c8c8..f900a43 100644
--- a/drivers/video/omap/omapfb_main.c
+++ b/drivers/video/omap/omapfb_main.c
@@ -28,8 +28,8 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
-#include <mach/dma.h>
-#include <mach/omapfb.h>
+#include <plat/dma.h>
+#include <plat/omapfb.h>
 
 #include "lcdc.h"
 #include "dispc.h"
diff --git a/drivers/video/omap/rfbi.c b/drivers/video/omap/rfbi.c
index ee01e84..c90fa39 100644
--- a/drivers/video/omap/rfbi.c
+++ b/drivers/video/omap/rfbi.c
@@ -27,7 +27,7 @@
 #include <linux/clk.h>
 #include <linux/io.h>
 
-#include <mach/omapfb.h>
+#include <plat/omapfb.h>
 
 #include "dispc.h"
 
diff --git a/drivers/video/omap/sossi.c b/drivers/video/omap/sossi.c
index a769462..79dc84f 100644
--- a/drivers/video/omap/sossi.c
+++ b/drivers/video/omap/sossi.c
@@ -24,8 +24,8 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 
-#include <mach/dma.h>
-#include <mach/omapfb.h>
+#include <plat/dma.h>
+#include <plat/omapfb.h>
 
 #include "lcdc.h"
 
diff --git a/drivers/video/pxa168fb.c b/drivers/video/pxa168fb.c
index 84d8327..75285d3 100644
--- a/drivers/video/pxa168fb.c
+++ b/drivers/video/pxa168fb.c
@@ -687,6 +687,7 @@ static int __init pxa168fb_probe(struct platform_device *pdev)
 	}
 
 	info->fix.smem_start = (unsigned long)fbi->fb_start_dma;
+	set_graphics_start(info, 0, 0);
 
 	/*
 	 * Set video mode according to platform data.
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 1820c4a..f58a3aa 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -80,7 +80,8 @@
 static int pxafb_activate_var(struct fb_var_screeninfo *var,
 				struct pxafb_info *);
 static void set_ctrlr_state(struct pxafb_info *fbi, u_int state);
-static void setup_base_frame(struct pxafb_info *fbi, int branch);
+static void setup_base_frame(struct pxafb_info *fbi,
+                             struct fb_var_screeninfo *var, int branch);
 static int setup_frame_dma(struct pxafb_info *fbi, int dma, int pal,
 			   unsigned long offset, size_t size);
 
@@ -397,6 +398,7 @@ static void pxafb_setmode(struct fb_var_screeninfo *var,
 	var->lower_margin	= mode->lower_margin;
 	var->sync		= mode->sync;
 	var->grayscale		= mode->cmap_greyscale;
+	var->transp.length	= mode->transparency;
 
 	/* set the initial RGBA bitfields */
 	pxafb_set_pixfmt(var, mode->depth);
@@ -531,12 +533,22 @@ static int pxafb_pan_display(struct fb_var_screeninfo *var,
 			     struct fb_info *info)
 {
 	struct pxafb_info *fbi = (struct pxafb_info *)info;
+	struct fb_var_screeninfo newvar;
 	int dma = DMA_MAX + DMA_BASE;
 
 	if (fbi->state != C_ENABLE)
 		return 0;
 
-	setup_base_frame(fbi, 1);
+	/* Only take .xoffset, .yoffset and .vmode & FB_VMODE_YWRAP from what
+	 * was passed in and copy the rest from the old screeninfo.
+	 */
+	memcpy(&newvar, &fbi->fb.var, sizeof(newvar));
+	newvar.xoffset = var->xoffset;
+	newvar.yoffset = var->yoffset;
+	newvar.vmode &= ~FB_VMODE_YWRAP;
+	newvar.vmode |= var->vmode & FB_VMODE_YWRAP;
+
+	setup_base_frame(fbi, &newvar, 1);
 
 	if (fbi->lccr0 & LCCR0_SDS)
 		lcd_writel(fbi, FBR1, fbi->fdadr[dma + 1] | 0x1);
@@ -1052,9 +1064,10 @@ static int setup_frame_dma(struct pxafb_info *fbi, int dma, int pal,
 	return 0;
 }
 
-static void setup_base_frame(struct pxafb_info *fbi, int branch)
+static void setup_base_frame(struct pxafb_info *fbi,
+                             struct fb_var_screeninfo *var,
+                             int branch)
 {
-	struct fb_var_screeninfo *var = &fbi->fb.var;
 	struct fb_fix_screeninfo *fix = &fbi->fb.fix;
 	int nbytes, dma, pal, bpp = var->bits_per_pixel;
 	unsigned long offset;
@@ -1332,7 +1345,7 @@ static int pxafb_activate_var(struct fb_var_screeninfo *var,
 #endif
 		setup_parallel_timing(fbi, var);
 
-	setup_base_frame(fbi, 0);
+	setup_base_frame(fbi, var, 0);
 
 	fbi->reg_lccr0 = fbi->lccr0 |
 		(LCCR0_LDM | LCCR0_SFM | LCCR0_IUM | LCCR0_EFM |
diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c
index 3ed571a..429ea99 100644
--- a/drivers/watchdog/omap_wdt.c
+++ b/drivers/watchdog/omap_wdt.c
@@ -43,7 +43,7 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <mach/hardware.h>
-#include <mach/prcm.h>
+#include <plat/prcm.h>
 
 #include "omap_wdt.h"
 
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index d3c824d..c14ae86 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -10,7 +10,6 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/miscdevice.h>
-#include <linux/smp_lock.h>
 #include <linux/watchdog.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -75,7 +74,6 @@ static void riowd_writereg(struct riowd *p, u8 val, int index)
 
 static int riowd_open(struct inode *inode, struct file *filp)
 {
-	cycle_kernel_lock();
 	nonseekable_open(inode, filp);
 	return 0;
 }
@@ -194,6 +192,8 @@ static int __devinit riowd_probe(struct of_device *op,
 		printk(KERN_ERR PFX "Cannot map registers.\n");
 		goto out_free;
 	}
+	/* Make miscdev useable right away */
+	riowd_device = p;
 
 	err = misc_register(&riowd_miscdev);
 	if (err) {
@@ -205,10 +205,10 @@ static int __devinit riowd_probe(struct of_device *op,
 	       "regs at %p\n", riowd_timeout, p->regs);
 
 	dev_set_drvdata(&op->dev, p);
-	riowd_device = p;
 	return 0;
 
 out_iounmap:
+	riowd_device = NULL;
 	of_iounmap(&op->resource[0], p->regs, 2);
 
 out_free:
diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c
index e6c4390..53180a3 100644
--- a/drivers/zorro/zorro-driver.c
+++ b/drivers/zorro/zorro-driver.c
@@ -156,5 +156,4 @@ postcore_initcall(zorro_driver_init);
 EXPORT_SYMBOL(zorro_match_device);
 EXPORT_SYMBOL(zorro_register_driver);
 EXPORT_SYMBOL(zorro_unregister_driver);
-EXPORT_SYMBOL(zorro_dev_driver);
 EXPORT_SYMBOL(zorro_bus_type);