diff options
Diffstat (limited to 'drivers')
202 files changed, 23921 insertions, 2028 deletions
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index f2df6e2..676f08b 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -3,7 +3,7 @@ # menuconfig ATA - tristate "Serial ATA (prod) and Parallel ATA (experimental) drivers" + tristate "Serial ATA and Parallel ATA drivers" depends on HAS_IOMEM depends on BLOCK depends on !(M32R || M68K) || BROKEN @@ -374,8 +374,8 @@ config PATA_HPT366 If unsure, say N. config PATA_HPT37X - tristate "HPT 370/370A/371/372/374/302 PATA support (Experimental)" - depends on PCI && EXPERIMENTAL + tristate "HPT 370/370A/371/372/374/302 PATA support" + depends on PCI help This option enables support for the majority of the later HPT PATA controllers via the new ATA layer. @@ -383,8 +383,8 @@ config PATA_HPT37X If unsure, say N. config PATA_HPT3X2N - tristate "HPT 372N/302N PATA support (Experimental)" - depends on PCI && EXPERIMENTAL + tristate "HPT 372N/302N PATA support" + depends on PCI help This option enables support for the N variant HPT PATA controllers via the new ATA layer @@ -401,7 +401,7 @@ config PATA_HPT3X3 If unsure, say N. config PATA_HPT3X3_DMA - bool "HPT 343/363 DMA support (Experimental)" + bool "HPT 343/363 DMA support" depends on PATA_HPT3X3 help This option enables DMA support for the HPT343/363 @@ -510,8 +510,8 @@ config PATA_NETCELL If unsure, say N. config PATA_NINJA32 - tristate "Ninja32/Delkin Cardbus ATA support (Experimental)" - depends on PCI && EXPERIMENTAL + tristate "Ninja32/Delkin Cardbus ATA support" + depends on PCI help This option enables support for the Ninja32, Delkin and possibly other brands of Cardbus ATA adapter @@ -573,6 +573,14 @@ config PATA_PCMCIA If unsure, say N. +config PATA_PDC2027X + tristate "Promise PATA 2027x support" + depends on PCI + help + This option enables support for Promise PATA pdc20268 to pdc20277 host adapters. + + If unsure, say N. + config PATA_PDC_OLD tristate "Older Promise PATA controller support" depends on PCI @@ -643,14 +651,6 @@ config PATA_SERVERWORKS If unsure, say N. -config PATA_PDC2027X - tristate "Promise PATA 2027x support" - depends on PCI - help - This option enables support for Promise PATA pdc20268 to pdc20277 host adapters. - - If unsure, say N. - config PATA_SIL680 tristate "CMD / Silicon Image 680 PATA support" depends on PCI @@ -667,6 +667,15 @@ config PATA_SIS If unsure, say N. +config PATA_TOSHIBA + tristate "Toshiba Piccolo support (Experimental)" + depends on PCI && EXPERIMENTAL + help + Support for the Toshiba Piccolo controllers. Currently only the + primary channel is supported by this driver. + + If unsure, say N. + config PATA_VIA tristate "VIA PATA support" depends on PCI diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile index 01e126f..d909435 100644 --- a/drivers/ata/Makefile +++ b/drivers/ata/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_PATA_RZ1000) += pata_rz1000.o obj-$(CONFIG_PATA_SC1200) += pata_sc1200.o obj-$(CONFIG_PATA_SERVERWORKS) += pata_serverworks.o obj-$(CONFIG_PATA_SIL680) += pata_sil680.o +obj-$(CONFIG_PATA_TOSHIBA) += pata_piccolo.o obj-$(CONFIG_PATA_VIA) += pata_via.o obj-$(CONFIG_PATA_WINBOND) += pata_sl82c105.o obj-$(CONFIG_PATA_WINBOND_VLB) += pata_winbond.o diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index a3241a1..b8bea10 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -113,6 +113,7 @@ enum { board_ahci_mcp65 = 6, board_ahci_nopmp = 7, board_ahci_yesncq = 8, + board_ahci_nosntf = 9, /* global controller registers */ HOST_CAP = 0x00, /* host capabilities */ @@ -235,6 +236,7 @@ enum { AHCI_HFLAG_NO_SUSPEND = (1 << 10), /* don't suspend */ AHCI_HFLAG_SRST_TOUT_IS_OFFLINE = (1 << 11), /* treat SRST timeout as link offline */ + AHCI_HFLAG_NO_SNTF = (1 << 12), /* no sntf */ /* ap->flags bits */ @@ -508,7 +510,7 @@ static const struct ata_port_info ahci_port_info[] = { .udma_mask = ATA_UDMA6, .port_ops = &ahci_ops, }, - /* board_ahci_yesncq */ + [board_ahci_yesncq] = { AHCI_HFLAGS (AHCI_HFLAG_YES_NCQ), .flags = AHCI_FLAG_COMMON, @@ -516,6 +518,14 @@ static const struct ata_port_info ahci_port_info[] = { .udma_mask = ATA_UDMA6, .port_ops = &ahci_ops, }, + [board_ahci_nosntf] = + { + AHCI_HFLAGS (AHCI_HFLAG_NO_SNTF), + .flags = AHCI_FLAG_COMMON, + .pio_mask = ATA_PIO4, + .udma_mask = ATA_UDMA6, + .port_ops = &ahci_ops, + }, }; static const struct pci_device_id ahci_pci_tbl[] = { @@ -531,7 +541,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x2683), board_ahci }, /* ESB2 */ { PCI_VDEVICE(INTEL, 0x27c6), board_ahci }, /* ICH7-M DH */ { PCI_VDEVICE(INTEL, 0x2821), board_ahci }, /* ICH8 */ - { PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* ICH8 */ + { PCI_VDEVICE(INTEL, 0x2822), board_ahci_nosntf }, /* ICH8 */ { PCI_VDEVICE(INTEL, 0x2824), board_ahci }, /* ICH8 */ { PCI_VDEVICE(INTEL, 0x2829), board_ahci }, /* ICH8M */ { PCI_VDEVICE(INTEL, 0x282a), board_ahci }, /* ICH8M */ @@ -849,6 +859,12 @@ static void ahci_save_initial_config(struct pci_dev *pdev, cap &= ~HOST_CAP_PMP; } + if ((cap & HOST_CAP_SNTF) && (hpriv->flags & AHCI_HFLAG_NO_SNTF)) { + dev_printk(KERN_INFO, &pdev->dev, + "controller can't do SNTF, turning off CAP_SNTF\n"); + cap &= ~HOST_CAP_SNTF; + } + if (pdev->vendor == PCI_VENDOR_ID_JMICRON && pdev->device == 0x2361 && port_map != 1) { dev_printk(KERN_INFO, &pdev->dev, @@ -2988,6 +3004,14 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable) return -ENODEV; + /* Promise's PDC42819 is a SAS/SATA controller that has an AHCI mode. + * At the moment, we can only use the AHCI mode. Let the users know + * that for SAS drives they're out of luck. + */ + if (pdev->vendor == PCI_VENDOR_ID_PROMISE) + dev_printk(KERN_INFO, &pdev->dev, "PDC42819 " + "can only drive SATA devices with this driver\n"); + /* acquire resources */ rc = pcim_enable_device(pdev); if (rc) diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c index ecfd22b..12e26c3 100644 --- a/drivers/ata/ata_generic.c +++ b/drivers/ata/ata_generic.c @@ -168,9 +168,12 @@ static struct pci_device_id ata_generic[] = { { PCI_DEVICE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C561), }, { PCI_DEVICE(PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C558), }, { PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE), }, - { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO), }, +#if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE) { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), }, { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), }, + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3), }, + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5), }, +#endif /* Must come last. If you add entries adjust this table appropriately */ { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_IDE << 8, 0xFFFFFF00UL, 1}, { 0, }, diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index 9ac4e37..0c6155f 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -869,10 +869,10 @@ static void do_pata_set_dmamode(struct ata_port *ap, struct ata_device *adev, in (timings[pio][1] << 8); } - if (ap->udma_mask) { + if (ap->udma_mask) udma_enable &= ~(1 << devid); - pci_write_config_word(dev, master_port, master_data); - } + + pci_write_config_word(dev, master_port, master_data); } /* Don't scribble on 0x48 if the controller does not support UDMA */ if (ap->udma_mask) diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index b0882cd..1245838 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -807,12 +807,11 @@ static int ata_acpi_exec_tfs(struct ata_device *dev, int *nr_executed) * EH context. * * RETURNS: - * 0 on success, -errno on failure. + * 0 on success, -ENOENT if _SDD doesn't exist, -errno on failure. */ static int ata_acpi_push_id(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; - int err; acpi_status status; struct acpi_object_list input; union acpi_object in_params[1]; @@ -835,12 +834,16 @@ static int ata_acpi_push_id(struct ata_device *dev) status = acpi_evaluate_object(dev->acpi_handle, "_SDD", &input, NULL); swap_buf_le16(dev->id, ATA_ID_WORDS); - err = ACPI_FAILURE(status) ? -EIO : 0; - if (err < 0) + if (status == AE_NOT_FOUND) + return -ENOENT; + + if (ACPI_FAILURE(status)) { ata_dev_printk(dev, KERN_WARNING, "ACPI _SDD failed (AE 0x%x)\n", status); + return -EIO; + } - return err; + return 0; } /** @@ -971,7 +974,7 @@ int ata_acpi_on_devcfg(struct ata_device *dev) /* do _SDD if SATA */ if (acpi_sata) { rc = ata_acpi_push_id(dev); - if (rc) + if (rc && rc != -ENOENT) goto acpi_err; } diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index dc72690..22ff51b 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -6616,6 +6616,13 @@ static int __init ata_init(void) { ata_parse_force_param(); + /* + * FIXME: In UP case, there is only one workqueue thread and if you + * have more than one PIO device, latency is bloody awful, with + * occasional multi-second "hiccups" as one PIO device waits for + * another. It's an ugly wart that users DO occasionally complain + * about; luckily most users have at most one PIO polled device. + */ ata_wq = create_workqueue("ata"); if (!ata_wq) goto free_force_tbl; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index bba2ae5..0ea97c9 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -110,6 +110,13 @@ static const unsigned long ata_eh_identify_timeouts[] = { ULONG_MAX, }; +static const unsigned long ata_eh_flush_timeouts[] = { + 15000, /* be generous with flush */ + 15000, /* ditto */ + 30000, /* and even more generous */ + ULONG_MAX, +}; + static const unsigned long ata_eh_other_timeouts[] = { 5000, /* same rationale as identify timeout */ 10000, /* ditto */ @@ -147,6 +154,8 @@ ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = { .timeouts = ata_eh_other_timeouts, }, { .commands = CMDS(ATA_CMD_INIT_DEV_PARAMS), .timeouts = ata_eh_other_timeouts, }, + { .commands = CMDS(ATA_CMD_FLUSH, ATA_CMD_FLUSH_EXT), + .timeouts = ata_eh_flush_timeouts }, }; #undef CMDS @@ -3112,6 +3121,82 @@ static int atapi_eh_clear_ua(struct ata_device *dev) return 0; } +/** + * ata_eh_maybe_retry_flush - Retry FLUSH if necessary + * @dev: ATA device which may need FLUSH retry + * + * If @dev failed FLUSH, it needs to be reported upper layer + * immediately as it means that @dev failed to remap and already + * lost at least a sector and further FLUSH retrials won't make + * any difference to the lost sector. However, if FLUSH failed + * for other reasons, for example transmission error, FLUSH needs + * to be retried. + * + * This function determines whether FLUSH failure retry is + * necessary and performs it if so. + * + * RETURNS: + * 0 if EH can continue, -errno if EH needs to be repeated. + */ +static int ata_eh_maybe_retry_flush(struct ata_device *dev) +{ + struct ata_link *link = dev->link; + struct ata_port *ap = link->ap; + struct ata_queued_cmd *qc; + struct ata_taskfile tf; + unsigned int err_mask; + int rc = 0; + + /* did flush fail for this device? */ + if (!ata_tag_valid(link->active_tag)) + return 0; + + qc = __ata_qc_from_tag(ap, link->active_tag); + if (qc->dev != dev || (qc->tf.command != ATA_CMD_FLUSH_EXT && + qc->tf.command != ATA_CMD_FLUSH)) + return 0; + + /* if the device failed it, it should be reported to upper layers */ + if (qc->err_mask & AC_ERR_DEV) + return 0; + + /* flush failed for some other reason, give it another shot */ + ata_tf_init(dev, &tf); + + tf.command = qc->tf.command; + tf.flags |= ATA_TFLAG_DEVICE; + tf.protocol = ATA_PROT_NODATA; + + ata_dev_printk(dev, KERN_WARNING, "retrying FLUSH 0x%x Emask 0x%x\n", + tf.command, qc->err_mask); + + err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); + if (!err_mask) { + /* + * FLUSH is complete but there's no way to + * successfully complete a failed command from EH. + * Making sure retry is allowed at least once and + * retrying it should do the trick - whatever was in + * the cache is already on the platter and this won't + * cause infinite loop. + */ + qc->scsicmd->allowed = max(qc->scsicmd->allowed, 1); + } else { + ata_dev_printk(dev, KERN_WARNING, "FLUSH failed Emask 0x%x\n", + err_mask); + rc = -EIO; + + /* if device failed it, report it to upper layers */ + if (err_mask & AC_ERR_DEV) { + qc->err_mask |= AC_ERR_DEV; + qc->result_tf = tf; + if (!(ap->pflags & ATA_PFLAG_FROZEN)) + rc = 0; + } + } + return rc; +} + static int ata_link_nr_enabled(struct ata_link *link) { struct ata_device *dev; @@ -3455,6 +3540,15 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset, } } + /* retry flush if necessary */ + ata_for_each_dev(dev, link, ALL) { + if (dev->class != ATA_DEV_ATA) + continue; + rc = ata_eh_maybe_retry_flush(dev); + if (rc) + goto dev_fail; + } + /* configure link power saving */ if (ehc->i.action & ATA_EH_LPM) ata_for_each_dev(dev, link, ALL) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index b4ee28d..62e6b9e 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -47,6 +47,7 @@ #include <linux/hdreg.h> #include <linux/uaccess.h> #include <linux/suspend.h> +#include <asm/unaligned.h> #include "libata.h" @@ -154,8 +155,7 @@ static ssize_t ata_scsi_lpm_put(struct device *dev, */ for (i = 1; i < ARRAY_SIZE(link_pm_policy); i++) { const int len = strlen(link_pm_policy[i].name); - if (strncmp(link_pm_policy[i].name, buf, len) == 0 && - buf[len] == '\n') { + if (strncmp(link_pm_policy[i].name, buf, len) == 0) { policy = link_pm_policy[i].value; break; } @@ -1964,6 +1964,7 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf) 0x80, /* page 0x80, unit serial no page */ 0x83, /* page 0x83, device ident page */ 0x89, /* page 0x89, ata info page */ + 0xb0, /* page 0xb0, block limits page */ 0xb1, /* page 0xb1, block device characteristics page */ }; @@ -2085,6 +2086,43 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf) return 0; } +static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf) +{ + u32 min_io_sectors; + + rbuf[1] = 0xb0; + rbuf[3] = 0x3c; /* required VPD size with unmap support */ + + /* + * Optimal transfer length granularity. + * + * This is always one physical block, but for disks with a smaller + * logical than physical sector size we need to figure out what the + * latter is. + */ + if (ata_id_has_large_logical_sectors(args->id)) + min_io_sectors = ata_id_logical_per_physical_sectors(args->id); + else + min_io_sectors = 1; + put_unaligned_be16(min_io_sectors, &rbuf[6]); + + /* + * Optimal unmap granularity. + * + * The ATA spec doesn't even know about a granularity or alignment + * for the TRIM command. We can leave away most of the unmap related + * VPD page entries, but we have specifify a granularity to signal + * that we support some form of unmap - in thise case via WRITE SAME + * with the unmap bit set. + */ + if (ata_id_has_trim(args->id)) { + put_unaligned_be32(65535 * 512 / 8, &rbuf[20]); + put_unaligned_be32(1, &rbuf[28]); + } + + return 0; +} + static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf) { int form_factor = ata_id_form_factor(args->id); @@ -2374,6 +2412,13 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf) rbuf[13] = log_per_phys; rbuf[14] = (lowest_aligned >> 8) & 0x3f; rbuf[15] = lowest_aligned; + + if (ata_id_has_trim(args->id)) { + rbuf[14] |= 0x80; /* TPE */ + + if (ata_id_has_zero_after_trim(args->id)) + rbuf[14] |= 0x40; /* TPRZ */ + } } return 0; @@ -2896,6 +2941,58 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc) return 1; } +static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc) +{ + struct ata_taskfile *tf = &qc->tf; + struct scsi_cmnd *scmd = qc->scsicmd; + struct ata_device *dev = qc->dev; + const u8 *cdb = scmd->cmnd; + u64 block; + u32 n_block; + u32 size; + void *buf; + + /* we may not issue DMA commands if no DMA mode is set */ + if (unlikely(!dev->dma_mode)) + goto invalid_fld; + + if (unlikely(scmd->cmd_len < 16)) + goto invalid_fld; + scsi_16_lba_len(cdb, &block, &n_block); + + /* for now we only support WRITE SAME with the unmap bit set */ + if (unlikely(!(cdb[1] & 0x8))) + goto invalid_fld; + + /* + * WRITE SAME always has a sector sized buffer as payload, this + * should never be a multiple entry S/G list. + */ + if (!scsi_sg_count(scmd)) + goto invalid_fld; + + buf = page_address(sg_page(scsi_sglist(scmd))); + size = ata_set_lba_range_entries(buf, 512, block, n_block); + + tf->protocol = ATA_PROT_DMA; + tf->hob_feature = 0; + tf->feature = ATA_DSM_TRIM; + tf->hob_nsect = (size / 512) >> 8; + tf->nsect = size / 512; + tf->command = ATA_CMD_DSM; + tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 | + ATA_TFLAG_WRITE; + + ata_qc_set_pc_nbytes(qc); + + return 0; + + invalid_fld: + ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00); + /* "Invalid field in cdb" */ + return 1; +} + /** * ata_get_xlat_func - check if SCSI to ATA translation is possible * @dev: ATA device @@ -2920,6 +3017,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd) case WRITE_16: return ata_scsi_rw_xlat; + case 0x93 /*WRITE_SAME_16*/: + return ata_scsi_write_same_xlat; + case SYNCHRONIZE_CACHE: if (ata_try_flush_cache(dev)) return ata_scsi_flush_xlat; @@ -3109,6 +3209,9 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd, case 0x89: ata_scsi_rbuf_fill(&args, ata_scsiop_inq_89); break; + case 0xb0: + ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b0); + break; case 0xb1: ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b1); break; diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index bbbb1fa..51eb1e2 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -2384,7 +2384,7 @@ void ata_sff_post_internal_cmd(struct ata_queued_cmd *qc) ap->hsm_task_state = HSM_ST_IDLE; if (ap->ioaddr.bmdma_addr) - ata_bmdma_stop(qc); + ap->ops->bmdma_stop(qc); spin_unlock_irqrestore(ap->lock, flags); } diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c index 1432dc9..9434114 100644 --- a/drivers/ata/pata_ali.c +++ b/drivers/ata/pata_ali.c @@ -453,7 +453,9 @@ static void ali_init_chipset(struct pci_dev *pdev) /* Clear CD-ROM DMA write bit */ tmp &= 0x7F; /* Cable and UDMA */ - pci_write_config_byte(pdev, 0x4B, tmp | 0x09); + if (pdev->revision >= 0xc2) + tmp |= 0x01; + pci_write_config_byte(pdev, 0x4B, tmp | 0x08); /* * CD_ROM DMA on (0x53 bit 0). Enable this even if we want * to use PIO. 0x53 bit 1 (rev 20 only) - enable FIFO control diff --git a/drivers/ata/pata_cmd64x.c b/drivers/ata/pata_cmd64x.c index f98dffe..dadfc35 100644 --- a/drivers/ata/pata_cmd64x.c +++ b/drivers/ata/pata_cmd64x.c @@ -31,7 +31,7 @@ #include <linux/libata.h> #define DRV_NAME "pata_cmd64x" -#define DRV_VERSION "0.2.5" +#define DRV_VERSION "0.3.1" /* * CMD64x specific registers definition. @@ -254,17 +254,109 @@ static void cmd648_bmdma_stop(struct ata_queued_cmd *qc) } /** - * cmd646r1_dma_stop - DMA stop callback + * cmd64x_bmdma_stop - DMA stop callback * @qc: Command in progress * - * Stub for now while investigating the r1 quirk in the old driver. + * Track the completion of live DMA commands and clear the + * host->private_data DMA tracking flag as we do. */ -static void cmd646r1_bmdma_stop(struct ata_queued_cmd *qc) +static void cmd64x_bmdma_stop(struct ata_queued_cmd *qc) { + struct ata_port *ap = qc->ap; ata_bmdma_stop(qc); + WARN_ON(ap->host->private_data != ap); + ap->host->private_data = NULL; } +/** + * cmd64x_qc_defer - Defer logic for chip limits + * @qc: queued command + * + * Decide whether we can issue the command. Called under the host lock. + */ + +static int cmd64x_qc_defer(struct ata_queued_cmd *qc) +{ + struct ata_host *host = qc->ap->host; + struct ata_port *alt = host->ports[1 ^ qc->ap->port_no]; + int rc; + int dma = 0; + + /* Apply the ATA rules first */ + rc = ata_std_qc_defer(qc); + if (rc) + return rc; + + if (qc->tf.protocol == ATAPI_PROT_DMA || + qc->tf.protocol == ATA_PROT_DMA) + dma = 1; + + /* If the other port is not live then issue the command */ + if (alt == NULL || !alt->qc_active) { + if (dma) + host->private_data = qc->ap; + return 0; + } + /* If there is a live DMA command then wait */ + if (host->private_data != NULL) + return ATA_DEFER_PORT; + if (dma) + /* Cannot overlap our DMA command */ + return ATA_DEFER_PORT; + return 0; +} + +/** + * cmd64x_interrupt - ATA host interrupt handler + * @irq: irq line (unused) + * @dev_instance: pointer to our ata_host information structure + * + * Our interrupt handler for PCI IDE devices. Calls + * ata_sff_host_intr() for each port that is flagging an IRQ. We cannot + * use the defaults as we need to avoid touching status/altstatus during + * a DMA. + * + * LOCKING: + * Obtains host lock during operation. + * + * RETURNS: + * IRQ_NONE or IRQ_HANDLED. + */ +irqreturn_t cmd64x_interrupt(int irq, void *dev_instance) +{ + struct ata_host *host = dev_instance; + struct pci_dev *pdev = to_pci_dev(host->dev); + unsigned int i; + unsigned int handled = 0; + unsigned long flags; + static const u8 irq_reg[2] = { CFR, ARTTIM23 }; + static const u8 irq_mask[2] = { 1 << 2, 1 << 4 }; + + /* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */ + spin_lock_irqsave(&host->lock, flags); + + for (i = 0; i < host->n_ports; i++) { + struct ata_port *ap; + u8 reg; + + pci_read_config_byte(pdev, irq_reg[i], ®); + ap = host->ports[i]; + if (ap && (reg & irq_mask[i]) && + !(ap->flags & ATA_FLAG_DISABLED)) { + struct ata_queued_cmd *qc; + + qc = ata_qc_from_tag(ap, ap->link.active_tag); + if (qc && (!(qc->tf.flags & ATA_TFLAG_POLLING)) && + (qc->flags & ATA_QCFLAG_ACTIVE)) + handled |= ata_sff_host_intr(ap, qc); + } + } + + spin_unlock_irqrestore(&host->lock, flags); + + return IRQ_RETVAL(handled); +} static struct scsi_host_template cmd64x_sht = { ATA_BMDMA_SHT(DRV_NAME), }; @@ -273,6 +365,8 @@ static const struct ata_port_operations cmd64x_base_ops = { .inherits = &ata_bmdma_port_ops, .set_piomode = cmd64x_set_piomode, .set_dmamode = cmd64x_set_dmamode, + .bmdma_stop = cmd64x_bmdma_stop, + .qc_defer = cmd64x_qc_defer, }; static struct ata_port_operations cmd64x_port_ops = { @@ -282,7 +376,6 @@ static struct ata_port_operations cmd64x_port_ops = { static struct ata_port_operations cmd646r1_port_ops = { .inherits = &cmd64x_base_ops, - .bmdma_stop = cmd646r1_bmdma_stop, .cable_detect = ata_cable_40wire, }; @@ -290,12 +383,11 @@ static struct ata_port_operations cmd648_port_ops = { .inherits = &cmd64x_base_ops, .bmdma_stop = cmd648_bmdma_stop, .cable_detect = cmd648_cable_detect, + .qc_defer = ata_std_qc_defer }; static int cmd64x_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - u32 class_rev; - static const struct ata_port_info cmd_info[6] = { { /* CMD 643 - no UDMA */ .flags = ATA_FLAG_SLAVE_POSS, @@ -340,40 +432,43 @@ static int cmd64x_init_one(struct pci_dev *pdev, const struct pci_device_id *id) const struct ata_port_info *ppi[] = { &cmd_info[id->driver_data], NULL }; u8 mrdmode; int rc; + struct ata_host *host; rc = pcim_enable_device(pdev); if (rc) return rc; - pci_read_config_dword(pdev, PCI_CLASS_REVISION, &class_rev); - class_rev &= 0xFF; - if (id->driver_data == 0) /* 643 */ ata_pci_bmdma_clear_simplex(pdev); if (pdev->device == PCI_DEVICE_ID_CMD_646) { /* Does UDMA work ? */ - if (class_rev > 4) + if (pdev->revision > 4) ppi[0] = &cmd_info[2]; /* Early rev with other problems ? */ - else if (class_rev == 1) + else if (pdev->revision == 1) ppi[0] = &cmd_info[3]; } + pci_write_config_byte(pdev, PCI_LATENCY_TIMER, 64); pci_read_config_byte(pdev, MRDMODE, &mrdmode); mrdmode &= ~ 0x30; /* IRQ set up */ mrdmode |= 0x02; /* Memory read line enable */ pci_write_config_byte(pdev, MRDMODE, mrdmode); - /* Force PIO 0 here.. */ - /* PPC specific fixup copied from old driver */ #ifdef CONFIG_PPC pci_write_config_byte(pdev, UDIDETCR0, 0xF0); #endif + rc = ata_pci_sff_prepare_host(pdev, ppi, &host); + if (rc) + return rc; + /* We use this pointer to track the AP which has DMA running */ + host->private_data = NULL; - return ata_pci_sff_init_one(pdev, ppi, &cmd64x_sht, NULL); + pci_set_master(pdev); + return ata_pci_sff_activate_host(host, cmd64x_interrupt, &cmd64x_sht); } #ifdef CONFIG_PM diff --git a/drivers/ata/pata_cs5520.c b/drivers/ata/pata_cs5520.c index 0df83cf..95ebdac 100644 --- a/drivers/ata/pata_cs5520.c +++ b/drivers/ata/pata_cs5520.c @@ -90,48 +90,12 @@ static void cs5520_set_timings(struct ata_port *ap, struct ata_device *adev, int } /** - * cs5520_enable_dma - turn on DMA bits - * - * Turn on the DMA bits for this disk. Needed because the BIOS probably - * has not done the work for us. Belongs in the core SATA code. - */ - -static void cs5520_enable_dma(struct ata_port *ap, struct ata_device *adev) -{ - /* Set the DMA enable/disable flag */ - u8 reg = ioread8(ap->ioaddr.bmdma_addr + 0x02); - reg |= 1<<(adev->devno + 5); - iowrite8(reg, ap->ioaddr.bmdma_addr + 0x02); -} - -/** - * cs5520_set_dmamode - program DMA timings - * @ap: ATA port - * @adev: ATA device - * - * Program the DMA mode timings for the controller according to the pio - * clocking table. Note that this device sets the DMA timings to PIO - * mode values. This may seem bizarre but the 5520 architecture talks - * PIO mode to the disk and DMA mode to the controller so the underlying - * transfers are PIO timed. - */ - -static void cs5520_set_dmamode(struct ata_port *ap, struct ata_device *adev) -{ - static const int dma_xlate[3] = { XFER_PIO_0, XFER_PIO_3, XFER_PIO_4 }; - cs5520_set_timings(ap, adev, dma_xlate[adev->dma_mode]); - cs5520_enable_dma(ap, adev); -} - -/** * cs5520_set_piomode - program PIO timings * @ap: ATA port * @adev: ATA device * * Program the PIO mode timings for the controller according to the pio - * clocking table. We know pio_mode will equal dma_mode because of the - * CS5520 architecture. At least once we turned DMA on and wrote a - * mode setter. + * clocking table. */ static void cs5520_set_piomode(struct ata_port *ap, struct ata_device *adev) @@ -149,7 +113,6 @@ static struct ata_port_operations cs5520_port_ops = { .qc_prep = ata_sff_dumb_qc_prep, .cable_detect = ata_cable_40wire, .set_piomode = cs5520_set_piomode, - .set_dmamode = cs5520_set_dmamode, }; static int __devinit cs5520_init_one(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c index 6da4cb4..ffee397 100644 --- a/drivers/ata/pata_cs5536.c +++ b/drivers/ata/pata_cs5536.c @@ -224,7 +224,7 @@ static struct scsi_host_template cs5536_sht = { }; static struct ata_port_operations cs5536_port_ops = { - .inherits = &ata_bmdma_port_ops, + .inherits = &ata_bmdma32_port_ops, .cable_detect = cs5536_cable_detect, .set_piomode = cs5536_set_piomode, .set_dmamode = cs5536_set_dmamode, diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c index 2a6412f..b2e71e6 100644 --- a/drivers/ata/pata_efar.c +++ b/drivers/ata/pata_efar.c @@ -2,6 +2,7 @@ * pata_efar.c - EFAR PIIX clone controller driver * * (C) 2005 Red Hat + * (C) 2009 Bartlomiej Zolnierkiewicz * * Some parts based on ata_piix.c by Jeff Garzik and others. * @@ -118,12 +119,12 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev) int shift = 4 * ap->port_no; u8 slave_data; - idetm_data &= 0xCC0F; + idetm_data &= 0xFF0F; idetm_data |= (control << 4); /* Slave timing in separate register */ pci_read_config_byte(dev, 0x44, &slave_data); - slave_data &= 0x0F << shift; + slave_data &= ap->port_no ? 0x0F : 0xF0; slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << shift; pci_write_config_byte(dev, 0x44, slave_data); } @@ -200,7 +201,7 @@ static void efar_set_dmamode (struct ata_port *ap, struct ata_device *adev) master_data &= 0xFF4F; /* Mask out IORDY|TIME1|DMAONLY */ master_data |= control << 4; pci_read_config_byte(dev, 0x44, &slave_data); - slave_data &= (0x0F + 0xE1 * ap->port_no); + slave_data &= ap->port_no ? 0x0F : 0xF0; /* Load the matching timing */ slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0); pci_write_config_byte(dev, 0x44, slave_data); @@ -251,7 +252,7 @@ static int efar_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) static const struct ata_port_info info = { .flags = ATA_FLAG_SLAVE_POSS, .pio_mask = ATA_PIO4, - .mwdma_mask = ATA_MWDMA2, + .mwdma_mask = ATA_MWDMA12_ONLY, .udma_mask = ATA_UDMA4, .port_ops = &efar_ops, }; diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c index d7f2da1..0bd48e8 100644 --- a/drivers/ata/pata_hpt366.c +++ b/drivers/ata/pata_hpt366.c @@ -27,7 +27,7 @@ #include <linux/libata.h> #define DRV_NAME "pata_hpt366" -#define DRV_VERSION "0.6.2" +#define DRV_VERSION "0.6.7" struct hpt_clock { u8 xfer_mode; @@ -36,24 +36,22 @@ struct hpt_clock { /* key for bus clock timings * bit - * 0:3 data_high_time. inactive time of DIOW_/DIOR_ for PIO and MW - * DMA. cycles = value + 1 - * 4:8 data_low_time. active time of DIOW_/DIOR_ for PIO and MW - * DMA. cycles = value + 1 - * 9:12 cmd_high_time. inactive time of DIOW_/DIOR_ during task file + * 0:3 data_high_time. Inactive time of DIOW_/DIOR_ for PIO and MW DMA. + * cycles = value + 1 + * 4:7 data_low_time. Active time of DIOW_/DIOR_ for PIO and MW DMA. + * cycles = value + 1 + * 8:11 cmd_high_time. Inactive time of DIOW_/DIOR_ during task file * register access. - * 13:17 cmd_low_time. active time of DIOW_/DIOR_ during task file + * 12:15 cmd_low_time. Active time of DIOW_/DIOR_ during task file * register access. - * 18:21 udma_cycle_time. clock freq and clock cycles for UDMA xfer. - * during task file register access. - * 22:24 pre_high_time. time to initialize 1st cycle for PIO and MW DMA - * xfer. - * 25:27 cmd_pre_high_time. time to initialize 1st PIO cycle for task + * 16:18 udma_cycle_time. Clock cycles for UDMA xfer? + * 19:21 pre_high_time. Time to initialize 1st cycle for PIO and MW DMA xfer. + * 22:24 cmd_pre_high_time. Time to initialize 1st PIO cycle for task file * register access. - * 28 UDMA enable - * 29 DMA enable - * 30 PIO_MST enable. if set, the chip is in bus master mode during - * PIO. + * 28 UDMA enable. + * 29 DMA enable. + * 30 PIO_MST enable. If set, the chip is in bus master mode during + * PIO xfer. * 31 FIFO enable. */ @@ -344,7 +342,6 @@ static int hpt36x_init_one(struct pci_dev *dev, const struct pci_device_id *id) const struct ata_port_info *ppi[] = { &info_hpt366, NULL }; void *hpriv = NULL; - u32 class_rev; u32 reg1; int rc; @@ -352,13 +349,10 @@ static int hpt36x_init_one(struct pci_dev *dev, const struct pci_device_id *id) if (rc) return rc; - pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); - class_rev &= 0xFF; - /* May be a later chip in disguise. Check */ /* Newer chips are not in the HPT36x driver. Ignore them */ - if (class_rev > 2) - return -ENODEV; + if (dev->revision > 2) + return -ENODEV; hpt36x_init_chipset(dev); diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c index d0a7df2..4224cfc 100644 --- a/drivers/ata/pata_hpt37x.c +++ b/drivers/ata/pata_hpt37x.c @@ -24,7 +24,7 @@ #include <linux/libata.h> #define DRV_NAME "pata_hpt37x" -#define DRV_VERSION "0.6.12" +#define DRV_VERSION "0.6.14" struct hpt_clock { u8 xfer_speed; @@ -303,72 +303,79 @@ static unsigned long hpt370a_filter(struct ata_device *adev, unsigned long mask) } /** - * hpt37x_pre_reset - reset the hpt37x bus - * @link: ATA link to reset - * @deadline: deadline jiffies for the operation + * hpt37x_cable_detect - Detect the cable type + * @ap: ATA port to detect on * - * Perform the initial reset handling for the 370/372 and 374 func 0 + * Return the cable type attached to this port */ -static int hpt37x_pre_reset(struct ata_link *link, unsigned long deadline) +static int hpt37x_cable_detect(struct ata_port *ap) { - u8 scr2, ata66; - struct ata_port *ap = link->ap; struct pci_dev *pdev = to_pci_dev(ap->host->dev); - static const struct pci_bits hpt37x_enable_bits[] = { - { 0x50, 1, 0x04, 0x04 }, - { 0x54, 1, 0x04, 0x04 } - }; - if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no])) - return -ENOENT; + u8 scr2, ata66; pci_read_config_byte(pdev, 0x5B, &scr2); pci_write_config_byte(pdev, 0x5B, scr2 & ~0x01); + + udelay(10); /* debounce */ + /* Cable register now active */ pci_read_config_byte(pdev, 0x5A, &ata66); /* Restore state */ pci_write_config_byte(pdev, 0x5B, scr2); if (ata66 & (2 >> ap->port_no)) - ap->cbl = ATA_CBL_PATA40; + return ATA_CBL_PATA40; else - ap->cbl = ATA_CBL_PATA80; - - /* Reset the state machine */ - pci_write_config_byte(pdev, 0x50 + 4 * ap->port_no, 0x37); - udelay(100); - - return ata_sff_prereset(link, deadline); + return ATA_CBL_PATA80; } -static int hpt374_fn1_pre_reset(struct ata_link *link, unsigned long deadline) +/** + * hpt374_fn1_cable_detect - Detect the cable type + * @ap: ATA port to detect on + * + * Return the cable type attached to this port + */ + +static int hpt374_fn1_cable_detect(struct ata_port *ap) { - static const struct pci_bits hpt37x_enable_bits[] = { - { 0x50, 1, 0x04, 0x04 }, - { 0x54, 1, 0x04, 0x04 } - }; - u16 mcr3; - u8 ata66; - struct ata_port *ap = link->ap; struct pci_dev *pdev = to_pci_dev(ap->host->dev); unsigned int mcrbase = 0x50 + 4 * ap->port_no; - - if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no])) - return -ENOENT; + u16 mcr3; + u8 ata66; /* Do the extra channel work */ pci_read_config_word(pdev, mcrbase + 2, &mcr3); - /* Set bit 15 of 0x52 to enable TCBLID as input - */ + /* Set bit 15 of 0x52 to enable TCBLID as input */ pci_write_config_word(pdev, mcrbase + 2, mcr3 | 0x8000); pci_read_config_byte(pdev, 0x5A, &ata66); /* Reset TCBLID/FCBLID to output */ pci_write_config_word(pdev, mcrbase + 2, mcr3); if (ata66 & (2 >> ap->port_no)) - ap->cbl = ATA_CBL_PATA40; + return ATA_CBL_PATA40; else - ap->cbl = ATA_CBL_PATA80; + return ATA_CBL_PATA80; +} + +/** + * hpt37x_pre_reset - reset the hpt37x bus + * @link: ATA link to reset + * @deadline: deadline jiffies for the operation + * + * Perform the initial reset handling for the HPT37x. + */ + +static int hpt37x_pre_reset(struct ata_link *link, unsigned long deadline) +{ + struct ata_port *ap = link->ap; + struct pci_dev *pdev = to_pci_dev(ap->host->dev); + static const struct pci_bits hpt37x_enable_bits[] = { + { 0x50, 1, 0x04, 0x04 }, + { 0x54, 1, 0x04, 0x04 } + }; + if (!pci_test_config_bits(pdev, &hpt37x_enable_bits[ap->port_no])) + return -ENOENT; /* Reset the state machine */ pci_write_config_byte(pdev, 0x50 + 4 * ap->port_no, 0x37); @@ -404,9 +411,8 @@ static void hpt370_set_piomode(struct ata_port *ap, struct ata_device *adev) pci_read_config_dword(pdev, addr1, ®); mode = hpt37x_find_mode(ap, adev->pio_mode); - mode &= ~0x8000000; /* No FIFO in PIO */ - mode &= ~0x30070000; /* Leave config bits alone */ - reg &= 0x30070000; /* Strip timing bits */ + mode &= 0xCFC3FFFF; /* Leave DMA bits alone */ + reg &= ~0xCFC3FFFF; /* Strip timing bits */ pci_write_config_dword(pdev, addr1, reg | mode); } @@ -423,8 +429,7 @@ static void hpt370_set_dmamode(struct ata_port *ap, struct ata_device *adev) { struct pci_dev *pdev = to_pci_dev(ap->host->dev); u32 addr1, addr2; - u32 reg; - u32 mode; + u32 reg, mode, mask; u8 fast; addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no); @@ -436,11 +441,12 @@ static void hpt370_set_dmamode(struct ata_port *ap, struct ata_device *adev) fast |= 0x01; pci_write_config_byte(pdev, addr2, fast); + mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000; + pci_read_config_dword(pdev, addr1, ®); mode = hpt37x_find_mode(ap, adev->dma_mode); - mode |= 0x8000000; /* FIFO in MWDMA or UDMA */ - mode &= ~0xC0000000; /* Leave config bits alone */ - reg &= 0xC0000000; /* Strip timing bits */ + mode &= mask; + reg &= ~mask; pci_write_config_dword(pdev, addr1, reg | mode); } @@ -508,9 +514,8 @@ static void hpt372_set_piomode(struct ata_port *ap, struct ata_device *adev) mode = hpt37x_find_mode(ap, adev->pio_mode); printk("Find mode for %d reports %X\n", adev->pio_mode, mode); - mode &= ~0x80000000; /* No FIFO in PIO */ - mode &= ~0x30070000; /* Leave config bits alone */ - reg &= 0x30070000; /* Strip timing bits */ + mode &= 0xCFC3FFFF; /* Leave DMA bits alone */ + reg &= ~0xCFC3FFFF; /* Strip timing bits */ pci_write_config_dword(pdev, addr1, reg | mode); } @@ -527,8 +532,7 @@ static void hpt372_set_dmamode(struct ata_port *ap, struct ata_device *adev) { struct pci_dev *pdev = to_pci_dev(ap->host->dev); u32 addr1, addr2; - u32 reg; - u32 mode; + u32 reg, mode, mask; u8 fast; addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no); @@ -539,12 +543,13 @@ static void hpt372_set_dmamode(struct ata_port *ap, struct ata_device *adev) fast &= ~0x07; pci_write_config_byte(pdev, addr2, fast); + mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000; + pci_read_config_dword(pdev, addr1, ®); mode = hpt37x_find_mode(ap, adev->dma_mode); printk("Find mode for DMA %d reports %X\n", adev->dma_mode, mode); - mode &= ~0xC0000000; /* Leave config bits alone */ - mode |= 0x80000000; /* FIFO in MWDMA or UDMA */ - reg &= 0xC0000000; /* Strip timing bits */ + mode &= mask; + reg &= ~mask; pci_write_config_dword(pdev, addr1, reg | mode); } @@ -584,6 +589,7 @@ static struct ata_port_operations hpt370_port_ops = { .bmdma_stop = hpt370_bmdma_stop, .mode_filter = hpt370_filter, + .cable_detect = hpt37x_cable_detect, .set_piomode = hpt370_set_piomode, .set_dmamode = hpt370_set_dmamode, .prereset = hpt37x_pre_reset, @@ -608,6 +614,7 @@ static struct ata_port_operations hpt372_port_ops = { .bmdma_stop = hpt37x_bmdma_stop, + .cable_detect = hpt37x_cable_detect, .set_piomode = hpt372_set_piomode, .set_dmamode = hpt372_set_dmamode, .prereset = hpt37x_pre_reset, @@ -620,7 +627,8 @@ static struct ata_port_operations hpt372_port_ops = { static struct ata_port_operations hpt374_fn1_port_ops = { .inherits = &hpt372_port_ops, - .prereset = hpt374_fn1_pre_reset, + .cable_detect = hpt374_fn1_cable_detect, + .prereset = hpt37x_pre_reset, }; /** @@ -791,9 +799,8 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id) static const int MHz[4] = { 33, 40, 50, 66 }; void *private_data = NULL; const struct ata_port_info *ppi[] = { NULL, NULL }; - + u8 rev = dev->revision; u8 irqmask; - u32 class_rev; u8 mcr1; u32 freq; int prefer_dpll = 1; @@ -808,19 +815,16 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id) if (rc) return rc; - pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); - class_rev &= 0xFF; - if (dev->device == PCI_DEVICE_ID_TTI_HPT366) { /* May be a later chip in disguise. Check */ /* Older chips are in the HPT366 driver. Ignore them */ - if (class_rev < 3) + if (rev < 3) return -ENODEV; /* N series chips have their own driver. Ignore */ - if (class_rev == 6) + if (rev == 6) return -ENODEV; - switch(class_rev) { + switch(rev) { case 3: ppi[0] = &info_hpt370; chip_table = &hpt370; @@ -836,28 +840,29 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id) chip_table = &hpt372; break; default: - printk(KERN_ERR "pata_hpt37x: Unknown HPT366 subtype please report (%d).\n", class_rev); + printk(KERN_ERR "pata_hpt37x: Unknown HPT366 " + "subtype, please report (%d).\n", rev); return -ENODEV; } } else { switch(dev->device) { case PCI_DEVICE_ID_TTI_HPT372: /* 372N if rev >= 2*/ - if (class_rev >= 2) + if (rev >= 2) return -ENODEV; ppi[0] = &info_hpt372; chip_table = &hpt372a; break; case PCI_DEVICE_ID_TTI_HPT302: /* 302N if rev > 1 */ - if (class_rev > 1) + if (rev > 1) return -ENODEV; ppi[0] = &info_hpt372; /* Check this */ chip_table = &hpt302; break; case PCI_DEVICE_ID_TTI_HPT371: - if (class_rev > 1) + if (rev > 1) return -ENODEV; ppi[0] = &info_hpt372; chip_table = &hpt371; diff --git a/drivers/ata/pata_hpt3x2n.c b/drivers/ata/pata_hpt3x2n.c index 3d59fe0..9a09a1b 100644 --- a/drivers/ata/pata_hpt3x2n.c +++ b/drivers/ata/pata_hpt3x2n.c @@ -25,7 +25,7 @@ #include <linux/libata.h> #define DRV_NAME "pata_hpt3x2n" -#define DRV_VERSION "0.3.4" +#define DRV_VERSION "0.3.7" enum { HPT_PCI_FAST = (1 << 31), @@ -80,14 +80,13 @@ static struct hpt_clock hpt3x2n_clocks[] = { { XFER_MW_DMA_2, 0x2c829c62 }, { XFER_MW_DMA_1, 0x2c829c66 }, - { XFER_MW_DMA_0, 0x2c829d2c }, + { XFER_MW_DMA_0, 0x2c829d2e }, { XFER_PIO_4, 0x0c829c62 }, { XFER_PIO_3, 0x0c829c84 }, { XFER_PIO_2, 0x0c829ca6 }, { XFER_PIO_1, 0x0d029d26 }, { XFER_PIO_0, 0x0d029d5e }, - { 0, 0x0d029d5e } }; /** @@ -128,12 +127,15 @@ static int hpt3x2n_cable_detect(struct ata_port *ap) pci_read_config_byte(pdev, 0x5B, &scr2); pci_write_config_byte(pdev, 0x5B, scr2 & ~0x01); + + udelay(10); /* debounce */ + /* Cable register now active */ pci_read_config_byte(pdev, 0x5A, &ata66); /* Restore state */ pci_write_config_byte(pdev, 0x5B, scr2); - if (ata66 & (1 << ap->port_no)) + if (ata66 & (2 >> ap->port_no)) return ATA_CBL_PATA40; else return ATA_CBL_PATA80; @@ -185,9 +187,8 @@ static void hpt3x2n_set_piomode(struct ata_port *ap, struct ata_device *adev) pci_read_config_dword(pdev, addr1, ®); mode = hpt3x2n_find_mode(ap, adev->pio_mode); - mode &= ~0x8000000; /* No FIFO in PIO */ - mode &= ~0x30070000; /* Leave config bits alone */ - reg &= 0x30070000; /* Strip timing bits */ + mode &= 0xCFC3FFFF; /* Leave DMA bits alone */ + reg &= ~0xCFC3FFFF; /* Strip timing bits */ pci_write_config_dword(pdev, addr1, reg | mode); } @@ -204,8 +205,7 @@ static void hpt3x2n_set_dmamode(struct ata_port *ap, struct ata_device *adev) { struct pci_dev *pdev = to_pci_dev(ap->host->dev); u32 addr1, addr2; - u32 reg; - u32 mode; + u32 reg, mode, mask; u8 fast; addr1 = 0x40 + 4 * (adev->devno + 2 * ap->port_no); @@ -216,11 +216,12 @@ static void hpt3x2n_set_dmamode(struct ata_port *ap, struct ata_device *adev) fast &= ~0x07; pci_write_config_byte(pdev, addr2, fast); + mask = adev->dma_mode < XFER_UDMA_0 ? 0x31C001FF : 0x303C0000; + pci_read_config_dword(pdev, addr1, ®); mode = hpt3x2n_find_mode(ap, adev->dma_mode); - mode |= 0x8000000; /* FIFO in MWDMA or UDMA */ - mode &= ~0xC0000000; /* Leave config bits alone */ - reg &= 0xC0000000; /* Strip timing bits */ + mode &= mask; + reg &= ~mask; pci_write_config_dword(pdev, addr1, reg | mode); } @@ -447,10 +448,8 @@ static int hpt3x2n_init_one(struct pci_dev *dev, const struct pci_device_id *id) .port_ops = &hpt3x2n_port_ops }; const struct ata_port_info *ppi[] = { &info, NULL }; - + u8 rev = dev->revision; u8 irqmask; - u32 class_rev; - unsigned int pci_mhz; unsigned int f_low, f_high; int adjust; @@ -462,26 +461,23 @@ static int hpt3x2n_init_one(struct pci_dev *dev, const struct pci_device_id *id) if (rc) return rc; - pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); - class_rev &= 0xFF; - switch(dev->device) { case PCI_DEVICE_ID_TTI_HPT366: - if (class_rev < 6) + if (rev < 6) return -ENODEV; break; case PCI_DEVICE_ID_TTI_HPT371: - if (class_rev < 2) + if (rev < 2) return -ENODEV; /* 371N if rev > 1 */ break; case PCI_DEVICE_ID_TTI_HPT372: /* 372N if rev >= 2*/ - if (class_rev < 2) + if (rev < 2) return -ENODEV; break; case PCI_DEVICE_ID_TTI_HPT302: - if (class_rev < 2) + if (rev < 2) return -ENODEV; break; case PCI_DEVICE_ID_TTI_HPT372N: diff --git a/drivers/ata/pata_hpt3x3.c b/drivers/ata/pata_hpt3x3.c index 7e31025..c86c716 100644 --- a/drivers/ata/pata_hpt3x3.c +++ b/drivers/ata/pata_hpt3x3.c @@ -255,8 +255,17 @@ static int hpt3x3_init_one(struct pci_dev *pdev, const struct pci_device_id *id) #ifdef CONFIG_PM static int hpt3x3_reinit_one(struct pci_dev *dev) { + struct ata_host *host = dev_get_drvdata(&dev->dev); + int rc; + + rc = ata_pci_device_do_resume(dev); + if (rc) + return rc; + hpt3x3_init_chipset(dev); - return ata_pci_device_resume(dev); + + ata_host_resume(host); + return 0; } #endif diff --git a/drivers/ata/pata_it8213.c b/drivers/ata/pata_it8213.c index f156da8..8f3325a 100644 --- a/drivers/ata/pata_it8213.c +++ b/drivers/ata/pata_it8213.c @@ -22,7 +22,7 @@ #define DRV_VERSION "0.0.3" /** - * it8213_pre_reset - check for 40/80 pin + * it8213_pre_reset - probe begin * @link: link * @deadline: deadline jiffies for the operation * @@ -92,18 +92,17 @@ static void it8213_set_piomode (struct ata_port *ap, struct ata_device *adev) { 2, 1 }, { 2, 3 }, }; - if (pio > 2) - control |= 1; /* TIME1 enable */ + if (pio > 1) + control |= 1; /* TIME */ if (ata_pio_need_iordy(adev)) /* PIO 3/4 require IORDY */ - control |= 2; /* IORDY enable */ + control |= 2; /* IE */ /* Bit 2 is set for ATAPI on the IT8213 - reverse of ICH/PIIX */ if (adev->class != ATA_DEV_ATA) - control |= 4; + control |= 4; /* PPE */ pci_read_config_word(dev, idetm_port, &idetm_data); - /* Enable PPE, IE and TIME as appropriate */ - + /* Set PPE, IE, and TIME as appropriate */ if (adev->devno == 0) { idetm_data &= 0xCCF0; idetm_data |= control; @@ -112,17 +111,17 @@ static void it8213_set_piomode (struct ata_port *ap, struct ata_device *adev) } else { u8 slave_data; - idetm_data &= 0xCC0F; + idetm_data &= 0xFF0F; idetm_data |= (control << 4); /* Slave timing in separate register */ pci_read_config_byte(dev, 0x44, &slave_data); slave_data &= 0xF0; - slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << 4; + slave_data |= (timings[pio][0] << 2) | timings[pio][1]; pci_write_config_byte(dev, 0x44, slave_data); } - idetm_data |= 0x4000; /* Ensure SITRE is enabled */ + idetm_data |= 0x4000; /* Ensure SITRE is set */ pci_write_config_word(dev, idetm_port, idetm_data); } @@ -173,10 +172,10 @@ static void it8213_set_dmamode (struct ata_port *ap, struct ata_device *adev) udma_enable |= (1 << devid); - /* Load the UDMA mode number */ + /* Load the UDMA cycle time */ pci_read_config_word(dev, 0x4A, &udma_timing); udma_timing &= ~(3 << (4 * devid)); - udma_timing |= (udma & 3) << (4 * devid); + udma_timing |= u_speed << (4 * devid); pci_write_config_word(dev, 0x4A, udma_timing); /* Load the clock selection */ @@ -211,7 +210,7 @@ static void it8213_set_dmamode (struct ata_port *ap, struct ata_device *adev) master_data &= 0xFF4F; /* Mask out IORDY|TIME1|DMAONLY */ master_data |= control << 4; pci_read_config_byte(dev, 0x44, &slave_data); - slave_data &= (0x0F + 0xE1 * ap->port_no); + slave_data &= 0xF0; /* Load the matching timing */ slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0); pci_write_config_byte(dev, 0x44, slave_data); @@ -263,7 +262,7 @@ static int it8213_init_one (struct pci_dev *pdev, const struct pci_device_id *en static const struct ata_port_info info = { .flags = ATA_FLAG_SLAVE_POSS, .pio_mask = ATA_PIO4, - .mwdma_mask = ATA_MWDMA2, + .mwdma_mask = ATA_MWDMA12_ONLY, .udma_mask = ATA_UDMA4, /* FIXME: want UDMA 100? */ .port_ops = &it8213_ops, }; diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c index 188bc2f..edc5c1f 100644 --- a/drivers/ata/pata_it821x.c +++ b/drivers/ata/pata_it821x.c @@ -955,7 +955,7 @@ static int it821x_reinit_one(struct pci_dev *pdev) static const struct pci_device_id it821x[] = { { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8211), }, { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8212), }, - { PCI_VDEVICE(RDC, 0x1010), }, + { PCI_VDEVICE(RDC, PCI_DEVICE_ID_RDC_D1010), }, { }, }; diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c index 6932e56..9df1ff7 100644 --- a/drivers/ata/pata_legacy.c +++ b/drivers/ata/pata_legacy.c @@ -25,6 +25,13 @@ * http://www.ryston.cz/petr/vlb/pdc20230b.html * http://www.ryston.cz/petr/vlb/pdc20230c.html * http://www.ryston.cz/petr/vlb/pdc20630.html + * QDI65x0: + * http://www.ryston.cz/petr/vlb/qd6500.html + * http://www.ryston.cz/petr/vlb/qd6580.html + * + * QDI65x0 probe code based on drivers/ide/legacy/qd65xx.c + * Rewritten from the work of Colten Edwards <pje120@cs.usask.ca> by + * Samuel Thibault <samuel.thibault@ens-lyon.org> * * Unsupported but docs exist: * Appian/Adaptec AIC25VL01/Cirrus Logic PD7220 @@ -35,7 +42,7 @@ * the MPIIX where the tuning is PCI side but the IDE is "ISA side". * * Specific support is included for the ht6560a/ht6560b/opti82c611a/ - * opti82c465mv/promise 20230c/20630/winbond83759A + * opti82c465mv/promise 20230c/20630/qdi65x0/winbond83759A * * Use the autospeed and pio_mask options with: * Appian ADI/2 aka CLPD7220 or AIC25VL01. @@ -672,7 +679,7 @@ static void qdi6580dp_set_piomode(struct ata_port *ap, struct ata_device *adev) outb(timing, ld_qdi->timing + 2 * ap->port_no); /* Clear the FIFO */ if (adev->class != ATA_DEV_ATA) - outb(0x5F, ld_qdi->timing + 3); + outb(0x5F, (ld_qdi->timing & 0xFFF0) + 3); } /** @@ -707,7 +714,7 @@ static void qdi6580_set_piomode(struct ata_port *ap, struct ata_device *adev) outb(timing, ld_qdi->timing + 2 * adev->devno); /* Clear the FIFO */ if (adev->class != ATA_DEV_ATA) - outb(0x5F, ld_qdi->timing + 3); + outb(0x5F, (ld_qdi->timing & 0xFFF0) + 3); } /** @@ -787,6 +794,7 @@ static struct ata_port_operations qdi6580_port_ops = { static struct ata_port_operations qdi6580dp_port_ops = { .inherits = &legacy_base_port_ops, .set_piomode = qdi6580dp_set_piomode, + .qc_issue = qdi_qc_issue, .sff_data_xfer = vlb32_data_xfer, }; diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index 2096fb7..950da39 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -58,7 +58,7 @@ static int marvell_pata_active(struct pci_dev *pdev) } /** - * marvell_pre_reset - check for 40/80 pin + * marvell_pre_reset - probe begin * @link: link * @deadline: deadline jiffies for the operation * diff --git a/drivers/ata/pata_ns87415.c b/drivers/ata/pata_ns87415.c index 773b159..061aa1c 100644 --- a/drivers/ata/pata_ns87415.c +++ b/drivers/ata/pata_ns87415.c @@ -325,6 +325,13 @@ static struct scsi_host_template ns87415_sht = { ATA_BMDMA_SHT(DRV_NAME), }; +static void ns87415_fixup(struct pci_dev *pdev) +{ + /* Select 512 byte sectors */ + pci_write_config_byte(pdev, 0x55, 0xEE); + /* Select PIO0 8bit clocking */ + pci_write_config_byte(pdev, 0x54, 0xB7); +} /** * ns87415_init_one - Register 87415 ATA PCI device with kernel services @@ -371,10 +378,8 @@ static int ns87415_init_one (struct pci_dev *pdev, const struct pci_device_id *e if (rc) return rc; - /* Select 512 byte sectors */ - pci_write_config_byte(pdev, 0x55, 0xEE); - /* Select PIO0 8bit clocking */ - pci_write_config_byte(pdev, 0x54, 0xB7); + ns87415_fixup(pdev); + return ata_pci_sff_init_one(pdev, ppi, &ns87415_sht, NULL); } @@ -384,6 +389,23 @@ static const struct pci_device_id ns87415_pci_tbl[] = { { } /* terminate list */ }; +#ifdef CONFIG_PM +static int ns87415_reinit_one(struct pci_dev *pdev) +{ + struct ata_host *host = dev_get_drvdata(&pdev->dev); + int rc; + + rc = ata_pci_device_do_resume(pdev); + if (rc) + return rc; + + ns87415_fixup(pdev); + + ata_host_resume(host); + return 0; +} +#endif + static struct pci_driver ns87415_pci_driver = { .name = DRV_NAME, .id_table = ns87415_pci_tbl, @@ -391,7 +413,7 @@ static struct pci_driver ns87415_pci_driver = { .remove = ata_pci_remove_one, #ifdef CONFIG_PM .suspend = ata_pci_device_suspend, - .resume = ata_pci_device_resume, + .resume = ns87415_reinit_one, #endif }; diff --git a/drivers/ata/pata_oldpiix.c b/drivers/ata/pata_oldpiix.c index 84ac503..9a8687d 100644 --- a/drivers/ata/pata_oldpiix.c +++ b/drivers/ata/pata_oldpiix.c @@ -239,7 +239,7 @@ static int oldpiix_init_one (struct pci_dev *pdev, const struct pci_device_id *e static const struct ata_port_info info = { .flags = ATA_FLAG_SLAVE_POSS, .pio_mask = ATA_PIO4, - .mwdma_mask = ATA_MWDMA2, + .mwdma_mask = ATA_MWDMA12_ONLY, .port_ops = &oldpiix_pata_ops, }; const struct ata_port_info *ppi[] = { &info, NULL }; diff --git a/drivers/ata/pata_piccolo.c b/drivers/ata/pata_piccolo.c new file mode 100644 index 0000000..bfe0180 --- /dev/null +++ b/drivers/ata/pata_piccolo.c @@ -0,0 +1,140 @@ +/* + * pata_piccolo.c - Toshiba Piccolo PATA/SATA controller driver. + * + * This is basically an update to ata_generic.c to add Toshiba Piccolo support + * then split out to keep ata_generic "clean". + * + * Copyright 2005 Red Hat Inc, all rights reserved. + * + * Elements from ide/pci/generic.c + * Copyright (C) 2001-2002 Andre Hedrick <andre@linux-ide.org> + * Portions (C) Copyright 2002 Red Hat Inc <alan@redhat.com> + * + * May be copied or modified under the terms of the GNU General Public License + * + * The timing data tables/programming info are courtesy of the NetBSD driver + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <scsi/scsi_host.h> +#include <linux/libata.h> + +#define DRV_NAME "pata_piccolo" +#define DRV_VERSION "0.0.1" + + + +static void tosh_set_piomode(struct ata_port *ap, struct ata_device *adev) +{ + static const u16 pio[6] = { /* For reg 0x50 low word & E088 */ + 0x0566, 0x0433, 0x0311, 0x0201, 0x0200, 0x0100 + }; + struct pci_dev *pdev = to_pci_dev(ap->host->dev); + u16 conf; + pci_read_config_word(pdev, 0x50, &conf); + conf &= 0xE088; + conf |= pio[adev->pio_mode - XFER_PIO_0]; + pci_write_config_word(pdev, 0x50, conf); +} + +static void tosh_set_dmamode(struct ata_port *ap, struct ata_device *adev) +{ + struct pci_dev *pdev = to_pci_dev(ap->host->dev); + u32 conf; + pci_read_config_dword(pdev, 0x5C, &conf); + conf &= 0x78FFE088; /* Keep the other bits */ + if (adev->dma_mode >= XFER_UDMA_0) { + int udma = adev->dma_mode - XFER_UDMA_0; + conf |= 0x80000000; + conf |= (udma + 2) << 28; + conf |= (2 - udma) * 0x111; /* spread into three nibbles */ + } else { + static const u32 mwdma[4] = { + 0x0655, 0x0200, 0x0200, 0x0100 + }; + conf |= mwdma[adev->dma_mode - XFER_MW_DMA_0]; + } + pci_write_config_dword(pdev, 0x5C, conf); +} + + +static struct scsi_host_template tosh_sht = { + ATA_BMDMA_SHT(DRV_NAME), +}; + +static struct ata_port_operations tosh_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_unknown, + .set_piomode = tosh_set_piomode, + .set_dmamode = tosh_set_dmamode +}; + +/** + * ata_tosh_init - attach generic IDE + * @dev: PCI device found + * @id: match entry + * + * Called each time a matching IDE interface is found. We check if the + * interface is one we wish to claim and if so we perform any chip + * specific hacks then let the ATA layer do the heavy lifting. + */ + +static int ata_tosh_init_one(struct pci_dev *dev, const struct pci_device_id *id) +{ + static const struct ata_port_info info = { + .flags = ATA_FLAG_SLAVE_POSS, + .pio_mask = ATA_PIO5, + .mwdma_mask = ATA_MWDMA2, + .udma_mask = ATA_UDMA2, + .port_ops = &tosh_port_ops + }; + const struct ata_port_info *ppi[] = { &info, &ata_dummy_port_info }; + /* Just one port for the moment */ + return ata_pci_sff_init_one(dev, ppi, &tosh_sht, NULL); +} + +static struct pci_device_id ata_tosh[] = { + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), }, + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), }, + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3), }, + { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5), }, + { 0, }, +}; + +static struct pci_driver ata_tosh_pci_driver = { + .name = DRV_NAME, + .id_table = ata_tosh, + .probe = ata_tosh_init_one, + .remove = ata_pci_remove_one, +#ifdef CONFIG_PM + .suspend = ata_pci_device_suspend, + .resume = ata_pci_device_resume, +#endif +}; + +static int __init ata_tosh_init(void) +{ + return pci_register_driver(&ata_tosh_pci_driver); +} + + +static void __exit ata_tosh_exit(void) +{ + pci_unregister_driver(&ata_tosh_pci_driver); +} + + +MODULE_AUTHOR("Alan Cox"); +MODULE_DESCRIPTION("Low level driver for Toshiba Piccolo ATA"); +MODULE_LICENSE("GPL"); +MODULE_DEVICE_TABLE(pci, ata_tosh); +MODULE_VERSION(DRV_VERSION); + +module_init(ata_tosh_init); +module_exit(ata_tosh_exit); + diff --git a/drivers/ata/pata_radisys.c b/drivers/ata/pata_radisys.c index 4401b33..4fd25e7 100644 --- a/drivers/ata/pata_radisys.c +++ b/drivers/ata/pata_radisys.c @@ -139,9 +139,9 @@ static void radisys_set_dmamode (struct ata_port *ap, struct ata_device *adev) pci_read_config_byte(dev, 0x4A, &udma_mode); if (adev->xfer_mode == XFER_UDMA_2) - udma_mode &= ~ (1 << adev->devno); + udma_mode &= ~(2 << (adev->devno * 4)); else /* UDMA 4 */ - udma_mode |= (1 << adev->devno); + udma_mode |= (2 << (adev->devno * 4)); pci_write_config_byte(dev, 0x4A, udma_mode); diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c index c843a1e..237a24d 100644 --- a/drivers/ata/pata_rdc.c +++ b/drivers/ata/pata_rdc.c @@ -284,7 +284,7 @@ static struct ata_port_info rdc_port_info = { .flags = ATA_FLAG_SLAVE_POSS, .pio_mask = ATA_PIO4, - .mwdma_mask = ATA_MWDMA2, + .mwdma_mask = ATA_MWDMA12_ONLY, .udma_mask = ATA_UDMA5, .port_ops = &rdc_pata_ops, }; diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c index a5e4dfe..2932998 100644 --- a/drivers/ata/pata_rz1000.c +++ b/drivers/ata/pata_rz1000.c @@ -105,11 +105,20 @@ static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *en #ifdef CONFIG_PM static int rz1000_reinit_one(struct pci_dev *pdev) { + struct ata_host *host = dev_get_drvdata(&pdev->dev); + int rc; + + rc = ata_pci_device_do_resume(pdev); + if (rc) + return rc; + /* If this fails on resume (which is a "cant happen" case), we must stop as any progress risks data loss */ if (rz1000_fifo_disable(pdev)) panic("rz1000 fifo"); - return ata_pci_device_resume(pdev); + + ata_host_resume(host); + return 0; } #endif diff --git a/drivers/ata/pata_sil680.c b/drivers/ata/pata_sil680.c index 4cb649d..a2ace48 100644 --- a/drivers/ata/pata_sil680.c +++ b/drivers/ata/pata_sil680.c @@ -212,13 +212,11 @@ static struct ata_port_operations sil680_port_ops = { static u8 sil680_init_chip(struct pci_dev *pdev, int *try_mmio) { - u32 class_rev = 0; u8 tmpbyte = 0; - pci_read_config_dword(pdev, PCI_CLASS_REVISION, &class_rev); - class_rev &= 0xff; /* FIXME: double check */ - pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, (class_rev) ? 1 : 255); + pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, + pdev->revision ? 1 : 255); pci_write_config_byte(pdev, 0x80, 0x00); pci_write_config_byte(pdev, 0x84, 0x00); diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c index 488e77b..5c30d56 100644 --- a/drivers/ata/pata_sis.c +++ b/drivers/ata/pata_sis.c @@ -2,7 +2,7 @@ * pata_sis.c - SiS ATA driver * * (C) 2005 Red Hat - * (C) 2007 Bartlomiej Zolnierkiewicz + * (C) 2007,2009 Bartlomiej Zolnierkiewicz * * Based upon linux/drivers/ide/pci/sis5513.c * Copyright (C) 1999-2000 Andre Hedrick <andre@linux-ide.org> @@ -829,6 +829,23 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) return ata_pci_sff_init_one(pdev, ppi, &sis_sht, chipset); } +#ifdef CONFIG_PM +static int sis_reinit_one(struct pci_dev *pdev) +{ + struct ata_host *host = dev_get_drvdata(&pdev->dev); + int rc; + + rc = ata_pci_device_do_resume(pdev); + if (rc) + return rc; + + sis_fixup(pdev, host->private_data); + + ata_host_resume(host); + return 0; +} +#endif + static const struct pci_device_id sis_pci_tbl[] = { { PCI_VDEVICE(SI, 0x5513), }, /* SiS 5513 */ { PCI_VDEVICE(SI, 0x5518), }, /* SiS 5518 */ @@ -844,7 +861,7 @@ static struct pci_driver sis_pci_driver = { .remove = ata_pci_remove_one, #ifdef CONFIG_PM .suspend = ata_pci_device_suspend, - .resume = ata_pci_device_resume, + .resume = sis_reinit_one, #endif }; diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 88984b8..0d97890 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -303,14 +303,21 @@ static void via_do_set_mode(struct ata_port *ap, struct ata_device *adev, int mo } /* Set UDMA unless device is not UDMA capable */ - if (udma_type && t.udma) { - u8 cable80_status; + if (udma_type) { + u8 udma_etc; - /* Get 80-wire cable detection bit */ - pci_read_config_byte(pdev, 0x50 + offset, &cable80_status); - cable80_status &= 0x10; + pci_read_config_byte(pdev, 0x50 + offset, &udma_etc); - pci_write_config_byte(pdev, 0x50 + offset, ut | cable80_status); + /* clear transfer mode bit */ + udma_etc &= ~0x20; + + if (t.udma) { + /* preserve 80-wire cable detection bit */ + udma_etc &= 0x10; + udma_etc |= ut; + } + + pci_write_config_byte(pdev, 0x50 + offset, udma_etc); } } @@ -337,6 +344,32 @@ static void via_set_dmamode(struct ata_port *ap, struct ata_device *adev) } /** + * via_mode_filter - filter buggy device/mode pairs + * @dev: ATA device + * @mask: Mode bitmask + * + * We need to apply some minimal filtering for old controllers and at least + * one breed of Transcend SSD. Return the updated mask. + */ + +static unsigned long via_mode_filter(struct ata_device *dev, unsigned long mask) +{ + struct ata_host *host = dev->link->ap->host; + const struct via_isa_bridge *config = host->private_data; + unsigned char model_num[ATA_ID_PROD_LEN + 1]; + + if (config->id == PCI_DEVICE_ID_VIA_82C586_0) { + ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num)); + if (strcmp(model_num, "TS64GSSD25-M") == 0) { + ata_dev_printk(dev, KERN_WARNING, + "disabling UDMA mode due to reported lockups with this device.\n"); + mask &= ~ ATA_MASK_UDMA; + } + } + return ata_bmdma_mode_filter(dev, mask); +} + +/** * via_tf_load - send taskfile registers to host controller * @ap: Port to which output is sent * @tf: ATA taskfile register set @@ -427,6 +460,7 @@ static struct ata_port_operations via_port_ops = { .prereset = via_pre_reset, .sff_tf_load = via_tf_load, .port_start = via_port_start, + .mode_filter = via_mode_filter, }; static struct ata_port_operations via_port_ops_noirq = { @@ -526,7 +560,7 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id) .port_ops = &via_port_ops }; const struct ata_port_info *ppi[] = { NULL, NULL }; - struct pci_dev *isa = NULL; + struct pci_dev *isa; const struct via_isa_bridge *config; static int printed_version; u8 enable; @@ -551,15 +585,13 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if ((isa = pci_get_device(PCI_VENDOR_ID_VIA + !!(config->flags & VIA_BAD_ID), config->id, NULL))) { + u8 rev = isa->revision; + pci_dev_put(isa); - if (isa->revision >= config->rev_min && - isa->revision <= config->rev_max) + if (rev >= config->rev_min && rev <= config->rev_max) break; - pci_dev_put(isa); } - pci_dev_put(isa); - if (!(config->flags & VIA_NO_ENABLES)) { /* 0x40 low bits indicate enabled channels */ pci_read_config_byte(pdev, 0x40 , &enable); diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index 172b57e..8a5d35b 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -34,7 +34,7 @@ enum { SATA_FSL_HOST_FLAGS = (ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY | ATA_FLAG_MMIO | ATA_FLAG_PIO_DMA | - ATA_FLAG_PMP | ATA_FLAG_NCQ), + ATA_FLAG_PMP | ATA_FLAG_NCQ | ATA_FLAG_AN), SATA_FSL_MAX_CMDS = SATA_FSL_QUEUE_DEPTH, SATA_FSL_CMD_HDR_SIZE = 16, /* 4 DWORDS */ @@ -132,7 +132,7 @@ enum { INT_ON_SINGL_DEVICE_ERR = (1 << 1), INT_ON_CMD_COMPLETE = 1, - INT_ON_ERROR = INT_ON_FATAL_ERR | + INT_ON_ERROR = INT_ON_FATAL_ERR | INT_ON_SNOTIFY_UPDATE | INT_ON_PHYRDY_CHG | INT_ON_SINGL_DEVICE_ERR, /* @@ -153,7 +153,7 @@ enum { IE_ON_CMD_COMPLETE = 1, DEFAULT_PORT_IRQ_ENABLE_MASK = IE_ON_FATAL_ERR | IE_ON_PHYRDY_CHG | - IE_ON_SIGNATURE_UPDATE | + IE_ON_SIGNATURE_UPDATE | IE_ON_SNOTIFY_UPDATE | IE_ON_SINGL_DEVICE_ERR | IE_ON_CMD_COMPLETE, EXT_INDIRECT_SEG_PRD_FLAG = (1 << 31), @@ -992,9 +992,8 @@ static void sata_fsl_error_intr(struct ata_port *ap) */ sata_fsl_scr_read(&ap->link, SCR_ERROR, &SError); - if (unlikely(SError & 0xFFFF0000)) { + if (unlikely(SError & 0xFFFF0000)) sata_fsl_scr_write(&ap->link, SCR_ERROR, SError); - } DPRINTK("error_intr,hStat=0x%x,CE=0x%x,DE =0x%x,SErr=0x%x\n", hstatus, cereg, ioread32(hcr_base + DE), SError); @@ -1007,6 +1006,10 @@ static void sata_fsl_error_intr(struct ata_port *ap) freeze = 1; } + /* Handle SDB FIS receive & notify update */ + if (hstatus & INT_ON_SNOTIFY_UPDATE) + sata_async_notification(ap); + /* Handle PHYRDY change notification */ if (hstatus & INT_ON_PHYRDY_CHG) { DPRINTK("SATA FSL: PHYRDY change indication\n"); @@ -1070,9 +1073,9 @@ static void sata_fsl_error_intr(struct ata_port *ap) } /* record error info */ - if (qc) { + if (qc) qc->err_mask |= err_mask; - } else + else ehi->err_mask |= err_mask; ehi->action |= action; @@ -1103,7 +1106,6 @@ static void sata_fsl_host_intr(struct ata_port *ap) if (unlikely(SError & 0xFFFF0000)) { DPRINTK("serror @host_intr : 0x%x\n", SError); sata_fsl_error_intr(ap); - } if (unlikely(hstatus & INT_ON_ERROR)) { diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index 6f5093b..a8a7be0 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -2217,7 +2217,7 @@ static unsigned int mv_qc_issue_fis(struct ata_queued_cmd *qc) int err = 0; ata_tf_to_fis(&qc->tf, link->pmp, 1, (void *)fis); - err = mv_send_fis(ap, fis, sizeof(fis) / sizeof(fis[0])); + err = mv_send_fis(ap, fis, ARRAY_SIZE(fis)); if (err) return err; diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index e6946fc..1370df6 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -417,6 +417,10 @@ static struct ata_port_operations sil24_ops = { #endif }; +static int sata_sil24_msi; /* Disable MSI */ +module_param_named(msi, sata_sil24_msi, bool, S_IRUGO); +MODULE_PARM_DESC(msi, "Enable MSI (Default: false)"); + /* * Use bits 30-31 of port_flags to encode available port numbers. * Current maxium is 4. @@ -1340,6 +1344,11 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) sil24_init_controller(host); + if (sata_sil24_msi && !pci_enable_msi(pdev)) { + dev_printk(KERN_INFO, &pdev->dev, "Using MSI\n"); + pci_intx(pdev, 0); + } + pci_set_master(pdev); return ata_host_activate(host, pdev->irq, sil24_interrupt, IRQF_SHARED, &sil24_sht); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 846d89e..5a01ece 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -185,6 +185,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) } dev->power.runtime_status = RPM_SUSPENDING; + dev->power.deferred_resume = false; if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { spin_unlock_irq(&dev->power.lock); @@ -200,7 +201,6 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) if (retval) { dev->power.runtime_status = RPM_ACTIVE; pm_runtime_cancel_pending(dev); - dev->power.deferred_resume = false; if (retval == -EAGAIN || retval == -EBUSY) { notify = true; @@ -217,7 +217,6 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq) wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { - dev->power.deferred_resume = false; __pm_runtime_resume(dev, false); retval = -EAGAIN; goto out; @@ -626,6 +625,8 @@ int pm_schedule_suspend(struct device *dev, unsigned int delay) goto out; dev->power.timer_expires = jiffies + msecs_to_jiffies(delay); + if (!dev->power.timer_expires) + dev->power.timer_expires = 1; mod_timer(&dev->power.suspend_timer, dev->power.timer_expires); out: @@ -659,13 +660,17 @@ static int __pm_request_resume(struct device *dev) pm_runtime_deactivate_timer(dev); + if (dev->power.runtime_status == RPM_SUSPENDING) { + dev->power.deferred_resume = true; + return retval; + } if (dev->power.request_pending) { /* If non-resume request is pending, we can overtake it. */ dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME; return retval; - } else if (retval) { - return retval; } + if (retval) + return retval; dev->power.request = RPM_REQ_RESUME; dev->power.request_pending = true; @@ -777,7 +782,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) } if (parent) { - spin_lock(&parent->power.lock); + spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is @@ -786,12 +791,10 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) */ if (!parent->power.disable_depth && !parent->power.ignore_children - && parent->power.runtime_status != RPM_ACTIVE) { + && parent->power.runtime_status != RPM_ACTIVE) error = -EBUSY; - } else { - if (dev->power.runtime_status == RPM_SUSPENDED) - atomic_inc(&parent->power.child_count); - } + else if (dev->power.runtime_status == RPM_SUSPENDED) + atomic_inc(&parent->power.child_count); spin_unlock(&parent->power.lock); diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1d886e0..77bfce5 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP instead, which can be configured to be on-disk compatible with the cryptoloop device. +source "drivers/block/drbd/Kconfig" + config BLK_DEV_NBD tristate "Network block device support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index cdaa3f8..aff5ac9 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 92b1263..873e594 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl); static int deregister_disk(ctlr_info_t *h, int drv_index, int clear_all, int via_ioctl); -static void cciss_read_capacity(int ctlr, int logvol, int withirq, +static void cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size); -static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, +static void cciss_read_capacity_16(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size); static void cciss_geometry_inquiry(int ctlr, int logvol, - int withirq, sector_t total_size, + sector_t total_size, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv); static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, __u32); static void start_io(ctlr_info_t *h); -static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, - __u8 page_code, unsigned char *scsi3addr, int cmd_type); static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, __u8 page_code, unsigned char scsi3addr[], int cmd_type); @@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf, if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { struct seq_file *seq = file->private_data; ctlr_info_t *h = seq->private; - int rc; - rc = cciss_engage_scsi(h->ctlr); - if (rc != 0) - err = -rc; - else + err = cciss_engage_scsi(h->ctlr); + if (err == 0) err = length; } else #endif /* CONFIG_CISS_SCSI_TAPE */ @@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq) { CommandList_struct *cmd = rq->completion_data; ctlr_info_t *h = hba[cmd->ctlr]; + SGDescriptor_struct *curr_sg = cmd->SG; unsigned long flags; u64bit temp64; int i, ddir; + int sg_index = 0; if (cmd->Request.Type.Direction == XFER_READ) ddir = PCI_DMA_FROMDEVICE; @@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq) /* command did not need to be retried */ /* unmap the DMA mapping for all the scatter gather elements */ for (i = 0; i < cmd->Header.SGList; i++) { - temp64.val32.lower = cmd->SG[i].Addr.lower; - temp64.val32.upper = cmd->SG[i].Addr.upper; - pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); + if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) { + temp64.val32.lower = cmd->SG[i].Addr.lower; + temp64.val32.upper = cmd->SG[i].Addr.upper; + pci_dma_sync_single_for_cpu(h->pdev, temp64.val, + cmd->SG[i].Len, ddir); + pci_unmap_single(h->pdev, temp64.val, + cmd->SG[i].Len, ddir); + /* Point to the next block */ + curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain; + sg_index = 0; + } + temp64.val32.lower = curr_sg[sg_index].Addr.lower; + temp64.val32.upper = curr_sg[sg_index].Addr.upper; + pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len, + ddir); + ++sg_index; } #ifdef CCISS_DEBUG @@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h, * via the inquiry page 0. Model, vendor, and rev are set to empty strings if * they cannot be read. */ -static void cciss_get_device_descr(int ctlr, int logvol, int withirq, +static void cciss_get_device_descr(int ctlr, int logvol, char *vendor, char *model, char *rev) { int rc; @@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, return; log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, - sizeof(InquiryData_struct), 0, - scsi3addr, TYPE_CMD); - else - rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf, - sizeof(InquiryData_struct), 0, - scsi3addr, TYPE_CMD); + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0, + scsi3addr, TYPE_CMD); if (rc == IO_OK) { memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); vendor[VENDOR_LEN] = '\0'; @@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, * number cannot be had, for whatever reason, 16 bytes of 0xff * are returned instead. */ -static void cciss_get_serial_no(int ctlr, int logvol, int withirq, +static void cciss_get_serial_no(int ctlr, int logvol, unsigned char *serial_no, int buflen) { #define PAGE_83_INQ_BYTES 64 @@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq, return; memset(serial_no, 0, buflen); log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); - else - rc = sendcmd(CISS_INQUIRY, ctlr, buf, - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, + PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); if (rc == IO_OK) memcpy(serial_no, &buf[8], buflen); kfree(buf); @@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); /* This is a hardware imposed limit. */ - blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); + blk_queue_max_hw_segments(disk->queue, h->maxsgentries); /* This is a limit in the driver and could be eliminated. */ - blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES); + blk_queue_max_phys_segments(disk->queue, h->maxsgentries); blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); @@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, /* testing to see if 16-byte CDBs are already being used */ if (h->cciss_read == CCISS_READ_16) { - cciss_read_capacity_16(h->ctlr, drv_index, 1, + cciss_read_capacity_16(h->ctlr, drv_index, &total_size, &block_size); } else { - cciss_read_capacity(ctlr, drv_index, 1, - &total_size, &block_size); - + cciss_read_capacity(ctlr, drv_index, &total_size, &block_size); /* if read_capacity returns all F's this volume is >2TB */ /* in size so we switch to 16-byte CDB's for all */ /* read/write ops */ if (total_size == 0xFFFFFFFFULL) { - cciss_read_capacity_16(ctlr, drv_index, 1, + cciss_read_capacity_16(ctlr, drv_index, &total_size, &block_size); h->cciss_read = CCISS_READ_16; h->cciss_write = CCISS_WRITE_16; @@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, } } - cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, + cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size, inq_buff, drvinfo); drvinfo->block_size = block_size; drvinfo->nr_blocks = total_size + 1; - cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, + cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor, drvinfo->model, drvinfo->rev); - cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, + cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no, sizeof(drvinfo->serial_no)); /* Save the lunid in case we deregister the disk, below. */ memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, @@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c) case 0: return IO_OK; /* no sense */ case 1: return IO_OK; /* recovered error */ default: + if (check_for_unit_attention(h, c)) + return IO_NEEDS_RETRY; printk(KERN_WARNING "cciss%d: cmd 0x%02x " "check condition, sense key = 0x%02x\n", h->ctlr, c->Request.CDB[0], @@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, } static void cciss_geometry_inquiry(int ctlr, int logvol, - int withirq, sector_t total_size, + sector_t total_size, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv) @@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, memset(inq_buff, 0, sizeof(InquiryData_struct)); log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, - inq_buff, sizeof(*inq_buff), - 0xC1, scsi3addr, TYPE_CMD); - else - return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff, - sizeof(*inq_buff), 0xC1, scsi3addr, - TYPE_CMD); + return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff, + sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { if (inq_buff->data_byte[8] == 0xFF) { printk(KERN_WARNING @@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, } static void -cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, +cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size) { ReadCapdata_struct *buf; @@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, } log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - return_code = sendcmd_withirq(CCISS_READ_CAPACITY, - ctlr, buf, sizeof(ReadCapdata_struct), - 0, scsi3addr, TYPE_CMD); - else - return_code = sendcmd(CCISS_READ_CAPACITY, - ctlr, buf, sizeof(ReadCapdata_struct), - 0, scsi3addr, TYPE_CMD); + return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf, + sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { *total_size = be32_to_cpu(*(__be32 *) buf->total_size); *block_size = be32_to_cpu(*(__be32 *) buf->block_size); @@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, kfree(buf); } -static void -cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) +static void cciss_read_capacity_16(int ctlr, int logvol, + sector_t *total_size, unsigned int *block_size) { ReadCapdata_struct_16 *buf; int return_code; @@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, } log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) { - return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, - ctlr, buf, sizeof(ReadCapdata_struct_16), - 0, scsi3addr, TYPE_CMD); - } - else { - return_code = sendcmd(CCISS_READ_CAPACITY_16, - ctlr, buf, sizeof(ReadCapdata_struct_16), - 0, scsi3addr, TYPE_CMD); - } + return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, + ctlr, buf, sizeof(ReadCapdata_struct_16), + 0, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { *total_size = be64_to_cpu(*(__be64 *) buf->total_size); *block_size = be32_to_cpu(*(__be32 *) buf->block_size); @@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk) return 1; } if (h->cciss_read == CCISS_READ_10) { - cciss_read_capacity(h->ctlr, logvol, 1, + cciss_read_capacity(h->ctlr, logvol, &total_size, &block_size); } else { - cciss_read_capacity_16(h->ctlr, logvol, 1, + cciss_read_capacity_16(h->ctlr, logvol, &total_size, &block_size); } - cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, + cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size, inq_buff, drv); blk_queue_logical_block_size(drv->queue, drv->block_size); @@ -2837,167 +2818,6 @@ static int cciss_revalidate(struct gendisk *disk) } /* - * Wait polling for a command to complete. - * The memory mapped FIFO is polled for the completion. - * Used only at init time, interrupts from the HBA are disabled. - */ -static unsigned long pollcomplete(int ctlr) -{ - unsigned long done; - int i; - - /* Wait (up to 20 seconds) for a command to complete */ - - for (i = 20 * HZ; i > 0; i--) { - done = hba[ctlr]->access.command_completed(hba[ctlr]); - if (done == FIFO_EMPTY) - schedule_timeout_uninterruptible(1); - else - return done; - } - /* Invalid address to tell caller we ran out of time */ - return 1; -} - -/* Send command c to controller h and poll for it to complete. - * Turns interrupts off on the board. Used at driver init time - * and during SCSI error recovery. - */ -static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c) -{ - int i; - unsigned long complete; - int status = IO_ERROR; - u64bit buff_dma_handle; - -resend_cmd1: - - /* Disable interrupt on the board. */ - h->access.set_intr_mask(h, CCISS_INTR_OFF); - - /* Make sure there is room in the command FIFO */ - /* Actually it should be completely empty at this time */ - /* unless we are in here doing error handling for the scsi */ - /* tape side of the driver. */ - for (i = 200000; i > 0; i--) { - /* if fifo isn't full go */ - if (!(h->access.fifo_full(h))) - break; - udelay(10); - printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full," - " waiting!\n", h->ctlr); - } - h->access.submit_command(h, c); /* Send the cmd */ - do { - complete = pollcomplete(h->ctlr); - -#ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: command completed\n"); -#endif /* CCISS_DEBUG */ - - if (complete == 1) { - printk(KERN_WARNING - "cciss cciss%d: SendCmd Timeout out, " - "No command list address returned!\n", h->ctlr); - status = IO_ERROR; - break; - } - - /* Make sure it's the command we're expecting. */ - if ((complete & ~CISS_ERROR_BIT) != c->busaddr) { - printk(KERN_WARNING "cciss%d: Unexpected command " - "completion.\n", h->ctlr); - continue; - } - - /* It is our command. If no error, we're done. */ - if (!(complete & CISS_ERROR_BIT)) { - status = IO_OK; - break; - } - - /* There is an error... */ - - /* if data overrun or underun on Report command ignore it */ - if (((c->Request.CDB[0] == CISS_REPORT_LOG) || - (c->Request.CDB[0] == CISS_REPORT_PHYS) || - (c->Request.CDB[0] == CISS_INQUIRY)) && - ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) || - (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) { - complete = c->busaddr; - status = IO_OK; - break; - } - - if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) { - printk(KERN_WARNING "cciss%d: unsolicited abort %p\n", - h->ctlr, c); - if (c->retry_count < MAX_CMD_RETRIES) { - printk(KERN_WARNING "cciss%d: retrying %p\n", - h->ctlr, c); - c->retry_count++; - /* erase the old error information */ - memset(c->err_info, 0, sizeof(c->err_info)); - goto resend_cmd1; - } - printk(KERN_WARNING "cciss%d: retried %p too many " - "times\n", h->ctlr, c); - status = IO_ERROR; - break; - } - - if (c->err_info->CommandStatus == CMD_UNABORTABLE) { - printk(KERN_WARNING "cciss%d: command could not be " - "aborted.\n", h->ctlr); - status = IO_ERROR; - break; - } - - if (c->err_info->CommandStatus == CMD_TARGET_STATUS) { - status = check_target_status(h, c); - break; - } - - printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr); - printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n", - c->Request.CDB[0], c->err_info->CommandStatus); - status = IO_ERROR; - break; - - } while (1); - - /* unlock the data buffer from DMA */ - buff_dma_handle.val32.lower = c->SG[0].Addr.lower; - buff_dma_handle.val32.upper = c->SG[0].Addr.upper; - pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val, - c->SG[0].Len, PCI_DMA_BIDIRECTIONAL); - return status; -} - -/* - * Send a command to the controller, and wait for it to complete. - * Used at init time, and during SCSI error recovery. - */ -static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, - __u8 page_code, unsigned char *scsi3addr, int cmd_type) -{ - CommandList_struct *c; - int status; - - c = cmd_alloc(hba[ctlr], 1); - if (!c) { - printk(KERN_WARNING "cciss: unable to get memory"); - return IO_ERROR; - } - status = fill_cmd(c, cmd, ctlr, buff, size, page_code, - scsi3addr, cmd_type); - if (status == IO_OK) - status = sendcmd_core(hba[ctlr], c); - cmd_free(hba[ctlr], c, 1); - return status; -} - -/* * Map (physical) PCI mem into (virtual) kernel space */ static void __iomem *remap_pci_mem(ulong base, ulong size) @@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q) int seg; struct request *creq; u64bit temp64; - struct scatterlist tmp_sg[MAXSGENTRIES]; + struct scatterlist *tmp_sg; + SGDescriptor_struct *curr_sg; drive_info_struct *drv; int i, dir; + int nseg = 0; + int sg_index = 0; + int chained = 0; /* We call start_io here in case there is a command waiting on the * queue that has not been sent. @@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q) if (!creq) goto startio; - BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); + BUG_ON(creq->nr_phys_segments > h->maxsgentries); if ((c = cmd_alloc(h, 1)) == NULL) goto full; blk_start_request(creq); + tmp_sg = h->scatter_list[c->cmdindex]; spin_unlock_irq(q->queue_lock); c->cmd_type = CMD_RWREQ; @@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q) (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); #endif /* CCISS_DEBUG */ - sg_init_table(tmp_sg, MAXSGENTRIES); + sg_init_table(tmp_sg, h->maxsgentries); seg = blk_rq_map_sg(q, creq, tmp_sg); /* get the DMA records for the setup */ @@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q) else dir = PCI_DMA_TODEVICE; + curr_sg = c->SG; + sg_index = 0; + chained = 0; + for (i = 0; i < seg; i++) { - c->SG[i].Len = tmp_sg[i].length; + if (((sg_index+1) == (h->max_cmd_sgentries)) && + !chained && ((seg - i) > 1)) { + nseg = seg - i; + curr_sg[sg_index].Len = (nseg) * + sizeof(SGDescriptor_struct); + curr_sg[sg_index].Ext = CCISS_SG_CHAIN; + + /* Point to next chain block. */ + curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain; + sg_index = 0; + chained = 1; + } + curr_sg[sg_index].Len = tmp_sg[i].length; temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), - tmp_sg[i].offset, - tmp_sg[i].length, dir); - c->SG[i].Addr.lower = temp64.val32.lower; - c->SG[i].Addr.upper = temp64.val32.upper; - c->SG[i].Ext = 0; // we are not chaining + tmp_sg[i].offset, + tmp_sg[i].length, dir); + curr_sg[sg_index].Addr.lower = temp64.val32.lower; + curr_sg[sg_index].Addr.upper = temp64.val32.upper; + curr_sg[sg_index].Ext = 0; /* we are not chaining */ + + ++sg_index; + } + + if (chained) { + int len; + curr_sg = c->SG; + sg_index = h->max_cmd_sgentries - 1; + len = curr_sg[sg_index].Len; + /* Setup pointer to next chain block. + * Fill out last element in current chain + * block with address of next chain block. + */ + temp64.val = pci_map_single(h->pdev, + h->cmd_sg_list[c->cmdindex]->sgchain, + len, dir); + + h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val; + curr_sg[sg_index].Addr.lower = temp64.val32.lower; + curr_sg[sg_index].Addr.upper = temp64.val32.upper; + + pci_dma_sync_single_for_device(h->pdev, + h->cmd_sg_list[c->cmdindex]->sg_chain_dma, + len, dir); } + /* track how many SG entries we are using */ if (seg > h->maxSG) h->maxSG = seg; #ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", - blk_rq_sectors(creq), seg); + printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments " + "chained[%d]\n", + blk_rq_sectors(creq), seg, chained); #endif /* CCISS_DEBUG */ - c->Header.SGList = c->Header.SGTotal = seg; + c->Header.SGList = c->Header.SGTotal = seg + chained; + if (seg > h->max_cmd_sgentries) + c->Header.SGList = h->max_cmd_sgentries; + if (likely(blk_fs_request(creq))) { if(h->cciss_read == CCISS_READ_10) { c->Request.CDB[1] = 0; @@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h) * @h: Pointer to the controller. * * Removes the controller from the rescan queue if present. Blocks if - * the controller is currently conducting a rescan. + * the controller is currently conducting a rescan. The controller + * can be in one of three states: + * 1. Doesn't need a scan + * 2. On the scan list, but not scanning yet (we remove it) + * 3. Busy scanning (and not on the list). In this case we want to wait for + * the scan to complete to make sure the scanning thread for this + * controller is completely idle. **/ static void remove_from_scan_list(struct ctlr_info *h) { struct ctlr_info *test_h, *tmp_h; - int scanning = 0; mutex_lock(&scan_mutex); list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { - if (test_h == h) { + if (test_h == h) { /* state 2. */ list_del(&h->scan_list); complete_all(&h->scan_wait); mutex_unlock(&scan_mutex); return; } } - if (&h->busy_scanning) - scanning = 0; - mutex_unlock(&scan_mutex); - - if (scanning) + if (h->busy_scanning) { /* state 3. */ + mutex_unlock(&scan_mutex); wait_for_completion(&h->scan_wait); + } else { /* state 1, nothing to do. */ + mutex_unlock(&scan_mutex); + } } /** @@ -3573,13 +3448,11 @@ static int scan_thread(void *data) h->busy_scanning = 1; mutex_unlock(&scan_mutex); - if (h) { - rebuild_lun_table(h, 0, 0); - complete_all(&h->scan_wait); - mutex_lock(&scan_mutex); - h->busy_scanning = 0; - mutex_unlock(&scan_mutex); - } + rebuild_lun_table(h, 0, 0); + complete_all(&h->scan_wait); + mutex_lock(&scan_mutex); + h->busy_scanning = 0; + mutex_unlock(&scan_mutex); } } @@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c) case REPORT_LUNS_CHANGED: printk(KERN_WARNING "cciss%d: report LUN data " "changed\n", h->ctlr); - add_to_scan_list(h); - wake_up_process(cciss_scan_thread); + /* + * Here, we could call add_to_scan_list and wake up the scan thread, + * except that it's quite likely that we will get more than one + * REPORT_LUNS_CHANGED condition in quick succession, which means + * that those which occur after the first one will likely happen + * *during* the scan_thread's rescan. And the rescan code is not + * robust enough to restart in the middle, undoing what it has already + * done, and it's not clear that it's even possible to do this, since + * part of what it does is notify the block layer, which starts + * doing it's own i/o to read partition tables and so on, and the + * driver doesn't have visibility to know what might need undoing. + * In any event, if possible, it is horribly complicated to get right + * so we just don't do it for now. + * + * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012. + */ return 1; break; case POWER_OR_RESET: @@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev) * leave a little room for ioctl calls. */ c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); + c->maxsgentries = readl(&(c->cfgtable->MaxSGElements)); + + /* + * Limit native command to 32 s/g elements to save dma'able memory. + * Howvever spec says if 0, use 31 + */ + + c->max_cmd_sgentries = 31; + if (c->maxsgentries > 512) { + c->max_cmd_sgentries = 32; + c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1; + c->maxsgentries -= 1; /* account for chain pointer */ + } else { + c->maxsgentries = 31; /* Default to traditional value */ + c->chainsize = 0; /* traditional */ + } + c->product_name = products[prod_index].product_name; c->access = *(products[prod_index].access); c->nr_cmds = c->max_commands - 4; @@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, { int i; int j = 0; + int k = 0; int rc; int dac, return_code; InquiryData_struct *inq_buff; @@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, printk(KERN_ERR "cciss: out of memory"); goto clean4; } + + /* Need space for temp scatter list */ + hba[i]->scatter_list = kmalloc(hba[i]->max_commands * + sizeof(struct scatterlist *), + GFP_KERNEL); + for (k = 0; k < hba[i]->nr_cmds; k++) { + hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * + hba[i]->maxsgentries, + GFP_KERNEL); + if (hba[i]->scatter_list[k] == NULL) { + printk(KERN_ERR "cciss%d: could not allocate " + "s/g lists\n", i); + goto clean4; + } + } + hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) * + hba[i]->nr_cmds, + GFP_KERNEL); + if (!hba[i]->cmd_sg_list) { + printk(KERN_ERR "cciss%d: Cannot get memory for " + "s/g chaining.\n", i); + goto clean4; + } + /* Build up chain blocks for each command */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + hba[i]->cmd_sg_list[j] = + kmalloc(sizeof(struct Cmd_sg_list), + GFP_KERNEL); + if (!hba[i]->cmd_sg_list[j]) { + printk(KERN_ERR "cciss%d: Cannot get memory " + "for chain block.\n", i); + goto clean4; + } + /* Need a block of chainsized s/g elements. */ + hba[i]->cmd_sg_list[j]->sgchain = + kmalloc((hba[i]->chainsize * + sizeof(SGDescriptor_struct)), + GFP_KERNEL); + if (!hba[i]->cmd_sg_list[j]->sgchain) { + printk(KERN_ERR "cciss%d: Cannot get memory " + "for s/g chains\n", i); + goto clean4; + } + } + } + spin_lock_init(&hba[i]->lock); /* Initialize the pdev driver private data. @@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, cciss_procinit(i); - hba[i]->cciss_max_sectors = 2048; + hba[i]->cciss_max_sectors = 8192; rebuild_lun_table(hba[i], 1, 0); hba[i]->busy_initializing = 0; @@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, clean4: kfree(hba[i]->cmd_pool_bits); + /* Free up sg elements */ + for (k = 0; k < hba[i]->nr_cmds; k++) + kfree(hba[i]->scatter_list[k]); + kfree(hba[i]->scatter_list); + /* Only free up extra s/g lists if controller supports them */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + if (hba[i]->cmd_sg_list[j]) { + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } + } + kfree(hba[i]->cmd_sg_list); + } if (hba[i]->cmd_pool) pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(CommandList_struct), @@ -4400,30 +4366,28 @@ clean_no_release_regions: static void cciss_shutdown(struct pci_dev *pdev) { - ctlr_info_t *tmp_ptr; - int i; - char flush_buf[4]; + ctlr_info_t *h; + char *flush_buf; int return_code; - tmp_ptr = pci_get_drvdata(pdev); - if (tmp_ptr == NULL) - return; - i = tmp_ptr->ctlr; - if (hba[i] == NULL) + h = pci_get_drvdata(pdev); + flush_buf = kzalloc(4, GFP_KERNEL); + if (!flush_buf) { + printk(KERN_WARNING + "cciss:%d cache not flushed, out of memory.\n", + h->ctlr); return; - - /* Turn board interrupts off and send the flush cache command */ - /* sendcmd will turn off interrupt, and send the flush... - * To write all data in the battery backed cache to disks */ - memset(flush_buf, 0, 4); - return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, - CTLR_LUNID, TYPE_CMD); - if (return_code == IO_OK) { - printk(KERN_INFO "Completed flushing cache on controller %d\n", i); - } else { - printk(KERN_WARNING "Error flushing cache on controller %d\n", i); } - free_irq(hba[i]->intr[2], hba[i]); + /* write all data in the battery backed cache to disk */ + memset(flush_buf, 0, 4); + return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf, + 4, 0, CTLR_LUNID, TYPE_CMD); + kfree(flush_buf); + if (return_code != IO_OK) + printk(KERN_WARNING "cciss%d: Error flushing cache\n", + h->ctlr); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + free_irq(h->intr[2], h); } static void __devexit cciss_remove_one(struct pci_dev *pdev) @@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev) pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); kfree(hba[i]->cmd_pool_bits); + /* Free up sg elements */ + for (j = 0; j < hba[i]->nr_cmds; j++) + kfree(hba[i]->scatter_list[j]); + kfree(hba[i]->scatter_list); + /* Only free up extra s/g lists if controller supports them */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + if (hba[i]->cmd_sg_list[j]) { + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } + } + kfree(hba[i]->cmd_sg_list); + } /* * Deliberately omit pci_disable_device(): it does something nasty to * Smart Array controllers that pci_enable_device does not undo diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 31524cf..1d95db2 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -55,7 +55,13 @@ typedef struct _drive_info_struct char device_initialized; /* indicates whether dev is initialized */ } drive_info_struct; -struct ctlr_info +struct Cmd_sg_list { + SGDescriptor_struct *sgchain; + dma_addr_t sg_chain_dma; + int chain_block_size; +}; + +struct ctlr_info { int ctlr; char devname[8]; @@ -75,6 +81,16 @@ struct ctlr_info int num_luns; int highest_lun; int usage_count; /* number of opens all all minor devices */ + /* Need space for temp sg list + * number of scatter/gathers supported + * number of scatter/gathers in chained block + */ + struct scatterlist **scatter_list; + int maxsgentries; + int chainsize; + int max_cmd_sgentries; + struct Cmd_sg_list **cmd_sg_list; + # define DOORBELL_INT 0 # define PERF_MODE_INT 1 # define SIMPLE_MODE_INT 2 diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h index dbaed1e..b50a9b2 100644 --- a/drivers/block/cciss_cmd.h +++ b/drivers/block/cciss_cmd.h @@ -7,7 +7,8 @@ //general boundary defintions #define SENSEINFOBYTES 32//note that this value may vary between host implementations -#define MAXSGENTRIES 31 +#define MAXSGENTRIES 32 +#define CCISS_SG_CHAIN 0x80000000 #define MAXREPLYQS 256 //Command Status value @@ -319,6 +320,10 @@ typedef struct _CfgTable_struct { BYTE ServerName[16]; DWORD HeartBeat; DWORD SCSI_Prefetch; + DWORD MaxSGElements; + DWORD MaxLogicalUnits; + DWORD MaxPhysicalDrives; + DWORD MaxPhysicalDrivesPerLogicalUnit; } CfgTable_struct; #pragma pack() #endif // CCISS_CMD_H diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 3315268..5d0e46d 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag) cp, ei->ScsiStatus); #endif - cmd->result |= (ei->ScsiStatus < 1); + cmd->result |= (ei->ScsiStatus << 1); } else { /* scsi status is zero??? How??? */ @@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr) if (sa->registered) { printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); - return ENXIO; + return -ENXIO; } sa->registered = 1; spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 0000000..f4acd04 --- /dev/null +++ b/drivers/block/drbd/Kconfig @@ -0,0 +1,71 @@ +# +# DRBD device driver configuration +# + +comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" + depends on !PROC_FS || !INET || !CONNECTOR + +config BLK_DEV_DRBD + tristate "DRBD Distributed Replicated Block Device support" + depends on PROC_FS && INET && CONNECTOR + select LRU_CACHE + default n + help + + NOTE: In order to authenticate connections you have to select + CRYPTO_HMAC and a hash function as well. + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Each minor device has a role, which can be 'primary' or 'secondary'. + On the node with the primary device the application is supposed to + run and to access the device (/dev/drbdX). Every write is sent to + the local 'lower level block device' and, across the network, to the + node with the device in 'secondary' state. The secondary device + simply writes the data to its lower level block device. + + DRBD can also be used in dual-Primary mode (device writable on both + nodes), which means it can exhibit shared disk semantics in a + shared-nothing cluster. Needless to say, on top of dual-Primary + DRBD utilizing a cluster file system is necessary to maintain for + cache coherency. + + For automatic failover you need a cluster manager (e.g. heartbeat). + See also: http://www.drbd.org/, http://www.linux-ha.org + + If unsure, say N. + +config DRBD_FAULT_INJECTION + bool "DRBD fault injection" + depends on BLK_DEV_DRBD + help + + Say Y here if you want to simulate IO errors, in order to test DRBD's + behavior. + + The actual simulation of IO errors is done by writing 3 values to + /sys/module/drbd/parameters/ + + enable_faults: bitmask of... + 1 meta data write + 2 read + 4 resync data write + 8 read + 16 data write + 32 data read + 64 read ahead + 128 kmalloc of bitmap + 256 allocation of EE (epoch_entries) + + fault_devs: bitmask of minor numbers + fault_rate: frequency in percent + + Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. + echo 16 > /sys/module/drbd/parameters/enable_faults + echo 1 > /sys/module/drbd/parameters/fault_devs + echo 5 > /sys/module/drbd/parameters/fault_rate + + If unsure, say N. diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 0000000..0d3f337 --- /dev/null +++ b/drivers/block/drbd/Makefile @@ -0,0 +1,5 @@ +drbd-y := drbd_bitmap.o drbd_proc.o +drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o +drbd-y += drbd_main.o drbd_strings.o drbd_nl.o + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 0000000..17956ff --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c @@ -0,0 +1,1424 @@ +/* + drbd_actlog.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/slab.h> +#include <linux/drbd.h> +#include "drbd_int.h" +#include "drbd_wrappers.h" + +/* We maintain a trivial check sum in our on disk activity log. + * With that we can ensure correct operation even when the storage + * device might do a partial (last) sector write while loosing power. + */ +struct __packed al_transaction { + u32 magic; + u32 tr_number; + struct __packed { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; +}; + +struct update_odbm_work { + struct drbd_work w; + unsigned int enr; +}; + +struct update_al_work { + struct drbd_work w; + struct lc_element *al_ext; + struct completion event; + unsigned int enr; + /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ + unsigned int old_enr; +}; + +struct drbd_atodb_wait { + atomic_t count; + struct completion io_done; + struct drbd_conf *mdev; + int error; +}; + + +int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); + +static int _drbd_md_sync_page_io(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, + struct page *page, sector_t sector, + int rw, int size) +{ + struct bio *bio; + struct drbd_md_io md_io; + int ok; + + md_io.mdev = mdev; + init_completion(&md_io.event); + md_io.error = 0; + + if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) + rw |= (1 << BIO_RW_BARRIER); + rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)); + + retry: + bio = bio_alloc(GFP_NOIO, 1); + bio->bi_bdev = bdev->md_bdev; + bio->bi_sector = sector; + ok = (bio_add_page(bio, page, size, 0) == size); + if (!ok) + goto out; + bio->bi_private = &md_io; + bio->bi_end_io = drbd_md_io_complete; + bio->bi_rw = rw; + + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + bio_endio(bio, -EIO); + else + submit_bio(rw, bio); + wait_for_completion(&md_io.event); + ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + + /* check for unsupported barrier op. + * would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 */ + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { + /* Try again with no barrier */ + dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); + set_bit(MD_NO_BARRIER, &mdev->flags); + rw &= ~(1 << BIO_RW_BARRIER); + bio_put(bio); + goto retry; + } + out: + bio_put(bio); + return ok; +} + +int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int logical_block_size, mask, ok; + int offset = 0; + struct page *iop = mdev->md_io_page; + + D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + + BUG_ON(!bdev->md_bdev); + + logical_block_size = bdev_logical_block_size(bdev->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + /* in case logical_block_size != 512 [ s390 only? ] */ + if (logical_block_size != MD_SECTOR_SIZE) { + mask = (logical_block_size / MD_SECTOR_SIZE) - 1; + D_ASSERT(mask == 1 || mask == 3 || mask == 7); + D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw & WRITE) { + /* these are GFP_KERNEL pages, pre-allocated + * on device initialization */ + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, + READ, logical_block_size); + + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus," + "READ [logical_block_size!=512]) failed!\n", + (unsigned long long)sector); + return 0; + } + + memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); + } + } + + if (sector < drbd_md_first_sector(bdev) || + sector > drbd_md_last_sector(bdev)) + dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + return 0; + } + + if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); + } + + return ok; +} + +static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + struct lc_element *tmp; + unsigned long al_flags = 0; + + spin_lock_irq(&mdev->al_lock); + tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + } + al_ext = lc_get(mdev->act_log, enr); + al_flags = mdev->act_log->flags; + spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); + } + */ + + return al_ext; +} + +void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + struct lc_element *al_ext; + struct update_al_work al_work; + + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); + + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); + + if (al_ext->lc_number != enr) { + /* drbd_al_write_transaction(mdev,al_ext,enr); + * recurses into generic_make_request(), which + * disallows recursion, bios being serialized on the + * current->bio_tail list now. + * we have to delegate updates to the activity log + * to the worker thread. */ + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.old_enr = al_ext->lc_number; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(&mdev->data.work, &al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log, al_ext); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); + } +} + +void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + struct lc_element *extent; + unsigned long flags; + + spin_lock_irqsave(&mdev->al_lock, flags); + + extent = lc_find(mdev->act_log, enr); + + if (!extent) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); + return; + } + + if (lc_put(mdev->act_log, extent) == 0) + wake_up(&mdev->al_wait); + + spin_unlock_irqrestore(&mdev->al_lock, flags); +} + +int +w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct update_al_work *aw = container_of(w, struct update_al_work, w); + struct lc_element *updated = aw->al_ext; + const unsigned int new_enr = aw->enr; + const unsigned int evicted = aw->old_enr; + struct al_transaction *buffer; + sector_t sector; + int i, n, mx; + unsigned int extent_nr; + u32 xor_sum = 0; + + if (!get_ldev(mdev)) { + dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); + complete(&((struct update_al_work *)w)->event); + return 1; + } + /* do we have to do a bitmap write, first? + * TODO reduce maximum latency: + * submit both bios, then wait for both, + * instead of doing two synchronous sector writes. */ + if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); + + mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ + buffer = (struct al_transaction *)page_address(mdev->md_io_page); + + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); + + n = lc_index_of(mdev->act_log, updated); + + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); + + xor_sum ^= new_enr; + + mx = min_t(int, AL_EXTENTS_PT, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = mdev->al_tr_cycle + i; + extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; + buffer->updates[i+1].pos = cpu_to_be32(idx); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; + } + for (; i < AL_EXTENTS_PT; i++) { + buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; + if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) + mdev->al_tr_cycle = 0; + + buffer->xor_sum = cpu_to_be32(xor_sum); + + sector = mdev->ldev->md.md_offset + + mdev->ldev->md.al_offset + mdev->al_tr_pos; + + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) + drbd_chk_io_error(mdev, 1, TRUE); + + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; + + mutex_unlock(&mdev->md_io_mutex); + + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + + return 1; +} + +/** + * drbd_al_read_tr() - Read a single transaction from the on disk activity log + * @mdev: DRBD device. + * @bdev: Block device to read form. + * @b: pointer to an al_transaction. + * @index: On disk slot of the transaction to read. + * + * Returns -1 on IO error, 0 on checksum error and 1 upon success. + */ +static int drbd_al_read_tr(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, + struct al_transaction *b, + int index) +{ + sector_t sector; + int rv, i; + u32 xor_sum = 0; + + sector = bdev->md.md_offset + bdev->md.al_offset + index; + + /* Dont process error normally, + * as this is done before disk is attached! */ + if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) + return -1; + + rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); + + for (i = 0; i < AL_EXTENTS_PT + 1; i++) + xor_sum ^= be32_to_cpu(b->updates[i].extent); + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); + + return rv; +} + +/** + * drbd_al_read_log() - Restores the activity log from its on disk representation. + * @mdev: DRBD device. + * @bdev: Block device to read form. + * + * Returns 1 on success, returns 0 when reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + struct al_transaction *buffer; + int i; + int rv; + int mx; + int active_extents = 0; + int transactions = 0; + int found_valid = 0; + int from = 0; + int to = 0; + u32 from_tnr = 0; + u32 to_tnr = 0; + u32 cnr; + + mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + mutex_lock(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); + + /* Find the valid transaction in the log */ + for (i = 0; i <= mx; i++) { + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + if (rv == 0) + continue; + if (rv == -1) { + mutex_unlock(&mdev->md_io_mutex); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + + if (++found_valid == 1) { + from = i; + to = i; + from_tnr = cnr; + to_tnr = cnr; + continue; + } + if ((int)cnr - (int)from_tnr < 0) { + D_ASSERT(from_tnr - cnr + i - from == mx+1); + from = i; + from_tnr = cnr; + } + if ((int)cnr - (int)to_tnr > 0) { + D_ASSERT(cnr - to_tnr == i - to); + to = i; + to_tnr = cnr; + } + } + + if (!found_valid) { + dev_warn(DEV, "No usable activity log found.\n"); + mutex_unlock(&mdev->md_io_mutex); + return 1; + } + + /* Read the valid transactions. + * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ + i = from; + while (1) { + int j, pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + ERR_IF(rv == 0) goto cancel; + if (rv == -1) { + mutex_unlock(&mdev->md_io_mutex); + return 0; + } + + trn = be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for (j = AL_EXTENTS_PT; j >= 0; j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if (extent_nr == LC_FREE) + continue; + + lc_set(mdev->act_log, extent_nr, pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + +cancel: + if (i == to) + break; + i++; + if (i > mx) + i = 0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + /* ok, we are done with it */ + mutex_unlock(&mdev->md_io_mutex); + + dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", + transactions, active_extents); + + return 1; +} + +static void atodb_endio(struct bio *bio, int error) +{ + struct drbd_atodb_wait *wc = bio->bi_private; + struct drbd_conf *mdev = wc->mdev; + struct page *page; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + if (!error && !uptodate) + error = -EIO; + + drbd_chk_io_error(mdev, error, TRUE); + if (error && wc->error == 0) + wc->error = error; + + if (atomic_dec_and_test(&wc->count)) + complete(&wc->io_done); + + page = bio->bi_io_vec[0].bv_page; + put_page(page); + bio_put(bio); + mdev->bm_writ_cnt++; + put_ldev(mdev); +} + +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* activity log to on disk bitmap -- prepare bio unless that sector + * is already covered by previously prepared bios */ +static int atodb_prepare_unless_covered(struct drbd_conf *mdev, + struct bio **bios, + unsigned int enr, + struct drbd_atodb_wait *wc) __must_hold(local) +{ + struct bio *bio; + struct page *page; + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset + + mdev->ldev->md.bm_offset; + unsigned int page_offset = PAGE_SIZE; + int offset; + int i = 0; + int err = -ENOMEM; + + /* Check if that enr is already covered by an already created bio. + * Caution, bios[] is not NULL terminated, + * but only initialized to all NULL. + * For completely scattered activity log, + * the last invocation iterates over all bios, + * and finds the last NULL entry. + */ + while ((bio = bios[i])) { + if (bio->bi_sector == on_disk_sector) + return 0; + i++; + } + /* bios[i] == NULL, the next not yet used slot */ + + /* GFP_KERNEL, we are not in the write-out path */ + bio = bio_alloc(GFP_KERNEL, 1); + if (bio == NULL) + return -ENOMEM; + + if (i > 0) { + const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; + page_offset = prev_bv->bv_offset + prev_bv->bv_len; + page = prev_bv->bv_page; + } + if (page_offset == PAGE_SIZE) { + page = alloc_page(__GFP_HIGHMEM); + if (page == NULL) + goto out_bio_put; + page_offset = 0; + } else { + get_page(page); + } + + offset = S2W(enr); + drbd_bm_get_lel(mdev, offset, + min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), + kmap(page) + page_offset); + kunmap(page); + + bio->bi_private = wc; + bio->bi_end_io = atodb_endio; + bio->bi_bdev = mdev->ldev->md_bdev; + bio->bi_sector = on_disk_sector; + + if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) + goto out_put_page; + + atomic_inc(&wc->count); + /* we already know that we may do this... + * get_ldev_if_state(mdev,D_ATTACHING); + * just get the extra reference, so that the local_cnt reflects + * the number of pending IO requests DRBD at its backing device. + */ + atomic_inc(&mdev->local_cnt); + + bios[i] = bio; + + return 0; + +out_put_page: + err = -EINVAL; + put_page(page); +out_bio_put: + bio_put(bio); + return err; +} + +/** + * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents + * @mdev: DRBD device. + * + * Called when we detach (unconfigure) local storage, + * or when we go from R_PRIMARY to R_SECONDARY role. + */ +void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) +{ + int i, nr_elements; + unsigned int enr; + struct bio **bios; + struct drbd_atodb_wait wc; + + ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) + return; /* sorry, I don't have any act_log etc... */ + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + nr_elements = mdev->act_log->nr_elements; + + /* GFP_KERNEL, we are not in anyone's write-out path */ + bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); + if (!bios) + goto submit_one_by_one; + + atomic_set(&wc.count, 0); + init_completion(&wc.io_done); + wc.mdev = mdev; + wc.error = 0; + + for (i = 0; i < nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + /* next statement also does atomic_inc wc.count and local_cnt */ + if (atodb_prepare_unless_covered(mdev, bios, + enr/AL_EXT_PER_BM_SECT, + &wc)) + goto free_bios_submit_one_by_one; + } + + /* unnecessary optimization? */ + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + /* all prepared, submit them */ + for (i = 0; i < nr_elements; i++) { + if (bios[i] == NULL) + break; + if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { + bios[i]->bi_rw = WRITE; + bio_endio(bios[i], -EIO); + } else { + submit_bio(WRITE, bios[i]); + } + } + + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); + + /* always (try to) flush bitmap to stable storage */ + drbd_md_flush(mdev); + + /* In case we did not submit a single IO do not wait for + * them to complete. ( Because we would wait forever here. ) + * + * In case we had IOs and they are already complete, there + * is not point in waiting anyways. + * Therefore this if () ... */ + if (atomic_read(&wc.count)) + wait_for_completion(&wc.io_done); + + put_ldev(mdev); + + kfree(bios); + return; + + free_bios_submit_one_by_one: + /* free everything by calling the endio callback directly. */ + for (i = 0; i < nr_elements && bios[i]; i++) + bio_endio(bios[i], 0); + + kfree(bios); + + submit_one_by_one: + dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + /* Really slow: if we have al-extents 16..19 active, + * sector 4 will be written four times! Synchronous! */ + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + put_ldev(mdev); +} + +/** + * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents + * @mdev: DRBD device. + */ +void drbd_al_apply_to_bm(struct drbd_conf *mdev) +{ + unsigned int enr; + unsigned long add = 0; + char ppb[10]; + int i; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + add += drbd_bm_ALe_set_all(mdev, enr); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb, Bit2KB(add))); +} + +static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&mdev->al_lock); + rv = (al_ext->refcnt == 0); + if (likely(rv)) + lc_del(mdev->act_log, al_ext); + spin_unlock_irq(&mdev->al_lock); + + return rv; +} + +/** + * drbd_al_shrink() - Removes all active extents form the activity log + * @mdev: DRBD device. + * + * Removes all active extents form the activity log, waiting until + * the reference count of each entry dropped to 0 first, of course. + * + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct drbd_conf *mdev) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + al_ext = lc_element_by_index(mdev->act_log, i); + if (al_ext->lc_number == LC_FREE) + continue; + wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); + } + + wake_up(&mdev->al_wait); +} + +static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); + + if (!get_ldev(mdev)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); + kfree(udw); + return 1; + } + + drbd_bm_write_sect(mdev, udw->enr); + put_ldev(mdev); + + kfree(udw); + + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { + switch (mdev->state.conn) { + case C_SYNC_SOURCE: case C_SYNC_TARGET: + case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: + drbd_resync_finished(mdev); + default: + /* nothing to do */ + break; + } + } + drbd_bcast_sync_progress(mdev); + + return 1; +} + + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * The caller of this function has to hold an get_ldev() reference. + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, + int count, int success) +{ + struct lc_element *e; + struct update_odbm_work *udw; + + unsigned int enr; + + D_ASSERT(atomic_read(&mdev->local_cnt)); + + /* I simply assume that a sector/size pair never crosses + * a 16 MB extent border. (Currently this is true...) */ + enr = BM_SECT_TO_EXT(sector); + + e = lc_get(mdev->resync, enr); + if (e) { + struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); + if (ext->lce.lc_number == enr) { + if (success) + ext->rs_left -= count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d\n", + (unsigned long long)sector, + ext->lce.lc_number, ext->rs_left, + ext->rs_failed, count); + dump_stack(); + + lc_put(mdev->resync, &ext->lce); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; + } + } else { + /* Normally this element should be in the cache, + * since drbd_rs_begin_io() pulled it already in. + * + * But maybe an application write finished, and we set + * something outside the resync lru_cache in sync. + */ + int rs_left = drbd_bm_e_weight(mdev, enr); + if (ext->flags != 0) { + dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if (ext->rs_failed) { + dev_warn(DEV, "Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); + } + ext->rs_left = rs_left; + ext->rs_failed = success ? 0 : count; + lc_changed(mdev->resync, &ext->lce); + } + lc_put(mdev->resync, &ext->lce); + /* no race, we are within the al_lock! */ + + if (ext->rs_left == ext->rs_failed) { + ext->rs_failed = 0; + + udw = kmalloc(sizeof(*udw), GFP_ATOMIC); + if (udw) { + udw->enr = ext->lce.lc_number; + udw->w.cb = w_update_odbm; + drbd_queue_work_front(&mdev->data.work, &udw->w); + } else { + dev_warn(DEV, "Could not kmalloc an udw\n"); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); + } + } + } else { + dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", + mdev->resync_locked, + mdev->resync->nr_elements, + mdev->resync->flags); + } +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on C_SYNC_TARGET and receiver on SyncSource. + * + */ +void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, + const char *file, const unsigned int line) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up = 0; + unsigned long flags; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector, size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we clear it (in sync). + * round up start sector, round down end sector. we make sure we only + * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + return; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) + return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irqsave(&mdev->al_lock, flags); + count = drbd_bm_clear_bits(mdev, sbnr, ebnr); + if (count) { + /* we need the lock for drbd_try_clear_on_disk_bm */ + if (jiffies - mdev->rs_mark_time > HZ*10) { + /* should be rolling marks, + * but we estimate only anyways. */ + if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + mdev->state.conn != C_PAUSED_SYNC_T && + mdev->state.conn != C_PAUSED_SYNC_S) { + mdev->rs_mark_time = jiffies; + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + } + } + if (get_ldev(mdev)) { + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + put_ldev(mdev); + } + /* just wake_up unconditional now, various lc_chaged(), + * lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up = 1; + } + spin_unlock_irqrestore(&mdev->al_lock, flags); + if (wake_up) + wake_up(&mdev->al_wait); +} + +/* + * this is intended to set one request worth of data out of sync. + * affects at least 1 bit, + * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * + * called by tl_clear and drbd_send_dblock (==drbd_make_request). + * so this can be _any_ process. + */ +void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, + const char *file, const unsigned int line) +{ + unsigned long sbnr, ebnr, lbnr, flags; + sector_t esector, nr_sectors; + unsigned int enr, count; + struct lc_element *e; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "sector: %llus, size: %d\n", + (unsigned long long)sector, size); + return; + } + + if (!get_ldev(mdev)) + return; /* no disk, no metadata, no bitmap to set bits in */ + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) + goto out; + ERR_IF(esector >= nr_sectors) + esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we set it out of sync, + * we do not need to round anything here */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + /* ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. */ + spin_lock_irqsave(&mdev->al_lock, flags); + count = drbd_bm_set_bits(mdev, sbnr, ebnr); + + enr = BM_SECT_TO_EXT(sector); + e = lc_find(mdev->resync, enr); + if (e) + lc_entry(e, struct bm_extent, lce)->rs_left += count; + spin_unlock_irqrestore(&mdev->al_lock, flags); + +out: + put_ldev(mdev); +} + +static +struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_locked > mdev->resync->nr_elements/2) { + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + e = lc_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync, &bm_ext->lce); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) + mdev->resync_locked++; + set_bit(BME_NO_WRITES, &bm_ext->flags); + } + rs_flags = mdev->resync->flags; + spin_unlock_irq(&mdev->al_lock); + if (wakeup) + wake_up(&mdev->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_DIRTY); + } + + return bm_ext; +} + +static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + int rv = 0; + + spin_lock_irq(&mdev->al_lock); + if (unlikely(enr == mdev->act_log->new_number)) + rv = 1; + else { + al_ext = lc_find(mdev->act_log, enr); + if (al_ext) { + if (al_ext->refcnt) + rv = 1; + } + } + spin_unlock_irq(&mdev->al_lock); + + /* + if (unlikely(rv)) { + dev_info(DEV, "Delaying sync read until app's write is done\n"); + } + */ + return rv; +} + +/** + * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED + * @mdev: DRBD device. + * @sector: The sector number. + * + * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + */ +int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent *bm_ext; + int i, sig; + + sig = wait_event_interruptible(mdev->al_wait, + (bm_ext = _bme_get(mdev, enr))); + if (sig) + return 0; + + if (test_bit(BME_LOCKED, &bm_ext->flags)) + return 1; + + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + sig = wait_event_interruptible(mdev->al_wait, + !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); + if (sig) { + spin_lock_irq(&mdev->al_lock); + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + spin_unlock_irq(&mdev->al_lock); + return 0; + } + } + + set_bit(BME_LOCKED, &bm_ext->flags); + + return 1; +} + +/** + * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep + * @mdev: DRBD device. + * @sector: The sector number. + * + * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then + * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN + * if there is still application IO going on in this area. + */ +int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we give up the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keep the refcount on. + * so we remembered which extent we had to try again, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + e = lc_find(mdev->resync, mdev->resync_wenr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_wenr = LC_FREE; + if (lc_put(mdev->resync, &bm_ext->lce) == 0) + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } else { + dev_alert(DEV, "LOGIC BUG\n"); + } + } + /* TRY. */ + e = lc_try_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (test_bit(BME_LOCKED, &bm_ext->flags)) + goto proceed; + if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { + mdev->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + bm_ext->lce.refcnt--; + D_ASSERT(bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + /* do we rather want to try later? */ + if (mdev->resync_locked > mdev->resync->nr_elements-3) + goto try_again; + /* Do or do not. There is no try. -- Yoda */ + e = lc_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + const unsigned long rs_flags = mdev->resync->flags; + if (rs_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_DIRTY); + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync, &bm_ext->lce); + wake_up(&mdev->al_wait); + D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES, &bm_ext->flags); + D_ASSERT(bm_ext->lce.refcnt == 1); + mdev->resync_locked++; + goto check_al; + } +check_al: + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + if (unlikely(al_enr+i == mdev->act_log->new_number)) + goto try_again; + if (lc_is_used(mdev->act_log, al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED, &bm_ext->flags); +proceed: + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + return 0; + +try_again: + if (bm_ext) + mdev->resync_wenr = enr; + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct lc_element *e; + struct bm_extent *bm_ext; + unsigned long flags; + + spin_lock_irqsave(&mdev->al_lock, flags); + e = lc_find(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if (bm_ext->lce.refcnt == 0) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " + "but refcnt is 0!?\n", + (unsigned long long)sector, enr); + return; + } + + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { + clear_bit(BME_LOCKED, &bm_ext->flags); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock, flags); +} + +/** + * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) + * @mdev: DRBD device. + */ +void drbd_rs_cancel_all(struct drbd_conf *mdev) +{ + spin_lock_irq(&mdev->al_lock); + + if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ + lc_reset(mdev->resync); + put_ldev(mdev); + } + mdev->resync_locked = 0; + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); +} + +/** + * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU + * @mdev: DRBD device. + * + * Returns 0 upon success, -EAGAIN if at least one reference count was + * not zero. + */ +int drbd_rs_del_all(struct drbd_conf *mdev) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + spin_lock_irq(&mdev->al_lock); + + if (get_ldev_if_state(mdev, D_FAILED)) { + /* ok, ->resync is there. */ + for (i = 0; i < mdev->resync->nr_elements; i++) { + e = lc_element_by_index(mdev->resync, i); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext->lce.lc_number == LC_FREE) + continue; + if (bm_ext->lce.lc_number == mdev->resync_wenr) { + dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" + " got 'synced' by application io\n", + mdev->resync_wenr); + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync, &bm_ext->lce); + } + if (bm_ext->lce.refcnt != 0) { + dev_info(DEV, "Retrying drbd_rs_del_all() later. " + "refcnt=%d\n", bm_ext->lce.refcnt); + put_ldev(mdev); + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; + } + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); + lc_del(mdev->resync, &bm_ext->lce); + } + D_ASSERT(mdev->resync->used == 0); + put_ldev(mdev); + } + spin_unlock_irq(&mdev->al_lock); + + return 0; +} + +/** + * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks + * @mdev: DRBD device. + * @sector: The sector number. + * @size: Size of failed IO operation, in byte. + */ +void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count; + sector_t esector, nr_sectors; + int wake_up = 0; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector, size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* + * round up start sector, round down end sector. we make sure we only + * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + return; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) + return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + count = drbd_bm_count_bits(mdev, sbnr, ebnr); + if (count) { + mdev->rs_failed += count; + + if (get_ldev(mdev)) { + drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); + put_ldev(mdev); + } + + /* just wake_up unconditional now, various lc_chaged(), + * lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up = 1; + } + spin_unlock_irq(&mdev->al_lock); + if (wake_up) + wake_up(&mdev->al_wait); +} diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 0000000..b61057e --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c @@ -0,0 +1,1327 @@ +/* + drbd_bitmap.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/bitops.h> +#include <linux/vmalloc.h> +#include <linux/string.h> +#include <linux/drbd.h> +#include <asm/kmap_types.h> +#include "drbd_int.h" + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + + * convention: + * function name drbd_bm_... => used elsewhere, "public". + * function name bm_... => internal to implementation, "private". + + * Note that since find_first_bit returns int, at the current granularity of + * the bitmap (4KB per byte), this implementation "only" supports up to + * 1<<(32+12) == 16 TB... + */ + +/* + * NOTE + * Access to the *bm_pages is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bits is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * And we need the kmap_atomic. + */ +struct drbd_bitmap { + struct page **bm_pages; + spinlock_t bm_lock; + /* WARNING unsigned long bm_*: + * 32bit number of bit offset is just enough for 512 MB bitmap. + * it will blow up if we make the bitmap bigger... + * not that it makes much sense to have a bitmap that large, + * rather change the granularity to 16k or 64k or something. + * (that implies other problems, however...) + */ + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ + unsigned long bm_bits; + size_t bm_words; + size_t bm_number_of_pages; + sector_t bm_dev_capacity; + struct semaphore bm_change; /* serializes resize operations */ + + atomic_t bm_async_io; + wait_queue_head_t bm_io_wait; + + unsigned long bm_flags; + + /* debugging aid, in case we are still racy somewhere */ + char *bm_why; + struct task_struct *bm_task; +}; + +/* definition of bits in bm_flags */ +#define BM_LOCKED 0 +#define BM_MD_IO_ERROR 1 +#define BM_P_VMALLOCED 2 + +static int bm_is_locked(struct drbd_bitmap *b) +{ + return test_bit(BM_LOCKED, &b->bm_flags); +} + +#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) +static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) +{ + struct drbd_bitmap *b = mdev->bitmap; + if (!__ratelimit(&drbd_ratelimit_state)) + return; + dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + func, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); +} + +void drbd_bm_lock(struct drbd_conf *mdev, char *why) +{ + struct drbd_bitmap *b = mdev->bitmap; + int trylock_failed; + + if (!b) { + dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); + return; + } + + trylock_failed = down_trylock(&b->bm_change); + + if (trylock_failed) { + dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + why, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); + down(&b->bm_change); + } + if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) + dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); + + b->bm_why = why; + b->bm_task = current; +} + +void drbd_bm_unlock(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + if (!b) { + dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); + return; + } + + if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) + dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); + + b->bm_why = NULL; + b->bm_task = NULL; + up(&b->bm_change); +} + +/* word offset to long pointer */ +static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) +{ + struct page *page; + unsigned long page_nr; + + /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ + page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + page = b->bm_pages[page_nr]; + + return (unsigned long *) kmap_atomic(page, km); +} + +static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) +{ + return __bm_map_paddr(b, offset, KM_IRQ1); +} + +static void __bm_unmap(unsigned long *p_addr, const enum km_type km) +{ + kunmap_atomic(p_addr, km); +}; + +static void bm_unmap(unsigned long *p_addr) +{ + return __bm_unmap(p_addr, KM_IRQ1); +} + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* word offset from start of bitmap to word number _in_page_ + * modulo longs per page +#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) + hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) + so do it explicitly: + */ +#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) + +/* Long words per page */ +#define LWPP (PAGE_SIZE/sizeof(long)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * struct drbd_conf*, but for the debug macros I like to have the mdev around + * to be able to report device specific. + */ + +static void bm_free_pages(struct page **pages, unsigned long number) +{ + unsigned long i; + if (!pages) + return; + + for (i = 0; i < number; i++) { + if (!pages[i]) { + printk(KERN_ALERT "drbd: bm_free_pages tried to free " + "a NULL pointer; i=%lu n=%lu\n", + i, number); + continue; + } + __free_page(pages[i]); + pages[i] = NULL; + } +} + +static void bm_vk_free(void *ptr, int v) +{ + if (v) + vfree(ptr); + else + kfree(ptr); +} + +/* + * "have" and "want" are NUMBER OF PAGES. + */ +static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) +{ + struct page **old_pages = b->bm_pages; + struct page **new_pages, *page; + unsigned int i, bytes, vmalloced = 0; + unsigned long have = b->bm_number_of_pages; + + BUG_ON(have == 0 && old_pages != NULL); + BUG_ON(have != 0 && old_pages == NULL); + + if (have == want) + return old_pages; + + /* Trying kmalloc first, falling back to vmalloc. + * GFP_KERNEL is ok, as this is done when a lower level disk is + * "attached" to the drbd. Context is receiver thread or cqueue + * thread. As we have no disk yet, we are not in the IO path, + * not even the IO path of the peer. */ + bytes = sizeof(struct page *)*want; + new_pages = kmalloc(bytes, GFP_KERNEL); + if (!new_pages) { + new_pages = vmalloc(bytes); + if (!new_pages) + return NULL; + vmalloced = 1; + } + + memset(new_pages, 0, bytes); + if (want >= have) { + for (i = 0; i < have; i++) + new_pages[i] = old_pages[i]; + for (; i < want; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) { + bm_free_pages(new_pages + have, i - have); + bm_vk_free(new_pages, vmalloced); + return NULL; + } + new_pages[i] = page; + } + } else { + for (i = 0; i < want; i++) + new_pages[i] = old_pages[i]; + /* NOT HERE, we are outside the spinlock! + bm_free_pages(old_pages + want, have - want); + */ + } + + if (vmalloced) + set_bit(BM_P_VMALLOCED, &b->bm_flags); + else + clear_bit(BM_P_VMALLOCED, &b->bm_flags); + + return new_pages; +} + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in mdev->bitmap. + */ +int drbd_bm_init(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + WARN_ON(b != NULL); + b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); + if (!b) + return -ENOMEM; + spin_lock_init(&b->bm_lock); + init_MUTEX(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + mdev->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(struct drbd_conf *mdev) +{ + ERR_IF(!mdev->bitmap) return 0; + return mdev->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(struct drbd_conf *mdev) +{ + ERR_IF (!mdev->bitmap) return; + bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); + bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); + kfree(mdev->bitmap); + mdev->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Returns the number of bits cleared. + */ +static int bm_clear_surplus(struct drbd_bitmap *b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; + size_t w = b->bm_bits >> LN2_BPL; + int cleared = 0; + unsigned long *p_addr, *bm; + + p_addr = bm_map_paddr(b, w); + bm = p_addr + MLPP(w); + if (w < b->bm_words) { + cleared = hweight_long(*bm & ~mask); + *bm &= mask; + w++; bm++; + } + + if (w < b->bm_words) { + cleared += hweight_long(*bm); + *bm = 0; + } + bm_unmap(p_addr); + return cleared; +} + +static void bm_set_surplus(struct drbd_bitmap *b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; + size_t w = b->bm_bits >> LN2_BPL; + unsigned long *p_addr, *bm; + + p_addr = bm_map_paddr(b, w); + bm = p_addr + MLPP(w); + if (w < b->bm_words) { + *bm |= ~mask; + bm++; w++; + } + + if (w < b->bm_words) { + *bm = ~(0UL); + } + bm_unmap(p_addr); +} + +static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) +{ + unsigned long *p_addr, *bm, offset = 0; + unsigned long bits = 0; + unsigned long i, do_now; + + while (offset < b->bm_words) { + i = do_now = min_t(size_t, b->bm_words-offset, LWPP); + p_addr = __bm_map_paddr(b, offset, KM_USER0); + bm = p_addr + MLPP(offset); + while (i--) { +#ifndef __LITTLE_ENDIAN + if (swap_endian) + *bm = lel_to_cpu(*bm); +#endif + bits += hweight_long(*bm++); + } + __bm_unmap(p_addr, KM_USER0); + offset += do_now; + cond_resched(); + } + + return bits; +} + +static unsigned long bm_count_bits(struct drbd_bitmap *b) +{ + return __bm_count_bits(b, 0); +} + +static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) +{ + return __bm_count_bits(b, 1); +} + +/* offset and len in long words.*/ +static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) +{ + unsigned long *p_addr, *bm; + size_t do_now, end; + +#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) + + end = offset + len; + + if (end > b->bm_words) { + printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); + return; + } + + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + if (bm+do_now > p_addr + LWPP) { + printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", + p_addr, bm, (int)do_now); + break; /* breaks to after catch_oob_access_end() only! */ + } + memset(bm, c, do_now * sizeof(long)); + bm_unmap(p_addr); + offset += do_now; + } +} + +/* + * make sure the bitmap has enough room for the attached storage, + * if necessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initialized to all bits set. + */ +int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bits, words, owords, obits, *p_addr, *bm; + unsigned long want, have, onpages; /* number of pages */ + struct page **npages, **opages = NULL; + int err = 0, growing; + int opages_vmalloced; + + ERR_IF(!b) return -ENOMEM; + + drbd_bm_lock(mdev, "resize"); + + dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + onpages = b->bm_number_of_pages; + owords = b->bm_words; + b->bm_pages = NULL; + b->bm_number_of_pages = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + bm_free_pages(opages, onpages); + bm_vk_free(opages, opages_vmalloced); + goto out; + } + bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits, 64) >> LN2_BPL; + + if (get_ldev(mdev)) { + D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); + put_ldev(mdev); + } + + /* one extra long to catch off by one errors */ + want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + have = b->bm_number_of_pages; + if (want == have) { + D_ASSERT(b->bm_pages != NULL); + npages = b->bm_pages; + } else { + if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) + npages = NULL; + else + npages = bm_realloc_pages(b, want); + } + + if (!npages) { + err = -ENOMEM; + goto out; + } + + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + owords = b->bm_words; + obits = b->bm_bits; + + growing = bits > obits; + if (opages) + bm_set_surplus(b); + + b->bm_pages = npages; + b->bm_number_of_pages = want; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + + if (growing) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } + + if (want < have) { + /* implicit: (opages != NULL) && (opages != npages) */ + bm_free_pages(opages + want, have - want); + } + + p_addr = bm_map_paddr(b, words); + bm = p_addr + MLPP(words); + *bm = DRBD_MAGIC; + bm_unmap(p_addr); + + (void)bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); + if (opages != npages) + bm_vk_free(opages, opages_vmalloced); + if (!growing) + b->bm_set = bm_count_bits(b); + dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); + + out: + drbd_bm_unlock(mdev); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long s; + unsigned long flags; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock, flags); + + return s; +} + +unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) +{ + unsigned long s; + /* if I don't have a disk, I don't know about out-of-sync status */ + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) + return 0; + s = _drbd_bm_total_weight(mdev); + put_ldev(mdev); + return s; +} + +size_t drbd_bm_words(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + return b->bm_words; +} + +unsigned long drbd_bm_bits(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + + return b->bm_bits; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + * bitmap must be locked by drbd_bm_lock. + * currently only used from receive_bitmap. + */ +void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + unsigned long word, bits; + size_t end, do_now; + + end = offset + number; + + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + if (number == 0) + return; + WARN_ON(offset >= b->bm_words); + WARN_ON(end > b->bm_words); + + spin_lock_irq(&b->bm_lock); + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) { + bits = hweight_long(*bm); + word = *bm | lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + bm_unmap(p_addr); + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (end == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + size_t end, do_now; + + end = offset + number; + + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + if ((offset >= b->bm_words) || + (end > b->bm_words) || + (number <= 0)) + dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + else { + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) + *buffer++ = cpu_to_lel(*bm++); + bm_unmap(p_addr); + } + } + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0xff, b->bm_words); + (void)bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0, b->bm_words); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +static void bm_async_io_complete(struct bio *bio, int error) +{ + struct drbd_bitmap *b = bio->bi_private; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! + * do we want to WARN() on this? */ + if (!error && !uptodate) + error = -EIO; + + if (error) { + /* doh. what now? + * for now, set all bits, and flag MD_IO_ERROR */ + __set_bit(BM_MD_IO_ERROR, &b->bm_flags); + } + if (atomic_dec_and_test(&b->bm_async_io)) + wake_up(&b->bm_io_wait); + + bio_put(bio); +} + +static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) +{ + /* we are process context. we always get a bio */ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); + unsigned int len; + sector_t on_disk_sector = + mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small + * flexible external meta data device */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); + + bio->bi_bdev = mdev->ldev->md_bdev; + bio->bi_sector = on_disk_sector; + bio_add_page(bio, b->bm_pages[page_nr], len, 0); + bio->bi_private = b; + bio->bi_end_io = bm_async_io_complete; + + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio, -EIO); + } else { + submit_bio(rw, bio); + } +} + +# if defined(__LITTLE_ENDIAN) + /* nothing to do, on disk == in memory */ +# define bm_cpu_to_lel(x) ((void)0) +# else +void bm_cpu_to_lel(struct drbd_bitmap *b) +{ + /* need to cpu_to_lel all the pages ... + * this may be optimized by using + * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; + * the following is still not optimal, but better than nothing */ + unsigned int i; + unsigned long *p_addr, *bm; + if (b->bm_set == 0) { + /* no page at all; avoid swap if all is 0 */ + i = b->bm_number_of_pages; + } else if (b->bm_set == b->bm_bits) { + /* only the last page */ + i = b->bm_number_of_pages - 1; + } else { + /* all pages */ + i = 0; + } + for (; i < b->bm_number_of_pages; i++) { + p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); + for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) + *bm = cpu_to_lel(*bm); + kunmap_atomic(p_addr, KM_USER0); + } +} +# endif +/* lel_to_cpu == cpu_to_lel */ +# define bm_lel_to_cpu(x) bm_cpu_to_lel(x) + +/* + * bm_rw: read/write the whole bitmap from/to its on disk location. + */ +static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) +{ + struct drbd_bitmap *b = mdev->bitmap; + /* sector_t sector; */ + int bm_words, num_pages, i; + unsigned long now; + char ppb[10]; + int err = 0; + + WARN_ON(!bm_is_locked(b)); + + /* no spinlock here, the drbd_bm_lock should be enough! */ + + bm_words = drbd_bm_words(mdev); + num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; + + /* on disk bitmap is little endian */ + if (rw == WRITE) + bm_cpu_to_lel(b); + + now = jiffies; + atomic_set(&b->bm_async_io, num_pages); + __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); + + /* let the layers below us try to merge these bios... */ + for (i = 0; i < num_pages; i++) + bm_page_io_async(mdev, b, i, rw); + + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); + wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); + + if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { + dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(mdev, 1, TRUE); + err = -EIO; + } + + now = jiffies; + if (rw == WRITE) { + /* swap back endianness */ + bm_lel_to_cpu(b); + /* flush bitmap to stable storage */ + drbd_md_flush(mdev); + } else /* rw == READ */ { + /* just read, if necessary adjust endianness */ + b->bm_set = bm_count_bits_swap_endian(b); + dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + now = b->bm_set; + + dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + + return err; +} + +/** + * drbd_bm_read() - Read the whole bitmap from its on disk location. + * @mdev: DRBD device. + */ +int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, READ); +} + +/** + * drbd_bm_write() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + */ +int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE); +} + +/** + * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap + * @mdev: DRBD device. + * @enr: Extent number in the resync lru (happens to be sector offset) + * + * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered + * by a single sector write. Therefore enr == sector offset from the + * start of the bitmap. + */ +int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) +{ + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset + + mdev->ldev->md.bm_offset; + int bm_words, num_words, offset; + int err = 0; + + mutex_lock(&mdev->md_io_mutex); + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); /* word offset into bitmap */ + num_words = min(S2W(1), bm_words - offset); + if (num_words < S2W(1)) + memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); + drbd_bm_get_lel(mdev, offset, num_words, + page_address(mdev->md_io_page)); + if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { + int i; + err = -EIO; + dev_err(DEV, "IO ERROR writing bitmap sector %lu " + "(meta-disk sector %llus)\n", + enr, (unsigned long long)on_disk_sector); + drbd_chk_io_error(mdev, 1, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); + } + mdev->bm_writ_cnt++; + mutex_unlock(&mdev->md_io_mutex); + return err; +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * should not make much difference anyways, but ... + * + * this returns a bit number, NOT a sector! + */ +#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) +static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, + const int find_zero_bit, const enum km_type km) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + unsigned long *p_addr; + unsigned long bit_offset; /* bit offset of the mapped page. */ + + if (bm_fo > b->bm_bits) { + dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); + } else { + while (bm_fo < b->bm_bits) { + unsigned long offset; + bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ + offset = bit_offset >> LN2_BPL; /* word offset of the page */ + p_addr = __bm_map_paddr(b, offset, km); + + if (find_zero_bit) + i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + else + i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + + __bm_unmap(p_addr, km); + if (i < PAGE_SIZE*8) { + i = bit_offset + i; + if (i >= b->bm_bits) + break; + goto found; + } + bm_fo = bit_offset + PAGE_SIZE*8; + } + i = -1UL; + } + found: + return i; +} + +static unsigned long bm_find_next(struct drbd_conf *mdev, + unsigned long bm_fo, const int find_zero_bit) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + + ERR_IF(!b) return i; + ERR_IF(!b->bm_pages) return i; + + spin_lock_irq(&b->bm_lock); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); + + spin_unlock_irq(&b->bm_lock); + return i; +} + +unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) +{ + return bm_find_next(mdev, bm_fo, 0); +} + +#if 0 +/* not yet needed for anything. */ +unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) +{ + return bm_find_next(mdev, bm_fo, 1); +} +#endif + +/* does not spin_lock_irqsave. + * you must take drbd_bm_lock() first */ +unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) +{ + /* WARN_ON(!bm_is_locked(mdev)); */ + return __bm_find_next(mdev, bm_fo, 0, KM_USER1); +} + +unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) +{ + /* WARN_ON(!bm_is_locked(mdev)); */ + return __bm_find_next(mdev, bm_fo, 1, KM_USER1); +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector. + * expected to be called for only a few bits (e - s about BITS_PER_LONG). + * Must hold bitmap lock already. */ +int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + unsigned long e, int val, const enum km_type km) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned long last_page_nr = -1UL; + int c = 0; + + if (e >= b->bm_bits) { + dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", + s, e, b->bm_bits); + e = b->bm_bits ? b->bm_bits -1 : 0; + } + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned long offset = bitnr>>LN2_BPL; + unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + if (page_nr != last_page_nr) { + if (p_addr) + __bm_unmap(p_addr, km); + p_addr = __bm_map_paddr(b, offset, km); + last_page_nr = page_nr; + } + if (val) + c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); + else + c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); + } + if (p_addr) + __bm_unmap(p_addr, km); + b->bm_set += c; + return c; +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector */ +int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + const unsigned long e, int val) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + int c = 0; + + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); + + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + +/* returns number of bits changed 0 -> 1 */ +int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + return bm_change_bits_to(mdev, s, e, 1); +} + +/* returns number of bits changed 1 -> 0 */ +int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + return -bm_change_bits_to(mdev, s, e, 0); +} + +/* sets all bits in full words, + * from first_word up to, but not including, last_word */ +static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, + int page_nr, int first_word, int last_word) +{ + int i; + int bits; + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); + for (i = first_word; i < last_word; i++) { + bits = hweight_long(paddr[i]); + paddr[i] = ~0UL; + b->bm_set += BITS_PER_LONG - bits; + } + kunmap_atomic(paddr, KM_USER0); +} + +/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. + * You must first drbd_bm_lock(). + * Can be called to set the whole bitmap in one go. + * Sets bits from s to e _inclusive_. */ +void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + /* First set_bit from the first bit (s) + * up to the next long boundary (sl), + * then assign full words up to the last long boundary (el), + * then set_bit up to and including the last bit (e). + * + * Do not use memset, because we must account for changes, + * so we need to loop over the words with hweight() anyways. + */ + unsigned long sl = ALIGN(s,BITS_PER_LONG); + unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); + int first_page; + int last_page; + int page_nr; + int first_word; + int last_word; + + if (e - s <= 3*BITS_PER_LONG) { + /* don't bother; el and sl may even be wrong. */ + __bm_change_bits_to(mdev, s, e, 1, KM_USER0); + return; + } + + /* difference is large enough that we can trust sl and el */ + + /* bits filling the current long */ + if (sl) + __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); + + first_page = sl >> (3 + PAGE_SHIFT); + last_page = el >> (3 + PAGE_SHIFT); + + /* MLPP: modulo longs per page */ + /* LWPP: long words per page */ + first_word = MLPP(sl >> LN2_BPL); + last_word = LWPP; + + /* first and full pages, unless first page == last page */ + for (page_nr = first_page; page_nr < last_page; page_nr++) { + bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); + cond_resched(); + first_word = 0; + } + + /* last page (respectively only page, for first page == last page) */ + last_word = MLPP(el >> LN2_BPL); + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); + + /* possibly trailing bits. + * example: (e & 63) == 63, el will be e+1. + * if that even was the very last bit, + * it would trigger an assert in __bm_change_bits_to() + */ + if (el <= e) + __bm_change_bits_to(mdev, el, e, 1, KM_USER0); +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr; + int i; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + if (bitnr < b->bm_bits) { + unsigned long offset = bitnr>>LN2_BPL; + p_addr = bm_map_paddr(b, offset); + i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; + bm_unmap(p_addr); + } else if (bitnr == b->bm_bits) { + i = -1; + } else { /* (bitnr > b->bm_bits) */ + dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irqrestore(&b->bm_lock, flags); + return i; +} + +/* returns number of bits set in the range [s, e] */ +int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr = NULL, page_nr = -1; + unsigned long bitnr; + int c = 0; + size_t w; + + /* If this is called without a bitmap, that is a bug. But just to be + * robust in case we screwed up elsewhere, in that case pretend there + * was one dirty bit in the requested area, so we won't try to do a + * local read there (no bitmap probably implies no disk) */ + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 1; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + for (bitnr = s; bitnr <= e; bitnr++) { + w = bitnr >> LN2_BPL; + if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { + page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); + if (p_addr) + bm_unmap(p_addr); + p_addr = bm_map_paddr(b, w); + } + ERR_IF (bitnr >= b->bm_bits) { + dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); + } else { + c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + } + } + if (p_addr) + bm_unmap(p_addr); + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int count, s, e; + unsigned long flags; + unsigned long *p_addr, *bm; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + s = S2W(enr); + e = min((size_t)S2W(enr+1), b->bm_words); + count = 0; + if (s < b->bm_words) { + int n = e-s; + p_addr = bm_map_paddr(b, s); + bm = p_addr + MLPP(s); + while (n--) + count += hweight_long(*bm++); + bm_unmap(p_addr); + } else { + dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock, flags); + return count; +} + +/* set all bits covered by the AL-extent al_enr */ +unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + unsigned long weight; + int count, s, e, i, do_now; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irq(&b->bm_lock); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + /* assert that s and e are on the same page */ + D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) + == s >> (PAGE_SHIFT - LN2_BPL + 3)); + count = 0; + if (s < b->bm_words) { + i = do_now = e-s; + p_addr = bm_map_paddr(b, s); + bm = p_addr + MLPP(s); + while (i--) { + count += hweight_long(*bm); + *bm = -1UL; + bm++; + } + bm_unmap(p_addr); + b->bm_set += do_now*BITS_PER_LONG - count; + if (e == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + } else { + dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 0000000..2312d78 --- /dev/null +++ b/drivers/block/drbd/drbd_int.h @@ -0,0 +1,2252 @@ +/* + drbd_int.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/version.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/crypto.h> +#include <linux/ratelimit.h> +#include <linux/tcp.h> +#include <linux/mutex.h> +#include <linux/major.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> +#include <net/tcp.h> +#include <linux/lru_cache.h> + +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) + +/* module parameter, defined in drbd_main.c */ +extern unsigned int minor_count; +extern int disable_sendpage; +extern int allow_oos; +extern unsigned int cn_idx; + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +extern char usermode_helper[]; + + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + */ +#define DRBD_SIGKILL SIGHUP + +/* All EEs on the free list should have ID_VACANT (== 0) + * freshly allocated EEs get !ID_VACANT (== 1) + * so if it says "cannot dereference null pointer at adress 0x00000001", + * it is most likely one of these :( */ + +#define ID_IN_SYNC (4711ULL) +#define ID_OUT_OF_SYNC (4712ULL) + +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 +#define is_syncer_block_id(id) ((id) == ID_SYNCER) + +struct drbd_conf; + + +/* to shorten dev_warn(DEV, "msg"); and relatives statements */ +#define DEV (disk_to_dev(mdev->vdisk)) + +#define D_ASSERT(exp) if (!(exp)) \ + dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) + +#define ERR_IF(exp) if (({ \ + int _b = (exp) != 0; \ + if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ + __func__, #exp, __FILE__, __LINE__); \ + _b; \ + })) + +/* Defines to control fault insertion */ +enum { + DRBD_FAULT_MD_WR = 0, /* meta data write */ + DRBD_FAULT_MD_RD = 1, /* read */ + DRBD_FAULT_RS_WR = 2, /* resync */ + DRBD_FAULT_RS_RD = 3, + DRBD_FAULT_DT_WR = 4, /* data */ + DRBD_FAULT_DT_RD = 5, + DRBD_FAULT_DT_RA = 6, /* data read ahead */ + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ + DRBD_FAULT_AL_EE = 8, /* alloc ee */ + + DRBD_FAULT_MAX, +}; + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern unsigned int +_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); +static inline int +drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { + return fault_rate && + (enable_faults & (1<<type)) && + _drbd_insert_fault(mdev, type); +} +#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) + +#else +#define FAULT_ACTIVE(_m, _t) (0) +#endif + +/* integer division, round _UP_ to the next integer */ +#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) +/* usual integer division */ +#define div_floor(A, B) ((A)/(B)) + +/* drbd_meta-data.c (still in drbd_main.c) */ +/* 4th incarnation of the disk layout. */ +#define DRBD_MD_MAGIC (DRBD_MAGIC+4) + +extern struct drbd_conf **minor_table; +extern struct ratelimit_state drbd_ratelimit_state; + +/* on the wire */ +enum drbd_packets { + /* receiver (data socket) */ + P_DATA = 0x00, + P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ + P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ + P_BARRIER = 0x03, + P_BITMAP = 0x04, + P_BECOME_SYNC_TARGET = 0x05, + P_BECOME_SYNC_SOURCE = 0x06, + P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ + P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ + P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ + P_SYNC_PARAM = 0x0a, + P_PROTOCOL = 0x0b, + P_UUIDS = 0x0c, + P_SIZES = 0x0d, + P_STATE = 0x0e, + P_SYNC_UUID = 0x0f, + P_AUTH_CHALLENGE = 0x10, + P_AUTH_RESPONSE = 0x11, + P_STATE_CHG_REQ = 0x12, + + /* asender (meta socket */ + P_PING = 0x13, + P_PING_ACK = 0x14, + P_RECV_ACK = 0x15, /* Used in protocol B */ + P_WRITE_ACK = 0x16, /* Used in protocol C */ + P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ + P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ + P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ + P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ + P_BARRIER_ACK = 0x1c, + P_STATE_CHG_REPLY = 0x1d, + + /* "new" commands, no longer fitting into the ordering scheme above */ + + P_OV_REQUEST = 0x1e, /* data socket */ + P_OV_REPLY = 0x1f, + P_OV_RESULT = 0x20, /* meta socket */ + P_CSUM_RS_REQUEST = 0x21, /* data socket */ + P_RS_IS_IN_SYNC = 0x22, /* meta socket */ + P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ + P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + + P_MAX_CMD = 0x25, + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ + P_MAX_OPT_CMD = 0x101, + + /* special command ids for handshake */ + + P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ + P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ + + P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ +}; + +static inline const char *cmdname(enum drbd_packets cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_DISCARD_ACK] = "DiscardAck", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_MAX_CMD] = NULL, + }; + + if (cmd == P_HAND_SHAKE_M) + return "HandShakeM"; + if (cmd == P_HAND_SHAKE_S) + return "HandShakeS"; + if (cmd == P_HAND_SHAKE) + return "HandShake"; + if (cmd >= P_MAX_CMD) + return "Unknown"; + return cmdnames[cmd]; +} + +/* for sending/receiving the bitmap, + * possibly in some encoding scheme */ +struct bm_xfer_ctx { + /* "const" + * stores total bits and long words + * of the bitmap, so we don't need to + * call the accessor functions over and again. */ + unsigned long bm_bits; + unsigned long bm_words; + /* during xfer, current position within the bitmap */ + unsigned long bit_offset; + unsigned long word_offset; + + /* statistics; index: (h->command == P_BITMAP) */ + unsigned packets[2]; + unsigned bytes[2]; +}; + +extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, + const char *direction, struct bm_xfer_ctx *c); + +static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) +{ + /* word_offset counts "native long words" (32 or 64 bit), + * aligned at 64 bit. + * Encoded packet may end at an unaligned bit offset. + * In case a fallback clear text packet is transmitted in + * between, we adjust this offset back to the last 64bit + * aligned "native long word", which makes coding and decoding + * the plain text bitmap much more convenient. */ +#if BITS_PER_LONG == 64 + c->word_offset = c->bit_offset >> 6; +#elif BITS_PER_LONG == 32 + c->word_offset = c->bit_offset >> 5; + c->word_offset &= ~(1UL); +#else +# error "unsupported BITS_PER_LONG" +#endif +} + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +struct p_header { + u32 magic; + u16 command; + u16 length; /* bytes of data after this header */ + u8 payload[0]; +} __packed; +/* 8 bytes. packet FIXED for the next century! */ + +/* + * short commands, packets without payload, plain p_header: + * P_PING + * P_PING_ACK + * P_BECOME_SYNC_TARGET + * P_BECOME_SYNC_SOURCE + * P_UNPLUG_REMOTE + */ + +/* + * commands with out-of-struct payload: + * P_BITMAP (no additional fields) + * P_DATA, P_DATA_REPLY (see p_data) + * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) + */ + +/* these defines must not be changed without changing the protocol version */ +#define DP_HARDBARRIER 1 +#define DP_RW_SYNC 2 +#define DP_MAY_SET_IN_SYNC 4 + +struct p_data { + struct p_header head; + u64 sector; /* 64 bits sector number */ + u64 block_id; /* to identify the request in protocol B&C */ + u32 seq_num; + u32 dp_flags; +} __packed; + +/* + * commands which share a struct: + * p_block_ack: + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), + * P_DISCARD_ACK (proto C, two-primaries conflict detection) + * p_block_req: + * P_DATA_REQUEST, P_RS_DATA_REQUEST + */ +struct p_block_ack { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __packed; + + +struct p_block_req { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* + * commands with their own struct for additional fields: + * P_HAND_SHAKE + * P_BARRIER + * P_BARRIER_ACK + * P_SYNC_PARAM + * ReportParams + */ + +struct p_handshake { + struct p_header head; /* 8 bytes */ + u32 protocol_min; + u32 feature_flags; + u32 protocol_max; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u32 _pad; + u64 reserverd[7]; +} __packed; +/* 80 bytes, FIXED for the next century */ + +struct p_barrier { + struct p_header head; + u32 barrier; /* barrier number _handle_ only */ + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +struct p_barrier_ack { + struct p_header head; + u32 barrier; + u32 set_size; +} __packed; + +struct p_rs_param { + struct p_header head; + u32 rate; + + /* Since protocol version 88 and higher. */ + char verify_alg[0]; +} __packed; + +struct p_rs_param_89 { + struct p_header head; + u32 rate; + /* protocol version 89: */ + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; +} __packed; + +struct p_protocol { + struct p_header head; + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 want_lose; + u32 two_primaries; + + /* Since protocol version 87 and higher. */ + char integrity_alg[0]; + +} __packed; + +struct p_uuids { + struct p_header head; + u64 uuid[UI_EXTENDED_SIZE]; +} __packed; + +struct p_rs_uuid { + struct p_header head; + u64 uuid; +} __packed; + +struct p_sizes { + struct p_header head; + u64 d_size; /* size of disk */ + u64 u_size; /* user requested size */ + u64 c_size; /* current exported size */ + u32 max_segment_size; /* Maximal size of a BIO */ + u32 queue_order_type; +} __packed; + +struct p_state { + struct p_header head; + u32 state; +} __packed; + +struct p_req_state { + struct p_header head; + u32 mask; + u32 val; +} __packed; + +struct p_req_state_reply { + struct p_header head; + u32 retcode; +} __packed; + +struct p_drbd06_param { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __packed; + +struct p_discard { + struct p_header head; + u64 block_id; + u32 seq_num; + u32 pad; +} __packed; + +/* Valid values for the encoding field. + * Bump proto version when changing this. */ +enum drbd_bitmap_code { + /* RLE_VLI_Bytes = 0, + * and other bit variants had been defined during + * algorithm evaluation. */ + RLE_VLI_Bits = 2, +}; + +struct p_compressed_bm { + struct p_header head; + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code + * (encoding & 0x80): polarity (set/unset) of first runlength + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits + * used to pad up to head.length bytes + */ + u8 encoding; + + u8 code[0]; +} __packed; + +/* DCBP: Drbd Compressed Bitmap Packet ... */ +static inline enum drbd_bitmap_code +DCBP_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static inline void +DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static inline int +DCBP_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static inline void +DCBP_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static inline int +DCBP_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +static inline void +DCBP_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +/* one bitmap packet, including the p_header, + * should fit within one _architecture independend_ page. + * so we need to use the fixed size 4KiB page size + * most architechtures have used for a long time. + */ +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) +#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) +#if (PAGE_SIZE < 4096) +/* drbd_send_bitmap / receive_bitmap would break horribly */ +#error "PAGE_SIZE too small" +#endif + +union p_polymorph { + struct p_header header; + struct p_handshake handshake; + struct p_data data; + struct p_block_ack block_ack; + struct p_barrier barrier; + struct p_barrier_ack barrier_ack; + struct p_rs_param_89 rs_param_89; + struct p_protocol protocol; + struct p_sizes sizes; + struct p_uuids uuids; + struct p_state state; + struct p_req_state req_state; + struct p_req_state_reply req_state_reply; + struct p_block_req block_req; +} __packed; + +/**********************************************************************/ +enum drbd_thread_state { + None, + Running, + Exiting, + Restarting +}; + +struct drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion stop; + enum drbd_thread_state t_state; + int (*function) (struct drbd_thread *); + struct drbd_conf *mdev; + int reset_cpu_mask; +}; + +static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and drbd_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +struct drbd_tl_epoch; +struct drbd_request { + struct drbd_work w; + struct drbd_conf *mdev; + + /* if local IO is not allowed, will be NULL. + * if local IO _is_ allowed, holds the locally submitted bio clone, + * or, after local IO completion, the ERR_PTR(error). + * see drbd_endio_pri(). */ + struct bio *private_bio; + + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ + + /* barrier_nr: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * starting a new epoch... + */ + + /* up to here, the struct layout is identical to drbd_epoch_entry; + * we might be able to use that to our advantage... */ + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; + unsigned long start_time; +}; + +struct drbd_tl_epoch { + struct drbd_work w; + struct list_head requests; /* requests before */ + struct drbd_tl_epoch *next; /* pointer to the next barrier */ + unsigned int br_number; /* the barriers identifier. */ + int n_req; /* number of requests attached before this barrier */ +}; + +struct drbd_request; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send P_WRITE_ACK + read_ee .. [RS]P_DATA_REQUEST being read +*/ + +struct drbd_epoch { + struct list_head list; + unsigned int barrier_nr; + atomic_t epoch_size; /* increased on every request added. */ + atomic_t active; /* increased on every req. added, and dec on every finished. */ + unsigned long flags; +}; + +/* drbd_epoch flag bits */ +enum { + DE_BARRIER_IN_NEXT_EPOCH_ISSUED, + DE_BARRIER_IN_NEXT_EPOCH_DONE, + DE_CONTAINS_A_BARRIER, + DE_HAVE_BARRIER_NUMBER, + DE_IS_FINISHING, +}; + +enum epoch_event { + EV_PUT, + EV_GOT_BARRIER_NR, + EV_BARRIER_DONE, + EV_BECAME_LAST, + EV_CLEANUP = 32, /* used as flag */ +}; + +struct drbd_epoch_entry { + struct drbd_work w; + struct drbd_conf *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + struct drbd_epoch *epoch; + + /* up to here, the struct layout is identical to drbd_request; + * we might be able to use that to our advantage... */ + + unsigned int flags; + u64 block_id; +}; + +struct drbd_wq_barrier { + struct drbd_work w; + struct completion done; +}; + +struct digest_info { + int digest_size; + void *digest; +}; + +/* ee flag bits */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_CONFLICT_PENDING, + __EE_MAY_SET_IN_SYNC, + __EE_IS_BARRIER, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) +#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) + +/* global flag bits */ +enum { + CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, /* whether asender should send a ping asap */ + + STOP_SYNC_TIMER, /* tell timer to cancel itself */ + UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ + MD_DIRTY, /* current uuids and flags not yet on disk */ + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, /* This node was a crashed primary. + * Gets cleared when the state.conn + * goes into C_CONNECTED state. */ + WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ + NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ + CONSIDER_RESYNC, + + MD_NO_BARRIER, /* meta data device does not support barriers, + so don't even try */ + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + NET_CONGESTED, /* The data socket is congested */ + + CONFIG_PENDING, /* serialization of (re)configuration requests. + * if set, also prevents the device from dying */ + DEVICE_DYING, /* device became unconfigured, + * but worker thread is still handling the cleanup. + * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, + * while this is set. */ + RESIZE_PENDING, /* Size change detected locally, waiting for the response from + * the peer, if it changed there as well. */ +}; + +struct drbd_bitmap; /* opaque for drbd_conf */ + +/* TODO sort members for performance + * MAYBE group them further */ + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; /* producers up it, worker down()s it */ + spinlock_t q_lock; /* to protect the list. */ +}; + +struct drbd_socket { + struct drbd_work_queue work; + struct mutex mutex; + struct socket *socket; + /* this way we get our + * send/receive buffers off the stack */ + union p_polymorph sbuf; + union p_polymorph rbuf; +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + u64 uuid[UI_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to al area */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* u32 al_nr_extents; important for restoring the AL + * is stored into sync_conf.al_extents, which in turn + * gets applied to act_log->nr_elements + */ +}; + +/* for sync_conf and other types... */ +#define NL_PACKET(name, number, fields) struct name { fields }; +#define NL_INTEGER(pn,pr,member) int member; +#define NL_INT64(pn,pr,member) __u64 member; +#define NL_BIT(pn,pr,member) unsigned member:1; +#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include "linux/drbd_nl.h" + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct file *lo_file; + struct file *md_file; + struct drbd_md md; + struct disk_conf dc; /* The user provided config... */ + sector_t known_size; /* last known size of that backing device */ +}; + +struct drbd_md_io { + struct drbd_conf *mdev; + struct completion event; + int error; +}; + +struct bm_io_work { + struct drbd_work w; + char *why; + int (*io_fn)(struct drbd_conf *mdev); + void (*done)(struct drbd_conf *mdev, int rv); +}; + +enum write_ordering_e { + WO_none, + WO_drain_io, + WO_bdev_flush, + WO_bio_barrier +}; + +struct drbd_conf { + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ + struct syncer_conf sync_conf; + struct drbd_backing_dev *ldev __protected_by(local); + + sector_t p_size; /* partner's disk size */ + struct request_queue *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + struct drbd_work resync_work, + unplug_work, + md_sync_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + + /* Used after attach while negotiating new disk state. */ + union drbd_state new_state_tmp; + + union drbd_state state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; /* upon each state change. */ + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ + atomic_t unacked_cnt; /* Need to send replys for */ + atomic_t local_cnt; /* Waiting for local completion */ + atomic_t net_cnt; /* Users of net_conf */ + spinlock_t req_lock; + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ + struct drbd_tl_epoch *newest_tle; + struct drbd_tl_epoch *oldest_tle; + struct list_head out_of_sequence_requests; + struct hlist_head *tl_hash; + unsigned int tl_hash_s; + + /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ + unsigned long rs_total; + /* number of sync IOs that failed in this run */ + unsigned long rs_failed; + /* Syncer's start time [unit jiffies] */ + unsigned long rs_start; + /* cumulated time in PausedSyncX state [unit jiffies] */ + unsigned long rs_paused; + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ + unsigned long rs_mark_left; + /* marks's time [unit jiffies] */ + unsigned long rs_mark_time; + /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; + + /* where does the admin want us to start? (sector) */ + sector_t ov_start_sector; + /* where are we now? (sector) */ + sector_t ov_position; + /* Start sector of out of sync range (to merge printk reporting). */ + sector_t ov_last_oos_start; + /* size of out-of-sync range in sectors. */ + sector_t ov_last_oos_size; + unsigned long ov_left; /* in bits */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; + struct drbd_bitmap *bitmap; + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ + + /* Used to track operations of resync... */ + struct lru_cache *resync; + /* Number of locked elements in resync LRU */ + unsigned int resync_locked; + /* resync extent number waiting for application requests */ + unsigned int resync_wenr; + + int open_cnt; + u64 *p_uuid; + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + enum write_ordering_e write_ordering; + struct list_head active_ee; /* IO in progress */ + struct list_head sync_ee; /* IO in progress */ + struct list_head done_ee; /* send ack */ + struct list_head read_ee; /* IO in progress */ + struct list_head net_ee; /* zero-copy network send in progress */ + struct hlist_head *ee_hash; /* is proteced by req_lock! */ + unsigned int ee_hash_s; + + /* this one is protected by ee_lock, single thread */ + struct drbd_epoch_entry *last_write_w_barrier; + + int next_barrier_nr; + struct hlist_head *app_reads_hash; /* is proteced by req_lock */ + struct list_head resync_reads; + atomic_t pp_in_use; + wait_queue_head_t ee_wait; + struct page *md_io_page; /* one page buffer for md_io */ + struct page *md_io_tmpp; /* for logical_block_size != 512 */ + struct mutex md_io_mutex; /* protects the md_io_buffer */ + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache *act_log; /* activity log */ + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; /* position of the next transaction in the journal */ + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ + struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ + void *int_dig_out; + void *int_dig_in; + void *int_dig_vv; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + unsigned int minor; + unsigned long comm_bm_set; /* communicated number of set bits. */ + cpumask_var_t cpu_mask; + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex state_mutex; + char congestion_reason; /* Why we where congested... */ +}; + +static inline struct drbd_conf *minor_to_mdev(unsigned int minor) +{ + struct drbd_conf *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; +} + +static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) +{ + return mdev->minor; +} + +/* returns 1 if it was successfull, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(struct drbd_conf *mdev) +{ + mutex_lock(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(struct drbd_conf *mdev) +{ + mutex_unlock(&mdev->data.mutex); +} + +/* + * function declarations + *************************/ + +/* drbd_main.c */ + +enum chg_state_flags { + CS_HARD = 1, + CS_VERBOSE = 2, + CS_WAIT_COMPLETE = 4, + CS_SERIALIZE = 8, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, +}; + +extern void drbd_init_set_defaults(struct drbd_conf *mdev); +extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val); +extern void drbd_force_state(struct drbd_conf *, union drbd_state, + union drbd_state); +extern int _drbd_request_state(struct drbd_conf *, union drbd_state, + union drbd_state, enum chg_state_flags); +extern int __drbd_set_state(struct drbd_conf *, union drbd_state, + enum chg_state_flags, struct completion *done); +extern void print_st_err(struct drbd_conf *, union drbd_state, + union drbd_state, int); +extern int drbd_thread_start(struct drbd_thread *thi); +extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +#ifdef CONFIG_SMP +extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); +extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); +#else +#define drbd_thread_current_set_cpu(A) ({}) +#define drbd_calc_cpu_mask(A) ({}) +#endif +extern void drbd_free_resources(struct drbd_conf *mdev); +extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(struct drbd_conf *mdev); +extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); +extern void drbd_free_sock(struct drbd_conf *mdev); +extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(struct drbd_conf *mdev); +extern int drbd_send_uuids(struct drbd_conf *mdev); +extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); +extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); +extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); +extern int _drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev); +extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header *h, + size_t size); +extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, + char *data, size_t size); +extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp); +extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp); +extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, + sector_t sector, int blksize, u64 block_id); +extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); +extern int _drbd_send_barrier(struct drbd_conf *mdev, + struct drbd_tl_epoch *barrier); +extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, + sector_t sector, int size, u64 block_id); +extern int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector,int size, + void *digest, int digest_size, + enum drbd_packets cmd); +extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); + +extern int drbd_send_bitmap(struct drbd_conf *mdev); +extern int _drbd_send_bitmap(struct drbd_conf *mdev); +extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); +extern void drbd_free_bc(struct drbd_backing_dev *ldev); +extern void drbd_mdev_cleanup(struct drbd_conf *mdev); + +/* drbd_meta-data.c (still in drbd_main.c) */ +extern void drbd_md_sync(struct drbd_conf *mdev); +extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); +/* maybe define them below as inline? */ +extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); +extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); +extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); +extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); +extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); +extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); +extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + void (*done)(struct drbd_conf *, int), + char *why); +extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); +extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); +extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); + + +/* Meta data layout + We reserve a 128MB Block (4k aligned) + * either at the end of the backing device + * or on a seperate meta data device. */ + +#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ +/* The following numbers are sectors */ +#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ +#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ +/* Allows up to about 3.8TB */ +#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) + +/* Since the smalles IO unit is usually 512 byte */ +#define MD_SECTOR_SHIFT 9 +#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) + +/* activity log */ +#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ +#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ +#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) + +#if BITS_PER_LONG == 32 +#define LN2_BPL 5 +#define cpu_to_lel(A) cpu_to_le32(A) +#define lel_to_cpu(A) le32_to_cpu(A) +#elif BITS_PER_LONG == 64 +#define LN2_BPL 6 +#define cpu_to_lel(A) cpu_to_le64(A) +#define lel_to_cpu(A) le64_to_cpu(A) +#else +#error "LN2 of BITS_PER_LONG unknown!" +#endif + +/* resync bitmap */ +/* 16MB sized 'bitmap extent' to track syncer usage */ +struct bm_extent { + int rs_left; /* number of bits set (out of sync) in this extent. */ + int rs_failed; /* number of failed resync requests in this extent. */ + unsigned long flags; + struct lc_element lce; +}; + +#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ +#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ + +/* drbd_bitmap.c */ +/* + * We need to store one bit for a block. + * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define BM_BLOCK_SHIFT 12 /* 4k per bit */ +#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) +/* (9+3) : 512 bytes @ 8 bits; representing 16M storage + * per sector of on disk bitmap */ +#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ +#define BM_EXT_SIZE (1<<BM_EXT_SHIFT) + +#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) +#error "HAVE YOU FIXED drbdmeta AS WELL??" +#endif + +/* thus many _storage_ sectors are described by one bit */ +#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) + +/* how much _storage_ sectors we have per bitmap sector */ +#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) +#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) +#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) + +#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) +#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) + +/* the extent in "PER_EXTENT" below is an activity log extent + * we need that many (long words/bytes) to store the bitmap + * of one AL_EXTENT_SIZE chunk of storage. + * we can store the bitmap for that many AL_EXTENTS within + * one sector of the _on_disk_ bitmap: + * bit 0 bit 37 bit 38 bit (512*8)-1 + * ...|........|........|.. // ..|........| + * sect. 0 `296 `304 ^(512*8*8)-1 + * +#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) +#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 +#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 + */ + +#define DRBD_MAX_SECTORS_32 (0xffffffffLU) +#define DRBD_MAX_SECTORS_BM \ + ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) +#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM +#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM +#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32 +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 +#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 +#else +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM +/* 16 TB in units of sectors */ +#if BITS_PER_LONG == 32 +/* adjust by one page worth of bitmap, + * so we won't wrap around in drbd_bm_find_next_bit. + * you should use 64bit OS for that much storage, anyways. */ +#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) +#else +#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) +#endif +#endif + +/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. + * With a value of 6 all IO in one 32K block make it to the same slot of the + * hash table. */ +#define HT_SHIFT 6 +#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) + +/* Number of elements in the app_reads_hash */ +#define APP_R_HSIZE 15 + +extern int drbd_bm_init(struct drbd_conf *mdev); +extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); +extern void drbd_bm_cleanup(struct drbd_conf *mdev); +extern void drbd_bm_set_all(struct drbd_conf *mdev); +extern void drbd_bm_clear_all(struct drbd_conf *mdev); +extern int drbd_bm_set_bits( + struct drbd_conf *mdev, unsigned long s, unsigned long e); +extern int drbd_bm_clear_bits( + struct drbd_conf *mdev, unsigned long s, unsigned long e); +/* bm_set_bits variant for use while holding drbd_bm_lock */ +extern void _drbd_bm_set_bits(struct drbd_conf *mdev, + const unsigned long s, const unsigned long e); +extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); +extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); +extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); +extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, + unsigned long al_enr); +extern size_t drbd_bm_words(struct drbd_conf *mdev); +extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); +extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); +extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); +/* bm_find_next variants for use while you hold drbd_bm_lock() */ +extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); +extern int drbd_bm_rs_done(struct drbd_conf *mdev); +/* for receive_bitmap */ +extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, + size_t number, unsigned long *buffer); +/* for _drbd_send_bitmap and drbd_bm_write_sect */ +extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, + size_t number, unsigned long *buffer); + +extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); +extern void drbd_bm_unlock(struct drbd_conf *mdev); + +extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); +/* drbd_main.c */ + +extern struct kmem_cache *drbd_request_cache; +extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ +extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +extern mempool_t *drbd_request_mempool; +extern mempool_t *drbd_ee_mempool; + +extern struct page *drbd_pp_pool; /* drbd's page pool */ +extern spinlock_t drbd_pp_lock; +extern int drbd_pp_vacant; +extern wait_queue_head_t drbd_pp_wait; + +extern rwlock_t global_state_lock; + +extern struct drbd_conf *drbd_new_device(unsigned int minor); +extern void drbd_free_mdev(struct drbd_conf *mdev); + +extern int proc_details; + +/* drbd_req */ +extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); +extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); +extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); +extern int is_valid_ar_handle(struct drbd_request *, sector_t); + + +/* drbd_nl.c */ +extern void drbd_suspend_io(struct drbd_conf *mdev); +extern void drbd_resume_io(struct drbd_conf *mdev); +extern char *ppsize(char *buf, unsigned long long size); +extern sector_t drbd_new_dev_size(struct drbd_conf *, + struct drbd_backing_dev *); +enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; +extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local); +extern void resync_after_online_grow(struct drbd_conf *); +extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); +extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, + int force); +enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); + +/* drbd_worker.c */ +extern int drbd_worker(struct drbd_thread *thi); +extern int drbd_alter_sa(struct drbd_conf *mdev, int na); +extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); +extern void resume_next_sg(struct drbd_conf *mdev); +extern void suspend_other_sg(struct drbd_conf *mdev); +extern int drbd_resync_finished(struct drbd_conf *mdev); +/* maybe rather drbd_main.c ? */ +extern int drbd_md_sync_page_io(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); + +static inline void ov_oos_print(struct drbd_conf *mdev) +{ + if (mdev->ov_last_oos_size) { + dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", + (unsigned long long)mdev->ov_last_oos_start, + (unsigned long)mdev->ov_last_oos_size); + } + mdev->ov_last_oos_size=0; +} + + +extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +/* worker callbacks */ +extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); +extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); +extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); +extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); +extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); +extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); + +/* drbd_receiver.c */ +extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); +extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local); +extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); +extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); +extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); +extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); +extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); +extern void drbd_flush_workqueue(struct drbd_conf *mdev); + +/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to + * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ +static inline int drbd_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + int err; + if (level == SOL_SOCKET) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else + err = sock->ops->setsockopt(sock, level, optname, optval, + optlen); + return err; +} + +static inline void drbd_tcp_cork(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_uncork(struct socket *sock) +{ + int __user val = 0; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_nodelay(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_quickack(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char __user *)&val, sizeof(val)); +} + +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); + +/* drbd_proc.c */ +extern struct proc_dir_entry *drbd_proc; +extern struct file_operations drbd_proc_fops; +extern const char *drbd_conn_str(enum drbd_conns s); +extern const char *drbd_role_str(enum drbd_role s); + +/* drbd_actlog.c */ +extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); +extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_rs_cancel_all(struct drbd_conf *mdev); +extern int drbd_rs_del_all(struct drbd_conf *mdev); +extern void drbd_rs_failed_io(struct drbd_conf *mdev, + sector_t sector, int size); +extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); +extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, + int size, const char *file, const unsigned int line); +#define drbd_set_in_sync(mdev, sector, size) \ + __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) +extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, + int size, const char *file, const unsigned int line); +#define drbd_set_out_of_sync(mdev, sector, size) \ + __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) +extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); +extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); +extern void drbd_al_shrink(struct drbd_conf *mdev); + + +/* drbd_nl.c */ + +void drbd_nl_cleanup(void); +int __init drbd_nl_init(void); +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); +void drbd_bcast_sync_progress(struct drbd_conf *mdev); +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e); + + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) + +/* + * inline helper functions + *************************/ + +static inline void drbd_state_lock(struct drbd_conf *mdev) +{ + wait_event(mdev->misc_wait, + !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); +} + +static inline void drbd_state_unlock(struct drbd_conf *mdev) +{ + clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); + wake_up(&mdev->misc_wait); +} + +static inline int _drbd_set_state(struct drbd_conf *mdev, + union drbd_state ns, enum chg_state_flags flags, + struct completion *done) +{ + int rv; + + read_lock(&global_state_lock); + rv = __drbd_set_state(mdev, ns, flags, done); + read_unlock(&global_state_lock); + + return rv; +} + +/** + * drbd_request_state() - Reqest a state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_conf *mdev, + union drbd_state mask, + union drbd_state val) +{ + return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); +} + +#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) +static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) +{ + switch (mdev->ldev->dc.on_io_error) { + case EP_PASS_ON: + if (!forcedetach) { + if (printk_ratelimit()) + dev_err(DEV, "Local IO failed in %s." + "Passing error on...\n", where); + break; + } + /* NOTE fall through to detach case if forcedetach set */ + case EP_DETACH: + case EP_CALL_HELPER: + if (mdev->state.disk > D_FAILED) { + _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); + dev_err(DEV, "Local IO failed in %s." + "Detaching...\n", where); + } + break; + } +} + +/** + * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers + * @mdev: DRBD device. + * @error: Error code passed to the IO completion callback + * @forcedetach: Force detach. I.e. the error happened while accessing the meta data + * + * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) + */ +#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) +static inline void drbd_chk_io_error_(struct drbd_conf *mdev, + int error, int forcedetach, const char *where) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock, flags); + __drbd_chk_io_error_(mdev, forcedetach, where); + spin_unlock_irqrestore(&mdev->req_lock, flags); + } +} + + +/** + * drbd_md_first_sector() - Returns the first sector number of the meta data area + * @bdev: Meta data block device. + * + * BTW, for internal meta data, this happens to be the maximum capacity + * we could agree upon with our peer node. + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/** + * drbd_md_last_sector() - Return the last sector number of the meta data area + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_AL_OFFSET - 1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect; + } +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? bdev->bd_inode->i_size >> 9 : 0; +} + +/** + * drbd_get_max_capacity() - Returns the capacity we announce to out peer + * @bdev: Meta data block device. + * + * returns the capacity we announce to out peer. we clip ourselves at the + * various MAX_SECTORS, because if we don't, current implementation will + * oops sooner or later + */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + sector_t s; + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + s = drbd_get_capacity(bdev->backing_bdev) + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_md_first_sector(bdev)) + : 0; + break; + case DRBD_MD_INDEX_FLEX_EXT: + s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_get_capacity(bdev->backing_bdev)); + /* clip at maximum size the meta device can support */ + s = min_t(sector_t, s, + BM_EXT_TO_SECT(bdev->md.md_size_sect + - bdev->md.bm_offset)); + break; + default: + s = min_t(sector_t, DRBD_MAX_SECTORS, + drbd_get_capacity(bdev->backing_bdev)); + } + return s; +} + +/** + * drbd_md_ss__() - Return the sector number of our meta data super block + * @mdev: DRBD device. + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + default: /* external, some index */ + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; + case DRBD_MD_INDEX_INTERNAL: + /* with drbd08, internal meta data is always "flexible" */ + case DRBD_MD_INDEX_FLEX_INT: + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + if (!bdev->backing_bdev) { + if (__ratelimit(&drbd_ratelimit_state)) { + dev_err(DEV, "bdev->backing_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) + - MD_AL_OFFSET; + case DRBD_MD_INDEX_FLEX_EXT: + return 0; + } +} + +static inline void +_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add_tail(&w->list, &q->q); + up(&q->s); +} + +static inline void +drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock, flags); +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add_tail(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock, flags); +} + +static inline void wake_asender(struct drbd_conf *mdev) +{ + if (test_bit(SIGNAL_ASENDER, &mdev->flags)) + force_sig(DRBD_SIG, mdev->asender.task); +} + +static inline void request_ping(struct drbd_conf *mdev) +{ + set_bit(SEND_PING, &mdev->flags); + wake_asender(mdev); +} + +static inline int drbd_send_short_cmd(struct drbd_conf *mdev, + enum drbd_packets cmd) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); +} + +static inline int drbd_send_ping(struct drbd_conf *mdev) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); +} + +static inline int drbd_send_ping_ack(struct drbd_conf *mdev) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); +} + +static inline void drbd_thread_stop(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, FALSE, TRUE); +} + +static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, FALSE, FALSE); +} + +static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, TRUE, FALSE); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, queue_for_net_write or queue_for_net_read); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, data_received) + * [from receive_DataReply] + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) + * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, send_failed or send_canceled) + * _req_mod(req, connection_lost_while_pending) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if (atomic_read(&mdev->which) < 0) \ + dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) + +#define dec_ap_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->misc_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) + * (or P_NEG_ACK with ID_SYNCER) + */ +static inline void inc_rs_pending(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->rs_pending_cnt); +} + +#define dec_rs_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a P_RECV_ACK (proto B) + * or P_WRITE_ACK (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK + * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA + * receive_Barrier_* we need to send a P_BARRIER_ACK + */ +static inline void inc_unacked(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->unacked_cnt); +} + +#define dec_unacked(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + +#define sub_unacked(mdev, n) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + + +static inline void put_net_conf(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->net_cnt)) + wake_up(&mdev->misc_wait); +} + +/** + * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there + * @mdev: DRBD device. + * + * You have to call put_net_conf() when finished working with mdev->net_conf. + */ +static inline int get_net_conf(struct drbd_conf *mdev) +{ + int have_net_conf; + + atomic_inc(&mdev->net_cnt); + have_net_conf = mdev->state.conn >= C_UNCONNECTED; + if (!have_net_conf) + put_net_conf(mdev); + return have_net_conf; +} + +/** + * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev + * @M: DRBD device. + * + * You have to call put_ldev() when finished working with mdev->ldev. + */ +#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) +#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) + +static inline void put_ldev(struct drbd_conf *mdev) +{ + __release(local); + if (atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); + D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); +} + +#ifndef __CHECKER__ +static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins); + if (!io_allowed) + put_ldev(mdev); + return io_allowed; +} +#else +extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); +#endif + +/* you must have an "get_ldev" reference */ +static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, + unsigned long *bits_left, unsigned int *per_mil_done) +{ + /* + * this is to break it at compile time when we change that + * (we may feel 4TB maximum storage per drbd is not enough) + */ + typecheck(unsigned long, mdev->rs_total); + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (*bits_left > mdev->rs_total) { + /* doh. maybe a logic bug somewhere. + * may also be just a race condition + * between this and a disconnect during sync. + * for now, just prevent in-kernel buffer overflow. + */ + smp_rmb(); + dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + drbd_conn_str(mdev->state.conn), + *bits_left, mdev->rs_total, mdev->rs_failed); + *per_mil_done = 0; + } else { + /* make sure the calculation happens in long context */ + unsigned long tmp = 1000UL - + (*bits_left >> 10)*1000UL + / ((mdev->rs_total >> 10) + 1UL); + *per_mil_done = tmp; + } +} + + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(struct drbd_conf *mdev) +{ + int mxb = 1000000; /* arbitrary limit on open requests */ + if (get_net_conf(mdev)) { + mxb = mdev->net_conf->max_buffers; + put_net_conf(mdev); + } + return mxb; +} + +static inline int drbd_state_is_stable(union drbd_state s) +{ + + /* DO NOT add a default clause, we want the compiler to warn us + * for any newly introduced state we may have forgotten to add here */ + + switch ((enum drbd_conns)s.conn) { + /* new io only accepted when there is no connection, ... */ + case C_STANDALONE: + case C_WF_CONNECTION: + /* ... or there is a well established connection. */ + case C_CONNECTED: + case C_SYNC_SOURCE: + case C_SYNC_TARGET: + case C_VERIFY_S: + case C_VERIFY_T: + case C_PAUSED_SYNC_S: + case C_PAUSED_SYNC_T: + /* maybe stable, look at the disk state */ + break; + + /* no new io accepted during tansitional states + * like handshake or teardown */ + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_REPORT_PARAMS: + case C_STARTING_SYNC_S: + case C_STARTING_SYNC_T: + case C_WF_BITMAP_S: + case C_WF_BITMAP_T: + case C_WF_SYNC_UUID: + case C_MASK: + /* not "stable" */ + return 0; + } + + switch ((enum drbd_disk_state)s.disk) { + case D_DISKLESS: + case D_INCONSISTENT: + case D_OUTDATED: + case D_CONSISTENT: + case D_UP_TO_DATE: + /* disk state is stable as well. */ + break; + + /* no new io accepted during tansitional states */ + case D_ATTACHING: + case D_FAILED: + case D_NEGOTIATING: + case D_UNKNOWN: + case D_MASK: + /* not "stable" */ + return 0; + } + + return 1; +} + +static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + + if (mdev->state.susp) + return 0; + if (test_bit(SUSPEND_IO, &mdev->flags)) + return 0; + + /* to avoid potential deadlock or bitmap corruption, + * in various places, we only allow new application io + * to start during "stable" states. */ + + /* no new io accepted when attaching or detaching the disk */ + if (!drbd_state_is_stable(mdev->state)) + return 0; + + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&mdev->ap_bio_cnt) > mxb) + return 0; + if (test_bit(BITMAP_IO, &mdev->flags)) + return 0; + return 1; +} + +/* I'd like to use wait_event_lock_irq, + * but I'm not sure when it got introduced, + * and not sure when it has 3 or 4 arguments */ +static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) +{ + /* compare with after_state_ch, + * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ + DEFINE_WAIT(wait); + + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection + * handshake as long as we would exeed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + spin_lock_irq(&mdev->req_lock); + while (!__inc_ap_bio_cond(mdev)) { + prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + schedule(); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } + atomic_add(one_or_two, &mdev->ap_bio_cnt); + spin_unlock_irq(&mdev->req_lock); +} + +static inline void dec_ap_bio(struct drbd_conf *mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); + + D_ASSERT(ap_bio >= 0); + /* this currently does wake_up for every dec_ap_bio! + * maybe rather introduce some type of hysteresis? + * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ + if (ap_bio < mxb) + wake_up(&mdev->misc_wait); + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } +} + +static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) +{ + mdev->ed_uuid = val; +} + +static inline int seq_cmp(u32 a, u32 b) +{ + /* we assume wrap around at 32bit. + * for wrap around at 24bit (old atomic_t), + * we'd have to + * a <<= 8; b <<= 8; + */ + return (s32)(a) - (s32)(b); +} +#define seq_lt(a, b) (seq_cmp((a), (b)) < 0) +#define seq_gt(a, b) (seq_cmp((a), (b)) > 0) +#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) +#define seq_le(a, b) (seq_cmp((a), (b)) <= 0) +/* CAUTION: please no side effects in arguments! */ +#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) + +static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) +{ + unsigned int m; + spin_lock(&mdev->peer_seq_lock); + m = seq_max(mdev->peer_seq, new_seq); + mdev->peer_seq = m; + spin_unlock(&mdev->peer_seq_lock); + if (m == new_seq) + wake_up(&mdev->seq_wait); +} + +static inline void drbd_update_congested(struct drbd_conf *mdev) +{ + struct sock *sk = mdev->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &mdev->flags); +} + +static inline int drbd_queue_order_type(struct drbd_conf *mdev) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ stuff */ +#ifndef QUEUE_ORDERED_NONE +#define QUEUE_ORDERED_NONE 0 +#endif + return QUEUE_ORDERED_NONE; +} + +static inline void drbd_blk_run_queue(struct request_queue *q) +{ + if (q && q->unplug_fn) + q->unplug_fn(q); +} + +static inline void drbd_kick_lo(struct drbd_conf *mdev) +{ + if (get_ldev(mdev)) { + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); + put_ldev(mdev); + } +} + +static inline void drbd_md_flush(struct drbd_conf *mdev) +{ + int r; + + if (test_bit(MD_NO_BARRIER, &mdev->flags)) + return; + + r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); + if (r) { + set_bit(MD_NO_BARRIER, &mdev->flags); + dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); + } +} + +#endif diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 0000000..157d1e4 --- /dev/null +++ b/drivers/block/drbd/drbd_main.c @@ -0,0 +1,3699 @@ +/* + drbd.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/drbd.h> +#include <asm/uaccess.h> +#include <asm/types.h> +#include <net/sock.h> +#include <linux/ctype.h> +#include <linux/smp_lock.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/reboot.h> +#include <linux/notifier.h> +#include <linux/kthread.h> + +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> +#include <linux/vmalloc.h> + +#include <linux/drbd_limits.h> +#include "drbd_int.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ + +#include "drbd_vli.h" + +struct after_state_chg_work { + struct drbd_work w; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + +int drbdd_init(struct drbd_thread *); +int drbd_worker(struct drbd_thread *); +int drbd_asender(struct drbd_thread *); + +int drbd_init(void); +static int drbd_open(struct block_device *bdev, fmode_t mode); +static int drbd_release(struct gendisk *gd, fmode_t mode); +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void md_sync_timer_fn(unsigned long data); +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); + +MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " + "Lars Ellenberg <lars@linbit.com>"); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_VERSION(REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); + +#include <linux/moduleparam.h> +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, uint, 0444); +module_param(disable_sendpage, bool, 0644); +module_param(allow_oos, bool, 0); +module_param(cn_idx, uint, 0444); +module_param(proc_details, int, 0644); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +int enable_faults; +int fault_rate; +static int fault_count; +int fault_devs; +/* bitmap of enabled faults */ +module_param(enable_faults, int, 0664); +/* fault rate % value - applies to all enabled faults */ +module_param(fault_rate, int, 0664); +/* count of faults inserted */ +module_param(fault_count, int, 0664); +/* bitmap of devices to insert faults on */ +module_param(fault_devs, int, 0644); +#endif + +/* module parameter, defined */ +unsigned int minor_count = 32; +int disable_sendpage; +int allow_oos; +unsigned int cn_idx = CN_IDX_DRBD; +int proc_details; /* Detail level in proc drbd*/ + +/* Module parameter for setting the user mode helper program + * to run. Default is /sbin/drbdadm */ +char usermode_helper[80] = "/sbin/drbdadm"; + +module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct drbd_conf **minor_table; + +struct kmem_cache *drbd_request_cache; +struct kmem_cache *drbd_ee_cache; /* epoch entries */ +struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; + +/* I do not use a standard mempool, because: + 1) I want to hand out the pre-allocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page *drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); + +static struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_release, +}; + +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) + +#ifdef __CHECKER__ +/* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. + */ +int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins); + if (!io_allowed) { + if (atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); + } + return io_allowed; +} + +#endif + +/** + * DOC: The transfer log + * + * The transfer log is a single linked list of &struct drbd_tl_epoch objects. + * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail + * of the list. There is always at least one &struct drbd_tl_epoch object. + * + * Each &struct drbd_tl_epoch has a circular double linked list of requests + * attached. + */ +static int tl_init(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* during device minor initialization, we may well use GFP_KERNEL */ + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); + if (!b) + return 0; + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->next = NULL; + b->br_number = 4711; + b->n_req = 0; + b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + + mdev->oldest_tle = b; + mdev->newest_tle = b; + INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + + return 1; +} + +static void tl_cleanup(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->oldest_tle == mdev->newest_tle); + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + kfree(mdev->oldest_tle); + mdev->oldest_tle = NULL; + kfree(mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; +} + +/** + * _tl_add_barrier() - Adds a barrier to the transfer log + * @mdev: DRBD device. + * @new: Barrier to be added before the current head of the TL. + * + * The caller must hold the req_lock. + */ +void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) +{ + struct drbd_tl_epoch *newest_before; + + INIT_LIST_HEAD(&new->requests); + INIT_LIST_HEAD(&new->w.list); + new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + new->next = NULL; + new->n_req = 0; + + newest_before = mdev->newest_tle; + /* never send a barrier number == 0, because that is special-cased + * when using TCQ for our write ordering code */ + new->br_number = (newest_before->br_number+1) ?: 1; + if (mdev->newest_tle != new) { + mdev->newest_tle->next = new; + mdev->newest_tle = new; + } +} + +/** + * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL + * @mdev: DRBD device. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * &struct drbd_tl_epoch objects this function will cause a termination + * of the connection. + */ +void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_tl_epoch *b, *nob; /* next old barrier */ + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_tle; + + /* first some paranoia code */ + if (b == NULL) { + dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; + } + if (b->br_number != barrier_nr) { + dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, b->br_number); + goto bail; + } + if (b->n_req != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", + barrier_nr, set_size, b->n_req); + goto bail; + } + + /* Clean up list of requests processed during current epoch */ + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request, tl_requests); + _req_mod(r, barrier_acked); + } + /* There could be requests on the list waiting for completion + of the write to the local disk. To avoid corruptions of + slab's data structures we have to remove the lists head. + + Also there could have been a barrier ack out of sequence, overtaking + the write acks - which would be a bug and violating write ordering. + To not deadlock in case we lose connection while such requests are + still pending, we need some way to find them for the + _req_mode(connection_lost_while_pending). + + These have been list_move'd to the out_of_sequence_requests list in + _req_mod(, barrier_acked) above. + */ + list_del_init(&b->requests); + + nob = b->next; + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, b); + if (nob) + mdev->oldest_tle = nob; + /* if nob == NULL b was the only barrier, and becomes the new + barrier. Therefore mdev->oldest_tle points already to b */ + } else { + D_ASSERT(nob != NULL); + mdev->oldest_tle = nob; + kfree(b); + } + + spin_unlock_irq(&mdev->req_lock); + dec_ap_pending(mdev); + + return; + +bail: + spin_unlock_irq(&mdev->req_lock); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); +} + + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @mdev: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b, *tmp; + struct list_head *le, *tle; + struct drbd_request *r; + int new_initial_bnr = net_random(); + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, connection_lost_while_pending); + } + tmp = b->next; + + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = new_initial_bnr; + b->n_req = 0; + + mdev->oldest_tle = b; + break; + } + kfree(b); + b = tmp; + } + + /* we expect this list to be empty. */ + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + + /* but just in case, clean it up anyways! */ + list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, connection_lost_while_pending); + } + + /* ensure bit indicating barrier is required is clear */ + clear_bit(CREATE_BARRIER, &mdev->flags); + + spin_unlock_irq(&mdev->req_lock); +} + +/** + * cl_wide_st_chg() - TRUE if the state change is a cluster wide one + * @mdev: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +static int cl_wide_st_chg(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); +} + +int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state os, ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, NULL); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(mdev, CS_HARD, mask, val); +} + +static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); +static int is_valid_state_transition(struct drbd_conf *, + union drbd_state, union drbd_state); +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, int *warn_sync_abort); +int drbd_send_state_req(struct drbd_conf *, + union drbd_state, union drbd_state); + +static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + int rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) + return SS_CW_FAILED_BY_PEER; + + rv = 0; + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (!cl_wide_st_chg(mdev, os, ns)) + rv = SS_CW_NO_NEED; + if (!rv) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) { + rv = is_valid_state_transition(mdev, ns, os); + if (rv == SS_SUCCESS) + rv = 0; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +static int drbd_req_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val, + enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + int rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(&mdev->state_mutex); + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (cl_wide_st_chg(mdev, os, ns)) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) + rv = is_valid_state_transition(mdev, ns, os); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + drbd_state_lock(mdev); + if (!drbd_send_state_req(mdev, mask, val)) { + drbd_state_unlock(mdev); + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + wait_event(mdev->state_wait, + (rv = _req_st_cond(mdev, mask, val))); + + if (rv < SS_SUCCESS) { + drbd_state_unlock(mdev); + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, &done); + drbd_state_unlock(mdev); + } else { + rv = _drbd_set_state(mdev, ns, f, &done); + } + + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(current != mdev->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(&mdev->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + int rv; + + wait_event(mdev->state_wait, + (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) +{ + dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", + name, + drbd_conn_str(ns.conn), + drbd_role_str(ns.role), + drbd_role_str(ns.peer), + drbd_disk_str(ns.disk), + drbd_disk_str(ns.pdsk), + ns.susp ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-' + ); +} + +void print_st_err(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns, int err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(mdev, " state", os); + print_st(mdev, "wanted", ns); +} + + +#define drbd_peer_str drbd_role_str +#define drbd_pdsk_str drbd_disk_str + +#define drbd_susp_str(A) ((A) ? "1" : "0") +#define drbd_aftr_isp_str(A) ((A) ? "1" : "0") +#define drbd_peer_isp_str(A) ((A) ? "1" : "0") +#define drbd_user_isp_str(A) ((A) ? "1" : "0") + +#define PSC(A) \ + ({ if (ns.A != os.A) { \ + pbp += sprintf(pbp, #A "( %s -> %s ) ", \ + drbd_##A##_str(os.A), \ + drbd_##A##_str(ns.A)); \ + } }) + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @mdev: DRBD device. + * @ns: State to consider. + */ +static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + int rv = SS_SUCCESS; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (get_net_conf(mdev)) { + if (!mdev->net_conf->two_primaries && + ns.role == R_PRIMARY && ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + put_net_conf(mdev); + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && mdev->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (mdev->sync_conf.verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + mdev->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + return rv; +} + +/** + * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible + * @mdev: DRBD device. + * @ns: new state. + * @os: old state. + */ +static int is_valid_state_transition(struct drbd_conf *mdev, + union drbd_state ns, union drbd_state os) +{ + int rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) + rv = SS_IN_TRANSIENT_STATE; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + return rv; +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, int *warn_sync_abort) +{ + enum drbd_fencing_p fp; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Disallow Network errors to configure a device's network part */ + if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && + os.conn <= C_DISCONNECTING) + ns.conn = os.conn; + + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ + if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) + ns.conn = os.conn; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) + ns.conn = os.conn; + + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) + ns.pdsk = D_UNKNOWN; + + /* Abort resync if a disk fails/detaches */ + if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && + (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn_sync_abort) + *warn_sync_abort = 1; + ns.conn = C_CONNECTED; + } + + if (ns.conn >= C_CONNECTED && + ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || + (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { + switch (ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + ns.disk = D_OUTDATED; + break; + case C_CONNECTED: + case C_WF_BITMAP_S: + case C_SYNC_SOURCE: + case C_PAUSED_SYNC_S: + ns.disk = D_UP_TO_DATE; + break; + case C_SYNC_TARGET: + ns.disk = D_INCONSISTENT; + dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); + break; + } + if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) + dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); + } + + if (ns.conn >= C_CONNECTED && + (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { + switch (ns.conn) { + case C_CONNECTED: + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_SYNC_TARGET: + ns.pdsk = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + ns.pdsk = D_OUTDATED; + break; + case C_SYNC_SOURCE: + ns.pdsk = D_INCONSISTENT; + dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); + break; + } + if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) + dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } else { + dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(mdev); + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) + ns.susp = 1; + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) +{ + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + mdev->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); + if (bit >= mdev->rs_total) + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - 1); + mdev->ov_position = mdev->ov_start_sector; + } +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @mdev: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +int __drbd_set_state(struct drbd_conf *mdev, + union drbd_state ns, enum chg_state_flags flags, + struct completion *done) +{ + union drbd_state os; + int rv = SS_SUCCESS; + int warn_sync_abort = 0; + struct after_state_chg_work *ascw; + + os = mdev->state; + + ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(mdev, os) == rv) { + dev_err(DEV, "Considering state change from bad state. " + "Error would be: '%s'\n", + drbd_set_st_err_str(rv)); + print_st(mdev, "old", os); + print_st(mdev, "new", ns); + rv = is_valid_state_transition(mdev, ns, os); + } + } else + rv = is_valid_state_transition(mdev, ns, os); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + return rv; + } + + if (warn_sync_abort) + dev_warn(DEV, "Resync aborted.\n"); + + { + char *pbp, pb[300]; + pbp = pb; + *pbp = 0; + PSC(role); + PSC(peer); + PSC(conn); + PSC(disk); + PSC(pdsk); + PSC(susp); + PSC(aftr_isp); + PSC(peer_isp); + PSC(user_isp); + dev_info(DEV, "%s\n", pb); + } + + /* solve the race between becoming unconfigured, + * worker doing the cleanup, and + * admin reconfiguring us: + * on (re)configure, first set CONFIG_PENDING, + * then wait for a potentially exiting worker, + * start the worker, and schedule one no_op. + * then proceed with configuration. + */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY && + !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) + set_bit(DEVICE_DYING, &mdev->flags); + + mdev->state.i = ns.i; + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + + /* post-state-change actions */ + if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { + set_bit(STOP_SYNC_TIMER, &mdev->flags); + mod_timer(&mdev->resync_timer, jiffies); + } + + /* aborted verify run. log the last position */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn < C_CONNECTED) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); + dev_info(DEV, "Online Verify reached sector %llu\n", + (unsigned long long)mdev->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + dev_info(DEV, "Syncer continues.\n"); + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + if (ns.conn == C_SYNC_TARGET) { + if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) + mod_timer(&mdev->resync_timer, jiffies); + /* This if (!test_bit) is only needed for the case + that a device that has ceased to used its timer, + i.e. it is already in drbd_resync_finished() gets + paused and resumed. */ + } + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + dev_info(DEV, "Resync suspended\n"); + mdev->rs_mark_time = jiffies; + if (ns.conn == C_PAUSED_SYNC_T) + set_bit(STOP_SYNC_TIMER, &mdev->flags); + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + mdev->ov_position = 0; + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_bits(mdev); + if (mdev->agreed_pro_version >= 90) + set_ov_position(mdev, ns.conn); + else + mdev->ov_start_sector = 0; + mdev->ov_left = mdev->rs_total + - BM_SECT_TO_BIT(mdev->ov_position); + mdev->rs_start = + mdev->rs_mark_time = jiffies; + mdev->ov_last_oos_size = 0; + mdev->ov_last_oos_start = 0; + + if (ns.conn == C_VERIFY_S) { + dev_info(DEV, "Starting Online Verify from sector %llu\n", + (unsigned long long)mdev->ov_position); + mod_timer(&mdev->resync_timer, jiffies); + } + } + + if (get_ldev(mdev)) { + u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + if (test_bit(CRASHED_PRIMARY, &mdev->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (mdev->state.role == R_PRIMARY || + (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (mdev->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (mdev->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (mdev->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != mdev->ldev->md.flags) { + mdev->ldev->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); + put_ldev(mdev); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &mdev->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_TEAR_DOWN && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&mdev->receiver); + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->done = done; + drbd_queue_work(&mdev->data.work, &ascw->w); + } else { + dev_warn(DEV, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) { + D_ASSERT(ascw->done != NULL); + complete(ascw->done); + } + kfree(ascw); + + return 1; +} + +static void abw_start_sync(struct drbd_conf *mdev, int rv) +{ + if (rv) { + dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (mdev->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(mdev, C_SYNC_SOURCE); + break; + } +} + +/** + * after_state_ch() - Perform after state change actions that may sleep + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags + */ +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) +{ + enum drbd_fencing_p fp; + + if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if (mdev->p_uuid) + mdev->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Inform userspace about the change... */ + drbd_bcast_state(mdev, ns); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(mdev, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (fp == FP_STONITH && ns.susp) { + /* case1: The outdate peer handler is successful: + * case2: The connection was established again: */ + if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { + tl_clear(mdev); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + } + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + if (get_ldev(mdev)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + put_ldev(mdev); + } + } + + if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) + drbd_uuid_new_current(mdev); + + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + drbd_al_to_on_disk_bm(mdev); + put_ldev(mdev); + } + + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ + mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ + drbd_send_sizes(mdev, 0); /* to start sync... */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(mdev); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(mdev); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(mdev); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); + + /* We are invalidating our self... */ + if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && + os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); + + if (os.disk > D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh; + + eh = EP_PASS_ON; + if (get_ldev_if_state(mdev, D_FAILED)) { + eh = mdev->ldev->dc.on_io_error; + put_ldev(mdev); + } + + drbd_rs_cancel_all(mdev); + /* since get_ldev() only works as long as disk>=D_INCONSISTENT, + and it is D_DISKLESS here, local_cnt can only go down, it can + not increase... It will reach zero */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); + spin_unlock_irq(&mdev->req_lock); + + if (eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + } + + if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { + + if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that my disk is broken.\n"); + else + dev_err(DEV, "Sending state in drbd_io_error() failed\n"); + } + + lc_destroy(mdev->resync); + mdev->resync = NULL; + lc_destroy(mdev->act_log); + mdev->act_log = NULL; + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); + + if (mdev->md_io_tmpp) + __free_page(mdev->md_io_tmpp); + } + + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(mdev); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(mdev); + + /* Upon network connection, we need to start the receiver */ + if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) + drbd_thread_start(&mdev->receiver); + + /* Terminate worker thread if we are unconfigured - it will be + restarted as needed... */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(mdev); + /* set in __drbd_set_state, unless CONFIG_PENDING was set */ + if (test_bit(DEVICE_DYING, &mdev->flags)) + drbd_thread_stop_nowait(&mdev->worker); + } + + drbd_md_sync(mdev); +} + + +static int drbd_thread_setup(void *arg) +{ + struct drbd_thread *thi = (struct drbd_thread *) arg; + struct drbd_conf *mdev = thi->mdev; + unsigned long flags; + int retval; + +restart: + retval = thi->function(thi); + + spin_lock_irqsave(&thi->t_lock, flags); + + /* if the receiver has been "Exiting", the last thing it did + * was set the conn state to "StandAlone", + * if now a re-connect request comes in, conn state goes C_UNCONNECTED, + * and receiver thread will be "started". + * drbd_thread_start needs to set "Restarting" in that case. + * t_state check and assignment needs to be within the same spinlock, + * so either thread_start sees Exiting, and can remap to Restarting, + * or thread_start see None, and can proceed as normal. + */ + + if (thi->t_state == Restarting) { + dev_info(DEV, "Restarting %s\n", current->comm); + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + goto restart; + } + + thi->task = NULL; + thi->t_state = None; + smp_mb(); + complete(&thi->stop); + spin_unlock_irqrestore(&thi->t_lock, flags); + + dev_info(DEV, "Terminating %s\n", current->comm); + + /* Release mod reference taken when thread was started */ + module_put(THIS_MODULE); + return retval; +} + +static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, + int (*func) (struct drbd_thread *)) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + thi->function = func; + thi->mdev = mdev; +} + +int drbd_thread_start(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct task_struct *nt; + unsigned long flags; + + const char *me = + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE"; + + /* is used from state engine doing drbd_thread_stop_nowait, + * while holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + switch (thi->t_state) { + case None: + dev_info(DEV, "Starting %s thread (from %s [%d])\n", + me, current->comm, current->pid); + + /* Get ref on module for thread - this is released when thread exits */ + if (!try_module_get(THIS_MODULE)) { + dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); + spin_unlock_irqrestore(&thi->t_lock, flags); + return FALSE; + } + + init_completion(&thi->stop); + D_ASSERT(thi->task == NULL); + thi->reset_cpu_mask = 1; + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ + + nt = kthread_create(drbd_thread_setup, (void *) thi, + "drbd%d_%s", mdev_to_minor(mdev), me); + + if (IS_ERR(nt)) { + dev_err(DEV, "Couldn't start thread\n"); + + module_put(THIS_MODULE); + return FALSE; + } + spin_lock_irqsave(&thi->t_lock, flags); + thi->task = nt; + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + wake_up_process(nt); + break; + case Exiting: + thi->t_state = Restarting; + dev_info(DEV, "Restarting %s thread (from %s [%d])\n", + me, current->comm, current->pid); + /* fall through */ + case Running: + case Restarting: + default: + spin_unlock_irqrestore(&thi->t_lock, flags); + break; + } + + return TRUE; +} + + +void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) +{ + unsigned long flags; + + enum drbd_thread_state ns = restart ? Restarting : Exiting; + + /* may be called from state engine, holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + if (thi->t_state == None) { + spin_unlock_irqrestore(&thi->t_lock, flags); + if (restart) + drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock_irqrestore(&thi->t_lock, flags); + return; + } + + thi->t_state = ns; + smp_mb(); + init_completion(&thi->stop); + if (thi->task != current) + force_sig(DRBD_SIGKILL, thi->task); + + } + + spin_unlock_irqrestore(&thi->t_lock, flags); + + if (wait) + wait_for_completion(&thi->stop); +} + +#ifdef CONFIG_SMP +/** + * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs + * @mdev: DRBD device. + * + * Forces all threads of a device onto the same CPU. This is beneficial for + * DRBD's performance. May be overwritten by user's configuration. + */ +void drbd_calc_cpu_mask(struct drbd_conf *mdev) +{ + int ord, cpu; + + /* user override. */ + if (cpumask_weight(mdev->cpu_mask)) + return; + + ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); + for_each_online_cpu(cpu) { + if (ord-- == 0) { + cpumask_set_cpu(cpu, mdev->cpu_mask); + return; + } + } + /* should not be reached */ + cpumask_setall(mdev->cpu_mask); +} + +/** + * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread + * @mdev: DRBD device. + * + * call in the "main loop" of _all_ threads, no need for any mutex, current won't die + * prematurely. + */ +void drbd_thread_current_set_cpu(struct drbd_conf *mdev) +{ + struct task_struct *p = current; + struct drbd_thread *thi = + p == mdev->asender.task ? &mdev->asender : + p == mdev->receiver.task ? &mdev->receiver : + p == mdev->worker.task ? &mdev->worker : + NULL; + ERR_IF(thi == NULL) + return; + if (!thi->reset_cpu_mask) + return; + thi->reset_cpu_mask = 0; + set_cpus_allowed_ptr(p, mdev->cpu_mask); +} +#endif + +/* the appropriate socket mutex must be held already */ +int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header *h, + size_t size, unsigned msg_flags) +{ + int sent, ok; + + ERR_IF(!h) return FALSE; + ERR_IF(!size) return FALSE; + + h->magic = BE_DRBD_MAGIC; + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size-sizeof(struct p_header)); + + sent = drbd_send(mdev, sock, h, size, msg_flags); + + ok = (sent == size); + if (!ok) + dev_err(DEV, "short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + return ok; +} + +/* don't pass the socket. we may only look at it + * when we hold the appropriate socket mutex. + */ +int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header *h, size_t size) +{ + int ok = 0; + struct socket *sock; + + if (use_data_socket) { + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; + } else { + mutex_lock(&mdev->meta.mutex); + sock = mdev->meta.socket; + } + + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (likely(sock != NULL)) + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); + + if (use_data_socket) + mutex_unlock(&mdev->data.mutex); + else + mutex_unlock(&mdev->meta.mutex); + return ok; +} + +int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, + size_t size) +{ + struct p_header h; + int ok; + + h.magic = BE_DRBD_MAGIC; + h.command = cpu_to_be16(cmd); + h.length = cpu_to_be16(size); + + if (!drbd_get_data_sock(mdev)) + return 0; + + ok = (sizeof(h) == + drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); + ok = ok && (size == + drbd_send(mdev, mdev->data.socket, data, size, 0)); + + drbd_put_data_sock(mdev); + + return ok; +} + +int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) +{ + struct p_rs_param_89 *p; + struct socket *sock; + int size, rv; + const int apv = mdev->agreed_pro_version; + + size = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + strlen(mdev->sync_conf.verify_alg) + 1 + : /* 89 */ sizeof(struct p_rs_param_89); + + /* used from admin command context and receiver/worker context. + * to avoid kmalloc, grab the socket right here, + * then use the pre-allocated sbuf there */ + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; + + p = &mdev->data.sbuf.rs_param_89; + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + p->rate = cpu_to_be32(sc->rate); + + if (apv >= 88) + strcpy(p->verify_alg, mdev->sync_conf.verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, mdev->sync_conf.csums_alg); + + rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); + } else + rv = 0; /* not ok */ + + mutex_unlock(&mdev->data.mutex); + + return rv; +} + +int drbd_send_protocol(struct drbd_conf *mdev) +{ + struct p_protocol *p; + int size, rv; + + size = sizeof(struct p_protocol); + + if (mdev->agreed_pro_version >= 87) + size += strlen(mdev->net_conf->integrity_alg) + 1; + + /* we must not recurse into our own queue, + * as that is blocked during handshake */ + p = kmalloc(size, GFP_NOIO); + if (p == NULL) + return 0; + + p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); + p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); + p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); + p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); + p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); + p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); + + if (mdev->agreed_pro_version >= 87) + strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); + + rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, + (struct p_header *)p, size); + kfree(p); + return rv; +} + +int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) +{ + struct p_uuids p; + int i; + + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) + return 1; + + for (i = UI_CURRENT; i < UI_SIZE; i++) + p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; + + mdev->comm_bm_set = drbd_bm_total_weight(mdev); + p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; + uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; + uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; + p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + + put_ldev(mdev); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_uuids(struct drbd_conf *mdev) +{ + return _drbd_send_uuids(mdev, 0); +} + +int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) +{ + return _drbd_send_uuids(mdev, 8); +} + + +int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) +{ + struct p_rs_uuid p; + + p.uuid = cpu_to_be64(val); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) +{ + struct p_sizes p; + sector_t d_size, u_size; + int q_order_type; + int ok; + + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { + D_ASSERT(mdev->ldev->backing_bdev); + d_size = drbd_get_max_capacity(mdev->ldev); + u_size = mdev->ldev->dc.disk_size; + q_order_type = drbd_queue_order_type(mdev); + p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); + put_ldev(mdev); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + } + + p.d_size = cpu_to_be64(d_size); + p.u_size = cpu_to_be64(u_size); + p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); + p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); + p.queue_order_type = cpu_to_be32(q_order_type); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/** + * drbd_send_state() - Sends the drbd state to the peer + * @mdev: DRBD device. + */ +int drbd_send_state(struct drbd_conf *mdev) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + /* Grab state lock so we wont send state if we're in the middle + * of a cluster wide state change on another thread */ + drbd_state_lock(mdev); + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + drbd_state_unlock(mdev); + return ok; +} + +int drbd_send_state_req(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + struct p_req_state p; + + p.mask = cpu_to_be32(mask.i); + p.val = cpu_to_be32(val.i); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) +{ + struct p_req_state_reply p; + + p.retcode = cpu_to_be32(retcode); + + return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, + (struct p_header *)&p, sizeof(p)); +} + +int fill_bitmap_rle_bits(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + unsigned long plain_bits; + unsigned long tmp; + unsigned long rl; + unsigned len; + unsigned toggle; + int bits; + + /* may we use this feature? */ + if ((mdev->sync_conf.use_rle == 0) || + (mdev->agreed_pro_version < 90)) + return 0; + + if (c->bit_offset >= c->bm_bits) + return 0; /* nothing to do. */ + + /* use at most thus many bytes */ + bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); + memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); + /* plain bits covered in this code string */ + plain_bits = 0; + + /* p->encoding & 0x80 stores whether the first run length is set. + * bit offset is implicit. + * start with toggle == 2 to be able to tell the first iteration */ + toggle = 2; + + /* see how much plain bits we can stuff into one packet + * using RLE and VLI. */ + do { + tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) + : _drbd_bm_find_next(mdev, c->bit_offset); + if (tmp == -1UL) + tmp = c->bm_bits; + rl = tmp - c->bit_offset; + + if (toggle == 2) { /* first iteration */ + if (rl == 0) { + /* the first checked bit was set, + * store start value, */ + DCBP_set_start(p, 1); + /* but skip encoding of zero run length */ + toggle = !toggle; + continue; + } + DCBP_set_start(p, 0); + } + + /* paranoia: catch zero runlength. + * can only happen if bitmap is modified while we scan it. */ + if (rl == 0) { + dev_err(DEV, "unexpected zero runlength while encoding bitmap " + "t:%u bo:%lu\n", toggle, c->bit_offset); + return -1; + } + + bits = vli_encode_bits(&bs, rl); + if (bits == -ENOBUFS) /* buffer full */ + break; + if (bits <= 0) { + dev_err(DEV, "error while encoding bitmap: %d\n", bits); + return 0; + } + + toggle = !toggle; + plain_bits += rl; + c->bit_offset = tmp; + } while (c->bit_offset < c->bm_bits); + + len = bs.cur.b - p->code + !!bs.cur.bit; + + if (plain_bits < (len << 3)) { + /* incompressible with this method. + * we need to rewind both word and bit position. */ + c->bit_offset -= plain_bits; + bm_xfer_ctx_bit_to_word_offset(c); + c->bit_offset = c->word_offset * BITS_PER_LONG; + return 0; + } + + /* RLE + VLI was able to compress it just fine. + * update c->word_offset. */ + bm_xfer_ctx_bit_to_word_offset(c); + + /* store pad_bits */ + DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + + return len; +} + +enum { OK, FAILED, DONE } +send_bitmap_rle_or_plain(struct drbd_conf *mdev, + struct p_header *h, struct bm_xfer_ctx *c) +{ + struct p_compressed_bm *p = (void*)h; + unsigned long num_words; + int len; + int ok; + + len = fill_bitmap_rle_bits(mdev, p, c); + + if (len < 0) + return FAILED; + + if (len) { + DCBP_set_code(p, RLE_VLI_Bits); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, + sizeof(*p) + len, 0); + + c->packets[0]++; + c->bytes[0] += sizeof(*p) + len; + + if (c->bit_offset >= c->bm_bits) + len = 0; /* DONE */ + } else { + /* was not compressible. + * send a buffer full of plain text bits instead. */ + num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + len = num_words * sizeof(long); + if (len) + drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, + h, sizeof(struct p_header) + len, 0); + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + + c->packets[1]++; + c->bytes[1] += sizeof(struct p_header) + len; + + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + } + ok = ok ? ((len == 0) ? DONE : OK) : FAILED; + + if (ok == DONE) + INFO_bm_xfer_stats(mdev, "send", c); + return ok; +} + +/* See the comment at receive_bitmap() */ +int _drbd_send_bitmap(struct drbd_conf *mdev) +{ + struct bm_xfer_ctx c; + struct p_header *p; + int ret; + + ERR_IF(!mdev->bitmap) return FALSE; + + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + p = (struct p_header *) __get_free_page(GFP_NOIO); + if (!p) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); + return FALSE; + } + + if (get_ldev(mdev)) { + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { + dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); + drbd_bm_set_all(mdev); + if (drbd_bm_write(mdev)) { + /* write_bm did fail! Leave full sync flag set in Meta P_DATA + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + dev_err(DEV, "Failed to write bitmap to disk!\n"); + } else { + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + } + } + put_ldev(mdev); + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(mdev), + .bm_words = drbd_bm_words(mdev), + }; + + do { + ret = send_bitmap_rle_or_plain(mdev, p, &c); + } while (ret == OK); + + free_page((unsigned long) p); + return (ret == DONE); +} + +int drbd_send_bitmap(struct drbd_conf *mdev) +{ + int err; + + if (!drbd_get_data_sock(mdev)) + return -1; + err = !_drbd_send_bitmap(mdev); + drbd_put_data_sock(mdev); + return err; +} + +int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) +{ + int ok; + struct p_barrier_ack p; + + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); + + if (mdev->state.conn < C_CONNECTED) + return FALSE; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/** + * _drbd_send_ack() - Sends an ack packet + * @mdev: DRBD device. + * @cmd: Packet command code. + * @sector: sector, needs to be in big endian byte order + * @blksize: size in byte, needs to be in big endian byte order + * @block_id: Id, big endian byte order + */ +static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + u64 sector, + u32 blksize, + u64 block_id) +{ + int ok; + struct p_block_ack p; + + p.sector = sector; + p.block_id = block_id; + p.blksize = blksize; + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); + + if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) + return FALSE; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp) +{ + const int header_size = sizeof(struct p_data) + - sizeof(struct p_header); + int data_size = ((struct p_header *)dp)->length - header_size; + + return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); +} + +int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp) +{ + return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); +} + +/** + * drbd_send_ack() - Sends an ack packet + * @mdev: DRBD device. + * @cmd: Packet command code. + * @e: Epoch entry. + */ +int drbd_send_ack(struct drbd_conf *mdev, + enum drbd_packets cmd, struct drbd_epoch_entry *e) +{ + return _drbd_send_ack(mdev, cmd, + cpu_to_be64(e->sector), + cpu_to_be32(e->size), + e->block_id); +} + +/* This function misuses the block_id field to signal if the blocks + * are is sync or not. */ +int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, + sector_t sector, int blksize, u64 block_id) +{ + return _drbd_send_ack(mdev, cmd, + cpu_to_be64(sector), + cpu_to_be32(blksize), + cpu_to_be64(block_id)); +} + +int drbd_send_drequest(struct drbd_conf *mdev, int cmd, + sector_t sector, int size, u64 block_id) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector, int size, + void *digest, int digest_size, + enum drbd_packets cmd) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbeef; + p.blksize = cpu_to_be32(size); + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); + + mutex_lock(&mdev->data.mutex); + + ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); + ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + +int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbabe; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/* called on sndtimeo + * returns FALSE if we should retry, + * TRUE if we think connection is dead + */ +static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) +{ + int drop_it; + /* long elapsed = (long)(jiffies - mdev->last_received); */ + + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || mdev->state.conn < C_CONNECTED; + + if (drop_it) + return TRUE; + + drop_it = !--mdev->ko_count; + if (!drop_it) { + dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); + } + + return drop_it; /* && (mdev->state == R_PRIMARY) */; +} + +/* The idea of sendpage seems to be to put some kind of reference + * to the page into the skb, and to hand it over to the NIC. In + * this process get_page() gets called. + * + * As soon as the page was really sent over the network put_page() + * gets called by some part of the network layer. [ NIC driver? ] + * + * [ get_page() / put_page() increment/decrement the count. If count + * reaches 0 the page will be freed. ] + * + * This works nicely with pages from FSs. + * But this means that in protocol A we might signal IO completion too early! + * + * In order not to corrupt data during a resync we must make sure + * that we do not reuse our own buffer pages (EEs) to early, therefore + * we have the net_ee list. + * + * XFS seems to have problems, still, it submits pages with page_count == 0! + * As a workaround, we disable sendpage on pages + * with page_count == 0 or PageSlab. + */ +static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, + int offset, size_t size) +{ + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + if (sent == size) + mdev->send_cnt += size>>9; + return sent == size; +} + +static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, + int offset, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int sent, ok; + int len = size; + + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set. + * we cannot use send_page for those, as that does get_page(); + * put_page(); and would cause either a VM_BUG directly, or + * __page_cache_release a page that would actually still be referenced + * by someone, leading to some obscure delayed Oops somewhere else. */ + if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) + return _drbd_no_send_page(mdev, page, offset, size); + + drbd_update_congested(mdev); + set_fs(KERNEL_DS); + do { + sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, + offset, len, + MSG_NOSIGNAL); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else + continue; + } + if (sent <= 0) { + dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", + __func__, (int)size, len, sent); + break; + } + len -= sent; + offset += sent; + } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); + set_fs(oldfs); + clear_bit(NET_CONGESTED, &mdev->flags); + + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} + +static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (!_drbd_no_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len)) + return 0; + } + return 1; +} + +static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (!_drbd_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len)) + return 0; + } + + return 1; +} + +/* Used to send write requests + * R_PRIMARY -> Peer (P_DATA) + */ +int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) +{ + int ok = 1; + struct p_data p; + unsigned int dp_flags = 0; + void *dgb; + int dgs; + + if (!drbd_get_data_sock(mdev)) + return 0; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(P_DATA); + p.head.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); + + p.sector = cpu_to_be64(req->sector); + p.block_id = (unsigned long)req; + p.seq_num = cpu_to_be32(req->seq_num = + atomic_add_return(1, &mdev->packet_seq)); + dp_flags = 0; + + /* NOTE: no need to check if barriers supported here as we would + * not pass the test in make_request_common in that case + */ + if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { + dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); + /* dp_flags |= DP_HARDBARRIER; */ + } + if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) + dp_flags |= DP_RW_SYNC; + /* for now handle SYNCIO and UNPLUG + * as if they still were one and the same flag */ + if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) + dp_flags |= DP_RW_SYNC; + if (mdev->state.conn >= C_SYNC_SOURCE && + mdev->state.conn <= C_PAUSED_SYNC_T) + dp_flags |= DP_MAY_SET_IN_SYNC; + + p.dp_flags = cpu_to_be32(dp_flags); + set_bit(UNPLUG_REMOTE, &mdev->flags); + ok = (sizeof(p) == + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + } + if (ok) { + if (mdev->net_conf->wire_protocol == DRBD_PROT_A) + ok = _drbd_send_bio(mdev, req->master_bio); + else + ok = _drbd_send_zc_bio(mdev, req->master_bio); + } + + drbd_put_data_sock(mdev); + return ok; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) + * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) + */ +int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e) +{ + int ok; + struct p_data p; + void *dgb; + int dgs; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); + + p.sector = cpu_to_be64(e->sector); + p.block_id = e->block_id; + /* p.seq_num = 0; No sequence numbers here.. */ + + /* Only called by our kernel thread. + * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to admin command or module unload. + */ + if (!drbd_get_data_sock(mdev)) + return 0; + + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, + sizeof(p), MSG_MORE); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + } + if (ok) + ok = _drbd_send_zc_bio(mdev, e->private_bio); + + drbd_put_data_sock(mdev); + return ok; +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags) +{ + struct kvec iov; + struct msghdr msg; + int rv, sent = 0; + + if (!sock) + return -1000; + + /* THINK if (signal_pending) return ... ? */ + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (sock == mdev->data.socket) { + mdev->ko_count = mdev->net_conf->ko_count; + drbd_update_congested(mdev); + } + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(mdev, sock)) + break; + else + continue; + } + D_ASSERT(rv != 0); + if (rv == -EINTR) { + flush_signals(current); + rv = 0; + } + if (rv < 0) + break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while (sent < size); + + if (sock == mdev->data.socket) + clear_bit(NET_CONGESTED, &mdev->flags); + + if (rv <= 0) { + if (rv != -EAGAIN) { + dev_err(DEV, "%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + } else + drbd_force_state(mdev, NS(conn, C_TIMEOUT)); + } + + return sent; +} + +static int drbd_open(struct block_device *bdev, fmode_t mode) +{ + struct drbd_conf *mdev = bdev->bd_disk->private_data; + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&mdev->req_lock, flags); + /* to have a stable mdev->state.role + * and no race with updating open_cnt */ + + if (mdev->state.role != R_PRIMARY) { + if (mode & FMODE_WRITE) + rv = -EROFS; + else if (!allow_oos) + rv = -EMEDIUMTYPE; + } + + if (!rv) + mdev->open_cnt++; + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +static int drbd_release(struct gendisk *gd, fmode_t mode) +{ + struct drbd_conf *mdev = gd->private_data; + mdev->open_cnt--; + return 0; +} + +static void drbd_unplug_fn(struct request_queue *q) +{ + struct drbd_conf *mdev = q->queuedata; + + /* unplug FIRST */ + spin_lock_irq(q->queue_lock); + blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + /* only if connected */ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { + D_ASSERT(mdev->state.role == R_PRIMARY); + if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { + /* add to the data.work queue, + * unless already queued. + * XXX this might be a good addition to drbd_queue_work + * anyways, to detect "double queuing" ... */ + if (list_empty(&mdev->unplug_work.list)) + drbd_queue_work(&mdev->data.work, + &mdev->unplug_work); + } + } + spin_unlock_irq(&mdev->req_lock); + + if (mdev->state.disk >= D_INCONSISTENT) + drbd_kick_lo(mdev); +} + +static void drbd_set_defaults(struct drbd_conf *mdev) +{ + mdev->sync_conf.after = DRBD_AFTER_DEF; + mdev->sync_conf.rate = DRBD_RATE_DEF; + mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; + mdev->state = (union drbd_state) { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = C_STANDALONE, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + .susp = 0 + } }; +} + +void drbd_init_set_defaults(struct drbd_conf *mdev) +{ + /* the memset(,0,) did most of this. + * note: only assignments, no allocation in here */ + + drbd_set_defaults(mdev); + + /* for now, we do NOT yet support it, + * even though we start some framework + * to eventually support barriers */ + set_bit(NO_BARRIER_SUPP, &mdev->flags); + + atomic_set(&mdev->ap_bio_cnt, 0); + atomic_set(&mdev->ap_pending_cnt, 0); + atomic_set(&mdev->rs_pending_cnt, 0); + atomic_set(&mdev->unacked_cnt, 0); + atomic_set(&mdev->local_cnt, 0); + atomic_set(&mdev->net_cnt, 0); + atomic_set(&mdev->packet_seq, 0); + atomic_set(&mdev->pp_in_use, 0); + + mutex_init(&mdev->md_io_mutex); + mutex_init(&mdev->data.mutex); + mutex_init(&mdev->meta.mutex); + sema_init(&mdev->data.work.s, 0); + sema_init(&mdev->meta.work.s, 0); + mutex_init(&mdev->state_mutex); + + spin_lock_init(&mdev->data.work.q_lock); + spin_lock_init(&mdev->meta.work.q_lock); + + spin_lock_init(&mdev->al_lock); + spin_lock_init(&mdev->req_lock); + spin_lock_init(&mdev->peer_seq_lock); + spin_lock_init(&mdev->epoch_lock); + + INIT_LIST_HEAD(&mdev->active_ee); + INIT_LIST_HEAD(&mdev->sync_ee); + INIT_LIST_HEAD(&mdev->done_ee); + INIT_LIST_HEAD(&mdev->read_ee); + INIT_LIST_HEAD(&mdev->net_ee); + INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); + INIT_LIST_HEAD(&mdev->resync_work.list); + INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->md_sync_work.list); + INIT_LIST_HEAD(&mdev->bm_io_work.w.list); + mdev->resync_work.cb = w_resync_inactive; + mdev->unplug_work.cb = w_send_write_hint; + mdev->md_sync_work.cb = w_md_sync; + mdev->bm_io_work.w.cb = w_bitmap_io; + init_timer(&mdev->resync_timer); + init_timer(&mdev->md_sync_timer); + mdev->resync_timer.function = resync_timer_fn; + mdev->resync_timer.data = (unsigned long) mdev; + mdev->md_sync_timer.function = md_sync_timer_fn; + mdev->md_sync_timer.data = (unsigned long) mdev; + + init_waitqueue_head(&mdev->misc_wait); + init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->ee_wait); + init_waitqueue_head(&mdev->al_wait); + init_waitqueue_head(&mdev->seq_wait); + + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + + mdev->agreed_pro_version = PRO_VERSION_MAX; + mdev->write_ordering = WO_bio_barrier; + mdev->resync_wenr = LC_FREE; +} + +void drbd_mdev_cleanup(struct drbd_conf *mdev) +{ + if (mdev->receiver.t_state != None) + dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", + mdev->receiver.t_state); + + /* no need to lock it, I'm the only thread alive */ + if (atomic_read(&mdev->current_epoch->epoch_size) != 0) + dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); + mdev->al_writ_cnt = + mdev->bm_writ_cnt = + mdev->read_cnt = + mdev->recv_cnt = + mdev->send_cnt = + mdev->writ_cnt = + mdev->p_size = + mdev->rs_start = + mdev->rs_total = + mdev->rs_failed = + mdev->rs_mark_left = + mdev->rs_mark_time = 0; + D_ASSERT(mdev->net_conf == NULL); + + drbd_set_my_capacity(mdev, 0); + if (mdev->bitmap) { + /* maybe never allocated. */ + drbd_bm_resize(mdev, 0); + drbd_bm_cleanup(mdev); + } + + drbd_free_resources(mdev); + + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + */ + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->net_ee)); + D_ASSERT(list_empty(&mdev->resync_reads)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->resync_work.list)); + D_ASSERT(list_empty(&mdev->unplug_work.list)); + +} + + +static void drbd_destroy_mempools(void) +{ + struct page *page; + + while (drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page *)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_ee_mempool) + mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) + mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) + kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) + kmem_cache_destroy(drbd_request_cache); + if (drbd_bm_ext_cache) + kmem_cache_destroy(drbd_bm_ext_cache); + if (drbd_al_ext_cache) + kmem_cache_destroy(drbd_al_ext_cache); + + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + + return; +} + +static int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; + int i; + + /* prepare our caches and mempools */ + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + drbd_pp_pool = NULL; + + /* caches */ + drbd_request_cache = kmem_cache_create( + "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + drbd_bm_ext_cache = kmem_cache_create( + "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); + if (drbd_bm_ext_cache == NULL) + goto Enomem; + + drbd_al_ext_cache = kmem_cache_create( + "drbd_al", sizeof(struct lc_element), 0, 0, NULL); + if (drbd_al_ext_cache == NULL) + goto Enomem; + + /* mempools */ + drbd_request_mempool = mempool_create(number, + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create(number, + mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + /* drbd's page pool */ + spin_lock_init(&drbd_pp_lock); + + for (i = 0; i < number; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto Enomem; + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + +Enomem: + drbd_destroy_mempools(); /* in case we allocated some */ + return -ENOMEM; +} + +static int drbd_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + /* just so we have it. you never know what interesting things we + * might want to do here some day... + */ + + return NOTIFY_DONE; +} + +static struct notifier_block drbd_notifier = { + .notifier_call = drbd_notify_sys, +}; + +static void drbd_release_ee_lists(struct drbd_conf *mdev) +{ + int rr; + + rr = drbd_release_ee(mdev, &mdev->active_ee); + if (rr) + dev_err(DEV, "%d EEs in active list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->sync_ee); + if (rr) + dev_err(DEV, "%d EEs in sync list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->read_ee); + if (rr) + dev_err(DEV, "%d EEs in read list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->done_ee); + if (rr) + dev_err(DEV, "%d EEs in done list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->net_ee); + if (rr) + dev_err(DEV, "%d EEs in net list found!\n", rr); +} + +/* caution. no locking. + * currently only used from module cleanup code. */ +static void drbd_delete_device(unsigned int minor) +{ + struct drbd_conf *mdev = minor_to_mdev(minor); + + if (!mdev) + return; + + /* paranoia asserts */ + if (mdev->open_cnt != 0) + dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, + __FILE__ , __LINE__); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp, &mdev->data.work.q) { + dev_err(DEV, "lp = %p\n", lp); + } + }; + /* end paranoia asserts */ + + del_gendisk(mdev->vdisk); + + /* cleanup stuff that may have been allocated during + * device (re-)configuration or state changes */ + + if (mdev->this_bdev) + bdput(mdev->this_bdev); + + drbd_free_resources(mdev); + + drbd_release_ee_lists(mdev); + + /* should be free'd on disconnect? */ + kfree(mdev->ee_hash); + /* + mdev->ee_hash_s = 0; + mdev->ee_hash = NULL; + */ + + lc_destroy(mdev->act_log); + lc_destroy(mdev->resync); + + kfree(mdev->p_uuid); + /* mdev->p_uuid = NULL; */ + + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); + + /* cleanup the rest that has been + * allocated from drbd_new_device + * and actually free the mdev itself */ + drbd_free_mdev(mdev); +} + +static void drbd_cleanup(void) +{ + unsigned int i; + + unregister_reboot_notifier(&drbd_notifier); + + drbd_nl_cleanup(); + + if (minor_table) { + if (drbd_proc) + remove_proc_entry("drbd", NULL); + i = minor_count; + while (i--) + drbd_delete_device(i); + drbd_destroy_mempools(); + } + + kfree(minor_table); + + unregister_blkdev(DRBD_MAJOR, "drbd"); + + printk(KERN_INFO "drbd: module cleanup done.\n"); +} + +/** + * drbd_congested() - Callback for pdflush + * @congested_data: User data + * @bdi_bits: Bits pdflush is currently interested in + * + * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. + */ +static int drbd_congested(void *congested_data, int bdi_bits) +{ + struct drbd_conf *mdev = congested_data; + struct request_queue *q; + char reason = '-'; + int r = 0; + + if (!__inc_ap_bio_cond(mdev)) { + /* DRBD has frozen IO */ + r = bdi_bits; + reason = 'd'; + goto out; + } + + if (get_ldev(mdev)) { + q = bdev_get_queue(mdev->ldev->backing_bdev); + r = bdi_congested(&q->backing_dev_info, bdi_bits); + put_ldev(mdev); + if (r) + reason = 'b'; + } + + if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { + r |= (1 << BDI_async_congested); + reason = reason == 'b' ? 'a' : 'n'; + } + +out: + mdev->congestion_reason = reason; + return r; +} + +struct drbd_conf *drbd_new_device(unsigned int minor) +{ + struct drbd_conf *mdev; + struct gendisk *disk; + struct request_queue *q; + + /* GFP_KERNEL, we are outside of all write-out paths */ + mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); + if (!mdev) + return NULL; + if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) + goto out_no_cpumask; + + mdev->minor = minor; + + drbd_init_set_defaults(mdev); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + goto out_no_q; + mdev->rq_queue = q; + q->queuedata = mdev; + blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); + + disk = alloc_disk(1); + if (!disk) + goto out_no_disk; + mdev->vdisk = disk; + + set_disk_ro(disk, TRUE); + + disk->queue = q; + disk->major = DRBD_MAJOR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, "drbd%d", minor); + disk->private_data = mdev; + + mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); + /* we have no partitions. we contain only ourselves. */ + mdev->this_bdev->bd_contains = mdev->this_bdev; + + q->backing_dev_info.congested_fn = drbd_congested; + q->backing_dev_info.congested_data = mdev; + + blk_queue_make_request(q, drbd_make_request_26); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &mdev->req_lock; /* needed since we use */ + /* plugging on a queue, that actually has no requests! */ + q->unplug_fn = drbd_unplug_fn; + + mdev->md_io_page = alloc_page(GFP_KERNEL); + if (!mdev->md_io_page) + goto out_no_io_page; + + if (drbd_bm_init(mdev)) + goto out_no_bitmap; + /* no need to lock access, we are still initializing this minor device. */ + if (!tl_init(mdev)) + goto out_no_tl; + + mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); + if (!mdev->app_reads_hash) + goto out_no_app_reads; + + mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!mdev->current_epoch) + goto out_no_epoch; + + INIT_LIST_HEAD(&mdev->current_epoch->list); + mdev->epochs = 1; + + return mdev; + +/* out_whatever_else: + kfree(mdev->current_epoch); */ +out_no_epoch: + kfree(mdev->app_reads_hash); +out_no_app_reads: + tl_cleanup(mdev); +out_no_tl: + drbd_bm_cleanup(mdev); +out_no_bitmap: + __free_page(mdev->md_io_page); +out_no_io_page: + put_disk(disk); +out_no_disk: + blk_cleanup_queue(q); +out_no_q: + free_cpumask_var(mdev->cpu_mask); +out_no_cpumask: + kfree(mdev); + return NULL; +} + +/* counterpart of drbd_new_device. + * last part of drbd_delete_device. */ +void drbd_free_mdev(struct drbd_conf *mdev) +{ + kfree(mdev->current_epoch); + kfree(mdev->app_reads_hash); + tl_cleanup(mdev); + if (mdev->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(mdev); + __free_page(mdev->md_io_page); + put_disk(mdev->vdisk); + blk_cleanup_queue(mdev->rq_queue); + free_cpumask_var(mdev->cpu_mask); + kfree(mdev); +} + + +int __init drbd_init(void) +{ + int err; + + if (sizeof(struct p_handshake) != 80) { + printk(KERN_ERR + "drbd: never change the size or layout " + "of the HandShake packet.\n"); + return -EINVAL; + } + + if (1 > minor_count || minor_count > 255) { + printk(KERN_ERR + "drbd: invalid minor_count (%d)\n", minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = 8; +#endif + } + + err = drbd_nl_init(); + if (err) + return err; + + err = register_blkdev(DRBD_MAJOR, "drbd"); + if (err) { + printk(KERN_ERR + "drbd: unable to register block device major %d\n", + DRBD_MAJOR); + return err; + } + + register_reboot_notifier(&drbd_notifier); + + /* + * allocate all necessary structs + */ + err = -ENOMEM; + + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; /* play safe for drbd_cleanup */ + minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, + GFP_KERNEL); + if (!minor_table) + goto Enomem; + + err = drbd_create_mempools(); + if (err) + goto Enomem; + + drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); + if (!drbd_proc) { + printk(KERN_ERR "drbd: unable to register proc file\n"); + goto Enomem; + } + + rwlock_init(&global_state_lock); + + printk(KERN_INFO "drbd: initialized. " + "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); + printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); + printk(KERN_INFO "drbd: registered as block device major %d\n", + DRBD_MAJOR); + printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); + + return 0; /* Success! */ + +Enomem: + drbd_cleanup(); + if (err == -ENOMEM) + /* currently always the case */ + printk(KERN_ERR "drbd: ran out of memory\n"); + else + printk(KERN_ERR "drbd: initialization failure\n"); + return err; +} + +void drbd_free_bc(struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + bd_release(ldev->backing_bdev); + bd_release(ldev->md_bdev); + + fput(ldev->lo_file); + fput(ldev->md_file); + + kfree(ldev); +} + +void drbd_free_sock(struct drbd_conf *mdev) +{ + if (mdev->data.socket) { + kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); + sock_release(mdev->data.socket); + mdev->data.socket = NULL; + } + if (mdev->meta.socket) { + kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); + sock_release(mdev->meta.socket); + mdev->meta.socket = NULL; + } +} + + +void drbd_free_resources(struct drbd_conf *mdev) +{ + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = NULL; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = NULL; + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = NULL; + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = NULL; + + drbd_free_sock(mdev); + + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); +} + +/* meta data management */ + +struct meta_data_on_disk { + u64 la_size; /* last agreed size. */ + u64 uuid[UI_SIZE]; /* UUIDs. */ + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; /* MDF */ + u32 magic; + u32 md_size_sect; + u32 al_offset; /* offset to this block */ + u32 al_nr_extents; /* important for restoring the AL */ + /* `-- act_log->nr_elements <-- sync_conf.al_extents */ + u32 bm_offset; /* offset to the bitmap, from here */ + u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ + u32 reserved_u32[4]; + +} __packed; + +/** + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set + * @mdev: DRBD device. + */ +void drbd_md_sync(struct drbd_conf *mdev) +{ + struct meta_data_on_disk *buffer; + sector_t sector; + int i; + + if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) + return; + del_timer(&mdev->md_sync_timer); + + /* We use here D_FAILED and not D_ATTACHING because we try to write + * metadata even if we detach due to a disk failure! */ + if (!get_ldev_if_state(mdev, D_FAILED)) + return; + + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + memset(buffer, 0, 512); + + buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + for (i = UI_CURRENT; i < UI_SIZE; i++) + buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); + buffer->flags = cpu_to_be32(mdev->ldev->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); + + buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); + buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); + + D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); + sector = mdev->ldev->md.md_offset; + + if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + clear_bit(MD_DIRTY, &mdev->flags); + } else { + /* this was a try anyways ... */ + dev_err(DEV, "meta data update failed!\n"); + + drbd_chk_io_error(mdev, 1, TRUE); + } + + /* Update mdev->ldev->md.la_size_sect, + * since we updated it on metadata. */ + mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); + + mutex_unlock(&mdev->md_io_mutex); + put_ldev(mdev); +} + +/** + * drbd_md_read() - Reads in the meta data super block + * @mdev: DRBD device. + * @bdev: Device from which the meta data should be read in. + * + * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case + * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. + */ +int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk *buffer; + int i, rv = NO_ERROR; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) + return ERR_IO_MD_DISK; + + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + + if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { + /* NOTE: cant do normal error processing here as this is + called BEFORE disk is attached */ + dev_err(DEV, "Error while reading metadata.\n"); + rv = ERR_IO_MD_DISK; + goto err; + } + + if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { + dev_err(DEV, "Error while reading metadata, magic not found.\n"); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { + dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", + be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + dev_err(DEV, "unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + rv = ERR_MD_INVALID; + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + rv = ERR_MD_INVALID; + goto err; + } + + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + err: + mutex_unlock(&mdev->md_io_mutex); + put_ldev(mdev); + + return rv; +} + +/** + * drbd_md_mark_dirty() - Mark meta data super block as dirty + * @mdev: DRBD device. + * + * Call this function if you change anything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +void drbd_md_mark_dirty(struct drbd_conf *mdev) +{ + set_bit(MD_DIRTY, &mdev->flags); + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); +} + + +static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) +{ + int i; + + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) + mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; +} + +void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) +{ + if (idx == UI_CURRENT) { + if (mdev->state.role == R_PRIMARY) + val |= 1; + else + val &= ~((u64)1); + + drbd_set_ed_uuid(mdev, val); + } + + mdev->ldev->md.uuid[idx] = val; + drbd_md_mark_dirty(mdev); +} + + +void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) +{ + if (mdev->ldev->md.uuid[idx]) { + drbd_uuid_move_history(mdev); + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; + } + _drbd_uuid_set(mdev, idx, val); +} + +/** + * drbd_uuid_new_current() - Creates a new current UUID + * @mdev: DRBD device. + * + * Creates a new current UUID, and rotates the old current UUID into + * the bitmap slot. Causes an incremental resync upon next connect. + */ +void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) +{ + u64 val; + + dev_info(DEV, "Creating new current UUID\n"); + D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); + mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; + + get_random_bytes(&val, sizeof(u64)); + _drbd_uuid_set(mdev, UI_CURRENT, val); +} + +void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) +{ + if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) + return; + + if (val == 0) { + drbd_uuid_move_history(mdev); + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; + mdev->ldev->md.uuid[UI_BITMAP] = 0; + } else { + if (mdev->ldev->md.uuid[UI_BITMAP]) + dev_warn(DEV, "bm UUID already set"); + + mdev->ldev->md.uuid[UI_BITMAP] = val; + mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); + + } + drbd_md_mark_dirty(mdev); +} + +/** + * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @mdev: DRBD device. + * + * Sets all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_set_n_write(struct drbd_conf *mdev) +{ + int rv = -EIO; + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + drbd_md_set_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + drbd_bm_set_all(mdev); + + rv = drbd_bm_write(mdev); + + if (!rv) { + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + } + + put_ldev(mdev); + } + + return rv; +} + +/** + * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @mdev: DRBD device. + * + * Clears all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_clear_n_write(struct drbd_conf *mdev) +{ + int rv = -EIO; + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + drbd_bm_clear_all(mdev); + rv = drbd_bm_write(mdev); + put_ldev(mdev); + } + + return rv; +} + +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct bm_io_work *work = container_of(w, struct bm_io_work, w); + int rv; + + D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); + + drbd_bm_lock(mdev, work->why); + rv = work->io_fn(mdev); + drbd_bm_unlock(mdev); + + clear_bit(BITMAP_IO, &mdev->flags); + wake_up(&mdev->misc_wait); + + if (work->done) + work->done(mdev, rv); + + clear_bit(BITMAP_IO_QUEUED, &mdev->flags); + work->why = NULL; + + return 1; +} + +/** + * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap + * @mdev: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @done: callback to be called after the bitmap IO was performed + * @why: Descriptive text of the reason for doing the IO + * + * While IO on the bitmap happens we freeze application IO thus we ensure + * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be + * called from worker context. It MUST NOT be used while a previous such + * work is still pending! + */ +void drbd_queue_bitmap_io(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + void (*done)(struct drbd_conf *, int), + char *why) +{ + D_ASSERT(current == mdev->worker.task); + + D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); + D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); + D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); + if (mdev->bm_io_work.why) + dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", + why, mdev->bm_io_work.why); + + mdev->bm_io_work.io_fn = io_fn; + mdev->bm_io_work.done = done; + mdev->bm_io_work.why = why; + + set_bit(BITMAP_IO, &mdev->flags); + if (atomic_read(&mdev->ap_bio_cnt) == 0) { + if (list_empty(&mdev->bm_io_work.w.list)) { + set_bit(BITMAP_IO_QUEUED, &mdev->flags); + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } else + dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); + } +} + +/** + * drbd_bitmap_io() - Does an IO operation on the whole bitmap + * @mdev: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @why: Descriptive text of the reason for doing the IO + * + * freezes application IO while that the actual IO operations runs. This + * functions MAY NOT be called from worker context. + */ +int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) +{ + int rv; + + D_ASSERT(current != mdev->worker.task); + + drbd_suspend_io(mdev); + + drbd_bm_lock(mdev, why); + rv = io_fn(mdev); + drbd_bm_unlock(mdev); + + drbd_resume_io(mdev); + + return rv; +} + +void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) +{ + if ((mdev->ldev->md.flags & flag) != flag) { + drbd_md_mark_dirty(mdev); + mdev->ldev->md.flags |= flag; + } +} + +void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) +{ + if ((mdev->ldev->md.flags & flag) != 0) { + drbd_md_mark_dirty(mdev); + mdev->ldev->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return (bdev->md.flags & flag) != 0; +} + +static void md_sync_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + + drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); +} + +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(mdev); + + return 1; +} + +#ifdef CONFIG_DRBD_FAULT_INJECTION +/* Fault insertion support including random number generator shamelessly + * stolen from kernel/rcutorture.c */ +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (--rsp->count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +static char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + [DRBD_FAULT_MD_WR] = "Meta-data write", + [DRBD_FAULT_MD_RD] = "Meta-data read", + [DRBD_FAULT_RS_WR] = "Resync write", + [DRBD_FAULT_RS_RD] = "Resync read", + [DRBD_FAULT_DT_WR] = "Data write", + [DRBD_FAULT_DT_RD] = "Data read", + [DRBD_FAULT_DT_RA] = "Data read ahead", + [DRBD_FAULT_BM_ALLOC] = "BM allocation", + [DRBD_FAULT_AL_EE] = "EE allocation" + }; + + return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) +{ + static struct fault_random_state rrs = {0, 0}; + + unsigned int ret = ( + (fault_devs == 0 || + ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + + if (ret) { + fault_count++; + + if (printk_ratelimit()) + dev_warn(DEV, "***Simulating %s failure\n", + _drbd_fault_str(type)); + } + + return ret; +} +#endif + +const char *drbd_buildtag(void) +{ + /* DRBD built from external sources has here a reference to the + git hash of the source code. */ + + static char buildtag[38] = "\0uilt-in"; + + if (buildtag[0] == 0) { +#ifdef CONFIG_MODULES + if (THIS_MODULE != NULL) + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); + else +#endif + buildtag[0] = 'b'; + } + + return buildtag; +} + +module_init(drbd_init) +module_exit(drbd_cleanup) + +EXPORT_SYMBOL(drbd_conn_str); +EXPORT_SYMBOL(drbd_role_str); +EXPORT_SYMBOL(drbd_disk_str); +EXPORT_SYMBOL(drbd_set_st_err_str); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 0000000..436a090 --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c @@ -0,0 +1,2364 @@ +/* + drbd_nl.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> +#include <linux/drbd.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/connector.h> +#include <linux/blkpg.h> +#include <linux/cpumask.h> +#include "drbd_int.h" +#include "drbd_wrappers.h" +#include <asm/unaligned.h> +#include <linux/drbd_tag_magic.h> +#include <linux/drbd_limits.h> + +static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); +static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); +static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); + +/* see get_sb_bdev and bd_claim */ +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + +/* Generate the tag_list to struct functions */ +#define NL_PACKET(name, number, fields) \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) \ +{ \ + int tag; \ + int dlen; \ + \ + while ((tag = get_unaligned(tags++)) != TT_END) { \ + dlen = get_unaligned(tags++); \ + switch (tag_number(tag)) { \ + fields \ + default: \ + if (tag & T_MANDATORY) { \ + dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ + return 0; \ + } \ + } \ + tags = (unsigned short *)((char *)tags + dlen); \ + } \ + return 1; \ +} +#define NL_INTEGER(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ + arg->member = get_unaligned((int *)(tags)); \ + break; +#define NL_INT64(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ + arg->member = get_unaligned((u64 *)(tags)); \ + break; +#define NL_BIT(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ + arg->member = *(char *)(tags) ? 1 : 0; \ + break; +#define NL_STRING(pn, pr, member, len) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ + if (dlen > len) { \ + dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ + #member, dlen, (unsigned int)len); \ + return 0; \ + } \ + arg->member ## _len = dlen; \ + memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ + break; +#include "linux/drbd_nl.h" + +/* Generate the struct to tag_list functions */ +#define NL_PACKET(name, number, fields) \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) \ +{ \ + fields \ + return tags; \ +} + +#define NL_INTEGER(pn, pr, member) \ + put_unaligned(pn | pr | TT_INTEGER, tags++); \ + put_unaligned(sizeof(int), tags++); \ + put_unaligned(arg->member, (int *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(int)); +#define NL_INT64(pn, pr, member) \ + put_unaligned(pn | pr | TT_INT64, tags++); \ + put_unaligned(sizeof(u64), tags++); \ + put_unaligned(arg->member, (u64 *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(u64)); +#define NL_BIT(pn, pr, member) \ + put_unaligned(pn | pr | TT_BIT, tags++); \ + put_unaligned(sizeof(char), tags++); \ + *(char *)tags = arg->member; \ + tags = (unsigned short *)((char *)tags+sizeof(char)); +#define NL_STRING(pn, pr, member, len) \ + put_unaligned(pn | pr | TT_STRING, tags++); \ + put_unaligned(arg->member ## _len, tags++); \ + memcpy(tags, arg->member, arg->member ## _len); \ + tags = (unsigned short *)((char *)tags + arg->member ## _len); +#include "linux/drbd_nl.h" + +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); +void drbd_nl_send_reply(struct cn_msg *, int); + +int drbd_khelper(struct drbd_conf *mdev, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL, /* Will be set to address family */ + NULL, /* Will be set to address */ + NULL }; + + char mb[12], af[20], ad[60], *afs; + char *argv[] = {usermode_helper, cmd, mb, NULL }; + int ret; + + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); + + if (get_net_conf(mdev)) { + switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); + break; + case AF_INET: + afs = "ipv4"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + break; + default: + afs = "ssocks"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + } + snprintf(af, 20, "DRBD_PEER_AF=%s", afs); + envp[3]=af; + envp[4]=ad; + put_net_conf(mdev); + } + + dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); + + drbd_bcast_ev_helper(mdev, cmd); + ret = call_usermodehelper(usermode_helper, argv, envp, 1); + if (ret) + dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + else + dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) +{ + char *ex_to_string; + int r; + enum drbd_disk_state nps; + enum drbd_fencing_p fp; + + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); + + if (get_ldev_if_state(mdev, D_CONSISTENT)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } else { + dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); + return mdev->state.pdsk; + } + + if (fp == FP_STONITH) + _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); + + r = drbd_khelper(mdev, "fence-peer"); + + switch ((r>>8) & 0xff) { + case 3: /* peer is inconsistent */ + ex_to_string = "peer is inconsistent or worse"; + nps = D_INCONSISTENT; + break; + case 4: /* peer got outdated, or was already outdated */ + ex_to_string = "peer was fenced"; + nps = D_OUTDATED; + break; + case 5: /* peer was down */ + if (mdev->state.disk == D_UP_TO_DATE) { + /* we will(have) create(d) a new UUID anyways... */ + ex_to_string = "peer is unreachable, assumed to be dead"; + nps = D_OUTDATED; + } else { + ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; + nps = mdev->state.pdsk; + } + break; + case 6: /* Peer is primary, voluntarily outdate myself. + * This is useful when an unconnected R_SECONDARY is asked to + * become R_PRIMARY, but finds the other peer being active. */ + ex_to_string = "peer is active"; + dev_warn(DEV, "Peer is primary, outdating myself.\n"); + nps = D_UNKNOWN; + _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); + break; + case 7: + if (fp != FP_STONITH) + dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); + ex_to_string = "peer was stonithed"; + nps = D_OUTDATED; + break; + default: + /* The script is broken ... */ + nps = D_UNKNOWN; + dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return nps; + } + + dev_info(DEV, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); + return nps; +} + + +int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) +{ + const int max_tries = 4; + int r = 0; + int try = 0; + int forced = 0; + union drbd_state mask, val; + enum drbd_disk_state nps; + + if (new_role == R_PRIMARY) + request_ping(mdev); /* Detect a dead peer ASAP */ + + mutex_lock(&mdev->state_mutex); + + mask.i = 0; mask.role = R_MASK; + val.i = 0; val.role = new_role; + + while (try++ < max_tries) { + r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); + + /* in case we first succeeded to outdate, + * but now suddenly could establish a connection */ + if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { + val.pdsk = 0; + mask.pdsk = 0; + continue; + } + + if (r == SS_NO_UP_TO_DATE_DISK && force && + (mdev->state.disk == D_INCONSISTENT || + mdev->state.disk == D_OUTDATED)) { + mask.disk = D_MASK; + val.disk = D_UP_TO_DATE; + forced = 1; + continue; + } + + if (r == SS_NO_UP_TO_DATE_DISK && + mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); + nps = drbd_try_outdate_peer(mdev); + + if (nps == D_OUTDATED || nps == D_INCONSISTENT) { + val.disk = D_UP_TO_DATE; + mask.disk = D_MASK; + } + + val.pdsk = nps; + mask.pdsk = D_MASK; + + continue; + } + + if (r == SS_NOTHING_TO_DO) + goto fail; + if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { + nps = drbd_try_outdate_peer(mdev); + + if (force && nps > D_OUTDATED) { + dev_warn(DEV, "Forced into split brain situation!\n"); + nps = D_OUTDATED; + } + + mask.pdsk = D_MASK; + val.pdsk = nps; + + continue; + } + if (r == SS_TWO_PRIMARIES) { + /* Maybe the peer is detected as dead very soon... + retry at most once more in this case. */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); + if (try < max_tries) + try = max_tries - 1; + continue; + } + if (r < SS_SUCCESS) { + r = _drbd_request_state(mdev, mask, val, + CS_VERBOSE + CS_WAIT_COMPLETE); + if (r < SS_SUCCESS) + goto fail; + } + break; + } + + if (r < SS_SUCCESS) + goto fail; + + if (forced) + dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); + + /* Wait until nothing is on the fly :) */ + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); + + if (new_role == R_SECONDARY) { + set_disk_ro(mdev->vdisk, TRUE); + if (get_ldev(mdev)) { + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + put_ldev(mdev); + } + } else { + if (get_net_conf(mdev)) { + mdev->net_conf->want_lose = 0; + put_net_conf(mdev); + } + set_disk_ro(mdev->vdisk, FALSE); + if (get_ldev(mdev)) { + if (((mdev->state.conn < C_CONNECTED || + mdev->state.pdsk <= D_FAILED) + && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) + drbd_uuid_new_current(mdev); + + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; + put_ldev(mdev); + } + } + + if ((new_role == R_SECONDARY) && get_ldev(mdev)) { + drbd_al_to_on_disk_bm(mdev); + put_ldev(mdev); + } + + if (mdev->state.conn >= C_WF_REPORT_PARAMS) { + /* if this was forced, we should consider sync */ + if (forced) + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + drbd_md_sync(mdev); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + fail: + mutex_unlock(&mdev->state_mutex); + return r; +} + + +static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct primary primary_args; + + memset(&primary_args, 0, sizeof(struct primary)); + if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + + reply->ret_code = + drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); + + return 0; +} + +static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); + + return 0; +} + +/* initializes the md.*_offset members, so we are able to find + * the on disk meta data */ +static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + switch (bdev->dc.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.md_offset = 0; + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); + /* al size is still fixed */ + bdev->md.al_offset = -MD_AL_MAX_SIZE; + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect, 8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_BM_OFFSET; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + break; + } +} + +char *ppsize(char *buf, unsigned long long size) +{ + /* Needs 9 bytes at max. */ + static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; + int base = 0; + while (size >= 10000) { + /* shift + round */ + size = (size >> 10) + !!(size & (1<<9)); + base++; + } + sprintf(buf, "%lu %cB", (long)size, units[base]); + + return buf; +} + +/* there is still a theoretical deadlock when called from receiver + * on an D_INCONSISTENT R_PRIMARY: + * remote READ does inc_ap_bio, receiver would need to receive answer + * packet from remote to dec_ap_bio again. + * receiver receive_sizes(), comes here, + * waits for ap_bio_cnt == 0. -> deadlock. + * but this cannot happen, actually, because: + * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable + * (not connected, or bad/no disk on peer): + * see drbd_fail_request_early, ap_bio_cnt is zero. + * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: + * peer may not initiate a resize. + */ +void drbd_suspend_io(struct drbd_conf *mdev) +{ + set_bit(SUSPEND_IO, &mdev->flags); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); +} + +void drbd_resume_io(struct drbd_conf *mdev) +{ + clear_bit(SUSPEND_IO, &mdev->flags); + wake_up(&mdev->misc_wait); +} + +/** + * drbd_determine_dev_size() - Sets the right device size obeying all constraints + * @mdev: DRBD device. + * + * Returns 0 on success, negative return values indicate errors. + * You should call drbd_md_sync() after calling this function. + */ +enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) +{ + sector_t prev_first_sect, prev_size; /* previous meta location */ + sector_t la_size; + sector_t size; + char ppb[10]; + + int md_moved, la_size_changed; + enum determine_dev_size rv = unchanged; + + /* race: + * application request passes inc_ap_bio, + * but then cannot get an AL-reference. + * this function later may wait on ap_bio_cnt == 0. -> deadlock. + * + * to avoid that: + * Suspend IO right here. + * still lock the act_log to not trigger ASSERTs there. + */ + drbd_suspend_io(mdev); + + /* no wait necessary anymore, actually we could assert that */ + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + prev_first_sect = drbd_md_first_sector(mdev->ldev); + prev_size = mdev->ldev->md.md_size_sect; + la_size = mdev->ldev->md.la_size_sect; + + /* TODO: should only be some assert here, not (re)init... */ + drbd_md_set_sector_offsets(mdev, mdev->ldev); + + size = drbd_new_dev_size(mdev, mdev->ldev); + + if (drbd_get_capacity(mdev->this_bdev) != size || + drbd_bm_capacity(mdev) != size) { + int err; + err = drbd_bm_resize(mdev, size); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(mdev)>>1; + if (size == 0) { + dev_err(DEV, "OUT OF MEMORY! " + "Could not allocate bitmap!\n"); + } else { + dev_err(DEV, "BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = dev_size_error; + } + /* racy, see comments above. */ + drbd_set_my_capacity(mdev, size); + mdev->ldev->md.la_size_sect = size; + dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), + (unsigned long long)size>>1); + } + if (rv == dev_size_error) + goto out; + + la_size_changed = (la_size != mdev->ldev->md.la_size_sect); + + md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) + || prev_size != mdev->ldev->md.md_size_sect; + + if (la_size_changed || md_moved) { + drbd_al_shrink(mdev); /* All extents inactive. */ + dev_info(DEV, "Writing the whole bitmap, %s\n", + la_size_changed && md_moved ? "size changed and md moved" : + la_size_changed ? "size changed" : "md moved"); + rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ + drbd_md_mark_dirty(mdev); + } + + if (size > la_size) + rv = grew; + if (size < la_size) + rv = shrunk; +out: + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + drbd_resume_io(mdev); + + return rv; +} + +sector_t +drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + sector_t p_size = mdev->p_size; /* partner's disk size. */ + sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ + sector_t m_size; /* my size */ + sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ + sector_t size = 0; + + m_size = drbd_get_max_capacity(bdev); + + if (p_size && m_size) { + size = min_t(sector_t, p_size, m_size); + } else { + if (la_size) { + size = la_size; + if (m_size && m_size < size) + size = m_size; + if (p_size && p_size < size) + size = p_size; + } else { + if (m_size) + size = m_size; + if (p_size) + size = p_size; + } + } + + if (size == 0) + dev_err(DEV, "Both nodes diskless!\n"); + + if (u_size) { + if (u_size > size) + dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size>>1, (unsigned long)size>>1); + else + size = u_size; + } + + return size; +} + +/** + * drbd_check_al_size() - Ensures that the AL is of the right size + * @mdev: DRBD device. + * + * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation + * failed, and 0 on success. You should call drbd_md_sync() after you called + * this function. + */ +static int drbd_check_al_size(struct drbd_conf *mdev) +{ + struct lru_cache *n, *t; + struct lc_element *e; + unsigned int in_use; + int i; + + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + if (mdev->act_log && + mdev->act_log->nr_elements == mdev->sync_conf.al_extents) + return 0; + + in_use = 0; + t = mdev->act_log; + n = lc_create("act_log", drbd_al_ext_cache, + mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); + + if (n == NULL) { + dev_err(DEV, "Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&mdev->al_lock); + if (t) { + for (i = 0; i < t->nr_elements; i++) { + e = lc_element_by_index(t, i); + if (e->refcnt) + dev_err(DEV, "refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) + mdev->act_log = n; + spin_unlock_irq(&mdev->al_lock); + if (in_use) { + dev_err(DEV, "Activity log still in use!\n"); + lc_destroy(n); + return -EBUSY; + } else { + if (t) + lc_destroy(t); + } + drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ + return 0; +} + +void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) +{ + struct request_queue * const q = mdev->rq_queue; + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; + int max_segments = mdev->ldev->dc.max_bio_bvecs; + + if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) + max_seg_s = PAGE_SIZE; + + max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); + + blk_queue_max_sectors(q, max_seg_s >> 9); + blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); + blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); + blk_queue_max_segment_size(q, max_seg_s); + blk_queue_logical_block_size(q, 512); + blk_queue_segment_boundary(q, PAGE_SIZE-1); + blk_stack_limits(&q->limits, &b->limits, 0); + + if (b->merge_bvec_fn) + dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", + b->merge_bvec_fn); + dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); + + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } +} + +/* serialize deconfig (worker exiting, doing cleanup) + * and reconfig (drbdsetup disk, drbdsetup net) + * + * wait for a potentially exiting worker, then restart it, + * or start a new one. + */ +static void drbd_reconfig_start(struct drbd_conf *mdev) +{ + wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); + wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); + drbd_thread_start(&mdev->worker); +} + +/* if still unconfigured, stops worker again. + * if configured now, clears CONFIG_PENDING. + * wakes potential waiters */ +static void drbd_reconfig_done(struct drbd_conf *mdev) +{ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.disk == D_DISKLESS && + mdev->state.conn == C_STANDALONE && + mdev->state.role == R_SECONDARY) { + set_bit(DEVICE_DYING, &mdev->flags); + drbd_thread_stop_nowait(&mdev->worker); + } else + clear_bit(CONFIG_PENDING, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + wake_up(&mdev->state_wait); +} + +/* does always return 0; + * interesting return code is in reply->ret_code */ +static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + enum drbd_ret_codes retcode; + enum determine_dev_size dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ + struct inode *inode, *inode2; + struct lru_cache *resync_lru = NULL; + union drbd_state ns, os; + int rv; + int cp_discovered = 0; + int logical_block_size; + + drbd_reconfig_start(mdev); + + /* if you want to reconfigure, please tear down first */ + if (mdev->state.disk > D_DISKLESS) { + retcode = ERR_DISK_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, cqueue thread context */ + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); + if (!nbc) { + retcode = ERR_NOMEM; + goto fail; + } + + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; + nbc->dc.fencing = DRBD_FENCING_DEF; + nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; + + if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); + if (IS_ERR(nbc->lo_file)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, + PTR_ERR(nbc->lo_file)); + nbc->lo_file = NULL; + retcode = ERR_OPEN_DISK; + goto fail; + } + + inode = nbc->lo_file->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode)) { + retcode = ERR_DISK_NOT_BDEV; + goto fail; + } + + nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); + if (IS_ERR(nbc->md_file)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, + PTR_ERR(nbc->md_file)); + nbc->md_file = NULL; + retcode = ERR_OPEN_MD_DISK; + goto fail; + } + + inode2 = nbc->md_file->f_dentry->d_inode; + + if (!S_ISBLK(inode2->i_mode)) { + retcode = ERR_MD_NOT_BDEV; + goto fail; + } + + nbc->backing_bdev = inode->i_bdev; + if (bd_claim(nbc->backing_bdev, mdev)) { + printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", + nbc->backing_bdev, mdev, + nbc->backing_bdev->bd_holder, + nbc->backing_bdev->bd_contains->bd_holder, + nbc->backing_bdev->bd_holders); + retcode = ERR_BDCLAIM_DISK; + goto fail; + } + + resync_lru = lc_create("resync", drbd_bm_ext_cache, + 61, sizeof(struct bm_extent), + offsetof(struct bm_extent, lce)); + if (!resync_lru) { + retcode = ERR_NOMEM; + goto release_bdev_fail; + } + + /* meta_dev_idx >= 0: external fixed size, + * possibly multiple drbd sharing one meta device. + * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is + * not yet used by some other drbd minor! + * (if you use drbd.conf + drbdadm, + * that should check it for you already; but if you don't, or someone + * fooled it, we need to double check here) */ + nbc->md_bdev = inode2->i_bdev; + if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev + : (void *) drbd_m_holder)) { + retcode = ERR_BDCLAIM_MD_DISK; + goto release_bdev_fail; + } + + if ((nbc->backing_bdev == nbc->md_bdev) != + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; + goto release_bdev2_fail; + } + + /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ + drbd_md_set_sector_offsets(mdev, nbc); + + if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { + dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", + (unsigned long long) drbd_get_max_capacity(nbc), + (unsigned long long) nbc->dc.disk_size); + retcode = ERR_DISK_TO_SMALL; + goto release_bdev2_fail; + } + + if (nbc->dc.meta_dev_idx < 0) { + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; + /* at least one MB, otherwise it does not make sense */ + min_md_device_sectors = (2<<10); + } else { + max_possible_sectors = DRBD_MAX_SECTORS; + min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); + } + + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { + retcode = ERR_MD_DISK_TO_SMALL; + dev_warn(DEV, "refusing attach: md-device too small, " + "at least %llu sectors needed for this meta-disk type\n", + (unsigned long long) min_md_device_sectors); + goto release_bdev2_fail; + } + + /* Make sure the new disk is big enough + * (we may currently be R_PRIMARY with no local disk...) */ + if (drbd_get_max_capacity(nbc) < + drbd_get_capacity(mdev->this_bdev)) { + retcode = ERR_DISK_TO_SMALL; + goto release_bdev2_fail; + } + + nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + + if (nbc->known_size > max_possible_sectors) { + dev_warn(DEV, "==> truncating very big lower level device " + "to currently maximum possible %llu sectors <==\n", + (unsigned long long) max_possible_sectors); + if (nbc->dc.meta_dev_idx >= 0) + dev_warn(DEV, "==>> using internal or flexible " + "meta data may help <<==\n"); + } + + drbd_suspend_io(mdev); + /* also wait for the last barrier ack. */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); + /* and for any other previously queued work */ + drbd_flush_workqueue(mdev); + + retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); + drbd_resume_io(mdev); + if (retcode < SS_SUCCESS) + goto release_bdev2_fail; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) + goto force_diskless; + + drbd_md_set_sector_offsets(mdev, nbc); + + if (!mdev->bitmap) { + if (drbd_bm_init(mdev)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + } + + retcode = drbd_md_read(mdev, nbc); + if (retcode != NO_ERROR) + goto force_diskless_dec; + + if (mdev->state.conn < C_CONNECTED && + mdev->state.role == R_PRIMARY && + (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { + dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + retcode = ERR_DATA_NOT_CURRENT; + goto force_diskless_dec; + } + + /* Since we are diskless, fix the activity log first... */ + if (drbd_check_al_size(mdev)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + + /* Prevent shrinking of consistent devices ! */ + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && + drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { + dev_warn(DEV, "refusing to truncate a consistent device\n"); + retcode = ERR_DISK_TO_SMALL; + goto force_diskless_dec; + } + + if (!drbd_al_read_log(mdev, nbc)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + + /* allocate a second IO page if logical_block_size != 512 */ + logical_block_size = bdev_logical_block_size(nbc->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + if (logical_block_size != MD_SECTOR_SIZE) { + if (!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if (!page) + goto force_diskless_dec; + + dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", + logical_block_size, MD_SECTOR_SIZE); + dev_warn(DEV, "Workaround engaged (has performance impact).\n"); + + mdev->md_io_tmpp = page; + } + } + + /* Reset the "barriers don't work" bits here, then force meta data to + * be written, to ensure we determine if barriers are supported. */ + if (nbc->dc.no_md_flush) + set_bit(MD_NO_BARRIER, &mdev->flags); + else + clear_bit(MD_NO_BARRIER, &mdev->flags); + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now mdev takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(mdev->ldev == NULL); + mdev->ldev = nbc; + mdev->resync = resync_lru; + nbc = NULL; + resync_lru = NULL; + + mdev->write_ordering = WO_bio_barrier; + drbd_bump_write_ordering(mdev, WO_bio_barrier); + + if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) + set_bit(CRASHED_PRIMARY, &mdev->flags); + else + clear_bit(CRASHED_PRIMARY, &mdev->flags); + + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { + set_bit(CRASHED_PRIMARY, &mdev->flags); + cp_discovered = 1; + } + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + mdev->read_cnt = 0; + mdev->writ_cnt = 0; + + drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + + /* If I am currently not R_PRIMARY, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been R_PRIMARY before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded R_PRIMARY), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + if (mdev->state.role != R_PRIMARY && + drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && + !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) + set_bit(USE_DEGR_WFC_T, &mdev->flags); + + dd = drbd_determin_dev_size(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto force_diskless_dec; + } else if (dd == grew) + set_bit(RESYNC_AFTER_NEG, &mdev->flags); + + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { + dev_info(DEV, "Assuming that all blocks are out of sync " + "(aka FullSync)\n"); + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } else { + if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } + + if (cp_discovered) { + drbd_al_apply_to_bm(mdev); + drbd_al_to_on_disk_bm(mdev); + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = os.i; + /* If MDF_CONSISTENT is not set go into inconsistent state, + otherwise investigate MDF_WasUpToDate... + If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, + otherwise into D_CONSISTENT state. + */ + if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { + if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) + ns.disk = D_CONSISTENT; + else + ns.disk = D_OUTDATED; + } else { + ns.disk = D_INCONSISTENT; + } + + if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) + ns.pdsk = D_OUTDATED; + + if ( ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) + ns.disk = D_UP_TO_DATE; + + /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, + MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before + this point, because drbd_request_state() modifies these + flags. */ + + /* In case we are C_CONNECTED postpone any decision on the new disk + state after the negotiation phase. */ + if (mdev->state.conn == C_CONNECTED) { + mdev->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = D_NEGOTIATING; + } + + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + goto force_diskless_dec; + + if (mdev->state.role == R_PRIMARY) + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; + else + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + + drbd_md_mark_dirty(mdev); + drbd_md_sync(mdev); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + put_ldev(mdev); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; + + force_diskless_dec: + put_ldev(mdev); + force_diskless: + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + drbd_md_sync(mdev); + release_bdev2_fail: + if (nbc) + bd_release(nbc->md_bdev); + release_bdev_fail: + if (nbc) + bd_release(nbc->backing_bdev); + fail: + if (nbc) { + if (nbc->lo_file) + fput(nbc->lo_file); + if (nbc->md_file) + fput(nbc->md_file); + kfree(nbc); + } + lc_destroy(resync_lru); + + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; +} + +static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); + return 0; +} + +static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int i, ns; + enum drbd_ret_codes retcode; + struct net_conf *new_conf = NULL; + struct crypto_hash *tfm = NULL; + struct crypto_hash *integrity_w_tfm = NULL; + struct crypto_hash *integrity_r_tfm = NULL; + struct hlist_head *new_tl_hash = NULL; + struct hlist_head *new_ee_hash = NULL; + struct drbd_conf *odev; + char hmac_name[CRYPTO_MAX_ALG_NAME]; + void *int_dig_out = NULL; + void *int_dig_in = NULL; + void *int_dig_vv = NULL; + struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; + + drbd_reconfig_start(mdev); + + if (mdev->state.conn > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, cqueue thread context */ + new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + memset(new_conf, 0, sizeof(struct net_conf)); + new_conf->timeout = DRBD_TIMEOUT_DEF; + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; + new_conf->ping_int = DRBD_PING_INT_DEF; + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; + new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; + new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; + new_conf->ko_count = DRBD_KO_COUNT_DEF; + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; + new_conf->want_lose = 0; + new_conf->two_primaries = 0; + new_conf->wire_protocol = DRBD_PROT_C; + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + + if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (new_conf->two_primaries + && (new_conf->wire_protocol != DRBD_PROT_C)) { + retcode = ERR_NOT_PROTO_C; + goto fail; + }; + + if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { + retcode = ERR_DISCARD; + goto fail; + } + + retcode = NO_ERROR; + + new_my_addr = (struct sockaddr *)&new_conf->my_addr; + new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev || odev == mdev) + continue; + if (get_net_conf(odev)) { + taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; + if (new_conf->my_addr_len == odev->net_conf->my_addr_len && + !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) + retcode = ERR_LOCAL_ADDR; + + taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; + if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && + !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) + retcode = ERR_PEER_ADDR; + + put_net_conf(odev); + if (retcode != NO_ERROR) + goto fail; + } + } + + if (new_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + new_conf->cram_hmac_alg); + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + tfm = NULL; + retcode = ERR_AUTH_ALG; + goto fail; + } + + if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) + != CRYPTO_ALG_TYPE_HASH) { + retcode = ERR_AUTH_ALG_ND; + goto fail; + } + } + + if (new_conf->integrity_alg[0]) { + integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_w_tfm)) { + integrity_w_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { + retcode=ERR_INTEGRITY_ALG_ND; + goto fail; + } + + integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_r_tfm)) { + integrity_r_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } + } + + ns = new_conf->max_epoch_size/8; + if (mdev->tl_hash_s != ns) { + new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_tl_hash) { + retcode = ERR_NOMEM; + goto fail; + } + } + + ns = new_conf->max_buffers/8; + if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { + new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_ee_hash) { + retcode = ERR_NOMEM; + goto fail; + } + } + + ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + + if (integrity_w_tfm) { + i = crypto_hash_digestsize(integrity_w_tfm); + int_dig_out = kmalloc(i, GFP_KERNEL); + if (!int_dig_out) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_in = kmalloc(i, GFP_KERNEL); + if (!int_dig_in) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_vv = kmalloc(i, GFP_KERNEL); + if (!int_dig_vv) { + retcode = ERR_NOMEM; + goto fail; + } + } + + if (!mdev->bitmap) { + if(drbd_bm_init(mdev)) { + retcode = ERR_NOMEM; + goto fail; + } + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->net_conf != NULL) { + retcode = ERR_NET_CONFIGURED; + spin_unlock_irq(&mdev->req_lock); + goto fail; + } + mdev->net_conf = new_conf; + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + + if (new_tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; + mdev->tl_hash = new_tl_hash; + } + + if (new_ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; + mdev->ee_hash = new_ee_hash; + } + + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = tfm; + + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = integrity_w_tfm; + + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = integrity_r_tfm; + + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); + mdev->int_dig_out=int_dig_out; + mdev->int_dig_in=int_dig_in; + mdev->int_dig_vv=int_dig_vv; + spin_unlock_irq(&mdev->req_lock); + + retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; + +fail: + kfree(int_dig_out); + kfree(int_dig_in); + kfree(int_dig_vv); + crypto_free_hash(tfm); + crypto_free_hash(integrity_w_tfm); + crypto_free_hash(integrity_r_tfm); + kfree(new_tl_hash); + kfree(new_ee_hash); + kfree(new_conf); + + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; +} + +static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); + + if (retcode == SS_NOTHING_TO_DO) + goto done; + else if (retcode == SS_ALREADY_STANDALONE) + goto done; + else if (retcode == SS_PRIMARY_NOP) { + /* Our statche checking code wants to see the peer outdated. */ + retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + pdsk, D_OUTDATED)); + } else if (retcode == SS_CW_FAILED_BY_PEER) { + /* The peer probably wants to see us outdated. */ + retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), + CS_ORDERED); + if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + retcode = SS_SUCCESS; + } + } + + if (retcode < SS_SUCCESS) + goto fail; + + if (wait_event_interruptible(mdev->state_wait, + mdev->state.conn != C_DISCONNECTING)) { + /* Do not test for mdev->state.conn == C_STANDALONE, since + someone else might connect us in the mean time! */ + retcode = ERR_INTR; + goto fail; + } + + done: + retcode = NO_ERROR; + fail: + drbd_md_sync(mdev); + reply->ret_code = retcode; + return 0; +} + +void resync_after_online_grow(struct drbd_conf *mdev) +{ + int iass; /* I am sync source */ + + dev_info(DEV, "Resync of new storage after online grow\n"); + if (mdev->state.role != mdev->state.peer) + iass = (mdev->state.role == R_PRIMARY); + else + iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); + + if (iass) + drbd_start_resync(mdev, C_SYNC_SOURCE); + else + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct resize rs; + int retcode = NO_ERROR; + int ldsc = 0; /* local disk size changed */ + enum determine_dev_size dd; + + memset(&rs, 0, sizeof(struct resize)); + if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (mdev->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; + goto fail; + } + + if (mdev->state.role == R_SECONDARY && + mdev->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; + goto fail; + } + + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto fail; + } + + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + ldsc = 1; + } + + mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; + dd = drbd_determin_dev_size(mdev); + drbd_md_sync(mdev); + put_ldev(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } + + if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { + if (dd == grew) + set_bit(RESIZE_PENDING, &mdev->flags); + + drbd_send_uuids(mdev); + drbd_send_sizes(mdev, 1); + } + + fail: + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + struct syncer_conf sc; + cpumask_var_t new_cpu_mask; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { + retcode = ERR_NOMEM; + goto fail; + } + + if (nlp->flags & DRBD_NL_SET_DEFAULTS) { + memset(&sc, 0, sizeof(struct syncer_conf)); + sc.rate = DRBD_RATE_DEF; + sc.after = DRBD_AFTER_DEF; + sc.al_extents = DRBD_AL_EXTENTS_DEF; + } else + memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); + + if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + /* re-sync running */ + rsr = ( mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_PAUSED_SYNC_S || + mdev->state.conn == C_PAUSED_SYNC_T ); + + if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; + goto fail; + } + + if (!rsr && sc.csums_alg[0]) { + csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + retcode = ERR_CSUMS_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { + retcode = ERR_CSUMS_ALG_ND; + goto fail; + } + } + + /* online verify running */ + ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); + + if (ovr) { + if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { + retcode = ERR_VERIFY_RUNNING; + goto fail; + } + } + + if (!ovr && sc.verify_alg[0]) { + verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + retcode = ERR_VERIFY_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { + retcode = ERR_VERIFY_ALG_ND; + goto fail; + } + } + + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { + err = __bitmap_parse(sc.cpu_mask, 32, 0, + cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err) { + dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); + retcode = ERR_CPU_MASK_PARSE; + goto fail; + } + } + + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if (sc.al_extents > AL_MAX) { + dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); + sc.al_extents = AL_MAX; + } +#undef AL_MAX + + /* most sanity checks done, try to assign the new sync-after + * dependency. need to hold the global lock in there, + * to avoid a race in the dependency loop check. */ + retcode = drbd_alter_sa(mdev, sc.after); + if (retcode != NO_ERROR) + goto fail; + + /* ok, assign the rest of it as well. + * lock against receive_SyncParam() */ + spin_lock(&mdev->peer_seq_lock); + mdev->sync_conf = sc; + + if (!rsr) { + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + csums_tfm = NULL; + } + + if (!ovr) { + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + verify_tfm = NULL; + } + spin_unlock(&mdev->peer_seq_lock); + + if (get_ldev(mdev)) { + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + drbd_al_shrink(mdev); + err = drbd_check_al_size(mdev); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + put_ldev(mdev); + drbd_md_sync(mdev); + + if (err) { + retcode = ERR_NOMEM; + goto fail; + } + } + + if (mdev->state.conn >= C_CONNECTED) + drbd_send_sync_param(mdev, &sc); + + if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { + cpumask_copy(mdev->cpu_mask, new_cpu_mask); + drbd_calc_cpu_mask(mdev); + mdev->receiver.reset_cpu_mask = 1; + mdev->asender.reset_cpu_mask = 1; + mdev->worker.reset_cpu_mask = 1; + } + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); +fail: + free_cpumask_var(new_cpu_mask); + crypto_free_hash(csums_tfm); + crypto_free_hash(verify_tfm); + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); + + if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); + + while (retcode == SS_NEED_CONNECTION) { + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn < C_CONNECTED) + retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + + if (retcode != SS_NEED_CONNECTION) + break; + + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); + } + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + + reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + + return 0; +} + +static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + + if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_SET; + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + + if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_CLEAR; + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); + + return 0; +} + +static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); + return 0; +} + +static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); + return 0; +} + +static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if (get_ldev(mdev)) { + tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); + put_ldev(mdev); + } + + if (get_net_conf(mdev)) { + tl = net_conf_to_tags(mdev, mdev->net_conf, tl); + put_net_conf(mdev); + } + tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); + + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl = reply->tag_list; + union drbd_state s = mdev->state; + unsigned long rs_left; + unsigned int res; + + tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); + + /* no local ref, no bitmap, no syncer progress. */ + if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { + if (get_ldev(mdev)) { + drbd_get_syncer_progress(mdev, &rs_left, &res); + tl = tl_add_int(tl, T_sync_progress, &res); + put_ldev(mdev); + } + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if (get_ldev(mdev)) { + tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); + tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); + put_ldev(mdev); + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +/** + * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use + * @mdev: DRBD device. + * @nlp: Netlink/connector packet from drbdsetup + * @reply: Reply packet for drbdsetup + */ +static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + char rv; + + tl = reply->tag_list; + + rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; + + tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + /* default to resume from last known position, if possible */ + struct start_ov args = + { .start_sector = mdev->ov_start_sector }; + + if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + /* w_make_ov_request expects position to be aligned */ + mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; + reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + return 0; +} + + +static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + int skip_initial_sync = 0; + int err; + + struct new_c_uuid args; + + memset(&args, 0, sizeof(struct new_c_uuid)); + if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + + mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ + + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto out; + } + + /* this is "skip initial sync", assume to be clean */ + if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { + dev_info(DEV, "Preparing to skip initial sync\n"); + skip_initial_sync = 1; + } else if (mdev->state.conn != C_STANDALONE) { + retcode = ERR_CONNECTED; + goto out_dec; + } + + drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ + drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ + + if (args.clear_bm) { + err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); + if (err) { + dev_err(DEV, "Writing bitmap failed with %d\n",err); + retcode = ERR_IO_MD_DISK; + } + if (skip_initial_sync) { + drbd_send_uuids_skip_initial_sync(mdev); + _drbd_uuid_set(mdev, UI_BITMAP, 0); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + } + + drbd_md_sync(mdev); +out_dec: + put_ldev(mdev); +out: + mutex_unlock(&mdev->state_mutex); + + reply->ret_code = retcode; + return 0; +} + +static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) +{ + struct drbd_conf *mdev; + + if (nlp->drbd_minor >= minor_count) + return NULL; + + mdev = minor_to_mdev(nlp->drbd_minor); + + if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { + struct gendisk *disk = NULL; + mdev = drbd_new_device(nlp->drbd_minor); + + spin_lock_irq(&drbd_pp_lock); + if (minor_table[nlp->drbd_minor] == NULL) { + minor_table[nlp->drbd_minor] = mdev; + disk = mdev->vdisk; + mdev = NULL; + } /* else: we lost the race */ + spin_unlock_irq(&drbd_pp_lock); + + if (disk) /* we won the race above */ + /* in case we ever add a drbd_delete_device(), + * don't forget the del_gendisk! */ + add_disk(disk); + else /* we lost the race above */ + drbd_free_mdev(mdev); + + mdev = minor_to_mdev(nlp->drbd_minor); + } + + return mdev; +} + +struct cn_handler_struct { + int (*function)(struct drbd_conf *, + struct drbd_nl_cfg_req *, + struct drbd_nl_cfg_reply *); + int reply_body_size; +}; + +static struct cn_handler_struct cnd_table[] = { + [ P_primary ] = { &drbd_nl_primary, 0 }, + [ P_secondary ] = { &drbd_nl_secondary, 0 }, + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, + [ P_detach ] = { &drbd_nl_detach, 0 }, + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, + [ P_resize ] = { &drbd_nl_resize, 0 }, + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, + [ P_outdate ] = { &drbd_nl_outdate, 0 }, + [ P_get_config ] = { &drbd_nl_get_config, + sizeof(struct syncer_conf_tag_len_struct) + + sizeof(struct disk_conf_tag_len_struct) + + sizeof(struct net_conf_tag_len_struct) }, + [ P_get_state ] = { &drbd_nl_get_state, + sizeof(struct get_state_tag_len_struct) + + sizeof(struct sync_progress_tag_len_struct) }, + [ P_get_uuids ] = { &drbd_nl_get_uuids, + sizeof(struct get_uuids_tag_len_struct) }, + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, + sizeof(struct get_timeout_flag_tag_len_struct)}, + [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, + [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, +}; + +static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) +{ + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; + struct cn_handler_struct *cm; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + struct drbd_conf *mdev; + int retcode, rr; + int reply_size = sizeof(struct cn_msg) + + sizeof(struct drbd_nl_cfg_reply) + + sizeof(short int); + + if (!try_module_get(THIS_MODULE)) { + printk(KERN_ERR "drbd: try_module_get() failed!\n"); + return; + } + + if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { + retcode = ERR_PERM; + goto fail; + } + + mdev = ensure_mdev(nlp); + if (!mdev) { + retcode = ERR_MINOR_INVALID; + goto fail; + } + + if (nlp->packet_type >= P_nl_after_last_packet) { + retcode = ERR_PACKET_NR; + goto fail; + } + + cm = cnd_table + nlp->packet_type; + + /* This may happen if packet number is 0: */ + if (cm->function == NULL) { + retcode = ERR_PACKET_NR; + goto fail; + } + + reply_size += cm->reply_body_size; + + /* allocation not in the IO path, cqueue thread context */ + cn_reply = kmalloc(reply_size, GFP_KERNEL); + if (!cn_reply) { + retcode = ERR_NOMEM; + goto fail; + } + reply = (struct drbd_nl_cfg_reply *) cn_reply->data; + + reply->packet_type = + cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; + reply->minor = nlp->drbd_minor; + reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ + /* reply->tag_list; might be modified by cm->function. */ + + rr = cm->function(mdev, nlp, reply); + + cn_reply->id = req->id; + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; + cn_reply->flags = 0; + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); + + kfree(cn_reply); + module_put(THIS_MODULE); + return; + fail: + drbd_nl_send_reply(req, retcode); + module_put(THIS_MODULE); +} + +static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ + +static unsigned short * +__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, + unsigned short len, int nul_terminated) +{ + unsigned short l = tag_descriptions[tag_number(tag)].max_len; + len = (len < l) ? len : l; + put_unaligned(tag, tl++); + put_unaligned(len, tl++); + memcpy(tl, data, len); + tl = (unsigned short*)((char*)tl + len); + if (nul_terminated) + *((char*)tl - 1) = 0; + return tl; +} + +static unsigned short * +tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) +{ + return __tl_add_blob(tl, tag, data, len, 0); +} + +static unsigned short * +tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) +{ + return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); +} + +static unsigned short * +tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) +{ + put_unaligned(tag, tl++); + switch(tag_type(tag)) { + case TT_INTEGER: + put_unaligned(sizeof(int), tl++); + put_unaligned(*(int *)val, (int *)tl); + tl = (unsigned short*)((char*)tl+sizeof(int)); + break; + case TT_INT64: + put_unaligned(sizeof(u64), tl++); + put_unaligned(*(u64 *)val, (u64 *)tl); + tl = (unsigned short*)((char*)tl+sizeof(u64)); + break; + default: + /* someone did something stupid. */ + ; + } + return tl; +} + +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct get_state_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ + + tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); + + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_get_state; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct call_helper_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ + + tl = tl_add_str(tl, T_helper, helper_name); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_call_helper; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e) +{ + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + struct bio_vec *bvec; + unsigned short *tl; + int i; + + if (!e) + return; + if (!reason || !reason[0]) + return; + + /* apparently we have to memcpy twice, first to prepare the data for the + * struct cn_msg, then within cn_netlink_send from the cn_msg to the + * netlink skb. */ + /* receiver thread context, which is not in the writeout path (of this node), + * but may be in the writeout path of the _other_ node. + * GFP_NOIO to avoid potential "distributed deadlock". */ + cn_reply = kmalloc( + sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct dump_ee_tag_len_struct)+ + sizeof(short int), + GFP_NOIO); + + if (!cn_reply) { + dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", + (unsigned long long)e->sector, e->size); + return; + } + + reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + tl = reply->tag_list; + + tl = tl_add_str(tl, T_dump_ee_reason, reason); + tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); + tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); + tl = tl_add_int(tl, T_ee_sector, &e->sector); + tl = tl_add_int(tl, T_ee_block_id, &e->block_id); + + put_unaligned(T_ee_data, tl++); + put_unaligned(e->size, tl++); + + __bio_for_each_segment(bvec, e->private_bio, i, 0) { + void *d = kmap(bvec->bv_page); + memcpy(tl, d + bvec->bv_offset, bvec->bv_len); + kunmap(bvec->bv_page); + tl=(unsigned short*)((char*)tl + bvec->bv_len); + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_dump_ee; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + kfree(cn_reply); +} + +void drbd_bcast_sync_progress(struct drbd_conf *mdev) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct sync_progress_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + unsigned long rs_left; + unsigned int res; + + /* no local ref, no bitmap, no syncer progress, no broadcast. */ + if (!get_ldev(mdev)) + return; + drbd_get_syncer_progress(mdev, &rs_left, &res); + put_ldev(mdev); + + tl = tl_add_int(tl, T_sync_progress, &res); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_sync_progress; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +int __init drbd_nl_init(void) +{ + static struct cb_id cn_id_drbd; + int err, try=10; + + cn_id_drbd.val = CN_VAL_DRBD; + do { + cn_id_drbd.idx = cn_idx; + err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); + if (!err) + break; + cn_idx = (cn_idx + CN_IDX_STEP); + } while (try--); + + if (err) { + printk(KERN_ERR "drbd: cn_drbd failed to register\n"); + return err; + } + + return 0; +} + +void drbd_nl_cleanup(void) +{ + static struct cb_id cn_id_drbd; + + cn_id_drbd.idx = cn_idx; + cn_id_drbd.val = CN_VAL_DRBD; + + cn_del_callback(&cn_id_drbd); +} + +void drbd_nl_send_reply(struct cn_msg *req, int ret_code) +{ + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + int rr; + + cn_reply->id = req->id; + + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); + cn_reply->flags = 0; + + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; + reply->ret_code = ret_code; + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); +} + diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 0000000..bdd0b49 --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c @@ -0,0 +1,265 @@ +/* + drbd_proc.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/drbd.h> +#include "drbd_int.h" + +static int drbd_proc_open(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) +{ + unsigned long db, dt, dbdt, rt, rs_left; + unsigned int res; + int i, x, y; + + drbd_get_syncer_progress(mdev, &rs_left, &res); + + x = res/50; + y = 20-x; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + + seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); + /* if more than 1 GB display in MB */ + if (mdev->rs_total > 0x100000L) + seq_printf(seq, "(%lu/%lu)M\n\t", + (unsigned long) Bit2KB(rs_left >> 10), + (unsigned long) Bit2KB(mdev->rs_total >> 10)); + else + seq_printf(seq, "(%lu/%lu)K\n\t", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(mdev->rs_total)); + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = (jiffies - mdev->rs_mark_time) / HZ; + + if (dt > 20) { + /* if we made no update to rs_mark_time for too long, + * we are stalled. show that. */ + seq_printf(seq, "stalled\n"); + return; + } + + if (!dt) + dt++; + db = mdev->rs_mark_left - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " speed: %ld,%03ld", + dbdt/1000, dbdt % 1000); + else + seq_printf(seq, " speed: %ld", dbdt); + + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + db = mdev->rs_total - rs_left; + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " (%ld,%03ld)", + dbdt/1000, dbdt % 1000); + else + seq_printf(seq, " (%ld)", dbdt); + + seq_printf(seq, " K/sec\n"); +} + +static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(seq, "%5d %s %s\n", bme->rs_left, + bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", + bme->flags & BME_LOCKED ? "LOCKED" : "------" + ); +} + +static int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i, hole = 0; + const char *sn; + struct drbd_conf *mdev; + + static char write_ordering_chars[] = { + [WO_none] = 'n', + [WO_drain_io] = 'd', + [WO_bdev_flush] = 'f', + [WO_bio_barrier] = 'b', + }; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); + + /* + cs .. connection state + ro .. node role (local/remote) + ds .. disk state (local/remote) + protocol + various flags + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + al .. activity log write count + bm .. bitmap update write count + pe .. pending (waiting for ack or data reply) + ua .. unack'd (still need to send ack or data reply) + ap .. application requests accepted, but not yet completed + ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending + wo .. write ordering mode currently in use + oos .. known out-of-sync kB + */ + + for (i = 0; i < minor_count; i++) { + mdev = minor_to_mdev(i); + if (!mdev) { + hole = 1; + continue; + } + if (hole) { + hole = 0; + seq_printf(seq, "\n"); + } + + sn = drbd_conn_str(mdev->state.conn); + + if (mdev->state.conn == C_STANDALONE && + mdev->state.disk == D_DISKLESS && + mdev->state.role == R_SECONDARY) { + seq_printf(seq, "%2d: cs:Unconfigured\n", i); + } else { + seq_printf(seq, + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", + i, sn, + drbd_role_str(mdev->state.role), + drbd_role_str(mdev->state.peer), + drbd_disk_str(mdev->state.disk), + drbd_disk_str(mdev->state.pdsk), + (mdev->net_conf == NULL ? ' ' : + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), + mdev->state.susp ? 's' : 'r', + mdev->state.aftr_isp ? 'a' : '-', + mdev->state.peer_isp ? 'p' : '-', + mdev->state.user_isp ? 'u' : '-', + mdev->congestion_reason ?: '-', + mdev->send_cnt/2, + mdev->recv_cnt/2, + mdev->writ_cnt/2, + mdev->read_cnt/2, + mdev->al_writ_cnt, + mdev->bm_writ_cnt, + atomic_read(&mdev->local_cnt), + atomic_read(&mdev->ap_pending_cnt) + + atomic_read(&mdev->rs_pending_cnt), + atomic_read(&mdev->unacked_cnt), + atomic_read(&mdev->ap_bio_cnt), + mdev->epochs, + write_ordering_chars[mdev->write_ordering] + ); + seq_printf(seq, " oos:%lu\n", + Bit2KB(drbd_bm_total_weight(mdev))); + } + if (mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET) + drbd_syncer_progress(mdev, seq); + + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) + seq_printf(seq, "\t%3d%% %lu/%lu\n", + (int)((mdev->rs_total-mdev->ov_left) / + (mdev->rs_total/100+1)), + mdev->rs_total - mdev->ov_left, + mdev->rs_total); + + if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { + lc_seq_printf_stats(seq, mdev->resync); + lc_seq_printf_stats(seq, mdev->act_log); + put_ldev(mdev); + } + + if (proc_details >= 2) { + if (mdev->resync) { + lc_seq_dump_details(seq, mdev->resync, "rs_left", + resync_dump_detail); + } + } + } + + return 0; +} + +static int drbd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_seq_show, PDE(inode)->data); +} + +/* PROC FS stuff end */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 0000000..c548f24 --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c @@ -0,0 +1,4426 @@ +/* + drbd_receiver.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <net/sock.h> + +#include <linux/version.h> +#include <linux/drbd.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/pkt_sched.h> +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/scatterlist.h> +#include "drbd_int.h" +#include "drbd_req.h" + +#include "drbd_vli.h" + +struct flush_work { + struct drbd_work w; + struct drbd_epoch *epoch; +}; + +enum finish_epoch { + FE_STILL_LIVE, + FE_DESTROYED, + FE_RECYCLED, +}; + +static int drbd_do_handshake(struct drbd_conf *mdev); +static int drbd_do_auth(struct drbd_conf *mdev); + +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); +static int e_end_block(struct drbd_conf *, struct drbd_work *, int); + +static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +{ + struct drbd_epoch *prev; + spin_lock(&mdev->epoch_lock); + prev = list_entry(epoch->list.prev, struct drbd_epoch, list); + if (prev == epoch || prev == mdev->current_epoch) + prev = NULL; + spin_unlock(&mdev->epoch_lock); + return prev; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) +{ + struct page *page = NULL; + + /* Yes, testing drbd_pp_vacant outside the lock is racy. + * So what. It saves a spin_lock. */ + if (drbd_pp_vacant > 0) { + spin_lock(&drbd_pp_lock); + page = drbd_pp_pool; + if (page) { + drbd_pp_pool = (struct page *)page_private(page); + set_page_private(page, 0); /* just to be polite */ + drbd_pp_vacant--; + } + spin_unlock(&drbd_pp_lock); + } + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + if (!page) + page = alloc_page(GFP_TRY); + if (page) + atomic_inc(&mdev->pp_in_use); + return page; +} + +/* kick lower level device, if we have more than (arbitrary number) + * reference counts on it, which typically are locally submitted io + * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ +static void maybe_kick_lo(struct drbd_conf *mdev) +{ + if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) + drbd_kick_lo(mdev); +} + +static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) +{ + struct drbd_epoch_entry *e; + struct list_head *le, *tle; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_safe(le, tle, &mdev->net_ee) { + e = list_entry(le, struct drbd_epoch_entry, w.list); + if (drbd_bio_has_active_page(e->private_bio)) + break; + list_move(le, to_be_freed); + } +} + +static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) +{ + LIST_HEAD(reclaimed); + struct drbd_epoch_entry *e, *t; + + maybe_kick_lo(mdev); + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_ee(mdev, e); +} + +/** + * drbd_pp_alloc() - Returns a page, fails only if a signal comes in + * @mdev: DRBD device. + * @retry: whether or not to retry allocation forever (or until signalled) + * + * Tries to allocate a page, first from our own page pool, then from the + * kernel, unless this allocation would exceed the max_buffers setting. + * If @retry is non-zero, retry until DRBD frees a page somewhere else. + */ +static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) +{ + struct page *page = NULL; + DEFINE_WAIT(wait); + + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { + page = drbd_pp_first_page_or_try_alloc(mdev); + if (page) + return page; + } + + for (;;) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + drbd_kick_lo_and_reclaim_net(mdev); + + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { + page = drbd_pp_first_page_or_try_alloc(mdev); + if (page) + break; + } + + if (!retry) + break; + + if (signal_pending(current)) { + dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); + break; + } + + schedule(); + } + finish_wait(&drbd_pp_wait, &wait); + + return page; +} + +/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. + * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) +{ + int free_it; + + spin_lock(&drbd_pp_lock); + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + free_it = 1; + } else { + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + drbd_pp_vacant++; + free_it = 0; + } + spin_unlock(&drbd_pp_lock); + + atomic_dec(&mdev->pp_in_use); + + if (free_it) + __free_page(page); + + wake_up(&drbd_pp_wait); +} + +static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) +{ + struct page *p_to_be_freed = NULL; + struct page *page; + struct bio_vec *bvec; + int i; + + spin_lock(&drbd_pp_lock); + __bio_for_each_segment(bvec, bio, i, 0) { + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); + p_to_be_freed = bvec->bv_page; + } else { + set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = bvec->bv_page; + drbd_pp_vacant++; + } + } + spin_unlock(&drbd_pp_lock); + atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); + + while (p_to_be_freed) { + page = p_to_be_freed; + p_to_be_freed = (struct page *)page_private(page); + set_page_private(page, 0); /* just to be polite */ + put_page(page); + } + + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_free_ee() + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() + drbd_ee_fix_bhs() + drbd_process_done_ee() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local) +{ + struct request_queue *q; + struct drbd_epoch_entry *e; + struct page *page; + struct bio *bio; + unsigned int ds; + + if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) + return NULL; + + e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!e) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); + return NULL; + } + + bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); + if (!bio) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); + goto fail1; + } + + bio->bi_bdev = mdev->ldev->backing_bdev; + bio->bi_sector = sector; + + ds = data_size; + while (ds) { + page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); + if (!page) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); + goto fail2; + } + if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { + drbd_pp_free(mdev, page); + dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," + "data_size=%u,ds=%u) failed\n", + (unsigned long long)sector, data_size, ds); + + q = bdev_get_queue(bio->bi_bdev); + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + int l = q->merge_bvec_fn(q, &bvm, + &bio->bi_io_vec[bio->bi_vcnt]); + dev_err(DEV, "merge_bvec_fn() = %d\n", l); + } + + /* dump more of the bio. */ + dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); + dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); + dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); + dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); + + goto fail2; + break; + } + ds -= min_t(int, ds, PAGE_SIZE); + } + + D_ASSERT(data_size == bio->bi_size); + + bio->bi_private = e; + e->mdev = mdev; + e->sector = sector; + e->size = bio->bi_size; + + e->private_bio = bio; + e->block_id = id; + INIT_HLIST_NODE(&e->colision); + e->epoch = NULL; + e->flags = 0; + + return e; + + fail2: + drbd_pp_free_bio_pages(mdev, bio); + bio_put(bio); + fail1: + mempool_free(e, drbd_ee_mempool); + + return NULL; +} + +void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +{ + struct bio *bio = e->private_bio; + drbd_pp_free_bio_pages(mdev, bio); + bio_put(bio); + D_ASSERT(hlist_unhashed(&e->colision)); + mempool_free(e, drbd_ee_mempool); +} + +int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) +{ + LIST_HEAD(work_list); + struct drbd_epoch_entry *e, *t; + int count = 0; + + spin_lock_irq(&mdev->req_lock); + list_splice_init(list, &work_list); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &work_list, w.list) { + drbd_free_ee(mdev, e); + count++; + } + return count; +} + + +/* + * This function is called from _asender only_ + * but see also comments in _req_mod(,barrier_acked) + * and receive_Barrier. + * + * Move entries from net_ee to done_ee, if ready. + * Grab done_ee, call all callbacks, free the entries. + * The callbacks typically send out ACKs. + */ +static int drbd_process_done_ee(struct drbd_conf *mdev) +{ + LIST_HEAD(work_list); + LIST_HEAD(reclaimed); + struct drbd_epoch_entry *e, *t; + int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); + + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); + list_splice_init(&mdev->done_ee, &work_list); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_ee(mdev, e); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_discard_ack. + * all ignore the last argument. + */ + list_for_each_entry_safe(e, t, &work_list, w.list) { + /* list_del not necessary, next/prev members not touched */ + ok = e->w.cb(mdev, &e->w, !ok) && ok; + drbd_free_ee(mdev, e); + } + wake_up(&mdev->ee_wait); + + return ok; +} + +void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +{ + DEFINE_WAIT(wait); + + /* avoids spin_lock/unlock + * and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + drbd_kick_lo(mdev); + schedule(); + finish_wait(&mdev->ee_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } +} + +void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +{ + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, head); + spin_unlock_irq(&mdev->req_lock); +} + +/* see also kernel_accept; which is only present since 2.6.18. + * also we want to log which part of it failed, exactly */ +static int drbd_accept(struct drbd_conf *mdev, const char **what, + struct socket *sock, struct socket **newsock) +{ + struct sock *sk = sock->sk; + int err = 0; + + *what = "listen"; + err = sock->ops->listen(sock, 5); + if (err < 0) + goto out; + + *what = "sock_create_lite"; + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); + if (err < 0) + goto out; + + *what = "accept"; + err = sock->ops->accept(sock, *newsock, 0); + if (err < 0) { + sock_release(*newsock); + *newsock = NULL; + goto out; + } + (*newsock)->ops = sock->ops; + +out: + return err; +} + +static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, int flags) +{ + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) + }; + int rv; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); + set_fs(oldfs); + + return rv; +} + +static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) +{ + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = MSG_WAITALL | MSG_NOSIGNAL + }; + int rv; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + for (;;) { + rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); + if (rv == size) + break; + + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ + + if (rv < 0) { + if (rv == -ECONNRESET) + dev_info(DEV, "sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + dev_err(DEV, "sock_recvmsg returned %d\n", rv); + break; + } else if (rv == 0) { + dev_info(DEV, "sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + /* D_ASSERT(signal_pending(current)); */ + break; + } + }; + + set_fs(oldfs); + + if (rv != size) + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + + return rv; +} + +static struct socket *drbd_try_connect(struct drbd_conf *mdev) +{ + const char *what; + struct socket *sock; + struct sockaddr_in6 src_in6; + int err; + int disconnect_on_error = 1; + + if (!get_net_conf(mdev)) + return NULL; + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + sock = NULL; + goto out; + } + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + /* explicitly bind to the configured IP as source IP + * for the outgoing connections. + * This is needed for multihomed hosts and to be + * able to use lo: interfaces for drbd. + * Make sure to use 0 as port number, so linux selects + * a free one dynamically. + */ + memcpy(&src_in6, mdev->net_conf->my_addr, + min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); + if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + + what = "bind before connect"; + err = sock->ops->bind(sock, + (struct sockaddr *) &src_in6, + mdev->net_conf->my_addr_len); + if (err < 0) + goto out; + + /* connect may fail, peer not yet available. + * stay C_WF_CONNECTION, don't go Disconnecting! */ + disconnect_on_error = 0; + what = "connect"; + err = sock->ops->connect(sock, + (struct sockaddr *)mdev->net_conf->peer_addr, + mdev->net_conf->peer_addr_len, 0); + +out: + if (err < 0) { + if (sock) { + sock_release(sock); + sock = NULL; + } + switch (-err) { + /* timeout, busy, signal pending */ + case ETIMEDOUT: case EAGAIN: case EINPROGRESS: + case EINTR: case ERESTARTSYS: + /* peer not (yet) available, network problem */ + case ECONNREFUSED: case ENETUNREACH: + case EHOSTDOWN: case EHOSTUNREACH: + disconnect_on_error = 0; + break; + default: + dev_err(DEV, "%s failed, err = %d\n", what, err); + } + if (disconnect_on_error) + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + put_net_conf(mdev); + return sock; +} + +static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) +{ + int timeo, err; + struct socket *s_estab = NULL, *s_listen; + const char *what; + + if (!get_net_conf(mdev)) + return NULL; + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &s_listen); + if (err) { + s_listen = NULL; + goto out; + } + + timeo = mdev->net_conf->try_connect_int * HZ; + timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ + + s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_rcvtimeo = timeo; + s_listen->sk->sk_sndtimeo = timeo; + + what = "bind before listen"; + err = s_listen->ops->bind(s_listen, + (struct sockaddr *) mdev->net_conf->my_addr, + mdev->net_conf->my_addr_len); + if (err < 0) + goto out; + + err = drbd_accept(mdev, &what, s_listen, &s_estab); + +out: + if (s_listen) + sock_release(s_listen); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + dev_err(DEV, "%s failed, err = %d\n", what, err); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + } + put_net_conf(mdev); + + return s_estab; +} + +static int drbd_send_fp(struct drbd_conf *mdev, + struct socket *sock, enum drbd_packets cmd) +{ + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + + return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); +} + +static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) +{ + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + int rr; + + rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); + + if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) + return be16_to_cpu(h->command); + + return 0xffff; +} + +/** + * drbd_socket_okay() - Free the socket if its connection is not okay + * @mdev: DRBD device. + * @sock: pointer to the pointer to the socket. + */ +static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) +{ + int rr; + char tb[4]; + + if (!*sock) + return FALSE; + + rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + + if (rr > 0 || rr == -EAGAIN) { + return TRUE; + } else { + sock_release(*sock); + *sock = NULL; + return FALSE; + } +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + * -2 We do not have a network config... + */ +static int drbd_connect(struct drbd_conf *mdev) +{ + struct socket *s, *sock, *msock; + int try, h, ok; + + D_ASSERT(!mdev->data.socket); + + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) + dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); + + if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) + return -2; + + clear_bit(DISCARD_CONCURRENT, &mdev->flags); + + sock = NULL; + msock = NULL; + + do { + for (try = 0;;) { + /* 3 tries, this should take less than a second! */ + s = drbd_try_connect(mdev); + if (s || ++try >= 3) + break; + /* give the other side time to call bind() & listen() */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + } + + if (s) { + if (!sock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_S); + sock = s; + s = NULL; + } else if (!msock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_M); + msock = s; + s = NULL; + } else { + dev_err(DEV, "Logic error in drbd_connect()\n"); + goto out_release_sockets; + } + } + + if (sock && msock) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; + if (ok) + break; + } + +retry: + s = drbd_wait_for_connect(mdev); + if (s) { + try = drbd_recv_fp(mdev, s); + drbd_socket_okay(mdev, &sock); + drbd_socket_okay(mdev, &msock); + switch (try) { + case P_HAND_SHAKE_S: + if (sock) { + dev_warn(DEV, "initial packet S crossed\n"); + sock_release(sock); + } + sock = s; + break; + case P_HAND_SHAKE_M: + if (msock) { + dev_warn(DEV, "initial packet M crossed\n"); + sock_release(msock); + } + msock = s; + set_bit(DISCARD_CONCURRENT, &mdev->flags); + break; + default: + dev_warn(DEV, "Error receiving initial packet\n"); + sock_release(s); + if (random32() & 1) + goto retry; + } + } + + if (mdev->state.conn <= C_DISCONNECTING) + goto out_release_sockets; + if (signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&mdev->receiver) == Exiting) + goto out_release_sockets; + } + + if (sock && msock) { + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; + if (ok) + break; + } + } while (1); + + msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + + sock->sk->sk_allocation = GFP_NOIO; + msock->sk->sk_allocation = GFP_NOIO; + + sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock->sk->sk_priority = TC_PRIO_INTERACTIVE; + + if (mdev->net_conf->sndbuf_size) { + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + + if (mdev->net_conf->rcvbuf_size) { + sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + + /* NOT YET ... + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_HAND_SHAKE timeout, + * which we set to 4x the configured ping_timeout. */ + sock->sk->sk_sndtimeo = + sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; + + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + /* we don't want delays. + * we use TCP_CORK where apropriate, though */ + drbd_tcp_nodelay(sock); + drbd_tcp_nodelay(msock); + + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; + + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); + if (h <= 0) + return h; + + if (mdev->cram_hmac_tfm) { + /* drbd_request_state(mdev, NS(conn, WFAuth)); */ + if (!drbd_do_auth(mdev)) { + dev_err(DEV, "Authentication of peer failed\n"); + return -1; + } + } + + if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) + return 0; + + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + atomic_set(&mdev->packet_seq, 0); + mdev->peer_seq = 0; + + drbd_thread_start(&mdev->asender); + + drbd_send_protocol(mdev); + drbd_send_sync_param(mdev, &mdev->sync_conf); + drbd_send_sizes(mdev, 0); + drbd_send_uuids(mdev); + drbd_send_state(mdev); + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + clear_bit(RESIZE_PENDING, &mdev->flags); + + return 1; + +out_release_sockets: + if (sock) + sock_release(sock); + if (msock) + sock_release(msock); + return -1; +} + +static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) +{ + int r; + + r = drbd_recv(mdev, h, sizeof(*h)); + + if (unlikely(r != sizeof(*h))) { + dev_err(DEV, "short read expecting header on sock: r=%d\n", r); + return FALSE; + }; + h->command = be16_to_cpu(h->command); + h->length = be16_to_cpu(h->length); + if (unlikely(h->magic != BE_DRBD_MAGIC)) { + dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + return FALSE; + } + mdev->last_received = jiffies; + + return TRUE; +} + +static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +{ + int rv; + + if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); + if (rv) { + dev_err(DEV, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(mdev, WO_drain_io); + } + put_ldev(mdev); + } + + return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); +} + +static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct flush_work *fw = (struct flush_work *)w; + struct drbd_epoch *epoch = fw->epoch; + + kfree(w); + + if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) + drbd_flush_after_epoch(mdev, epoch); + + drbd_may_finish_epoch(mdev, epoch, EV_PUT | + (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); + + return 1; +} + +/** + * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. + * @mdev: DRBD device. + * @epoch: Epoch object. + * @ev: Epoch event. + */ +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, + struct drbd_epoch *epoch, + enum epoch_event ev) +{ + int finish, epoch_size; + struct drbd_epoch *next_epoch; + int schedule_flush = 0; + enum finish_epoch rv = FE_STILL_LIVE; + + spin_lock(&mdev->epoch_lock); + do { + next_epoch = NULL; + finish = 0; + + epoch_size = atomic_read(&epoch->epoch_size); + + switch (ev & ~EV_CLEANUP) { + case EV_PUT: + atomic_dec(&epoch->active); + break; + case EV_GOT_BARRIER_NR: + set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); + + /* Special case: If we just switched from WO_bio_barrier to + WO_bdev_flush we should not finish the current epoch */ + if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && + mdev->write_ordering != WO_bio_barrier && + epoch == mdev->current_epoch) + clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); + break; + case EV_BARRIER_DONE: + set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); + break; + case EV_BECAME_LAST: + /* nothing to do*/ + break; + } + + if (epoch_size != 0 && + atomic_read(&epoch->active) == 0 && + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && + epoch->list.prev == &mdev->current_epoch->list && + !test_bit(DE_IS_FINISHING, &epoch->flags)) { + /* Nearly all conditions are met to finish that epoch... */ + if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || + mdev->write_ordering == WO_none || + (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || + ev & EV_CLEANUP) { + finish = 1; + set_bit(DE_IS_FINISHING, &epoch->flags); + } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && + mdev->write_ordering == WO_bio_barrier) { + atomic_inc(&epoch->active); + schedule_flush = 1; + } + } + if (finish) { + if (!(ev & EV_CLEANUP)) { + spin_unlock(&mdev->epoch_lock); + drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); + spin_lock(&mdev->epoch_lock); + } + dec_unacked(mdev); + + if (mdev->current_epoch != epoch) { + next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); + list_del(&epoch->list); + ev = EV_BECAME_LAST | (ev & EV_CLEANUP); + mdev->epochs--; + kfree(epoch); + + if (rv == FE_STILL_LIVE) + rv = FE_DESTROYED; + } else { + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + /* atomic_set(&epoch->active, 0); is alrady zero */ + if (rv == FE_STILL_LIVE) + rv = FE_RECYCLED; + } + } + + if (!next_epoch) + break; + + epoch = next_epoch; + } while (1); + + spin_unlock(&mdev->epoch_lock); + + if (schedule_flush) { + struct flush_work *fw; + fw = kmalloc(sizeof(*fw), GFP_ATOMIC); + if (fw) { + fw->w.cb = w_flush; + fw->epoch = epoch; + drbd_queue_work(&mdev->data.work, &fw->w); + } else { + dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + /* That is not a recursion, only one level */ + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); + drbd_may_finish_epoch(mdev, epoch, EV_PUT); + } + } + + return rv; +} + +/** + * drbd_bump_write_ordering() - Fall back to an other write ordering method + * @mdev: DRBD device. + * @wo: Write ordering method to try. + */ +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) +{ + enum write_ordering_e pwo; + static char *write_ordering_str[] = { + [WO_none] = "none", + [WO_drain_io] = "drain", + [WO_bdev_flush] = "flush", + [WO_bio_barrier] = "barrier", + }; + + pwo = mdev->write_ordering; + wo = min(pwo, wo); + if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) + wo = WO_bdev_flush; + if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) + wo = WO_drain_io; + if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) + wo = WO_none; + mdev->write_ordering = wo; + if (pwo != mdev->write_ordering || wo == WO_bio_barrier) + dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); +} + +/** + * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways (unused in this callback) + */ +int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + struct bio *bio = e->private_bio; + + /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, + (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) + so that we can finish that epoch in drbd_may_finish_epoch(). + That is necessary if we already have a long chain of Epochs, before + we realize that BIO_RW_BARRIER is actually not supported */ + + /* As long as the -ENOTSUPP on the barrier is reported immediately + that will never trigger. If it is reported late, we will just + print that warning and continue correctly for all future requests + with WO_bdev_flush */ + if (previous_epoch(mdev, e->epoch)) + dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); + + /* prepare bio for re-submit, + * re-init volatile members */ + /* we still have a local reference, + * get_ldev was done in receive_Data. */ + bio->bi_bdev = mdev->ldev->backing_bdev; + bio->bi_sector = e->sector; + bio->bi_size = e->size; + bio->bi_idx = 0; + + bio->bi_flags &= ~(BIO_POOL_MASK - 1); + bio->bi_flags |= 1 << BIO_UPTODATE; + + /* don't know whether this is necessary: */ + bio->bi_phys_segments = 0; + bio->bi_next = NULL; + + /* these should be unchanged: */ + /* bio->bi_end_io = drbd_endio_write_sec; */ + /* bio->bi_vcnt = whatever; */ + + e->w.cb = e_end_block; + + /* This is no longer a barrier request. */ + bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); + + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); + + return 1; +} + +static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) +{ + int rv, issue_flush; + struct p_barrier *p = (struct p_barrier *)h; + struct drbd_epoch *epoch; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + if (mdev->net_conf->wire_protocol != DRBD_PROT_C) + drbd_kick_lo(mdev); + + mdev->current_epoch->barrier_nr = p->barrier; + rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); + + /* P_BARRIER_ACK may imply that the corresponding extent is dropped from + * the activity log, which means it would not be resynced in case the + * R_PRIMARY crashes now. + * Therefore we must send the barrier_ack after the barrier request was + * completed. */ + switch (mdev->write_ordering) { + case WO_bio_barrier: + case WO_none: + if (rv == FE_RECYCLED) + return TRUE; + break; + + case WO_bdev_flush: + case WO_drain_io: + D_ASSERT(rv == FE_STILL_LIVE); + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + if (rv == FE_RECYCLED) + return TRUE; + + /* The asender will send all the ACKs and barrier ACKs out, since + all EEs moved from the active_ee to the done_ee. We need to + provide a new epoch object for the EEs that come in soon */ + break; + } + + /* receiver context, in the writeout path of the other node. + * avoid potential distributed deadlock */ + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (!epoch) { + dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); + issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + if (issue_flush) { + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + if (rv == FE_RECYCLED) + return TRUE; + } + + drbd_wait_ee_list_empty(mdev, &mdev->done_ee); + + return TRUE; + } + + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + atomic_set(&epoch->active, 0); + + spin_lock(&mdev->epoch_lock); + if (atomic_read(&mdev->current_epoch->epoch_size)) { + list_add(&epoch->list, &mdev->current_epoch->list); + mdev->current_epoch = epoch; + mdev->epochs++; + } else { + /* The current_epoch got recycled while we allocated this one... */ + kfree(epoch); + } + spin_unlock(&mdev->epoch_lock); + + return TRUE; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +static struct drbd_epoch_entry * +read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) +{ + struct drbd_epoch_entry *e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int dgs, ds, i, rr; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", + rr, dgs); + return NULL; + } + } + + data_size -= dgs; + + ERR_IF(data_size & 0x1ff) return NULL; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); + if (!e) + return NULL; + bio = e->private_bio; + ds = data_size; + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); + kunmap(page); + if (rr != min_t(int, ds, PAGE_SIZE)) { + drbd_free_ee(mdev, e); + dev_warn(DEV, "short read receiving data: read %d expected %d\n", + rr, min_t(int, ds, PAGE_SIZE)); + return NULL; + } + ds -= rr; + } + + if (dgs) { + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, dgs)) { + dev_err(DEV, "Digest integrity check FAILED.\n"); + drbd_bcast_ee(mdev, "digest failed", + dgs, dig_in, dig_vv, e); + drbd_free_ee(mdev, e); + return NULL; + } + } + mdev->recv_cnt += data_size>>9; + return e; +} + +/* drbd_drain_block() just takes a data block + * out of the socket input buffer, and discards it. + */ +static int drbd_drain_block(struct drbd_conf *mdev, int data_size) +{ + struct page *page; + int rr, rv = 1; + void *data; + + page = drbd_pp_alloc(mdev, 1); + + data = kmap(page); + while (data_size) { + rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); + if (rr != min_t(int, data_size, PAGE_SIZE)) { + rv = 0; + dev_warn(DEV, "short read receiving data: read %d expected %d\n", + rr, min_t(int, data_size, PAGE_SIZE)); + break; + } + data_size -= rr; + } + kunmap(page); + drbd_pp_free(mdev, page); + return rv; +} + +static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, + sector_t sector, int data_size) +{ + struct bio_vec *bvec; + struct bio *bio; + int dgs, rr, i, expect; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", + rr, dgs); + return 0; + } + } + + data_size -= dgs; + + /* optimistically update recv_cnt. if receiving fails below, + * we disconnect anyways, and counters will be reset. */ + mdev->recv_cnt += data_size>>9; + + bio = req->master_bio; + D_ASSERT(sector == bio->bi_sector); + + bio_for_each_segment(bvec, bio, i) { + expect = min_t(int, data_size, bvec->bv_len); + rr = drbd_recv(mdev, + kmap(bvec->bv_page)+bvec->bv_offset, + expect); + kunmap(bvec->bv_page); + if (rr != expect) { + dev_warn(DEV, "short read receiving data reply: " + "read %d expected %d\n", + rr, expect); + return 0; + } + data_size -= rr; + } + + if (dgs) { + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, dgs)) { + dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); + return 0; + } + } + + D_ASSERT(data_size == 0); + return 1; +} + +/* e_end_resync_block() is called via + * drbd_process_done_ee() by asender only */ +static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + int ok; + + D_ASSERT(hlist_unhashed(&e->colision)); + + if (likely(drbd_bio_uptodate(e->private_bio))) { + drbd_set_in_sync(mdev, sector, e->size); + ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); + } else { + /* Record failure to sync */ + drbd_rs_failed_io(mdev, sector, e->size); + + ok = drbd_send_ack(mdev, P_NEG_ACK, e); + } + dec_unacked(mdev); + + return ok; +} + +static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) +{ + struct drbd_epoch_entry *e; + + e = read_in_block(mdev, ID_SYNCER, sector, data_size); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + dec_rs_pending(mdev); + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->private_bio->bi_rw = WRITE; + e->w.cb = e_end_resync_block; + + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->sync_ee); + spin_unlock_irq(&mdev->req_lock); + + drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; +} + +static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct drbd_request *req; + sector_t sector; + unsigned int header_size, data_size; + int ok; + struct p_data *p = (struct p_data *)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev, p->block_id, sector); + spin_unlock_irq(&mdev->req_lock); + if (unlikely(!req)) { + dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); + return FALSE; + } + + /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + ok = recv_dless_read(mdev, req, sector, data_size); + + if (ok) + req_mod(req, data_received); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return ok; +} + +static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + unsigned int header_size, data_size; + int ok; + struct p_data *p = (struct p_data *)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + if (get_ldev(mdev)) { + /* data is submitted to disk within recv_resync_read. + * corresponding put_ldev done below on error, + * or in drbd_endio_write_sec. */ + ok = recv_resync_read(mdev, sector, data_size); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not write resync data to local disk.\n"); + + ok = drbd_drain_block(mdev, data_size); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p); + } + + return ok; +} + +/* e_end_block() is called via drbd_process_done_ee(). + * this means this function only runs in the asender thread + */ +static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + struct drbd_epoch *epoch; + int ok = 1, pcmd; + + if (e->flags & EE_IS_BARRIER) { + epoch = previous_epoch(mdev, e->epoch); + if (epoch) + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); + } + + if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { + if (likely(drbd_bio_uptodate(e->private_bio))) { + pcmd = (mdev->state.conn >= C_SYNC_SOURCE && + mdev->state.conn <= C_PAUSED_SYNC_T && + e->flags & EE_MAY_SET_IN_SYNC) ? + P_RS_WRITE_ACK : P_WRITE_ACK; + ok &= drbd_send_ack(mdev, pcmd, e); + if (pcmd == P_RS_WRITE_ACK) + drbd_set_in_sync(mdev, sector, e->size); + } else { + ok = drbd_send_ack(mdev, P_NEG_ACK, e); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(mdev); + } + /* we delete from the conflict detection hash _after_ we sent out the + * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ + if (mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + } else { + D_ASSERT(hlist_unhashed(&e->colision)); + } + + drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + + return ok; +} + +static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + int ok = 1; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); + + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + + dec_unacked(mdev); + + return ok; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a P_DATA packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking P_DATA packets. + * + * In case packet_seq is larger than mdev->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update mdev->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) +{ + DEFINE_WAIT(wait); + unsigned int p_seq; + long timeout; + int ret = 0; + spin_lock(&mdev->peer_seq_lock); + for (;;) { + prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); + if (seq_le(packet_seq, mdev->peer_seq+1)) + break; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + p_seq = mdev->peer_seq; + spin_unlock(&mdev->peer_seq_lock); + timeout = schedule_timeout(30*HZ); + spin_lock(&mdev->peer_seq_lock); + if (timeout == 0 && p_seq == mdev->peer_seq) { + ret = -ETIMEDOUT; + dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); + break; + } + } + finish_wait(&mdev->seq_wait, &wait); + if (mdev->peer_seq+1 == packet_seq) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + return ret; +} + +/* mirrored write */ +static int receive_Data(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + struct drbd_epoch_entry *e; + struct p_data *p = (struct p_data *)h; + int header_size, data_size; + int rw = WRITE; + u32 dp_flags; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + if (!get_ldev(mdev)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not write mirrored data block " + "to local disk.\n"); + spin_lock(&mdev->peer_seq_lock); + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p); + atomic_inc(&mdev->current_epoch->epoch_size); + return drbd_drain_block(mdev, data_size); + } + + /* get_ldev(mdev) successful. + * Corresponding put_ldev done either below (on various errors), + * or in drbd_endio_write_sec, if we successfully submit the data at + * the end of this function. */ + + sector = be64_to_cpu(p->sector); + e = read_in_block(mdev, p->block_id, sector, data_size); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_block; + + spin_lock(&mdev->epoch_lock); + e->epoch = mdev->current_epoch; + atomic_inc(&e->epoch->epoch_size); + atomic_inc(&e->epoch->active); + + if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { + struct drbd_epoch *epoch; + /* Issue a barrier if we start a new epoch, and the previous epoch + was not a epoch containing a single request which already was + a Barrier. */ + epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); + if (epoch == e->epoch) { + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); + rw |= (1<<BIO_RW_BARRIER); + e->flags |= EE_IS_BARRIER; + } else { + if (atomic_read(&epoch->epoch_size) > 1 || + !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); + rw |= (1<<BIO_RW_BARRIER); + e->flags |= EE_IS_BARRIER; + } + } + } + spin_unlock(&mdev->epoch_lock); + + dp_flags = be32_to_cpu(p->dp_flags); + if (dp_flags & DP_HARDBARRIER) { + dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); + /* rw |= (1<<BIO_RW_BARRIER); */ + } + if (dp_flags & DP_RW_SYNC) + rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); + if (dp_flags & DP_MAY_SET_IN_SYNC) + e->flags |= EE_MAY_SET_IN_SYNC; + + /* I'm the receiver, I do hold a net_cnt reference. */ + if (!mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + } else { + /* don't get the req_lock yet, + * we may sleep in drbd_wait_peer_seq */ + const int size = e->size; + const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); + DEFINE_WAIT(wait); + struct drbd_request *i; + struct hlist_node *n; + struct hlist_head *slot; + int first; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + BUG_ON(mdev->ee_hash == NULL); + BUG_ON(mdev->tl_hash == NULL); + + /* conflict detection and handling: + * 1. wait on the sequence number, + * in case this data packet overtook ACK packets. + * 2. check our hash tables for conflicting requests. + * we only need to walk the tl_hash, since an ee can not + * have a conflict with an other ee: on the submitting + * node, the corresponding req had already been conflicting, + * and a conflicting req is never sent. + * + * Note: for two_primaries, we are protocol C, + * so there cannot be any request that is DONE + * but still on the transfer log. + * + * unconditionally add to the ee_hash. + * + * if no conflicting request is found: + * submit. + * + * if any conflicting request is found + * that has not yet been acked, + * AND I have the "discard concurrent writes" flag: + * queue (via done_ee) the P_DISCARD_ACK; OUT. + * + * if any conflicting request is found: + * block the receiver, waiting on misc_wait + * until no more conflicting requests are there, + * or we get interrupted (disconnect). + * + * we do not just write after local io completion of those + * requests, but only after req is done completely, i.e. + * we wait for the P_DISCARD_ACK to arrive! + * + * then proceed normally, i.e. submit. + */ + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) + goto out_interrupted; + + spin_lock_irq(&mdev->req_lock); + + hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + first = 1; + for (;;) { + int have_unacked = 0; + int have_conflict = 0; + prepare_to_wait(&mdev->misc_wait, &wait, + TASK_INTERRUPTIBLE); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + /* only ALERT on first iteration, + * we may be woken up early... */ + if (first) + dev_alert(DEV, "%s[%u] Concurrent local write detected!" + " new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + if (i->rq_state & RQ_NET_PENDING) + ++have_unacked; + ++have_conflict; + } + } +#undef OVERLAPS + if (!have_conflict) + break; + + /* Discard Ack only for the _first_ iteration */ + if (first && discard && have_unacked) { + dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", + (unsigned long long)sector); + inc_unacked(mdev); + e->w.cb = e_send_discard_ack; + list_add_tail(&e->w.list, &mdev->done_ee); + + spin_unlock_irq(&mdev->req_lock); + + /* we could probably send that P_DISCARD_ACK ourselves, + * but I don't like the receiver using the msock */ + + put_ldev(mdev); + wake_asender(mdev); + finish_wait(&mdev->misc_wait, &wait); + return TRUE; + } + + if (signal_pending(current)) { + hlist_del_init(&e->colision); + + spin_unlock_irq(&mdev->req_lock); + + finish_wait(&mdev->misc_wait, &wait); + goto out_interrupted; + } + + spin_unlock_irq(&mdev->req_lock); + if (first) { + first = 0; + dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " + "sec=%llus\n", (unsigned long long)sector); + } else if (discard) { + /* we had none on the first iteration. + * there must be none now. */ + D_ASSERT(have_unacked == 0); + } + schedule(); + spin_lock_irq(&mdev->req_lock); + } + finish_wait(&mdev->misc_wait, &wait); + } + + list_add(&e->w.list, &mdev->active_ee); + spin_unlock_irq(&mdev->req_lock); + + switch (mdev->net_conf->wire_protocol) { + case DRBD_PROT_C: + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + break; + case DRBD_PROT_B: + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(mdev, P_RECV_ACK, e); + break; + case DRBD_PROT_A: + /* nothing to do */ + break; + } + + if (mdev->state.pdsk == D_DISKLESS) { + /* In case we have the only disk of the cluster, */ + drbd_set_out_of_sync(mdev, e->sector, e->size); + e->flags |= EE_CALL_AL_COMPLETE_IO; + drbd_al_begin_io(mdev, e->sector); + } + + e->private_bio->bi_rw = rw; + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; + +out_interrupted: + /* yes, the epoch_size now is imbalanced. + * but we drop the connection anyways, so we don't have a chance to + * receive a barrier... atomic_inc(&mdev->epoch_size); */ + put_ldev(mdev); + drbd_free_ee(mdev, e); + return FALSE; +} + +static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct drbd_epoch_entry *e; + struct digest_info *di = NULL; + int size, digest_size; + unsigned int fault_type; + struct p_block_req *p = + (struct p_block_req *)h; + const int brps = sizeof(*p)-sizeof(*h); + + if (drbd_recv(mdev, h->payload, brps) != brps) + return FALSE; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return FALSE; + } + if (sector + (size>>9) > capacity) { + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return FALSE; + } + + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not satisfy peer's read request, " + "no local data.\n"); + drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : + P_NEG_RS_DREPLY , p); + return TRUE; + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + e->private_bio->bi_rw = READ; + e->private_bio->bi_end_io = drbd_endio_read_sec; + + switch (h->command) { + case P_DATA_REQUEST: + e->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + break; + case P_RS_DATA_REQUEST: + e->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchronously. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + case P_OV_REPLY: + case P_CSUM_RS_REQUEST: + fault_type = DRBD_FAULT_RS_RD; + digest_size = h->length - brps ; + di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); + if (!di) + goto out_free_e; + + di->digest_size = digest_size; + di->digest = (((char *)di)+sizeof(struct digest_info)); + + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) + goto out_free_e; + + e->block_id = (u64)(unsigned long)di; + if (h->command == P_CSUM_RS_REQUEST) { + D_ASSERT(mdev->agreed_pro_version >= 89); + e->w.cb = w_e_end_csum_rs_req; + } else if (h->command == P_OV_REPLY) { + e->w.cb = w_e_end_ov_reply; + dec_rs_pending(mdev); + break; + } + + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + case P_OV_REQUEST: + if (mdev->state.conn >= C_CONNECTED && + mdev->state.conn != C_VERIFY_T) + dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", + drbd_conn_str(mdev->state.conn)); + if (mdev->ov_start_sector == ~(sector_t)0 && + mdev->agreed_pro_version >= 90) { + mdev->ov_start_sector = sector; + mdev->ov_position = sector; + mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); + dev_info(DEV, "Online Verify start sector: %llu\n", + (unsigned long long)sector); + } + e->w.cb = w_e_end_ov_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchronous. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + + default: + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(h->command)); + fault_type = DRBD_FAULT_MAX; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + inc_unacked(mdev); + + drbd_generic_make_request(mdev, fault_type, e->private_bio); + maybe_kick_lo(mdev); + + return TRUE; + +out_free_e: + kfree(di); + put_ldev(mdev); + drbd_free_ee(mdev, e); + return FALSE; +} + +static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, rv = -100; + unsigned long ch_self, ch_peer; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + ch_peer = mdev->p_uuid[UI_SIZE]; + ch_self = mdev->comm_bm_set; + + switch (mdev->net_conf->after_sb_0p) { + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_CALL_HELPER: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_DISCARD_YOUNGER_PRI: + if (self == 0 && peer == 1) { + rv = -1; + break; + } + if (self == 1 && peer == 0) { + rv = 1; + break; + } + /* Else fall through to one of the other strategies... */ + case ASB_DISCARD_OLDER_PRI: + if (self == 0 && peer == 1) { + rv = 1; + break; + } + if (self == 1 && peer == 0) { + rv = -1; + break; + } + /* Else fall through to one of the other strategies... */ + dev_warn(DEV, "Discard younger/older primary did not find a decision\n" + "Using discard-least-changes instead\n"); + case ASB_DISCARD_ZERO_CHG: + if (ch_peer == 0 && ch_self == 0) { + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + ? -1 : 1; + break; + } else { + if (ch_peer == 0) { rv = 1; break; } + if (ch_self == 0) { rv = -1; break; } + } + if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) + break; + case ASB_DISCARD_LEAST_CHG: + if (ch_self < ch_peer) + rv = -1; + else if (ch_self > ch_peer) + rv = 1; + else /* ( ch_self == ch_peer ) */ + /* Well, then use something else. */ + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + ? -1 : 1; + break; + case ASB_DISCARD_LOCAL: + rv = -1; + break; + case ASB_DISCARD_REMOTE: + rv = 1; + } + + return rv; +} + +static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, hg, rv = -100; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + switch (mdev->net_conf->after_sb_1p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_CONSENSUS: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1 && mdev->state.role == R_SECONDARY) + rv = hg; + if (hg == 1 && mdev->state.role == R_PRIMARY) + rv = hg; + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(mdev); + break; + case ASB_DISCARD_SECONDARY: + return mdev->state.role == R_PRIMARY ? 1 : -1; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1 && mdev->state.role == R_PRIMARY) { + self = drbd_set_role(mdev, R_SECONDARY, 0); + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (self != SS_SUCCESS) { + drbd_khelper(mdev, "pri-lost-after-sb"); + } else { + dev_warn(DEV, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, hg, rv = -100; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + switch (mdev->net_conf->after_sb_2p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(mdev); + break; + case ASB_DISCONNECT: + break; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1) { + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (self != SS_SUCCESS) { + drbd_khelper(mdev, "pri-lost-after-sb"); + } else { + dev_warn(DEV, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, + u64 bits, u64 flags) +{ + if (!uuid) { + dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); + return; + } + dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END], + (unsigned long long)bits, + (unsigned long long)flags); +} + +/* + 100 after split brain try auto recover + 2 C_SYNC_SOURCE set BitMap + 1 C_SYNC_SOURCE use BitMap + 0 no Sync + -1 C_SYNC_TARGET use BitMap + -2 C_SYNC_TARGET set BitMap + -100 after split brain, disconnect +-1000 unrelated data + */ +static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) +{ + u64 self, peer; + int i, j; + + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); + + *rule_nr = 10; + if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) + return 0; + + *rule_nr = 20; + if ((self == UUID_JUST_CREATED || self == (u64)0) && + peer != UUID_JUST_CREATED) + return -2; + + *rule_nr = 30; + if (self != UUID_JUST_CREATED && + (peer == UUID_JUST_CREATED || peer == (u64)0)) + return 2; + + if (self == peer) { + int rct, dc; /* roles at crash time */ + + if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { + + if (mdev->agreed_pro_version < 91) + return -1001; + + if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && + (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { + dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); + drbd_uuid_set_bm(mdev, 0UL); + + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); + *rule_nr = 34; + } else { + dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); + *rule_nr = 36; + } + + return 1; + } + + if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { + + if (mdev->agreed_pro_version < 91) + return -1001; + + if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && + (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { + dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); + + mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; + mdev->p_uuid[UI_BITMAP] = 0UL; + + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); + *rule_nr = 35; + } else { + dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); + *rule_nr = 37; + } + + return -1; + } + + /* Common power [off|failure] */ + rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + + (mdev->p_uuid[UI_FLAGS] & 2); + /* lowest bit is set when we were primary, + * next bit (weight 2) is set when peer was primary */ + *rule_nr = 40; + + switch (rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); + return dc ? -1 : 1; + } + } + + *rule_nr = 50; + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer) + return -1; + + *rule_nr = 51; + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); + if (self == peer) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of the peer's UUIDs. */ + + if (mdev->agreed_pro_version < 91) + return -1001; + + mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; + return -1; + } + } + + *rule_nr = 60; + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + peer = mdev->p_uuid[i] & ~((u64)1); + if (self == peer) + return -2; + } + + *rule_nr = 70; + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); + if (self == peer) + return 1; + + *rule_nr = 71; + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of our UUIDs. */ + + if (mdev->agreed_pro_version < 91) + return -1001; + + _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); + _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); + + dev_info(DEV, "Undid last start of resync:\n"); + + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); + + return 1; + } + } + + + *rule_nr = 80; + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = mdev->ldev->md.uuid[i] & ~((u64)1); + if (self == peer) + return 2; + } + + *rule_nr = 90; + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer && self != ((u64)0)) + return 100; + + *rule_nr = 100; + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = mdev->ldev->md.uuid[i] & ~((u64)1); + for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { + peer = mdev->p_uuid[j] & ~((u64)1); + if (self == peer) + return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + CONN_MASK (-1) on failure. + */ +static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, + enum drbd_disk_state peer_disk) __must_hold(local) +{ + int hg, rule_nr; + enum drbd_conns rv = C_MASK; + enum drbd_disk_state mydisk; + + mydisk = mdev->state.disk; + if (mydisk == D_NEGOTIATING) + mydisk = mdev->new_state_tmp.disk; + + dev_info(DEV, "drbd_sync_handshake:\n"); + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, + mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); + + hg = drbd_uuid_compare(mdev, &rule_nr); + + dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); + + if (hg == -1000) { + dev_alert(DEV, "Unrelated data, aborting!\n"); + return C_MASK; + } + if (hg == -1001) { + dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); + return C_MASK; + } + + if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || + (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { + int f = (hg == -100) || abs(hg) == 2; + hg = mydisk > D_INCONSISTENT ? 1 : -1; + if (f) + hg = hg*2; + dev_info(DEV, "Becoming sync %s due to disk states.\n", + hg > 0 ? "source" : "target"); + } + + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { + int pcount = (mdev->state.role == R_PRIMARY) + + (peer_role == R_PRIMARY); + int forced = (hg == -100); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(mdev); + break; + case 1: + hg = drbd_asb_recover_1p(mdev); + break; + case 2: + hg = drbd_asb_recover_2p(mdev); + break; + } + if (abs(hg) < 100) { + dev_warn(DEV, "Split-Brain detected, %d primaries, " + "automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer" : "this"); + if (forced) { + dev_warn(DEV, "Doing a full sync, since" + " UUIDs where ambiguous.\n"); + hg = hg*2; + } + } + } + + if (hg == -100) { + if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) + hg = -1; + if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) + hg = 1; + + if (abs(hg) < 100) + dev_warn(DEV, "Split-Brain detected, manually solved. " + "Sync from %s node\n", + (hg < 0) ? "peer" : "this"); + } + + if (hg == -100) { + dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); + drbd_khelper(mdev, "split-brain"); + return C_MASK; + } + + if (hg > 0 && mydisk <= D_INCONSISTENT) { + dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); + return C_MASK; + } + + if (hg < 0 && /* by intention we do not use mydisk here. */ + mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { + switch (mdev->net_conf->rr_conflict) { + case ASB_CALL_HELPER: + drbd_khelper(mdev, "pri-lost"); + /* fall through */ + case ASB_DISCONNECT: + dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); + return C_MASK; + case ASB_VIOLENTLY: + dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (abs(hg) >= 2) { + dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) + return C_MASK; + } + + if (hg > 0) { /* become sync source. */ + rv = C_WF_BITMAP_S; + } else if (hg < 0) { /* become sync target */ + rv = C_WF_BITMAP_T; + } else { + rv = C_CONNECTED; + if (drbd_bm_total_weight(mdev)) { + dev_info(DEV, "No resync, but %lu bits in bitmap!\n", + drbd_bm_total_weight(mdev)); + } + } + + return rv; +} + +/* returns 1 if invalid */ +static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) +{ + /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ + if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || + (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) + return 0; + + /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ + if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || + self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) + return 1; + + /* everything else is valid if they are equal on both sides. */ + if (peer == self) + return 0; + + /* everything es is invalid. */ + return 1; +} + +static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_protocol *p = (struct p_protocol *)h; + int header_size, data_size; + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_want_lose, p_two_primaries; + char p_integrity_alg[SHARED_SECRET_MAX] = ""; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_want_lose = be32_to_cpu(p->want_lose); + p_two_primaries = be32_to_cpu(p->two_primaries); + + if (p_proto != mdev->net_conf->wire_protocol) { + dev_err(DEV, "incompatible communication protocols\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { + dev_err(DEV, "incompatible after-sb-0pri settings\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { + dev_err(DEV, "incompatible after-sb-1pri settings\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { + dev_err(DEV, "incompatible after-sb-2pri settings\n"); + goto disconnect; + } + + if (p_want_lose && mdev->net_conf->want_lose) { + dev_err(DEV, "both sides have the 'want_lose' flag set\n"); + goto disconnect; + } + + if (p_two_primaries != mdev->net_conf->two_primaries) { + dev_err(DEV, "incompatible setting of the two-primaries options\n"); + goto disconnect; + } + + if (mdev->agreed_pro_version >= 87) { + unsigned char *my_alg = mdev->net_conf->integrity_alg; + + if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) + return FALSE; + + p_integrity_alg[SHARED_SECRET_MAX-1] = 0; + if (strcmp(p_integrity_alg, my_alg)) { + dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); + goto disconnect; + } + dev_info(DEV, "data-integrity-alg: %s\n", + my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); + } + + return TRUE; + +disconnect: + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; +} + +/* helper function + * input: alg name, feature name + * return: NULL (alg name was "") + * ERR_PTR(error) if something goes wrong + * or the crypto hash ptr, if it worked out ok. */ +struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, + const char *alg, const char *name) +{ + struct crypto_hash *tfm; + + if (!alg[0]) + return NULL; + + tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", + alg, name, PTR_ERR(tfm)); + return tfm; + } + if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { + crypto_free_hash(tfm); + dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); + return ERR_PTR(-EINVAL); + } + return tfm; +} + +static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) +{ + int ok = TRUE; + struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; + unsigned int header_size, data_size, exp_max_sz; + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + const int apv = mdev->agreed_pro_version; + + exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + SHARED_SECRET_MAX + : /* 89 */ sizeof(struct p_rs_param_89); + + if (h->length > exp_max_sz) { + dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", + h->length, exp_max_sz); + return FALSE; + } + + if (apv <= 88) { + header_size = sizeof(struct p_rs_param) - sizeof(*h); + data_size = h->length - header_size; + } else /* apv >= 89 */ { + header_size = sizeof(struct p_rs_param_89) - sizeof(*h); + data_size = h->length - header_size; + D_ASSERT(data_size == 0); + } + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + mdev->sync_conf.rate = be32_to_cpu(p->rate); + + if (apv >= 88) { + if (apv == 88) { + if (data_size > SHARED_SECRET_MAX) { + dev_err(DEV, "verify-alg too long, " + "peer wants %u, accepting only %u byte\n", + data_size, SHARED_SECRET_MAX); + return FALSE; + } + + if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) + return FALSE; + + /* we expect NUL terminated string */ + /* but just in case someone tries to be evil */ + D_ASSERT(p->verify_alg[data_size-1] == 0); + p->verify_alg[data_size-1] = 0; + + } else /* apv >= 89 */ { + /* we still expect NUL terminated strings */ + /* but just in case someone tries to be evil */ + D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); + D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); + p->verify_alg[SHARED_SECRET_MAX-1] = 0; + p->csums_alg[SHARED_SECRET_MAX-1] = 0; + } + + if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { + if (mdev->state.conn == C_WF_REPORT_PARAMS) { + dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", + mdev->sync_conf.verify_alg, p->verify_alg); + goto disconnect; + } + verify_tfm = drbd_crypto_alloc_digest_safe(mdev, + p->verify_alg, "verify-alg"); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + goto disconnect; + } + } + + if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { + if (mdev->state.conn == C_WF_REPORT_PARAMS) { + dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", + mdev->sync_conf.csums_alg, p->csums_alg); + goto disconnect; + } + csums_tfm = drbd_crypto_alloc_digest_safe(mdev, + p->csums_alg, "csums-alg"); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + goto disconnect; + } + } + + + spin_lock(&mdev->peer_seq_lock); + /* lock against drbd_nl_syncer_conf() */ + if (verify_tfm) { + strcpy(mdev->sync_conf.verify_alg, p->verify_alg); + mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); + } + if (csums_tfm) { + strcpy(mdev->sync_conf.csums_alg, p->csums_alg); + mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); + } + spin_unlock(&mdev->peer_seq_lock); + } + + return ok; +disconnect: + /* just for completeness: actually not needed, + * as this is not reached if csums_tfm was ok. */ + crypto_free_hash(csums_tfm); + /* but free the verify_tfm again, if csums_tfm did not work out */ + crypto_free_hash(verify_tfm); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; +} + +static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ */ +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(struct drbd_conf *mdev, + const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) + return; + d = (a > b) ? (a - b) : (b - a); + if (d > (a>>3) || d > (b>>3)) + dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); +} + +static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_sizes *p = (struct p_sizes *)h; + enum determine_dev_size dd = unchanged; + unsigned int max_seg_s; + sector_t p_size, p_usize, my_usize; + int ldsc = 0; /* local disk size changed */ + enum drbd_conns nconn; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_size = be64_to_cpu(p->d_size); + p_usize = be64_to_cpu(p->u_size); + + if (p_size == 0 && mdev->state.disk == D_DISKLESS) { + dev_err(DEV, "some backing storage is needed\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + /* just store the peer's disk size for now. + * we still need to figure out whether we accept that. */ + mdev->p_size = p_size; + +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + if (get_ldev(mdev)) { + warn_if_differ_considerably(mdev, "lower level device sizes", + p_size, drbd_get_max_capacity(mdev->ldev)); + warn_if_differ_considerably(mdev, "user requested size", + p_usize, mdev->ldev->dc.disk_size); + + /* if this is the first connect, or an otherwise expected + * param exchange, choose the minimum */ + if (mdev->state.conn == C_WF_REPORT_PARAMS) + p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, + p_usize); + + my_usize = mdev->ldev->dc.disk_size; + + if (mdev->ldev->dc.disk_size != p_usize) { + mdev->ldev->dc.disk_size = p_usize; + dev_info(DEV, "Peer sets u_size to %lu sectors\n", + (unsigned long)mdev->ldev->dc.disk_size); + } + + /* Never shrink a device with usable data during connect. + But allow online shrinking if we are connected. */ + if (drbd_new_dev_size(mdev, mdev->ldev) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= D_OUTDATED && + mdev->state.conn < C_CONNECTED) { + dev_err(DEV, "The peer's disk size is too small!\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + mdev->ldev->dc.disk_size = my_usize; + put_ldev(mdev); + return FALSE; + } + put_ldev(mdev); + } +#undef min_not_zero + + if (get_ldev(mdev)) { + dd = drbd_determin_dev_size(mdev); + put_ldev(mdev); + if (dd == dev_size_error) + return FALSE; + drbd_md_sync(mdev); + } else { + /* I am diskless, need to accept the peer's size. */ + drbd_set_my_capacity(mdev, p_size); + } + + if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { + nconn = drbd_sync_handshake(mdev, + mdev->state.peer, mdev->state.pdsk); + put_ldev(mdev); + + if (nconn == C_MASK) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + } + + if (get_ldev(mdev)) { + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + ldsc = 1; + } + + max_seg_s = be32_to_cpu(p->max_segment_size); + if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) + drbd_setup_queue_param(mdev, max_seg_s); + + drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); + put_ldev(mdev); + } + + if (mdev->state.conn > C_WF_REPORT_PARAMS) { + if (be64_to_cpu(p->c_size) != + drbd_get_capacity(mdev->this_bdev) || ldsc) { + /* we have different sizes, probably peer + * needs to know my new size... */ + drbd_send_sizes(mdev, 0); + } + if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || + (dd == grew && mdev->state.conn == C_CONNECTED)) { + if (mdev->state.pdsk >= D_INCONSISTENT && + mdev->state.disk >= D_INCONSISTENT) + resync_after_online_grow(mdev); + else + set_bit(RESYNC_AFTER_NEG, &mdev->flags); + } + } + + return TRUE; +} + +static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_uuids *p = (struct p_uuids *)h; + u64 *p_uuid; + int i; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); + + for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) + p_uuid[i] = be64_to_cpu(p->uuid[i]); + + kfree(mdev->p_uuid); + mdev->p_uuid = p_uuid; + + if (mdev->state.conn < C_CONNECTED && + mdev->state.disk < D_INCONSISTENT && + mdev->state.role == R_PRIMARY && + (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { + dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (get_ldev(mdev)) { + int skip_initial_sync = + mdev->state.conn == C_CONNECTED && + mdev->agreed_pro_version >= 90 && + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && + (p_uuid[UI_FLAGS] & 8); + if (skip_initial_sync) { + dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); + drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, + "clear_n_write from receive_uuids"); + _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); + _drbd_uuid_set(mdev, UI_BITMAP, 0); + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + drbd_md_sync(mdev); + } + put_ldev(mdev); + } + + /* Before we test for the disk state, we should wait until an eventually + ongoing cluster wide state change is finished. That is important if + we are primary and are detaching from our disk. We need to see the + new disk state... */ + wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); + if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) + drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); + + return TRUE; +} + +/** + * convert_state() - Converts the peer's view of the cluster state to our point of view + * @ps: The state as seen by the peer. + */ +static union drbd_state convert_state(union drbd_state ps) +{ + union drbd_state ms; + + static enum drbd_conns c_tab[] = { + [C_CONNECTED] = C_CONNECTED, + + [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, + [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, + [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ + [C_VERIFY_S] = C_VERIFY_T, + [C_MASK] = C_MASK, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = (ps.aftr_isp | ps.user_isp); + + return ms; +} + +static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_req_state *p = (struct p_req_state *)h; + union drbd_state mask, val; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && + test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { + drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); + return TRUE; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); + + drbd_send_sr_reply(mdev, rv); + drbd_md_sync(mdev); + + return TRUE; +} + +static int receive_state(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_state *p = (struct p_state *)h; + enum drbd_conns nconn, oconn; + union drbd_state ns, peer_state; + enum drbd_disk_state real_peer_disk; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) + return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + peer_state.i = be32_to_cpu(p->state); + + real_peer_disk = peer_state.disk; + if (peer_state.disk == D_NEGOTIATING) { + real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; + dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); + } + + spin_lock_irq(&mdev->req_lock); + retry: + oconn = nconn = mdev->state.conn; + spin_unlock_irq(&mdev->req_lock); + + if (nconn == C_WF_REPORT_PARAMS) + nconn = C_CONNECTED; + + if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + int cr; /* consider resync */ + + /* if we established a new connection */ + cr = (oconn < C_CONNECTED); + /* if we had an established connection + * and one of the nodes newly attaches a disk */ + cr |= (oconn == C_CONNECTED && + (peer_state.disk == D_NEGOTIATING || + mdev->state.disk == D_NEGOTIATING)); + /* if we have both been inconsistent, and the peer has been + * forced to be UpToDate with --overwrite-data */ + cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); + /* if we had been plain connected, and the admin requested to + * start a sync by "invalidate" or "invalidate-remote" */ + cr |= (oconn == C_CONNECTED && + (peer_state.conn >= C_STARTING_SYNC_S && + peer_state.conn <= C_WF_BITMAP_T)); + + if (cr) + nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); + + put_ldev(mdev); + if (nconn == C_MASK) { + if (mdev->state.disk == D_NEGOTIATING) { + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + nconn = C_CONNECTED; + } else if (peer_state.disk == D_NEGOTIATING) { + dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); + peer_state.disk = D_DISKLESS; + } else { + D_ASSERT(oconn == C_WF_REPORT_PARAMS); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + } + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn != oconn) + goto retry; + clear_bit(CONSIDER_RESYNC, &mdev->flags); + ns.i = mdev->state.i; + ns.conn = nconn; + ns.peer = peer_state.role; + ns.pdsk = real_peer_disk; + ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); + if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + ns.disk = mdev->new_state_tmp.disk; + + rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (oconn > C_WF_REPORT_PARAMS) { + if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + peer_state.disk != D_NEGOTIATING ) { + /* we want resync, peer has not yet decided to sync... */ + /* Nowadays only used when forcing a node into primary role and + setting its disk to UpToDate with that */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + } + + mdev->net_conf->want_lose = 0; + + drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ + + return TRUE; +} + +static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_rs_uuid *p = (struct p_rs_uuid *)h; + + wait_event(mdev->misc_wait, + mdev->state.conn == C_WF_SYNC_UUID || + mdev->state.conn < C_CONNECTED || + mdev->state.disk < D_NEGOTIATING); + + /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + /* Here the _drbd_uuid_ functions are right, current should + _not_ be rotated into the history */ + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { + _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); + _drbd_uuid_set(mdev, UI_BITMAP, 0UL); + + drbd_start_resync(mdev, C_SYNC_TARGET); + + put_ldev(mdev); + } else + dev_err(DEV, "Ignoring SyncUUID packet!\n"); + + return TRUE; +} + +enum receive_bitmap_ret { OK, DONE, FAILED }; + +static enum receive_bitmap_ret +receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, + unsigned long *buffer, struct bm_xfer_ctx *c) +{ + unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + unsigned want = num_words * sizeof(long); + + if (want != h->length) { + dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); + return FAILED; + } + if (want == 0) + return DONE; + if (drbd_recv(mdev, buffer, want) != want) + return FAILED; + + drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); + + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + + return OK; +} + +static enum receive_bitmap_ret +recv_bm_rle_bits(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + u64 look_ahead; + u64 rl; + u64 tmp; + unsigned long s = c->bit_offset; + unsigned long e; + int len = p->head.length - (sizeof(*p) - sizeof(p->head)); + int toggle = DCBP_get_start(p); + int have; + int bits; + + bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); + + bits = bitstream_get_bits(&bs, &look_ahead, 64); + if (bits < 0) + return FAILED; + + for (have = bits; have > 0; s += rl, toggle = !toggle) { + bits = vli_decode_bits(&rl, look_ahead); + if (bits <= 0) + return FAILED; + + if (toggle) { + e = s + rl -1; + if (e >= c->bm_bits) { + dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); + return FAILED; + } + _drbd_bm_set_bits(mdev, s, e); + } + + if (have < bits) { + dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", + have, bits, look_ahead, + (unsigned int)(bs.cur.b - p->code), + (unsigned int)bs.buf_len); + return FAILED; + } + look_ahead >>= bits; + have -= bits; + + bits = bitstream_get_bits(&bs, &tmp, 64 - have); + if (bits < 0) + return FAILED; + look_ahead |= tmp << have; + have += bits; + } + + c->bit_offset = s; + bm_xfer_ctx_bit_to_word_offset(c); + + return (s == c->bm_bits) ? DONE : OK; +} + +static enum receive_bitmap_ret +decode_bitmap_c(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + if (DCBP_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(mdev, p, c); + + /* other variants had been implemented for evaluation, + * but have been dropped as this one turned out to be "best" + * during all our tests. */ + + dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + return FAILED; +} + +void INFO_bm_xfer_stats(struct drbd_conf *mdev, + const char *direction, struct bm_xfer_ctx *c) +{ + /* what would it take to transfer it "plaintext" */ + unsigned plain = sizeof(struct p_header) * + ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) + + c->bm_words * sizeof(long); + unsigned total = c->bytes[0] + c->bytes[1]; + unsigned r; + + /* total can not be zero. but just in case: */ + if (total == 0) + return; + + /* don't report if not compressed */ + if (total >= plain) + return; + + /* total < plain. check for overflow, still */ + r = (total > UINT_MAX/1000) ? (total / (plain/1000)) + : (1000 * total / plain); + + if (r > 1000) + r = 1000; + + r = 1000 - r; + dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + "total %u; compression: %u.%u%%\n", + direction, + c->bytes[1], c->packets[1], + c->bytes[0], c->packets[0], + total, r/10, r % 10); +} + +/* Since we are processing the bitfield from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we successfully received it. */ +static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) +{ + struct bm_xfer_ctx c; + void *buffer; + enum receive_bitmap_ret ret; + int ok = FALSE; + + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + + drbd_bm_lock(mdev, "receive bitmap"); + + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + buffer = (unsigned long *) __get_free_page(GFP_NOIO); + if (!buffer) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); + goto out; + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(mdev), + .bm_words = drbd_bm_words(mdev), + }; + + do { + if (h->command == P_BITMAP) { + ret = receive_bitmap_plain(mdev, h, buffer, &c); + } else if (h->command == P_COMPRESSED_BITMAP) { + /* MAYBE: sanity check that we speak proto >= 90, + * and the feature is enabled! */ + struct p_compressed_bm *p; + + if (h->length > BM_PACKET_PAYLOAD_BYTES) { + dev_err(DEV, "ReportCBitmap packet too large\n"); + goto out; + } + /* use the page buff */ + p = buffer; + memcpy(p, h, sizeof(*h)); + if (drbd_recv(mdev, p->head.payload, h->length) != h->length) + goto out; + if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); + return FAILED; + } + ret = decode_bitmap_c(mdev, p, &c); + } else { + dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); + goto out; + } + + c.packets[h->command == P_BITMAP]++; + c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; + + if (ret != OK) + break; + + if (!drbd_recv_header(mdev, h)) + goto out; + } while (ret == OK); + if (ret == FAILED) + goto out; + + INFO_bm_xfer_stats(mdev, "receive", &c); + + if (mdev->state.conn == C_WF_BITMAP_T) { + ok = !drbd_send_bitmap(mdev); + if (!ok) + goto out; + /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ + ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + D_ASSERT(ok == SS_SUCCESS); + } else if (mdev->state.conn != C_WF_BITMAP_S) { + /* admin may have requested C_DISCONNECTING, + * other threads may have noticed network errors */ + dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", + drbd_conn_str(mdev->state.conn)); + } + + ok = TRUE; + out: + drbd_bm_unlock(mdev); + if (ok && mdev->state.conn == C_WF_BITMAP_S) + drbd_start_resync(mdev, C_SYNC_SOURCE); + free_page((unsigned long) buffer); + return ok; +} + +static int receive_skip(struct drbd_conf *mdev, struct p_header *h) +{ + /* TODO zero copy sink :) */ + static char sink[128]; + int size, want, r; + + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", + h->command, h->length); + + size = h->length; + while (size > 0) { + want = min_t(int, size, sizeof(sink)); + r = drbd_recv(mdev, sink, want); + ERR_IF(r <= 0) break; + size -= r; + } + return size == 0; +} + +static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) +{ + if (mdev->state.disk >= D_INCONSISTENT) + drbd_kick_lo(mdev); + + /* Make sure we've acked all the TCP data associated + * with the data requests being unplugged */ + drbd_tcp_quickack(mdev->data.socket); + + return TRUE; +} + +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); + +static drbd_cmd_handler_f drbd_default_handler[] = { + [P_DATA] = receive_Data, + [P_DATA_REPLY] = receive_DataReply, + [P_RS_DATA_REPLY] = receive_RSDataReply, + [P_BARRIER] = receive_Barrier, + [P_BITMAP] = receive_bitmap, + [P_COMPRESSED_BITMAP] = receive_bitmap, + [P_UNPLUG_REMOTE] = receive_UnplugRemote, + [P_DATA_REQUEST] = receive_DataRequest, + [P_RS_DATA_REQUEST] = receive_DataRequest, + [P_SYNC_PARAM] = receive_SyncParam, + [P_SYNC_PARAM89] = receive_SyncParam, + [P_PROTOCOL] = receive_protocol, + [P_UUIDS] = receive_uuids, + [P_SIZES] = receive_sizes, + [P_STATE] = receive_state, + [P_STATE_CHG_REQ] = receive_req_state, + [P_SYNC_UUID] = receive_sync_uuid, + [P_OV_REQUEST] = receive_DataRequest, + [P_OV_REPLY] = receive_DataRequest, + [P_CSUM_RS_REQUEST] = receive_DataRequest, + /* anything missing from this table is in + * the asender_tbl, see get_asender_cmd */ + [P_MAX_CMD] = NULL, +}; + +static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; +static drbd_cmd_handler_f *drbd_opt_cmd_handler; + +static void drbdd(struct drbd_conf *mdev) +{ + drbd_cmd_handler_f handler; + struct p_header *header = &mdev->data.rbuf.header; + + while (get_t_state(&mdev->receiver) == Running) { + drbd_thread_current_set_cpu(mdev); + if (!drbd_recv_header(mdev, header)) { + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + break; + } + + if (header->command < P_MAX_CMD) + handler = drbd_cmd_handler[header->command]; + else if (P_MAY_IGNORE < header->command + && header->command < P_MAX_OPT_CMD) + handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; + else if (header->command > P_MAX_OPT_CMD) + handler = receive_skip; + else + handler = NULL; + + if (unlikely(!handler)) { + dev_err(DEV, "unknown packet type %d, l: %d!\n", + header->command, header->length); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + break; + } + if (unlikely(!handler(mdev, header))) { + dev_err(DEV, "error receiving %s, l: %d!\n", + cmdname(header->command), header->length); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + break; + } + } +} + +static void drbd_fail_pending_reads(struct drbd_conf *mdev) +{ + struct hlist_head *slot; + struct hlist_node *pos; + struct hlist_node *tmp; + struct drbd_request *req; + int i; + + /* + * Application READ requests + */ + spin_lock_irq(&mdev->req_lock); + for (i = 0; i < APP_R_HSIZE; i++) { + slot = mdev->app_reads_hash+i; + hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { + /* it may (but should not any longer!) + * be on the work queue; if that assert triggers, + * we need to also grab the + * spin_lock_irq(&mdev->data.work.q_lock); + * and list_del_init here. */ + D_ASSERT(list_empty(&req->w.list)); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(req, connection_lost_while_pending); + } + } + for (i = 0; i < APP_R_HSIZE; i++) + if (!hlist_empty(mdev->app_reads_hash+i)) + dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " + "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); + + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + spin_unlock_irq(&mdev->req_lock); +} + +void drbd_flush_workqueue(struct drbd_conf *mdev) +{ + struct drbd_wq_barrier barr; + + barr.w.cb = w_prev_work_done; + init_completion(&barr.done); + drbd_queue_work(&mdev->data.work, &barr.w); + wait_for_completion(&barr.done); +} + +static void drbd_disconnect(struct drbd_conf *mdev) +{ + enum drbd_fencing_p fp; + union drbd_state os, ns; + int rv = SS_UNKNOWN_ERROR; + unsigned int i; + + if (mdev->state.conn == C_STANDALONE) + return; + if (mdev->state.conn >= C_WF_CONNECTION) + dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", + drbd_conn_str(mdev->state.conn)); + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&mdev->asender); + + mutex_lock(&mdev->data.mutex); + drbd_free_sock(mdev); + mutex_unlock(&mdev->data.mutex); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); + _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + /* We do not have data structures that would allow us to + * get the rs_pending_cnt down to 0 again. + * * On C_SYNC_TARGET we do not have any data structures describing + * the pending RSDataRequest's we have sent. + * * On C_SYNC_SOURCE there is no data structure that tracks + * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. + * And no, it is not the sum of the reference counts in the + * resync_LRU. The resync_LRU tracks the whole operation including + * the disk-IO, while the rs_pending_cnt only tracks the blocks + * on the fly. */ + drbd_rs_cancel_all(mdev); + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + wake_up(&mdev->misc_wait); + + /* make sure syncer is stopped and w_resume_next_sg queued */ + del_timer_sync(&mdev->resync_timer); + set_bit(STOP_SYNC_TIMER, &mdev->flags); + resync_timer_fn((unsigned long)mdev); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + drbd_flush_workqueue(mdev); + + /* This also does reclaim_net_ee(). If we do this too early, we might + * miss some resync ee and pages.*/ + drbd_process_done_ee(mdev); + + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + + if (!mdev->state.susp) + tl_clear(mdev); + + drbd_fail_pending_reads(mdev); + + dev_info(DEV, "Connection closed\n"); + + drbd_md_sync(mdev); + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (mdev->state.role == R_PRIMARY) { + if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { + enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev, NS(pdsk, nps)); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if (os.conn >= C_UNCONNECTED) { + /* Do not restart in case we are C_DISCONNECTING */ + ns = os; + ns.conn = C_UNCONNECTED; + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + } + spin_unlock_irq(&mdev->req_lock); + + if (os.conn == C_DISCONNECTING) { + struct hlist_head *h; + wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); + + /* we must not free the tl_hash + * while application io is still on the fly */ + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); + + spin_lock_irq(&mdev->req_lock); + /* paranoia code */ + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", + (int)(h - mdev->ee_hash), h->first); + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + + /* paranoia code */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", + (int)(h - mdev->tl_hash), h->first); + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + spin_unlock_irq(&mdev->req_lock); + + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + + kfree(mdev->net_conf); + mdev->net_conf = NULL; + drbd_request_state(mdev, NS(conn, C_STANDALONE)); + } + + /* tcp_close and release of sendpage pages can be deferred. I don't + * want to use SO_LINGER, because apparently it can be deferred for + * more than 20 seconds (longest time I checked). + * + * Actually we don't care for exactly when the network stack does its + * put_page(), but release our reference on these pages right here. + */ + i = drbd_release_ee(mdev, &mdev->net_ee); + if (i) + dev_info(DEV, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&mdev->pp_in_use); + if (i) + dev_info(DEV, "pp_in_use = %u, expected 0\n", i); + + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&mdev->current_epoch->epoch_size, 0); + D_ASSERT(list_empty(&mdev->current_epoch->list)); +} + +/* + * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version + * we can agree on is stored in agreed_pro_version. + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +static int drbd_send_handshake(struct drbd_conf *mdev) +{ + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.sbuf.handshake; + int ok; + + if (mutex_lock_interruptible(&mdev->data.mutex)) { + dev_err(DEV, "interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + + if (mdev->data.socket == NULL) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + + memset(p, 0, sizeof(*p)); + p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); + p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, + (struct p_header *)p, sizeof(*p), 0 ); + mutex_unlock(&mdev->data.mutex); + return ok; +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +static int drbd_do_handshake(struct drbd_conf *mdev) +{ + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.rbuf.handshake; + const int expect = sizeof(struct p_handshake) + -sizeof(struct p_header); + int rv; + + rv = drbd_send_handshake(mdev); + if (!rv) + return 0; + + rv = drbd_recv_header(mdev, &p->head); + if (!rv) + return 0; + + if (p->head.command != P_HAND_SHAKE) { + dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(p->head.command), p->head.command); + return -1; + } + + if (p->head.length != expect) { + dev_err(DEV, "expected HandShake length: %u, received: %u\n", + expect, p->head.length); + return -1; + } + + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); + return 0; + } + + p->protocol_min = be32_to_cpu(p->protocol_min); + p->protocol_max = be32_to_cpu(p->protocol_max); + if (p->protocol_max == 0) + p->protocol_max = p->protocol_min; + + if (PRO_VERSION_MAX < p->protocol_min || + PRO_VERSION_MIN > p->protocol_max) + goto incompat; + + mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + + dev_info(DEV, "Handshake successful: " + "Agreed network protocol version %d\n", mdev->agreed_pro_version); + + return 1; + + incompat: + dev_err(DEV, "incompatible DRBD dialects: " + "I support %d-%d, peer supports %d-%d\n", + PRO_VERSION_MIN, PRO_VERSION_MAX, + p->protocol_min, p->protocol_max); + return -1; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +static int drbd_do_auth(struct drbd_conf *mdev) +{ + dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return 0; +} +#else +#define CHALLENGE_LEN 64 +static int drbd_do_auth(struct drbd_conf *mdev) +{ + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + struct p_header p; + unsigned int key_len = strlen(mdev->net_conf->shared_secret); + unsigned int resp_size; + struct hash_desc desc; + int rv; + + desc.tfm = mdev->cram_hmac_tfm; + desc.flags = 0; + + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, + (u8 *)mdev->net_conf->shared_secret, key_len); + if (rv) { + dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); + rv = 0; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); + if (!rv) + goto fail; + + rv = drbd_recv_header(mdev, &p); + if (!rv) + goto fail; + + if (p.command != P_AUTH_CHALLENGE) { + dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command); + rv = 0; + goto fail; + } + + if (p.length > CHALLENGE_LEN*2) { + dev_err(DEV, "expected AuthChallenge payload too big.\n"); + rv = 0; + goto fail; + } + + peers_ch = kmalloc(p.length, GFP_NOIO); + if (peers_ch == NULL) { + dev_err(DEV, "kmalloc of peers_ch failed\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, peers_ch, p.length); + + if (rv != p.length) { + dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); + rv = 0; + goto fail; + } + + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); + response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + dev_err(DEV, "kmalloc of response failed\n"); + rv = 0; + goto fail; + } + + sg_init_table(&sg, 1); + sg_set_buf(&sg, peers_ch, p.length); + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if (rv) { + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + rv = 0; + goto fail; + } + + rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); + if (!rv) + goto fail; + + rv = drbd_recv_header(mdev, &p); + if (!rv) + goto fail; + + if (p.command != P_AUTH_RESPONSE) { + dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command); + rv = 0; + goto fail; + } + + if (p.length != resp_size) { + dev_err(DEV, "expected AuthResponse payload of wrong size\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, response , resp_size); + + if (rv != resp_size) { + dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + dev_err(DEV, "kmalloc of right_response failed\n"); + rv = 0; + goto fail; + } + + sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if (rv) { + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + rv = 0; + goto fail; + } + + rv = !memcmp(response, right_response, resp_size); + + if (rv) + dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", + resp_size, mdev->net_conf->cram_hmac_alg); + + fail: + kfree(peers_ch); + kfree(response); + kfree(right_response); + + return rv; +} +#endif + +int drbdd_init(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + unsigned int minor = mdev_to_minor(mdev); + int h; + + sprintf(current->comm, "drbd%d_receiver", minor); + + dev_info(DEV, "receiver (re)started\n"); + + do { + h = drbd_connect(mdev); + if (h == 0) { + drbd_disconnect(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + if (h == -1) { + dev_warn(DEV, "Discarding network configuration.\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + } while (h == 0); + + if (h > 0) { + if (get_net_conf(mdev)) { + drbdd(mdev); + put_net_conf(mdev); + } + } + + drbd_disconnect(mdev); + + dev_info(DEV, "receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_req_state_reply *p = (struct p_req_state_reply *)h; + + int retcode = be32_to_cpu(p->retcode); + + if (retcode >= SS_SUCCESS) { + set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); + } else { + set_bit(CL_ST_CHG_FAIL, &mdev->flags); + dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&mdev->state_wait); + + return TRUE; +} + +static int got_Ping(struct drbd_conf *mdev, struct p_header *h) +{ + return drbd_send_ping_ack(mdev); + +} + +static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) +{ + /* restore idle timeout */ + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + return TRUE; +} + +static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + D_ASSERT(mdev->agreed_pro_version >= 89); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + drbd_rs_complete_io(mdev, sector); + drbd_set_in_sync(mdev, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + dec_rs_pending(mdev); + + return TRUE; +} + +/* when we receive the ACK for a write request, + * verify that we actually know about it */ +static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = tl_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + if (req->sector != sector) { + dev_err(DEV, "_ack_id_to_req: found req %p but it has " + "wrong sector (%llus versus %llus)\n", req, + (unsigned long long)req->sector, + (unsigned long long)sector); + break; + } + return req; + } + } + dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", + (void *)(unsigned long)id, (unsigned long long)sector); + return NULL; +} + +typedef struct drbd_request *(req_validator_fn) + (struct drbd_conf *mdev, u64 id, sector_t sector); + +static int validate_req_change_req_state(struct drbd_conf *mdev, + u64 id, sector_t sector, req_validator_fn validator, + const char *func, enum drbd_req_event what) +{ + struct drbd_request *req; + struct bio_and_error m; + + spin_lock_irq(&mdev->req_lock); + req = validator(mdev, id, sector); + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); + return FALSE; + } + __req_mod(req, what, &m); + spin_unlock_irq(&mdev->req_lock); + + if (m.bio) + complete_master_bio(mdev, &m); + return TRUE; +} + +static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + enum drbd_req_event what; + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (is_syncer_block_id(p->block_id)) { + drbd_set_in_sync(mdev, sector, blksize); + dec_rs_pending(mdev); + return TRUE; + } + switch (be16_to_cpu(h->command)) { + case P_RS_WRITE_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer_and_sis; + break; + case P_WRITE_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer; + break; + case P_RECV_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); + what = recv_acked_by_peer; + break; + case P_DISCARD_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = conflict_discarded_by_peer; + break; + default: + D_ASSERT(0); + return FALSE; + } + + return validate_req_change_req_state(mdev, p->block_id, sector, + _ack_id_to_req, __func__ , what); +} + +static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + + if (__ratelimit(&drbd_ratelimit_state)) + dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (is_syncer_block_id(p->block_id)) { + int size = be32_to_cpu(p->blksize); + dec_rs_pending(mdev); + drbd_rs_failed_io(mdev, sector, size); + return TRUE; + } + return validate_req_change_req_state(mdev, p->block_id, sector, + _ack_id_to_req, __func__ , neg_acked); +} + +static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", + (unsigned long long)sector, be32_to_cpu(p->blksize)); + + return validate_req_change_req_state(mdev, p->block_id, sector, + _ar_id_to_req, __func__ , neg_acked); +} + +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + int size; + struct p_block_ack *p = (struct p_block_ack *)h; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + D_ASSERT(p->block_id == ID_SYNCER); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + dec_rs_pending(mdev); + + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_rs_complete_io(mdev, sector); + drbd_rs_failed_io(mdev, sector, size); + put_ldev(mdev); + } + + return TRUE; +} + +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_barrier_ack *p = (struct p_barrier_ack *)h; + + tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); + + return TRUE; +} + +static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_work *w; + sector_t sector; + int size; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) + drbd_ov_oos_found(mdev, sector, size); + else + ov_oos_print(mdev); + + drbd_rs_complete_io(mdev, sector); + dec_rs_pending(mdev); + + if (--mdev->ov_left == 0) { + w = kmalloc(sizeof(*w), GFP_NOIO); + if (w) { + w->cb = w_ov_finished; + drbd_queue_work_front(&mdev->data.work, w); + } else { + dev_err(DEV, "kmalloc(w) failed."); + ov_oos_print(mdev); + drbd_resync_finished(mdev); + } + } + return TRUE; +} + +struct asender_cmd { + size_t pkt_size; + int (*process)(struct drbd_conf *mdev, struct p_header *h); +}; + +static struct asender_cmd *get_asender_cmd(int cmd) +{ + static struct asender_cmd asender_tbl[] = { + /* anything missing from this table is in + * the drbd_cmd_handler (drbd_default_handler) table, + * see the beginning of drbdd() */ + [P_PING] = { sizeof(struct p_header), got_Ping }, + [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, + [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, + [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, + [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, + [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, + [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, + [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, + [P_MAX_CMD] = { 0, NULL }, + }; + if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) + return NULL; + return &asender_tbl[cmd]; +} + +int drbd_asender(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct p_header *h = &mdev->meta.rbuf.header; + struct asender_cmd *cmd = NULL; + + int rv, len; + void *buf = h; + int received = 0; + int expect = sizeof(struct p_header); + int empty; + + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); + + current->policy = SCHED_RR; /* Make this a realtime task! */ + current->rt_priority = 2; /* more important than all other tasks */ + + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto reconnect; + mdev->meta.socket->sk->sk_rcvtimeo = + mdev->net_conf->ping_timeo*HZ/10; + } + + /* conditionally cork; + * it may hurt latency if we cork without much to send */ + if (!mdev->net_conf->no_cork && + 3 < atomic_read(&mdev->unacked_cnt)) + drbd_tcp_cork(mdev->meta.socket); + while (1) { + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); + if (!drbd_process_done_ee(mdev)) { + dev_err(DEV, "process_done_ee() = NOT_OK\n"); + goto reconnect; + } + /* to avoid race with newly queued ACKs */ + set_bit(SIGNAL_ASENDER, &mdev->flags); + spin_lock_irq(&mdev->req_lock); + empty = list_empty(&mdev->done_ee); + spin_unlock_irq(&mdev->req_lock); + /* new ack may have been queued right here, + * but then there is also a signal pending, + * and we start over... */ + if (empty) + break; + } + /* but unconditionally uncork unless disabled */ + if (!mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->meta.socket); + + /* short circuit, recv_msg would return EINTR anyways. */ + if (signal_pending(current)) + continue; + + rv = drbd_recv_short(mdev, mdev->meta.socket, + buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + flush_signals(current); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + dev_err(DEV, "meta connection shut down by peer.\n"); + goto reconnect; + } else if (rv == -EAGAIN) { + if (mdev->meta.socket->sk->sk_rcvtimeo == + mdev->net_conf->ping_timeo*HZ/10) { + dev_err(DEV, "PingAck did not arrive in time.\n"); + goto reconnect; + } + set_bit(SEND_PING, &mdev->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + dev_err(DEV, "sock_recvmsg returned %d\n", rv); + goto reconnect; + } + + if (received == expect && cmd == NULL) { + if (unlikely(h->magic != BE_DRBD_MAGIC)) { + dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto reconnect; + } + cmd = get_asender_cmd(be16_to_cpu(h->command)); + len = be16_to_cpu(h->length); + if (unlikely(cmd == NULL)) { + dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto disconnect; + } + expect = cmd->pkt_size; + ERR_IF(len != expect-sizeof(struct p_header)) + goto reconnect; + } + if (received == expect) { + D_ASSERT(cmd != NULL); + if (!cmd->process(mdev, h)) + goto reconnect; + + buf = h; + received = 0; + expect = sizeof(struct p_header); + cmd = NULL; + } + } + + if (0) { +reconnect: + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + } + if (0) { +disconnect: + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + D_ASSERT(mdev->state.conn < C_CONNECTED); + dev_info(DEV, "asender terminated\n"); + + return 0; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 0000000..de81ab7 --- /dev/null +++ b/drivers/block/drbd/drbd_req.c @@ -0,0 +1,1125 @@ +/* + drbd_req.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> + +#include <linux/slab.h> +#include <linux/drbd.h> +#include "drbd_int.h" +#include "drbd_req.h" + + +/* Update disk stats at start of I/O request */ +static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) +{ + const int rw = bio_data_dir(bio); + int cpu; + cpu = part_stat_lock(); + part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&mdev->vdisk->part0, rw); + part_stat_unlock(); +} + +/* Update disk stats when completing request upwards */ +static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) +{ + int rw = bio_data_dir(req->master_bio); + unsigned long duration = jiffies - req->start_time; + int cpu; + cpu = part_stat_lock(); + part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); + part_round_stats(cpu, &mdev->vdisk->part0); + part_dec_in_flight(&mdev->vdisk->part0, rw); + part_stat_unlock(); +} + +static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) +{ + const unsigned long s = req->rq_state; + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (rw == WRITE) { + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialized in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) + drbd_set_out_of_sync(mdev, req->sector, req->size); + + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) + drbd_set_in_sync(mdev, req->sector, req->size); + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_endio_pri. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_LOCAL_MASK) { + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_al_complete_io(mdev, req->sector); + put_ldev(mdev); + } else if (__ratelimit(&drbd_ratelimit_state)) { + dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " + "but my Disk seems to have failed :(\n", + (unsigned long long) req->sector); + } + } + } + + /* if it was a local io error, we want to notify our + * peer about that, and see if we need to + * detach the disk and stuff. + * to avoid allocating some special work + * struct, reuse the request. */ + + /* THINK + * why do we do this not when we detect the error, + * but delay it until it is "done", i.e. possibly + * until the next barrier ack? */ + + if (rw == WRITE && + ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { + if (!(req->w.list.next == LIST_POISON1 || + list_empty(&req->w.list))) { + /* DEBUG ASSERT only; if this triggers, we + * probably corrupt the worker list here */ + dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); + dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); + } + req->w.cb = w_io_error; + drbd_queue_work(&mdev->data.work, &req->w); + /* drbd_req_free() is done in w_io_error */ + } else { + drbd_req_free(req); + } +} + +static void queue_barrier(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* We are within the req_lock. Once we queued the barrier for sending, + * we set the CREATE_BARRIER bit. It is cleared as soon as a new + * barrier/epoch object is added. This is the only place this bit is + * set. It indicates that the barrier for this epoch is already queued, + * and no new epoch has been created yet. */ + if (test_bit(CREATE_BARRIER, &mdev->flags)) + return; + + b = mdev->newest_tle; + b->w.cb = w_send_barrier; + /* inc_ap_pending done here, so we won't + * get imbalanced on connection loss. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in tl_clear. */ + inc_ap_pending(mdev); + drbd_queue_work(&mdev->data.work, &b->w); + set_bit(CREATE_BARRIER, &mdev->flags); +} + +static void _about_to_complete_local_write(struct drbd_conf *mdev, + struct drbd_request *req) +{ + const unsigned long s = req->rq_state; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + /* before we can signal completion to the upper layers, + * we may need to close the current epoch */ + if (mdev->state.conn >= C_CONNECTED && + req->epoch == mdev->newest_tle->br_number) + queue_barrier(mdev); + + /* we need to do the conflict detection stuff, + * if we have the ee_hash (two_primaries) and + * this has been on the network */ + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { + const sector_t sector = req->sector; + const int size = req->size; + + /* ASSERT: + * there must be no conflicting requests, since + * they must have been failed on the spot */ +#define OVERLAPS overlaps(sector, size, i->sector, i->size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " + "other: %p %llus +%u\n", + req, (unsigned long long)sector, size, + i, (unsigned long long)i->sector, i->size); + } + } + + /* maybe "wake" those conflicting epoch entries + * that wait for this request to finish. + * + * currently, there can be only _one_ such ee + * (well, or some more, which would be pending + * P_DISCARD_ACK not yet sent by the asender...), + * since we block the receiver thread upon the + * first conflict detection, which will wait on + * misc_wait. maybe we want to assert that? + * + * anyways, if we found one, + * we just have to do a wake_up. */ +#undef OVERLAPS +#define OVERLAPS overlaps(sector, size, e->sector, e->size) + slot = ee_hash_slot(mdev, req->sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + wake_up(&mdev->misc_wait); + break; + } + } + } +#undef OVERLAPS +} + +void complete_master_bio(struct drbd_conf *mdev, + struct bio_and_error *m) +{ + bio_endio(m->bio, m->error); + dec_ap_bio(mdev); +} + +/* Helper for __req_mod(). + * Set m->bio to the master bio, if it is fit to be completed, + * or leave it alone (it is initialized to NULL in __req_mod), + * if it has already been completed, or cannot be completed yet. + * If m->bio is set, the error status to be returned is placed in m->error. + */ +void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) +{ + const unsigned long s = req->rq_state; + struct drbd_conf *mdev = req->mdev; + /* only WRITES may end up here without a master bio (on barrier ack) */ + int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + + /* we must not complete the master bio, while it is + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) + * not yet acknowledged by the peer + * not yet completed by the local io subsystem + * these flags may get cleared in any order by + * the worker, + * the receiver, + * the bio_endio completion callbacks. + */ + if (s & RQ_NET_QUEUED) + return; + if (s & RQ_NET_PENDING) + return; + if (s & RQ_LOCAL_PENDING) + return; + + if (req->master_bio) { + /* this is data_received (remote read) + * or protocol C P_WRITE_ACK + * or protocol B P_RECV_ACK + * or protocol A "handed_over_to_network" (SendAck) + * or canceled or failed, + * or killed from the transfer log due to connection loss. + */ + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations succeeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + * + * local completion error, if any, has been stored as ERR_PTR + * in private_bio within drbd_endio_pri. + */ + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + int error = PTR_ERR(req->private_bio); + + /* remove the request from the conflict detection + * respective block_id verification hash */ + if (!hlist_unhashed(&req->colision)) + hlist_del(&req->colision); + else + D_ASSERT((s & RQ_NET_MASK) == 0); + + /* for writes we need to do some extra housekeeping */ + if (rw == WRITE) + _about_to_complete_local_write(mdev, req); + + /* Update disk stats */ + _drbd_end_io_acct(mdev, req); + + m->error = ok ? 0 : (error ?: -EIO); + m->bio = req->master_bio; + req->master_bio = NULL; + } + + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { + /* this is disconnected (local only) operation, + * or protocol C P_WRITE_ACK, + * or protocol A or B P_BARRIER_ACK, + * or killed from the transfer log due to connection loss. */ + _req_is_done(mdev, req, rw); + } + /* else: network part and not DONE yet. that is + * protocol A or B, barrier ack still pending... */ +} + +/* + * checks whether there was an overlapping request + * or ee already registered. + * + * if so, return 1, in which case this request is completed on the spot, + * without ever being submitted or send. + * + * return 0 if it is ok to submit this request. + * + * NOTE: + * paranoia: assume something above us is broken, and issues different write + * requests for the same block simultaneously... + * + * To ensure these won't be reordered differently on both nodes, resulting in + * diverging data sets, we discard the later one(s). Not that this is supposed + * to happen, but this is the rationale why we also have to check for + * conflicting requests with local origin, and why we have to do so regardless + * of whether we allowed multiple primaries. + * + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the + * second hlist_for_each_entry becomes a noop. This is even simpler than to + * grab a reference on the net_conf, and check for the two_primaries flag... + */ +static int _req_conflicts(struct drbd_request *req) +{ + struct drbd_conf *mdev = req->mdev; + const sector_t sector = req->sector; + const int size = req->size; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + D_ASSERT(hlist_unhashed(&req->colision)); + + if (!get_net_conf(mdev)) + return 0; + + /* BUG_ON */ + ERR_IF (mdev->tl_hash_s == 0) + goto out_no_conflict; + BUG_ON(mdev->tl_hash == NULL); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent local write detected! " + "[DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + goto out_conflict; + } + } + + if (mdev->ee_hash_s) { + /* now, check for overlapping requests with remote origin */ + BUG_ON(mdev->ee_hash == NULL); +#undef OVERLAPS +#define OVERLAPS overlaps(e->sector, e->size, sector, size) + slot = ee_hash_slot(mdev, sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent remote write detected!" + " [DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)e->sector, e->size); + goto out_conflict; + } + } + } +#undef OVERLAPS + +out_no_conflict: + /* this is like it should be, and what we expected. + * our users do behave after all... */ + put_net_conf(mdev); + return 0; + +out_conflict: + put_net_conf(mdev); + return 1; +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + */ +void __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m) +{ + struct drbd_conf *mdev = req->mdev; + m->bio = NULL; + + switch (what) { + default: + dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); + break; + + /* does not happen... + * initialization done in drbd_req_new + case created: + break; + */ + + case to_be_send: /* via network */ + /* reached via drbd_make_request_common + * and from w_read_retry_remote */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + break; + + case to_be_submitted: /* locally */ + /* reached via drbd_make_request_common */ + D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); + req->rq_state |= RQ_LOCAL_PENDING; + break; + + case completed_ok: + if (bio_data_dir(req->master_bio) == WRITE) + mdev->writ_cnt += req->size>>9; + else + mdev->read_cnt += req->size>>9; + + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state &= ~RQ_LOCAL_PENDING; + + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case write_completed_with_error: + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* and now: check how to handle local io error. */ + __drbd_chk_io_error(mdev, FALSE); + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case read_ahead_completed_with_error: + /* it is legal to fail READA */ + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case read_completed_with_error: + drbd_set_out_of_sync(mdev, req->sector, req->size); + + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* _req_mod(req,to_be_send); oops, recursion... */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + + __drbd_chk_io_error(mdev, FALSE); + put_ldev(mdev); + /* NOTE: if we have no connection, + * or know the peer has no good data either, + * then we don't actually need to "queue_for_net_read", + * but we do so anyways, since the drbd_io_error() + * and the potential state change to "Diskless" + * needs to be done from process context */ + + /* fall through: _req_mod(req,queue_for_net_read); */ + + case queue_for_net_read: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from drbd_make_request_common + * or from bio_endio during read io-error recovery */ + + /* so we can verify the handle in the answer packet + * corresponding hlist_del is in _req_may_be_done() */ + hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); + + set_bit(UNPLUG_REMOTE, &mdev->flags); + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) + ? w_read_retry_remote + : w_send_read_req; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case queue_for_net_write: + /* assert something? */ + /* from drbd_make_request_common only */ + + hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); + /* corresponding hlist_del is in _req_may_be_done() */ + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * drbd_make_request_common, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* otherwise we may lose an unplug, which may cause some remote + * io-scheduler timeout to expire, increasing maximum latency, + * hurting performance. */ + set_bit(UNPLUG_REMOTE, &mdev->flags); + + /* see drbd_make_request_common, + * just after it grabs the req_lock */ + D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); + + req->epoch = mdev->newest_tle->br_number; + list_add_tail(&req->tl_requests, + &mdev->newest_tle->requests); + + /* increment size of current epoch */ + mdev->newest_tle->n_req++; + + /* queue work item to send data */ + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_dblock; + drbd_queue_work(&mdev->data.work, &req->w); + + /* close the epoch, in case it outgrew the limit */ + if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) + queue_barrier(mdev); + + break; + + case send_canceled: + /* treat it the same */ + case send_failed: + /* real cleanup will be done from tl_clear. just update flags + * so it is no longer marked as on the worker queue */ + req->rq_state &= ~RQ_NET_QUEUED; + /* if we did it right, tl_clear should be scheduled only after + * this, so this should not be necessary! */ + _req_may_be_done(req, m); + break; + + case handed_over_to_network: + /* assert something? */ + if (bio_data_dir(req->master_bio) == WRITE && + mdev->net_conf->wire_protocol == DRBD_PROT_A) { + /* this is what is dangerous about protocol A: + * pretend it was successfully written on the peer. */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= RQ_NET_OK; + } /* else: neg-ack was faster... */ + /* it is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss */ + } + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_SENT; + /* because _drbd_send_zc_bio could sleep, and may want to + * dereference the bio even after the "write_acked_by_peer" and + * "completed_ok" events came in, once we return from + * _drbd_send_zc_bio (drbd_send_dblock), we have to check + * whether it is done already, and end it. */ + _req_may_be_done(req, m); + break; + + case connection_lost_while_pending: + /* transfer log cleanup after connection loss */ + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) + dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + req->rq_state |= RQ_NET_DONE; + /* if it is still queued, we may not complete it here. + * it will be canceled soon. */ + if (!(req->rq_state & RQ_NET_QUEUED)) + _req_may_be_done(req, m); + break; + + case write_acked_by_peer_and_sis: + req->rq_state |= RQ_NET_SIS; + case conflict_discarded_by_peer: + /* for discarded conflicting writes of multiple primaries, + * there is no need to keep anything in the tl, potential + * node crashes are covered by the activity log. */ + if (what == conflict_discarded_by_peer) + dev_alert(DEV, "Got DiscardAck packet %llus +%u!" + " DRBD is not a random data generator!\n", + (unsigned long long)req->sector, req->size); + req->rq_state |= RQ_NET_DONE; + /* fall through */ + case write_acked_by_peer: + /* protocol C; successfully written on peer. + * Nothing to do here. + * We want to keep the tl in place for all protocols, to cater + * for volatile write-back caches on lower level devices. + * + * A barrier request is expected to have forced all prior + * requests onto stable storage, so completion of a barrier + * request could set NET_DONE right here, and not wait for the + * P_BARRIER_ACK, but that is an unnecessary optimization. */ + + /* this makes it effectively the same as for: */ + case recv_acked_by_peer: + /* protocol B; pretends to be successfully written on peer. + * see also notes above in handed_over_to_network about + * protocol != C */ + req->rq_state |= RQ_NET_OK; + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + _req_may_be_done(req, m); + break; + + case neg_acked: + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) + dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req, m); + /* else: done by handed_over_to_network */ + break; + + case barrier_acked: + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests have been acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + dev_err(DEV, "FIXME (barrier_acked but pending)\n"); + list_move(&req->tl_requests, &mdev->out_of_sequence_requests); + } + D_ASSERT(req->rq_state & RQ_NET_SENT); + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req, m); + break; + + case data_received: + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); + _req_may_be_done(req, m); + break; + }; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) +{ + unsigned long sbnr, ebnr; + sector_t esector, nr_sectors; + + if (mdev->state.disk == D_UP_TO_DATE) + return 1; + if (mdev->state.disk >= D_OUTDATED) + return 0; + if (mdev->state.disk < D_INCONSISTENT) + return 0; + /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + D_ASSERT(sector < nr_sectors); + D_ASSERT(esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); +} + +static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) +{ + const int rw = bio_rw(bio); + const int size = bio->bi_size; + const sector_t sector = bio->bi_sector; + struct drbd_tl_epoch *b = NULL; + struct drbd_request *req; + int local, remote; + int err = -EIO; + + /* allocate outside of all locks; */ + req = drbd_req_new(mdev, bio); + if (!req) { + dec_ap_bio(mdev); + /* only pass the error to the upper layers. + * if user cannot handle io errors, that's not our business. */ + dev_err(DEV, "could not kmalloc() req\n"); + bio_endio(bio, -ENOMEM); + return 0; + } + + local = get_ldev(mdev); + if (!local) { + bio_put(req->private_bio); /* or we get a bio leak */ + req->private_bio = NULL; + } + if (rw == WRITE) { + remote = 1; + } else { + /* READ || READA */ + if (local) { + if (!drbd_may_do_local_read(mdev, sector, size)) { + /* we could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + local = 0; + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); + } + } + remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { + err = -EWOULDBLOCK; + goto fail_and_free_req; + } + + /* For WRITES going to the local disk, grab a reference on the target + * extent. This waits for any resync activity in the corresponding + * resync extent to finish, and, if necessary, pulls in the target + * extent into the activity log, which involves further disk io because + * of transactional on-disk meta data updates. */ + if (rw == WRITE && local) + drbd_al_begin_io(mdev, sector); + + remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || + (mdev->state.pdsk == D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED)); + + if (!(local || remote)) { + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + goto fail_free_complete; + } + + /* For WRITE request, we have to make sure that we have an + * unused_spare_tle, in case we need to start a new epoch. + * I try to be smart and avoid to pre-allocate always "just in case", + * but there is a race between testing the bit and pointer outside the + * spinlock, and grabbing the spinlock. + * if we lost that race, we retry. */ + if (rw == WRITE && remote && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { +allocate_barrier: + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); + if (!b) { + dev_err(DEV, "Failed to alloc barrier.\n"); + err = -ENOMEM; + goto fail_free_complete; + } + } + + /* GOOD, everything prepared, grab the spin_lock */ + spin_lock_irq(&mdev->req_lock); + + if (remote) { + remote = (mdev->state.pdsk == D_UP_TO_DATE || + (mdev->state.pdsk == D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED)); + if (!remote) + dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); + if (!(local || remote)) { + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + spin_unlock_irq(&mdev->req_lock); + goto fail_free_complete; + } + } + + if (b && mdev->unused_spare_tle == NULL) { + mdev->unused_spare_tle = b; + b = NULL; + } + if (rw == WRITE && remote && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { + /* someone closed the current epoch + * while we were grabbing the spinlock */ + spin_unlock_irq(&mdev->req_lock); + goto allocate_barrier; + } + + + /* Update disk stats */ + _drbd_start_io_acct(mdev, req, bio); + + /* _maybe_start_new_epoch(mdev); + * If we need to generate a write barrier packet, we have to add the + * new epoch (barrier) object, and queue the barrier packet for sending, + * and queue the req's data after it _within the same lock_, otherwise + * we have race conditions were the reorder domains could be mixed up. + * + * Even read requests may start a new epoch and queue the corresponding + * barrier packet. To get the write ordering right, we only have to + * make sure that, if this is a write request and it triggered a + * barrier packet, this request is queued within the same spinlock. */ + if (remote && mdev->unused_spare_tle && + test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + } else { + D_ASSERT(!(remote && rw == WRITE && + test_bit(CREATE_BARRIER, &mdev->flags))); + } + + /* NOTE + * Actually, 'local' may be wrong here already, since we may have failed + * to write to the meta data, and may become wrong anytime because of + * local io-error for some other request, which would lead to us + * "detaching" the local disk. + * + * 'remote' may become wrong any time because the network could fail. + * + * This is a harmless race condition, though, since it is handled + * correctly at the appropriate places; so it just defers the failure + * of the respective operation. + */ + + /* mark them early for readability. + * this just sets some state flags. */ + if (remote) + _req_mod(req, to_be_send); + if (local) + _req_mod(req, to_be_submitted); + + /* check this request on the collision detection hash tables. + * if we have a conflict, just complete it here. + * THINK do we want to check reads, too? (I don't think so...) */ + if (rw == WRITE && _req_conflicts(req)) { + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + drbd_al_complete_io(mdev, req->sector); + put_ldev(mdev); + local = 0; + } + if (remote) + dec_ap_pending(mdev); + _drbd_end_io_acct(mdev, req); + /* THINK: do we want to fail it (-EIO), or pretend success? */ + bio_endio(req->master_bio, 0); + req->master_bio = NULL; + dec_ap_bio(mdev); + drbd_req_free(req); + remote = 0; + } + + /* NOTE remote first: to get the concurrent write detection right, + * we must register the request before start of local IO. */ + if (remote) { + /* either WRITE and C_CONNECTED, + * or READ, and no local disk, + * or READ, but not in sync. + */ + _req_mod(req, (rw == WRITE) + ? queue_for_net_write + : queue_for_net_read); + } + spin_unlock_irq(&mdev->req_lock); + kfree(b); /* if someone else has beaten us to it... */ + + if (local) { + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + + if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + } + + /* we need to plug ALWAYS since we possibly need to kick lo_dev. + * we plug after submit, so we won't miss an unplug event */ + drbd_plug_device(mdev); + + return 0; + +fail_free_complete: + if (rw == WRITE && local) + drbd_al_complete_io(mdev, sector); +fail_and_free_req: + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); + } + bio_endio(bio, err); + drbd_req_free(req); + dec_ap_bio(mdev); + kfree(b); + + return 0; +} + +/* helper function for drbd_make_request + * if we can determine just by the mdev (state) that this request will fail, + * return 1 + * otherwise return 0 + */ +static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) +{ + /* Unconfigured */ + if (mdev->state.conn == C_DISCONNECTING && + mdev->state.disk == D_DISKLESS) + return 1; + + if (mdev->state.role != R_PRIMARY && + (!allow_oos || is_write)) { + if (__ratelimit(&drbd_ratelimit_state)) { + dev_err(DEV, "Process %s[%u] tried to %s; " + "since we are not in Primary state, " + "we cannot allow this\n", + current->comm, current->pid, + is_write ? "WRITE" : "READ"); + } + return 1; + } + + /* + * Paranoia: we might have been primary, but sync target, or + * even diskless, then lost the connection. + * This should have been handled (panic? suspend?) somewhere + * else. But maybe it was not, so check again here. + * Caution: as long as we do not have a read/write lock on mdev, + * to serialize state changes, this is racy, since we may lose + * the connection *after* we test for the cstate. + */ + if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); + return 1; + } + + return 0; +} + +int drbd_make_request_26(struct request_queue *q, struct bio *bio) +{ + unsigned int s_enr, e_enr; + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { + bio_endio(bio, -EPERM); + return 0; + } + + /* Reject barrier requests if we know the underlying device does + * not support them. + * XXX: Need to get this info from peer as well some how so we + * XXX: reject if EITHER side/data/metadata area does not support them. + * + * because of those XXX, this is not yet enabled, + * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. + */ + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { + /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ + bio_endio(bio, -EOPNOTSUPP); + return 0; + } + + /* + * what we "blindly" assume: + */ + D_ASSERT(bio->bi_size > 0); + D_ASSERT((bio->bi_size & 0x1ff) == 0); + D_ASSERT(bio->bi_idx == 0); + + /* to make some things easier, force alignment of requests within the + * granularity of our hash tables */ + s_enr = bio->bi_sector >> HT_SHIFT; + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + + if (likely(s_enr == e_enr)) { + inc_ap_bio(mdev, 1); + return drbd_make_request_common(mdev, bio); + } + + /* can this bio be split generically? + * Maybe add our own split-arbitrary-bios function. */ + if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { + /* rather error out here than BUG in bio_split */ + dev_err(DEV, "bio would need to, but cannot, be split: " + "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", + bio->bi_vcnt, bio->bi_idx, bio->bi_size, + (unsigned long long)bio->bi_sector); + bio_endio(bio, -EINVAL); + } else { + /* This bio crosses some boundary, so we have to split it. */ + struct bio_pair *bp; + /* works for the "do not cross hash slot boundaries" case + * e.g. sector 262269, size 4096 + * s_enr = 262269 >> 6 = 4097 + * e_enr = (262269+8-1) >> 6 = 4098 + * HT_SHIFT = 6 + * sps = 64, mask = 63 + * first_sectors = 64 - (262269 & 63) = 3 + */ + const sector_t sect = bio->bi_sector; + const int sps = 1 << HT_SHIFT; /* sectors per slot */ + const int mask = sps - 1; + const sector_t first_sectors = sps - (sect & mask); + bp = bio_split(bio, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) + bio_split_pool, +#endif + first_sectors); + + /* we need to get a "reference count" (ap_bio_cnt) + * to avoid races with the disconnect/reconnect/suspend code. + * In case we need to split the bio here, we need to get two references + * atomically, otherwise we might deadlock when trying to submit the + * second one! */ + inc_ap_bio(mdev, 2); + + D_ASSERT(e_enr == s_enr + 1); + + drbd_make_request_common(mdev, &bp->bio1); + drbd_make_request_common(mdev, &bp->bio2); + bio_pair_release(bp); + } + return 0; +} + +/* This is called by bio_add_page(). With this function we reduce + * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs + * units (was AL_EXTENTs). + * + * we do the calculation within the lower 32bit of the byte offsets, + * since we don't care for actual offset, but only check whether it + * would cross "activity log extent" boundaries. + * + * As long as the BIO is empty we have to allow at least one bvec, + * regardless of size and offset. so the resulting bio may still + * cross extent boundaries. those are dealt with (bio_split) in + * drbd_make_request_26. + */ +int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) +{ + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + unsigned int bio_offset = + (unsigned int)bvm->bi_sector << 9; /* 32 bit */ + unsigned int bio_size = bvm->bi_size; + int limit, backing_limit; + + limit = DRBD_MAX_SEGMENT_SIZE + - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); + if (limit < 0) + limit = 0; + if (bio_size == 0) { + if (limit <= bvec->bv_len) + limit = bvec->bv_len; + } else if (limit && get_ldev(mdev)) { + struct request_queue * const b = + mdev->ldev->backing_bdev->bd_disk->queue; + if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { + backing_limit = b->merge_bvec_fn(b, bvm, bvec); + limit = min(limit, backing_limit); + } + put_ldev(mdev); + } + return limit; +} diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 0000000..f22c1bc --- /dev/null +++ b/drivers/block/drbd/drbd_req.h @@ -0,0 +1,326 @@ +/* + drbd_req.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include <linux/module.h> + +#include <linux/slab.h> +#include <linux/drbd.h> +#include "drbd_int.h" +#include "drbd_wrappers.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either sucessfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + */ + +enum drbd_req_event { + created, + to_be_send, + to_be_submitted, + + /* XXX yes, now I am inconsistent... + * these two are not "events" but "actions" + * oh, well... */ + queue_for_net_write, + queue_for_net_read, + + send_canceled, + send_failed, + handed_over_to_network, + connection_lost_while_pending, + recv_acked_by_peer, + write_acked_by_peer, + write_acked_by_peer_and_sis, /* and set_in_sync */ + conflict_discarded_by_peer, + neg_acked, + barrier_acked, /* in protocol A and B */ + data_received, /* (remote read) */ + + read_completed_with_error, + read_ahead_completed_with_error, + write_completed_with_error, + completed_ok, + nothing, /* for tracing only */ +}; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 210 + * 000: no local possible + * 001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 110: completed ok + * 010: completed with error + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + + /* 76543 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write_acked (C), + * data_received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack". + * + * TODO can potentially be dropped because of the similar meaning + * of RQ_NET_SENT and ~RQ_NET_QUEUED. + * however it is not exactly the same. before we drop it + * we must ensure that we can tell a request with network part + * from a request without, regardless of what happens to it. */ + __RQ_NET_SENT, + + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). + * basically this means the corresponding P_BARRIER_ACK was received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, + + /* keep this last, its for the RQ_NET_MASK */ + __RQ_NET_MAX, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +/* 0x1f8 */ +#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) + +/* epoch entries */ +static inline +struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->ee_hash_s == 0); + return mdev->ee_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); +} + +/* transfer log (drbd_request objects) */ +static inline +struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->tl_hash_s == 0); + return mdev->tl_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); +} + +/* application reads (drbd_request objects) */ +static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + return mdev->app_reads_hash + + ((unsigned int)(sector) % APP_R_HSIZE); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = ar_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + D_ASSERT(req->sector == sector); + return req; + } + } + return NULL; +} + +static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, + struct bio *bio_src) +{ + struct bio *bio; + struct drbd_request *req = + mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (likely(req)) { + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->rq_state = 0; + req->mdev = mdev; + req->master_bio = bio_src; + req->private_bio = bio; + req->epoch = 0; + req->sector = bio->bi_sector; + req->size = bio->bi_size; + req->start_time = jiffies; + INIT_HLIST_NODE(&req->colision); + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = NULL; + } + return req; +} + +static inline void drbd_req_free(struct drbd_request *req) +{ + mempool_free(req, drbd_request_mempool); +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); +} + +/* Short lived temporary struct on the stack. + * We could squirrel the error to be returned into + * bio->bi_size, or similar. But that would be too ugly. */ +struct bio_and_error { + struct bio *bio; + int error; +}; + +extern void _req_may_be_done(struct drbd_request *req, + struct bio_and_error *m); +extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m); +extern void complete_master_bio(struct drbd_conf *mdev, + struct bio_and_error *m); + +/* use this if you don't want to deal with calling complete_master_bio() + * outside the spinlock, e.g. when walking some list on cleanup. */ +static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) +{ + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + + /* __req_mod possibly frees req, do not touch req after that! */ + __req_mod(req, what, &m); + if (m.bio) + complete_master_bio(mdev, &m); +} + +/* completion of master bio is outside of spinlock. + * If you need it irqsave, do it your self! */ +static inline void req_mod(struct drbd_request *req, + enum drbd_req_event what) +{ + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + spin_lock_irq(&mdev->req_lock); + __req_mod(req, what, &m); + spin_unlock_irq(&mdev->req_lock); + + if (m.bio) + complete_master_bio(mdev, &m); +} +#endif diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 0000000..76863e3 --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c @@ -0,0 +1,113 @@ +/* + drbd.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include <linux/drbd.h> + +static const char *drbd_conn_s_names[] = { + [C_STANDALONE] = "StandAlone", + [C_DISCONNECTING] = "Disconnecting", + [C_UNCONNECTED] = "Unconnected", + [C_TIMEOUT] = "Timeout", + [C_BROKEN_PIPE] = "BrokenPipe", + [C_NETWORK_FAILURE] = "NetworkFailure", + [C_PROTOCOL_ERROR] = "ProtocolError", + [C_WF_CONNECTION] = "WFConnection", + [C_WF_REPORT_PARAMS] = "WFReportParams", + [C_TEAR_DOWN] = "TearDown", + [C_CONNECTED] = "Connected", + [C_STARTING_SYNC_S] = "StartingSyncS", + [C_STARTING_SYNC_T] = "StartingSyncT", + [C_WF_BITMAP_S] = "WFBitMapS", + [C_WF_BITMAP_T] = "WFBitMapT", + [C_WF_SYNC_UUID] = "WFSyncUUID", + [C_SYNC_SOURCE] = "SyncSource", + [C_SYNC_TARGET] = "SyncTarget", + [C_PAUSED_SYNC_S] = "PausedSyncS", + [C_PAUSED_SYNC_T] = "PausedSyncT", + [C_VERIFY_S] = "VerifyS", + [C_VERIFY_T] = "VerifyT", +}; + +static const char *drbd_role_s_names[] = { + [R_PRIMARY] = "Primary", + [R_SECONDARY] = "Secondary", + [R_UNKNOWN] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [D_DISKLESS] = "Diskless", + [D_ATTACHING] = "Attaching", + [D_FAILED] = "Failed", + [D_NEGOTIATING] = "Negotiating", + [D_INCONSISTENT] = "Inconsistent", + [D_OUTDATED] = "Outdated", + [D_UNKNOWN] = "DUnknown", + [D_CONSISTENT] = "Consistent", + [D_UP_TO_DATE] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", + [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", + [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", + [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", + [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", + [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", + [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", + [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", + [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", + [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", + [-SS_DEVICE_IN_USE] = "Device is held open by someone", + [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", + [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", + [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", + [-SS_NOT_SUPPORTED] = "Peer does not support protocol", + [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", + [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", + [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", +}; + +const char *drbd_conn_str(enum drbd_conns s) +{ + /* enums are unsigned... */ + return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; +} + +const char *drbd_role_str(enum drbd_role s) +{ + return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; +} + +const char *drbd_disk_str(enum drbd_disk_state s) +{ + return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; +} + +const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) +{ + return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : + err > SS_TWO_PRIMARIES ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 0000000..fc82400 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h @@ -0,0 +1,351 @@ +/* +-*- linux-c -*- + drbd_receiver.c + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_VLI_H +#define _DRBD_VLI_H + +/* + * At a granularity of 4KiB storage represented per bit, + * and stroage sizes of several TiB, + * and possibly small-bandwidth replication, + * the bitmap transfer time can take much too long, + * if transmitted in plain text. + * + * We try to reduce the transfered bitmap information + * by encoding runlengths of bit polarity. + * + * We never actually need to encode a "zero" (runlengths are positive). + * But then we have to store the value of the first bit. + * The first bit of information thus shall encode if the first runlength + * gives the number of set or unset bits. + * + * We assume that large areas are either completely set or unset, + * which gives good compression with any runlength method, + * even when encoding the runlength as fixed size 32bit/64bit integers. + * + * Still, there may be areas where the polarity flips every few bits, + * and encoding the runlength sequence of those areas with fix size + * integers would be much worse than plaintext. + * + * We want to encode small runlength values with minimum code length, + * while still being able to encode a Huge run of all zeros. + * + * Thus we need a Variable Length Integer encoding, VLI. + * + * For some cases, we produce more code bits than plaintext input. + * We need to send incompressible chunks as plaintext, skip over them + * and then see if the next chunk compresses better. + * + * We don't care too much about "excellent" compression ratio for large + * runlengths (all set/all clear): whether we achieve a factor of 100 + * or 1000 is not that much of an issue. + * We do not want to waste too much on short runlengths in the "noisy" + * parts of the bitmap, though. + * + * There are endless variants of VLI, we experimented with: + * * simple byte-based + * * various bit based with different code word length. + * + * To avoid yet an other configuration parameter (choice of bitmap compression + * algorithm) which was difficult to explain and tune, we just chose the one + * variant that turned out best in all test cases. + * Based on real world usage patterns, with device sizes ranging from a few GiB + * to several TiB, file server/mailserver/webserver/mysql/postgress, + * mostly idle to really busy, the all time winner (though sometimes only + * marginally better) is: + */ + +/* + * encoding is "visualised" as + * __little endian__ bitstream, least significant bit first (left most) + * + * this particular encoding is chosen so that the prefix code + * starts as unary encoding the level, then modified so that + * 10 levels can be described in 8bit, with minimal overhead + * for the smaller levels. + * + * Number of data bits follow fibonacci sequence, with the exception of the + * last level (+1 data bit, so it makes 64bit total). The only worse code when + * encoding bit polarity runlength is 1 plain bits => 2 code bits. +prefix data bits max val Nº data bits +0 x 0x2 1 +10 x 0x4 1 +110 xx 0x8 2 +1110 xxx 0x10 3 +11110 xxx xx 0x30 5 +111110 xx xxxxxx 0x130 8 +11111100 xxxxxxxx xxxxx 0x2130 13 +11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 +11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 +11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 + * maximum encodable value: 0x100000400202130 == 2**56 + some */ + +/* compression "table": + transmitted x 0.29 + as plaintext x ........................ + x ........................ + x ........................ + x 0.59 0.21........................ + x ........................................................ + x .. c ................................................... + x 0.44.. o ................................................... + x .......... d ................................................... + x .......... e ................................................... + X............. ................................................... + x.............. b ................................................... +2.0x............... i ................................................... + #X................ t ................................................... + #................. s ........................... plain bits .......... +-+----------------------------------------------------------------------- + 1 16 32 64 +*/ + +/* LEVEL: (total bits, prefix bits, prefix value), + * sorted ascending by number of total bits. + * The rest of the code table is calculated at compiletime from this. */ + +/* fibonacci data 1, 1, ... */ +#define VLI_L_1_1() do { \ + LEVEL( 2, 1, 0x00); \ + LEVEL( 3, 2, 0x01); \ + LEVEL( 5, 3, 0x03); \ + LEVEL( 7, 4, 0x07); \ + LEVEL(10, 5, 0x0f); \ + LEVEL(14, 6, 0x1f); \ + LEVEL(21, 8, 0x3f); \ + LEVEL(29, 8, 0x7f); \ + LEVEL(42, 8, 0xbf); \ + LEVEL(64, 8, 0xff); \ + } while (0) + +/* finds a suitable level to decode the least significant part of in. + * returns number of bits consumed. + * + * BUG() for bad input, as that would mean a buggy code table. */ +static inline int vli_decode_bits(u64 *out, const u64 in) +{ + u64 adj = 1; + +#define LEVEL(t,b,v) \ + do { \ + if ((in & ((1 << b) -1)) == v) { \ + *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ + return t; \ + } \ + adj += 1ULL << (t - b); \ + } while (0) + + VLI_L_1_1(); + + /* NOT REACHED, if VLI_LEVELS code table is defined properly */ + BUG(); +#undef LEVEL +} + +/* return number of code bits needed, + * or negative error number */ +static inline int __vli_encode_bits(u64 *out, const u64 in) +{ + u64 max = 0; + u64 adj = 1; + + if (in == 0) + return -EINVAL; + +#define LEVEL(t,b,v) do { \ + max += 1ULL << (t - b); \ + if (in <= max) { \ + if (out) \ + *out = ((in - adj) << b) | v; \ + return t; \ + } \ + adj = max + 1; \ + } while (0) + + VLI_L_1_1(); + + return -EOVERFLOW; +#undef LEVEL +} + +#undef VLI_L_1_1 + +/* code from here down is independend of actually used bit code */ + +/* + * Code length is determined by some unique (e.g. unary) prefix. + * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, + * not a byte stream. + */ + +/* for the bitstream, we need a cursor */ +struct bitstream_cursor { + /* the current byte */ + u8 *b; + /* the current bit within *b, nomalized: 0..7 */ + unsigned int bit; +}; + +/* initialize cursor to point to first bit of stream */ +static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) +{ + cur->b = s; + cur->bit = 0; +} + +/* advance cursor by that many bits; maximum expected input value: 64, + * but depending on VLI implementation, it may be more. */ +static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) +{ + bits += cur->bit; + cur->b = cur->b + (bits >> 3); + cur->bit = bits & 7; +} + +/* the bitstream itself knows its length */ +struct bitstream { + struct bitstream_cursor cur; + unsigned char *buf; + size_t buf_len; /* in bytes */ + + /* for input stream: + * number of trailing 0 bits for padding + * total number of valid bits in stream: buf_len * 8 - pad_bits */ + unsigned int pad_bits; +}; + +static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) +{ + bs->buf = s; + bs->buf_len = len; + bs->pad_bits = pad_bits; + bitstream_cursor_reset(&bs->cur, bs->buf); +} + +static inline void bitstream_rewind(struct bitstream *bs) +{ + bitstream_cursor_reset(&bs->cur, bs->buf); + memset(bs->buf, 0, bs->buf_len); +} + +/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. + * Ignores "pad_bits". + * Returns zero if bits == 0 (nothing to do). + * Returns number of bits used if successful. + * + * If there is not enough room left in bitstream, + * leaves bitstream unchanged and returns -ENOBUFS. + */ +static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) +{ + unsigned char *b = bs->cur.b; + unsigned int tmp; + + if (bits == 0) + return 0; + + if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) + return -ENOBUFS; + + /* paranoia: strip off hi bits; they should not be set anyways. */ + if (bits < 64) + val &= ~0ULL >> (64 - bits); + + *b++ |= (val & 0xff) << bs->cur.bit; + + for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) + *b++ |= (val >> tmp) & 0xff; + + bitstream_cursor_advance(&bs->cur, bits); + return bits; +} + +/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. + * + * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. + * + * If there are less than the requested number of valid bits left in the + * bitstream, still fetches all available bits. + * + * Returns number of actually fetched bits. + */ +static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) +{ + u64 val; + unsigned int n; + + if (bits > 64) + return -EINVAL; + + if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) + bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) + - bs->cur.bit - bs->pad_bits; + + if (bits == 0) { + *out = 0; + return 0; + } + + /* get the high bits */ + val = 0; + n = (bs->cur.bit + bits + 7) >> 3; + /* n may be at most 9, if cur.bit + bits > 64 */ + /* which means this copies at most 8 byte */ + if (n) { + memcpy(&val, bs->cur.b+1, n - 1); + val = le64_to_cpu(val) << (8 - bs->cur.bit); + } + + /* we still need the low bits */ + val |= bs->cur.b[0] >> bs->cur.bit; + + /* and mask out bits we don't want */ + val &= ~0ULL >> (64 - bits); + + bitstream_cursor_advance(&bs->cur, bits); + *out = val; + + return bits; +} + +/* encodes @in as vli into @bs; + + * return values + * > 0: number of bits successfully stored in bitstream + * -ENOBUFS @bs is full + * -EINVAL input zero (invalid) + * -EOVERFLOW input too large for this vli code (invalid) + */ +static inline int vli_encode_bits(struct bitstream *bs, u64 in) +{ + u64 code = code; + int bits = __vli_encode_bits(&code, in); + + if (bits <= 0) + return bits; + + return bitstream_put_bits(bs, code, bits); +} + +#endif diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 0000000..ed8796f --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c @@ -0,0 +1,1512 @@ +/* + drbd_worker.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/drbd.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/scatterlist.h> + +#include "drbd_int.h" +#include "drbd_req.h" + +#define SLEEP_TIME (HZ/10) + +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); + + + +/* defined here: + drbd_md_io_complete + drbd_endio_write_sec + drbd_endio_read_sec + drbd_endio_pri + + * more endio handlers: + atodb_endio in drbd_actlog.c + drbd_bm_async_io_complete in drbd_bitmap.c + + * For all these callbacks, note the following: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + + +/* About the global_state_lock + Each state transition on an device holds a read lock. In case we have + to evaluate the sync after dependencies, we grab a write lock, because + we need stable states on all devices for that. */ +rwlock_t global_state_lock; + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +void drbd_md_io_complete(struct bio *bio, int error) +{ + struct drbd_md_io *md_io; + + md_io = (struct drbd_md_io *)bio->bi_private; + md_io->error = error; + + complete(&md_io->event); +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) +{ + unsigned long flags = 0; + struct drbd_epoch_entry *e = NULL; + struct drbd_conf *mdev; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + e = bio->bi_private; + mdev = e->mdev; + + if (error) + dev_warn(DEV, "read: error=%d s=%llus\n", error, + (unsigned long long)e->sector); + if (!error && !uptodate) { + dev_warn(DEV, "read: setting error to -EIO s=%llus\n", + (unsigned long long)e->sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->read_cnt += e->size >> 9; + list_del(&e->w.list); + if (list_empty(&mdev->read_ee)) + wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + drbd_chk_io_error(mdev, error, FALSE); + drbd_queue_work(&mdev->data.work, &e->w); + put_ldev(mdev); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) +{ + unsigned long flags = 0; + struct drbd_epoch_entry *e = NULL; + struct drbd_conf *mdev; + sector_t e_sector; + int do_wake; + int is_syncer_req; + int do_al_complete_io; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); + + e = bio->bi_private; + mdev = e->mdev; + + if (error) + dev_warn(DEV, "write: error=%d s=%llus\n", error, + (unsigned long long)e->sector); + if (!error && !uptodate) { + dev_warn(DEV, "write: setting error to -EIO s=%llus\n", + (unsigned long long)e->sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + /* error == -ENOTSUPP would be a better test, + * alas it is not reliable */ + if (error && is_barrier && e->flags & EE_IS_BARRIER) { + drbd_bump_write_ordering(mdev, WO_bdev_flush); + spin_lock_irqsave(&mdev->req_lock, flags); + list_del(&e->w.list); + e->w.cb = w_e_reissue; + /* put_ldev actually happens below, once we come here again. */ + __release(local); + spin_unlock_irqrestore(&mdev->req_lock, flags); + drbd_queue_work(&mdev->data.work, &e->w); + return; + } + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->writ_cnt += e->size >> 9; + is_syncer_req = is_syncer_block_id(e->block_id); + + /* after we moved e to done_ee, + * we may no longer access it, + * it may be freed/reused already! + * (as soon as we release the req_lock) */ + e_sector = e->sector; + do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; + + list_del(&e->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&e->w.list, &mdev->done_ee); + + /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, + * neither did we wake possibly waiting conflicting requests. + * done from "drbd_process_done_ee" within the appropriate w.cb + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ + + do_wake = is_syncer_req + ? list_empty(&mdev->sync_ee) + : list_empty(&mdev->active_ee); + + if (error) + __drbd_chk_io_error(mdev, FALSE); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (is_syncer_req) + drbd_rs_complete_io(mdev, e_sector); + + if (do_wake) + wake_up(&mdev->ee_wait); + + if (do_al_complete_io) + drbd_al_complete_io(mdev, e_sector); + + wake_asender(mdev); + put_ldev(mdev); + +} + +/* read, readA or write requests on R_PRIMARY coming from drbd_make_request + */ +void drbd_endio_pri(struct bio *bio, int error) +{ + unsigned long flags; + struct drbd_request *req = bio->bi_private; + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + enum drbd_req_event what; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + if (error) + dev_warn(DEV, "p %s: error=%d\n", + bio_data_dir(bio) == WRITE ? "write" : "read", error); + if (!error && !uptodate) { + dev_warn(DEV, "p %s: setting error to -EIO\n", + bio_data_dir(bio) == WRITE ? "write" : "read"); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + /* to avoid recursion in __req_mod */ + if (unlikely(error)) { + what = (bio_data_dir(bio) == WRITE) + ? write_completed_with_error + : (bio_rw(bio) == READA) + ? read_completed_with_error + : read_ahead_completed_with_error; + } else + what = completed_ok; + + bio_put(req->private_bio); + req->private_bio = ERR_PTR(error); + + spin_lock_irqsave(&mdev->req_lock, flags); + __req_mod(req, what, &m); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (m.bio) + complete_master_bio(mdev, &m); +} + +int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + /* NOTE: mdev->ldev can be NULL by the time we get here! */ + /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ + + /* the only way this callback is scheduled is from _req_may_be_done, + * when it is done and had a local write error, see comments there */ + drbd_req_free(req); + + return TRUE; +} + +int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + /* We should not detach for read io-error, + * but try to WRITE the P_DATA_REPLY to the failed location, + * to give the disk the chance to relocate that block */ + + spin_lock_irq(&mdev->req_lock); + if (cancel || + mdev->state.conn < C_CONNECTED || + mdev->state.pdsk <= D_INCONSISTENT) { + _req_mod(req, send_canceled); + spin_unlock_irq(&mdev->req_lock); + dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); + return 1; + } + spin_unlock_irq(&mdev->req_lock); + + return w_send_read_req(mdev, w, 0); +} + +int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + ERR_IF(cancel) return 1; + dev_err(DEV, "resync inactive, but callback triggered??\n"); + return 1; /* Simply ignore this! */ +} + +void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct bio_vec *bvec; + int i; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + __bio_for_each_segment(bvec, bio, i, 0) { + sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); + crypto_hash_update(&desc, &sg, sg.length); + } + crypto_hash_final(&desc, digest); +} + +static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int digest_size; + void *digest; + int ok; + + D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + return 1; + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + digest_size = crypto_hash_digestsize(mdev->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + + inc_rs_pending(mdev); + ok = drbd_send_drequest_csum(mdev, + e->sector, + e->size, + digest, + digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + dev_err(DEV, "kmalloc() of digest failed.\n"); + ok = 0; + } + } else + ok = 1; + + drbd_free_ee(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); + return ok; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) +{ + struct drbd_epoch_entry *e; + + if (!get_ldev(mdev)) + return 0; + + /* GFP_TRY, because if there is no memory available right now, this may + * be rescheduled for later. It is "only" background resync, after all. */ + e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); + if (!e) { + put_ldev(mdev); + return 2; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + e->private_bio->bi_end_io = drbd_endio_read_sec; + e->private_bio->bi_rw = READ; + e->w.cb = w_e_send_csum; + + mdev->read_cnt += size >> 9; + drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); + + return 1; +} + +void resync_timer_fn(unsigned long data) +{ + unsigned long flags; + struct drbd_conf *mdev = (struct drbd_conf *) data; + int queue; + + spin_lock_irqsave(&mdev->req_lock, flags); + + if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { + queue = 1; + if (mdev->state.conn == C_VERIFY_S) + mdev->resync_work.cb = w_make_ov_request; + else + mdev->resync_work.cb = w_make_resync_request; + } else { + queue = 0; + mdev->resync_work.cb = w_resync_inactive; + } + + spin_unlock_irqrestore(&mdev->req_lock, flags); + + /* harmless race: list_empty outside data.work.q_lock */ + if (list_empty(&mdev->resync_work.list) && queue) + drbd_queue_work(&mdev->data.work, &mdev->resync_work); +} + +int w_make_resync_request(struct drbd_conf *mdev, + struct drbd_work *w, int cancel) +{ + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + int max_segment_size = queue_max_segment_size(mdev->rq_queue); + int number, i, size, pe, mx; + int align, queued, sndbuf; + + if (unlikely(cancel)) + return 1; + + if (unlikely(mdev->state.conn < C_CONNECTED)) { + dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); + return 0; + } + + if (mdev->state.conn != C_SYNC_TARGET) + dev_err(DEV, "%s in w_make_resync_request\n", + drbd_conn_str(mdev->state.conn)); + + if (!get_ldev(mdev)) { + /* Since we only need to access mdev->rsync a + get_ldev_if_state(mdev,D_FAILED) would be sufficient, but + to continue resync with a broken disk makes no sense at + all */ + dev_err(DEV, "Disk broke down during resync!\n"); + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + pe = atomic_read(&mdev->rs_pending_cnt); + + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket) + mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); + else + mx = 1; + mutex_unlock(&mdev->data.mutex); + + /* For resync rates >160MB/sec, allow more pending RS requests */ + if (number > mx) + mx = number; + + /* Limit the number of pending RS requests to no more than the peer's receive buffer */ + if ((pe + number) > mx) { + number = mx - pe; + } + + for (i = 0; i < number; i++) { + /* Stop generating RS requests, when half of the send buffer is filled */ + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket) { + queued = mdev->data.socket->sk->sk_wmem_queued; + sndbuf = mdev->data.socket->sk->sk_sndbuf; + } else { + queued = 1; + sndbuf = 0; + } + mutex_unlock(&mdev->data.mutex); + if (queued > sndbuf / 2) + goto requeue; + +next_sector: + size = BM_BLOCK_SIZE; + bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); + + if (bit == -1UL) { + mdev->bm_resync_fo = drbd_bm_bits(mdev); + mdev->resync_work.cb = w_resync_inactive; + put_ldev(mdev); + return 1; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(mdev, sector)) { + mdev->bm_resync_fo = bit; + goto requeue; + } + mdev->bm_resync_fo = bit + 1; + + if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { + drbd_rs_complete_io(mdev, sector); + goto next_sector; + } + +#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Additionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + * + * we _do_ care about the agreed-upon q->max_segment_size + * here, as splitting up the requests on the other side is more + * difficult. the consequence is, that on lvm and md and other + * "indirect" devices, this is dead code, since + * q->max_segment_size will be PAGE_SIZE. + */ + align = 1; + for (;;) { + if (size + BM_BLOCK_SIZE > max_segment_size) + break; + + /* Be always aligned */ + if (sector & ((1<<(align+3))-1)) + break; + + /* do not cross extent boundaries */ + if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if (drbd_bm_test_bit(mdev, bit+1) != 1) + break; + bit++; + size += BM_BLOCK_SIZE; + if ((BM_BLOCK_SIZE << align) <= size) + align++; + i++; + } + /* if we merged some, + * reset the offset to start the next drbd_bm_find_next from */ + if (size > BM_BLOCK_SIZE) + mdev->bm_resync_fo = bit + 1; +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { + switch (read_for_csum(mdev, sector, size)) { + case 0: /* Disk failure*/ + put_ldev(mdev); + return 0; + case 2: /* Allocation failed */ + drbd_rs_complete_io(mdev, sector); + mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + goto requeue; + /* case 1: everything ok */ + } + } else { + inc_rs_pending(mdev); + if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER)) { + dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(mdev); + put_ldev(mdev); + return 0; + } + } + } + + if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { + /* last syncer _request_ was sent, + * but the P_RS_DATA_REPLY not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + mdev->resync_work.cb = w_resync_inactive; + put_ldev(mdev); + return 1; + } + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + put_ldev(mdev); + return 1; +} + +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + int number, i, size; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + + if (unlikely(cancel)) + return 1; + + if (unlikely(mdev->state.conn < C_CONNECTED)) { + dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); + return 0; + } + + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + if (atomic_read(&mdev->rs_pending_cnt) > number) + goto requeue; + + number -= atomic_read(&mdev->rs_pending_cnt); + + sector = mdev->ov_position; + for (i = 0; i < number; i++) { + if (sector >= capacity) { + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + size = BM_BLOCK_SIZE; + + if (drbd_try_rs_begin_io(mdev, sector)) { + mdev->ov_position = sector; + goto requeue; + } + + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + inc_rs_pending(mdev); + if (!drbd_send_ov_request(mdev, sector, size)) { + dec_rs_pending(mdev); + return 0; + } + sector += BM_SECT_PER_BIT; + } + mdev->ov_position = sector; + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + + +int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + ov_oos_print(mdev); + drbd_resync_finished(mdev); + + return 1; +} + +static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + + drbd_resync_finished(mdev); + + return 1; +} + +int drbd_resync_finished(struct drbd_conf *mdev) +{ + unsigned long db, dt, dbdt; + unsigned long n_oos; + union drbd_state os, ns; + struct drbd_work *w; + char *khelper_cmd = NULL; + + /* Remove all elements from the resync LRU. Since future actions + * might set bits in the (main) bitmap, then the entries in the + * resync LRU would be wrong. */ + if (drbd_rs_del_all(mdev)) { + /* In case this is not possible now, most probably because + * there are P_RS_DATA_REPLY Packets lingering on the worker's + * queue (or even the read operations for those packets + * is not finished by now). Retry in 100ms. */ + + drbd_kick_lo(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); + if (w) { + w->cb = w_resync_finished; + drbd_queue_work(&mdev->data.work, w); + return 1; + } + dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); + } + + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + db = mdev->rs_total; + dbdt = Bit2KB(db/dt); + mdev->rs_paused /= HZ; + + if (!get_ldev(mdev)) + goto out; + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + + /* This protects us against multiple calls (that can happen in the presence + of application IO), and against connectivity loss just before we arrive here. */ + if (os.conn <= C_CONNECTED) + goto out_unlock; + + ns = os; + ns.conn = C_CONNECTED; + + dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", + (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? + "Online verify " : "Resync", + dt + mdev->rs_paused, mdev->rs_paused, dbdt); + + n_oos = drbd_bm_total_weight(mdev); + + if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { + if (n_oos) { + dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", + n_oos, Bit2KB(1)); + khelper_cmd = "out-of-sync"; + } + } else { + D_ASSERT((n_oos - mdev->rs_failed) == 0); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) + khelper_cmd = "after-resync-target"; + + if (mdev->csums_tfm && mdev->rs_total) { + const unsigned long s = mdev->rs_same_csum; + const unsigned long t = mdev->rs_total; + const int ratio = + (t == 0) ? 0 : + (t < 100000) ? ((s*100)/t) : (s/(t/100)); + dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " + "transferred %luK total %luK\n", + ratio, + Bit2KB(mdev->rs_same_csum), + Bit2KB(mdev->rs_total - mdev->rs_same_csum), + Bit2KB(mdev->rs_total)); + } + } + + if (mdev->rs_failed) { + dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + ns.disk = D_INCONSISTENT; + ns.pdsk = D_UP_TO_DATE; + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_INCONSISTENT; + } + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_UP_TO_DATE; + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + if (mdev->p_uuid) { + int i; + for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) + _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); + drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); + _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); + } else { + dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); + } + } + + drbd_uuid_set_bm(mdev, 0UL); + + if (mdev->p_uuid) { + /* Now the two UUID sets are equal, update what we + * know of the peer. */ + int i; + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) + mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; + } + } + + _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); +out_unlock: + spin_unlock_irq(&mdev->req_lock); + put_ldev(mdev); +out: + mdev->rs_total = 0; + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->ov_start_sector = 0; + + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { + dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); + } + + if (khelper_cmd) + drbd_khelper(mdev, khelper_cmd); + + return 1; +} + +/* helper */ +static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +{ + if (drbd_bio_has_active_page(e->private_bio)) { + /* This might happen if sendpage() has not finished */ + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->net_ee); + spin_unlock_irq(&mdev->req_lock); + } else + drbd_free_ee(mdev, e); +} + +/** + * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + ok = drbd_send_block(mdev, P_DATA_REPLY, e); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. sector=%llus.\n", + (unsigned long long)e->sector); + + ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); + } + + dec_unacked(mdev); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block() failed\n"); + return ok; +} + +/** + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { + inc_rs_pending(mdev); + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Not sending RSDataReply, " + "partner DISKLESS!\n"); + ok = 1; + } + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", + (unsigned long long)e->sector); + + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + + /* update resync data with failure */ + drbd_rs_failed_io(mdev, e->sector, e->size); + } + + dec_unacked(mdev); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block() failed\n"); + return ok; +} + +int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct digest_info *di; + int digest_size; + void *digest = NULL; + int ok, eq = 0; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + drbd_rs_complete_io(mdev, e->sector); + + di = (struct digest_info *)(unsigned long)e->block_id; + + if (likely(drbd_bio_uptodate(e->private_bio))) { + /* quick hack to try to avoid a race against reconfiguration. + * a real fix would be much more involved, + * introducing more locking mechanisms */ + if (mdev->csums_tfm) { + digest_size = crypto_hash_digestsize(mdev->csums_tfm); + D_ASSERT(digest_size == di->digest_size); + digest = kmalloc(digest_size, GFP_NOIO); + } + if (digest) { + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + + if (eq) { + drbd_set_in_sync(mdev, e->sector, e->size); + mdev->rs_same_csum++; + ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); + } else { + inc_rs_pending(mdev); + e->block_id = ID_SYNCER; + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + } + } else { + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(mdev); + + kfree(di); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block/ack() failed\n"); + return ok; +} + +int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int digest_size; + void *digest; + int ok = 1; + + if (unlikely(cancel)) + goto out; + + if (unlikely(!drbd_bio_uptodate(e->private_bio))) + goto out; + + digest_size = crypto_hash_digestsize(mdev->verify_tfm); + /* FIXME if this allocation fails, online verify will not terminate! */ + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + inc_rs_pending(mdev); + ok = drbd_send_drequest_csum(mdev, e->sector, e->size, + digest, digest_size, P_OV_REPLY); + if (!ok) + dec_rs_pending(mdev); + kfree(digest); + } + +out: + drbd_free_ee(mdev, e); + + dec_unacked(mdev); + + return ok; +} + +void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) +{ + if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { + mdev->ov_last_oos_size += size>>9; + } else { + mdev->ov_last_oos_start = sector; + mdev->ov_last_oos_size = size>>9; + } + drbd_set_out_of_sync(mdev, sector, size); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); +} + +int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct digest_info *di; + int digest_size; + void *digest; + int ok, eq = 0; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all + * the resync lru has been cleaned up already */ + drbd_rs_complete_io(mdev, e->sector); + + di = (struct digest_info *)(unsigned long)e->block_id; + + if (likely(drbd_bio_uptodate(e->private_bio))) { + digest_size = crypto_hash_digestsize(mdev->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + + D_ASSERT(digest_size == di->digest_size); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + } else { + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(mdev); + + kfree(di); + + if (!eq) + drbd_ov_oos_found(mdev, e->sector, e->size); + else + ov_oos_print(mdev); + + ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + + drbd_free_ee(mdev, e); + + if (--mdev->ov_left == 0) { + ov_oos_print(mdev); + drbd_resync_finished(mdev); + } + + return ok; +} + +int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); + complete(&b->done); + return 1; +} + +int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); + struct p_barrier *p = &mdev->data.sbuf.barrier; + int ok = 1; + + /* really avoid racing with tl_clear. w.cb may have been referenced + * just before it was reassigned and re-queued, so double check that. + * actually, this race was harmless, since we only try to send the + * barrier packet here, and otherwise do nothing with the object. + * but compare with the head of w_clear_epoch */ + spin_lock_irq(&mdev->req_lock); + if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) + cancel = 1; + spin_unlock_irq(&mdev->req_lock); + if (cancel) + return 1; + + if (!drbd_get_data_sock(mdev)) + return 0; + p->barrier = b->br_number; + /* inc_ap_pending was done where this was queued. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in w_clear_epoch. */ + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, + (struct p_header *)p, sizeof(*p), 0); + drbd_put_data_sock(mdev); + + return ok; +} + +int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + if (cancel) + return 1; + return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); +} + +/** + * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled); + return 1; + } + + ok = drbd_send_dblock(mdev, req); + req_mod(req, ok ? handed_over_to_network : send_failed); + + return ok; +} + +/** + * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled); + return 1; + } + + ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, + (unsigned long)req); + + if (!ok) { + /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); + * so this is probably redundant */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + } + req_mod(req, ok ? handed_over_to_network : send_failed); + + return ok; +} + +static int _drbd_may_sync_now(struct drbd_conf *mdev) +{ + struct drbd_conf *odev = mdev; + + while (1) { + if (odev->sync_conf.after == -1) + return 1; + odev = minor_to_mdev(odev->sync_conf.after); + ERR_IF(!odev) return 1; + if ((odev->state.conn >= C_SYNC_SOURCE && + odev->state.conn <= C_PAUSED_SYNC_T) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp) + return 0; + } +} + +/** + * _drbd_pause_after() - Pause resync on all devices that may not resync now + * @mdev: DRBD device. + * + * Called from process context only (admin command and after_state_ch). + */ +static int _drbd_pause_after(struct drbd_conf *mdev) +{ + struct drbd_conf *odev; + int i, rv = 0; + + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (!_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) + != SS_NOTHING_TO_DO); + } + + return rv; +} + +/** + * _drbd_resume_next() - Resume resync on all devices that may resync now + * @mdev: DRBD device. + * + * Called from process context only (admin command and worker). + */ +static int _drbd_resume_next(struct drbd_conf *mdev) +{ + struct drbd_conf *odev; + int i, rv = 0; + + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (odev->state.aftr_isp) { + if (_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) + != SS_NOTHING_TO_DO) ; + } + } + return rv; +} + +void resume_next_sg(struct drbd_conf *mdev) +{ + write_lock_irq(&global_state_lock); + _drbd_resume_next(mdev); + write_unlock_irq(&global_state_lock); +} + +void suspend_other_sg(struct drbd_conf *mdev) +{ + write_lock_irq(&global_state_lock); + _drbd_pause_after(mdev); + write_unlock_irq(&global_state_lock); +} + +static int sync_after_error(struct drbd_conf *mdev, int o_minor) +{ + struct drbd_conf *odev; + + if (o_minor == -1) + return NO_ERROR; + if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) + return ERR_SYNC_AFTER; + + /* check for loops */ + odev = minor_to_mdev(o_minor); + while (1) { + if (odev == mdev) + return ERR_SYNC_AFTER_CYCLE; + + /* dependency chain ends here, no cycles. */ + if (odev->sync_conf.after == -1) + return NO_ERROR; + + /* follow the dependency chain */ + odev = minor_to_mdev(odev->sync_conf.after); + } +} + +int drbd_alter_sa(struct drbd_conf *mdev, int na) +{ + int changes; + int retcode; + + write_lock_irq(&global_state_lock); + retcode = sync_after_error(mdev, na); + if (retcode == NO_ERROR) { + mdev->sync_conf.after = na; + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); + } + write_unlock_irq(&global_state_lock); + return retcode; +} + +/** + * drbd_start_resync() - Start the resync process + * @mdev: DRBD device. + * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET + * + * This function might bring you directly into one of the + * C_PAUSED_SYNC_* states. + */ +void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) +{ + union drbd_state ns; + int r; + + if (mdev->state.conn >= C_SYNC_SOURCE) { + dev_err(DEV, "Resync already running!\n"); + return; + } + + /* In case a previous resync run was aborted by an IO error/detach on the peer. */ + drbd_rs_cancel_all(mdev); + + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(mdev, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + dev_info(DEV, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; + } + } + + drbd_state_lock(mdev); + + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + drbd_state_unlock(mdev); + return; + } + + if (side == C_SYNC_TARGET) { + mdev->bm_resync_fo = 0; + } else /* side == C_SYNC_SOURCE */ { + u64 uuid; + + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, UI_BITMAP, uuid); + drbd_send_sync_uuid(mdev, uuid); + + D_ASSERT(mdev->state.disk == D_UP_TO_DATE); + } + + write_lock_irq(&global_state_lock); + ns = mdev->state; + + ns.aftr_isp = !_drbd_may_sync_now(mdev); + + ns.conn = side; + + if (side == C_SYNC_TARGET) + ns.disk = D_INCONSISTENT; + else /* side == C_SYNC_SOURCE */ + ns.pdsk = D_INCONSISTENT; + + r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + ns = mdev->state; + + if (ns.conn < C_CONNECTED) + r = SS_UNKNOWN_ERROR; + + if (r == SS_SUCCESS) { + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->rs_start = + mdev->rs_mark_time = jiffies; + mdev->rs_same_csum = 0; + _drbd_pause_after(mdev); + } + write_unlock_irq(&global_state_lock); + drbd_state_unlock(mdev); + put_ldev(mdev); + + if (r == SS_SUCCESS) { + dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", + drbd_conn_str(ns.conn), + (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), + (unsigned long) mdev->rs_total); + + if (mdev->rs_total == 0) { + /* Peer still reachable? Beware of failing before-resync-target handlers! */ + request_ping(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ + drbd_resync_finished(mdev); + return; + } + + /* ns.conn may already be != mdev->state.conn, + * we may have been paused in between, or become paused until + * the timer triggers. + * No matter, that is handled in resync_timer_fn() */ + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); + + drbd_md_sync(mdev); + } +} + +int drbd_worker(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct drbd_work *w = NULL; + LIST_HEAD(work_list); + int intr = 0, i; + + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); + + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); + + if (down_trylock(&mdev->data.work.s)) { + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); + + intr = down_interruptible(&mdev->data.work.s); + + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_cork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); + } + + if (intr) { + D_ASSERT(intr == -EINTR); + flush_signals(current); + ERR_IF (get_t_state(thi) == Running) + continue; + break; + } + + if (get_t_state(thi) != Running) + break; + /* With this break, we have done a down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = NULL; + spin_lock_irq(&mdev->data.work.q_lock); + ERR_IF(list_empty(&mdev->data.work.q)) { + /* something terribly wrong in our logic. + * we were able to down() the semaphore, + * but the list is empty... doh. + * + * what is the best thing to do now? + * try again from scratch, restarting the receiver, + * asender, whatnot? could break even more ugly, + * e.g. when we are primary, but no good local data. + * + * I'll try to get away just starting over this loop. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + continue; + } + w = list_entry(mdev->data.work.q.next, struct drbd_work, list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->data.work.q_lock); + + if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { + /* dev_warn(DEV, "worker: a callback failed! \n"); */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, + NS(conn, C_NETWORK_FAILURE)); + } + } + D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); + D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); + + spin_lock_irq(&mdev->data.work.q_lock); + i = 0; + while (!list_empty(&mdev->data.work.q)) { + list_splice_init(&mdev->data.work.q, &work_list); + spin_unlock_irq(&mdev->data.work.q_lock); + + while (!list_empty(&work_list)) { + w = list_entry(work_list.next, struct drbd_work, list); + list_del_init(&w->list); + w->cb(mdev, w, 1); + i++; /* dead debugging code */ + } + + spin_lock_irq(&mdev->data.work.q_lock); + } + sema_init(&mdev->data.work.s, 0); + /* DANGEROUS race: if someone did queue his work within the spinlock, + * but up() ed outside the spinlock, we could get an up() on the + * semaphore without corresponding list entry. + * So don't do that. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + + D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); + /* _drbd_set_state only uses stop_nowait. + * wait here for the Exiting receiver. */ + drbd_thread_stop(&mdev->receiver); + drbd_mdev_cleanup(mdev); + + dev_info(DEV, "worker terminated\n"); + + clear_bit(DEVICE_DYING, &mdev->flags); + clear_bit(CONFIG_PENDING, &mdev->flags); + wake_up(&mdev->state_wait); + + return 0; +} diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h new file mode 100644 index 0000000..f93fa11 --- /dev/null +++ b/drivers/block/drbd/drbd_wrappers.h @@ -0,0 +1,91 @@ +#ifndef _DRBD_WRAPPERS_H +#define _DRBD_WRAPPERS_H + +#include <linux/ctype.h> +#include <linux/mm.h> + +/* see get_sb_bdev and bd_claim */ +extern char *drbd_sec_holder; + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_conf *mdev, + sector_t size) +{ + /* set_capacity(mdev->this_bdev->bd_disk, size); */ + set_capacity(mdev->vdisk, size); + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) + +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) + return 1; + } + + return 0; +} + +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_endio_read_sec(struct bio *bio, int error); +extern void drbd_endio_write_sec(struct bio *bio, int error); +extern void drbd_endio_pri(struct bio *bio, int error); + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_conf *mdev, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + mdev_to_minor(mdev)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (FAULT_ACTIVE(mdev, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + +static inline void drbd_plug_device(struct drbd_conf *mdev) +{ + struct request_queue *q; + q = bdev_get_queue(mdev->this_bdev); + + spin_lock_irq(q->queue_lock); + +/* XXX the check on !blk_queue_plugged is redundant, + * implicitly checked in blk_plug_device */ + + if (!blk_queue_plugged(q)) { + blk_plug_device(q); + del_timer(&q->unplug_timer); + /* unplugging should not happen automatically... */ + } + spin_unlock_irq(q->queue_lock); +} + +static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) +{ + return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) + == CRYPTO_ALG_TYPE_HASH; +} + +#ifndef __CHECKER__ +# undef __cond_lock +# define __cond_lock(x,c) (c) +#endif + +#endif diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 3bb7c47..1fb6c31 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); - unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); + unsigned long timeout; + + for (timeout = 20; timeout; timeout--) { + if (!notify[3]) + return 0; + udelay(10); + } + + timeout = jiffies + msecs_to_jiffies(timeout_ms); do { if (!notify[3]) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 614da5b..e3749d0 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3557,67 +3557,65 @@ static ctl_table cdrom_table[] = { .data = &cdrom_sysctl_settings.info, .maxlen = CDROM_STR_SIZE, .mode = 0444, - .proc_handler = &cdrom_sysctl_info, + .proc_handler = cdrom_sysctl_info, }, { .procname = "autoclose", .data = &cdrom_sysctl_settings.autoclose, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &cdrom_sysctl_handler, + .proc_handler = cdrom_sysctl_handler, }, { .procname = "autoeject", .data = &cdrom_sysctl_settings.autoeject, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &cdrom_sysctl_handler, + .proc_handler = cdrom_sysctl_handler, }, { .procname = "debug", .data = &cdrom_sysctl_settings.debug, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &cdrom_sysctl_handler, + .proc_handler = cdrom_sysctl_handler, }, { .procname = "lock", .data = &cdrom_sysctl_settings.lock, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &cdrom_sysctl_handler, + .proc_handler = cdrom_sysctl_handler, }, { .procname = "check_media", .data = &cdrom_sysctl_settings.check, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &cdrom_sysctl_handler + .proc_handler = cdrom_sysctl_handler }, - { .ctl_name = 0 } + { } }; static ctl_table cdrom_cdrom_table[] = { { - .ctl_name = DEV_CDROM, .procname = "cdrom", .maxlen = 0, .mode = 0555, .child = cdrom_table, }, - { .ctl_name = 0 } + { } }; /* Make sure that /proc/sys/dev is there */ static ctl_table cdrom_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = cdrom_cdrom_table, }, - { .ctl_name = 0 } + { } }; static struct ctl_table_header *cdrom_sysctl_header; diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 70a770a..e481c59 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -675,36 +675,33 @@ static int hpet_is_known(struct hpet_data *hdp) static ctl_table hpet_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "max-user-freq", .data = &hpet_max_freq, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - {.ctl_name = 0} + {} }; static ctl_table hpet_root[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "hpet", .maxlen = 0, .mode = 0555, .child = hpet_table, }, - {.ctl_name = 0} + {} }; static ctl_table dev_root[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = hpet_root, }, - {.ctl_name = 0} + {} }; static struct ctl_table_header *sysctl_header; diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c index 2e66b5f..0dec5da 100644 --- a/drivers/char/ipmi/ipmi_poweroff.c +++ b/drivers/char/ipmi/ipmi_poweroff.c @@ -660,26 +660,23 @@ static struct ipmi_smi_watcher smi_watcher = { #include <linux/sysctl.h> static ctl_table ipmi_table[] = { - { .ctl_name = DEV_IPMI_POWEROFF_POWERCYCLE, - .procname = "poweroff_powercycle", + { .procname = "poweroff_powercycle", .data = &poweroff_powercycle, .maxlen = sizeof(poweroff_powercycle), .mode = 0644, - .proc_handler = &proc_dointvec }, + .proc_handler = proc_dointvec }, { } }; static ctl_table ipmi_dir_table[] = { - { .ctl_name = DEV_IPMI, - .procname = "ipmi", + { .procname = "ipmi", .mode = 0555, .child = ipmi_table }, { } }; static ctl_table ipmi_root_table[] = { - { .ctl_name = CTL_DEV, - .procname = "dev", + { .procname = "dev", .mode = 0555, .child = ipmi_dir_table }, { } diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 62f282e..d86c0bc 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -431,30 +431,25 @@ static struct cdev ptmx_cdev; static struct ctl_table pty_table[] = { { - .ctl_name = PTY_MAX, .procname = "max", .maxlen = sizeof(int), .mode = 0644, .data = &pty_limit, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, }, { - .ctl_name = PTY_NR, .procname = "nr", .maxlen = sizeof(int), .mode = 0444, .data = &pty_count, - .proc_handler = &proc_dointvec, - }, { - .ctl_name = 0 - } + .proc_handler = proc_dointvec, + }, + {} }; static struct ctl_table pty_kern_table[] = { { - .ctl_name = KERN_PTY, .procname = "pty", .mode = 0555, .child = pty_table, @@ -464,7 +459,6 @@ static struct ctl_table pty_kern_table[] = { static struct ctl_table pty_root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = pty_kern_table, diff --git a/drivers/char/random.c b/drivers/char/random.c index 04b505e..dcd08635 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1257,94 +1257,54 @@ static int proc_do_uuid(ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } -static int uuid_strategy(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - unsigned char tmp_uuid[16], *uuid; - unsigned int len; - - if (!oldval || !oldlenp) - return 1; - - uuid = table->data; - if (!uuid) { - uuid = tmp_uuid; - uuid[8] = 0; - } - if (uuid[8] == 0) - generate_random_uuid(uuid); - - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > 16) - len = 16; - if (copy_to_user(oldval, uuid, len) || - put_user(len, oldlenp)) - return -EFAULT; - } - return 1; -} - static int sysctl_poolsize = INPUT_POOL_WORDS * 32; ctl_table random_table[] = { { - .ctl_name = RANDOM_POOLSIZE, .procname = "poolsize", .data = &sysctl_poolsize, .maxlen = sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = RANDOM_ENTROPY_COUNT, .procname = "entropy_avail", .maxlen = sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, .data = &input_pool.entropy_count, }, { - .ctl_name = RANDOM_READ_THRESH, .procname = "read_wakeup_threshold", .data = &random_read_wakeup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_read_thresh, .extra2 = &max_read_thresh, }, { - .ctl_name = RANDOM_WRITE_THRESH, .procname = "write_wakeup_threshold", .data = &random_write_wakeup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_write_thresh, .extra2 = &max_write_thresh, }, { - .ctl_name = RANDOM_BOOT_ID, .procname = "boot_id", .data = &sysctl_bootid, .maxlen = 16, .mode = 0444, - .proc_handler = &proc_do_uuid, - .strategy = &uuid_strategy, + .proc_handler = proc_do_uuid, }, { - .ctl_name = RANDOM_UUID, .procname = "uuid", .maxlen = 16, .mode = 0444, - .proc_handler = &proc_do_uuid, - .strategy = &uuid_strategy, + .proc_handler = proc_do_uuid, }, - { .ctl_name = 0 } + { } }; #endif /* CONFIG_SYSCTL */ diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index bc4ab3e..95acb8c 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -282,34 +282,31 @@ static irqreturn_t rtc_interrupt(int irq, void *dev_id) */ static ctl_table rtc_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "max-user-freq", .data = &rtc_max_user_freq, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; static ctl_table rtc_root[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "rtc", .mode = 0555, .child = rtc_table, }, - { .ctl_name = 0 } + { } }; static ctl_table dev_root[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = rtc_root, }, - { .ctl_name = 0 } + { } }; static struct ctl_table_header *sysctl_header; diff --git a/drivers/dio/dio-driver.c b/drivers/dio/dio-driver.c index 9c0c9af..a7b174e 100644 --- a/drivers/dio/dio-driver.c +++ b/drivers/dio/dio-driver.c @@ -140,5 +140,4 @@ postcore_initcall(dio_driver_init); EXPORT_SYMBOL(dio_match_device); EXPORT_SYMBOL(dio_register_driver); EXPORT_SYMBOL(dio_unregister_driver); -EXPORT_SYMBOL(dio_dev_driver); EXPORT_SYMBOL(dio_bus_type); diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index b401dad..eb140ff 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -54,7 +54,7 @@ config DW_DMAC config AT_HDMAC tristate "Atmel AHB DMA support" - depends on ARCH_AT91SAM9RL + depends on ARCH_AT91SAM9RL || ARCH_AT91SAM9G45 select DMA_ENGINE help Support the Atmel AHB DMA controller. This can be integrated in diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 713ed7d..689cc6a 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -3,7 +3,6 @@ static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); -static void (*orig_mce_callback)(struct mce *m); void amd_report_gart_errors(bool v) { @@ -363,8 +362,10 @@ static inline void amd_decode_err_code(unsigned int ec) pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -static void amd_decode_mce(struct mce *m) +static int amd_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { + struct mce *m = (struct mce *)data; struct err_regs regs; int node, ecc; @@ -420,20 +421,22 @@ static void amd_decode_mce(struct mce *m) } amd_decode_err_code(m->status & 0xffff); + + return NOTIFY_STOP; } +static struct notifier_block amd_mce_dec_nb = { + .notifier_call = amd_decode_mce, +}; + static int __init mce_amd_init(void) { /* * We can decode MCEs for Opteron and later CPUs: */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && - (boot_cpu_data.x86 >= 0xf)) { - /* safe the default decode mce callback */ - orig_mce_callback = x86_mce_decode_callback; - - x86_mce_decode_callback = amd_decode_mce; - } + (boot_cpu_data.x86 >= 0xf)) + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; } @@ -442,7 +445,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { - x86_mce_decode_callback = orig_mce_callback; + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); } MODULE_DESCRIPTION("AMD MCE decoder"); diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e4864e8..7083bcc 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -38,15 +38,14 @@ #include "core.h" -int fw_compute_block_crc(u32 *block) +int fw_compute_block_crc(__be32 *block) { - __be32 be32_block[256]; - int i, length; + int length; + u16 crc; - length = (*block >> 16) & 0xff; - for (i = 0; i < length; i++) - be32_block[i] = cpu_to_be32(block[i + 1]); - *block |= crc_itu_t(0, (u8 *) be32_block, length * 4); + length = (be32_to_cpu(block[0]) >> 16) & 0xff; + crc = crc_itu_t(0, (u8 *)&block[1], length * 4); + *block |= cpu_to_be32(crc); return length; } @@ -57,6 +56,8 @@ static LIST_HEAD(card_list); static LIST_HEAD(descriptor_list); static int descriptor_count; +static __be32 tmp_config_rom[256]; + #define BIB_CRC(v) ((v) << 0) #define BIB_CRC_LENGTH(v) ((v) << 16) #define BIB_INFO_LENGTH(v) ((v) << 24) @@ -72,11 +73,10 @@ static int descriptor_count; #define BIB_CMC ((1) << 30) #define BIB_IMC ((1) << 31) -static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length) +static size_t generate_config_rom(struct fw_card *card, __be32 *config_rom) { struct fw_descriptor *desc; - static u32 config_rom[256]; - int i, j, length; + int i, j, k, length; /* * Initialize contents of config rom buffer. On the OHCI @@ -87,40 +87,39 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length) * the version stored in the OHCI registers. */ - memset(config_rom, 0, sizeof(config_rom)); - config_rom[0] = BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0); - config_rom[1] = 0x31333934; - - config_rom[2] = + config_rom[0] = cpu_to_be32( + BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0)); + config_rom[1] = cpu_to_be32(0x31333934); + config_rom[2] = cpu_to_be32( BIB_LINK_SPEED(card->link_speed) | BIB_GENERATION(card->config_rom_generation++ % 14 + 2) | BIB_MAX_ROM(2) | BIB_MAX_RECEIVE(card->max_receive) | - BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC; - config_rom[3] = card->guid >> 32; - config_rom[4] = card->guid; + BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC); + config_rom[3] = cpu_to_be32(card->guid >> 32); + config_rom[4] = cpu_to_be32(card->guid); /* Generate root directory. */ - i = 5; - config_rom[i++] = 0; - config_rom[i++] = 0x0c0083c0; /* node capabilities */ - j = i + descriptor_count; + config_rom[6] = cpu_to_be32(0x0c0083c0); /* node capabilities */ + i = 7; + j = 7 + descriptor_count; /* Generate root directory entries for descriptors. */ list_for_each_entry (desc, &descriptor_list, link) { if (desc->immediate > 0) - config_rom[i++] = desc->immediate; - config_rom[i] = desc->key | (j - i); + config_rom[i++] = cpu_to_be32(desc->immediate); + config_rom[i] = cpu_to_be32(desc->key | (j - i)); i++; j += desc->length; } /* Update root directory length. */ - config_rom[5] = (i - 5 - 1) << 16; + config_rom[5] = cpu_to_be32((i - 5 - 1) << 16); /* End of root directory, now copy in descriptors. */ list_for_each_entry (desc, &descriptor_list, link) { - memcpy(&config_rom[i], desc->data, desc->length * 4); + for (k = 0; k < desc->length; k++) + config_rom[i + k] = cpu_to_be32(desc->data[k]); i += desc->length; } @@ -131,20 +130,17 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length) for (i = 0; i < j; i += length + 1) length = fw_compute_block_crc(config_rom + i); - *config_rom_length = j; - - return config_rom; + return j; } static void update_config_roms(void) { struct fw_card *card; - u32 *config_rom; size_t length; list_for_each_entry (card, &card_list, link) { - config_rom = generate_config_rom(card, &length); - card->driver->set_config_rom(card, config_rom, length); + length = generate_config_rom(card, tmp_config_rom); + card->driver->set_config_rom(card, tmp_config_rom, length); } } @@ -211,11 +207,8 @@ static const char gap_count_table[] = { void fw_schedule_bm_work(struct fw_card *card, unsigned long delay) { - int scheduled; - fw_card_get(card); - scheduled = schedule_delayed_work(&card->work, delay); - if (!scheduled) + if (!schedule_delayed_work(&card->work, delay)) fw_card_put(card); } @@ -435,7 +428,6 @@ EXPORT_SYMBOL(fw_card_initialize); int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid) { - u32 *config_rom; size_t length; int ret; @@ -445,8 +437,8 @@ int fw_card_add(struct fw_card *card, mutex_lock(&card_mutex); - config_rom = generate_config_rom(card, &length); - ret = card->driver->enable(card, config_rom, length); + length = generate_config_rom(card, tmp_config_rom); + ret = card->driver->enable(card, tmp_config_rom, length); if (ret == 0) list_add_tail(&card->link, &card_list); @@ -465,7 +457,8 @@ EXPORT_SYMBOL(fw_card_add); * shutdown still need to be provided by the card driver. */ -static int dummy_enable(struct fw_card *card, u32 *config_rom, size_t length) +static int dummy_enable(struct fw_card *card, + const __be32 *config_rom, size_t length) { BUG(); return -1; @@ -478,7 +471,7 @@ static int dummy_update_phy_reg(struct fw_card *card, int address, } static int dummy_set_config_rom(struct fw_card *card, - u32 *config_rom, size_t length) + const __be32 *config_rom, size_t length) { /* * We take the card out of card_list before setting the dummy diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 5089331..231e6ee 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -130,9 +130,22 @@ struct iso_resource { struct iso_resource_event *e_alloc, *e_dealloc; }; -static void schedule_iso_resource(struct iso_resource *); static void release_iso_resource(struct client *, struct client_resource *); +static void schedule_iso_resource(struct iso_resource *r, unsigned long delay) +{ + client_get(r->client); + if (!schedule_delayed_work(&r->work, delay)) + client_put(r->client); +} + +static void schedule_if_iso_resource(struct client_resource *resource) +{ + if (resource->release == release_iso_resource) + schedule_iso_resource(container_of(resource, + struct iso_resource, resource), 0); +} + /* * dequeue_event() just kfree()'s the event, so the event has to be * the first field in a struct XYZ_event. @@ -166,7 +179,7 @@ struct iso_interrupt_event { struct iso_resource_event { struct event event; - struct fw_cdev_event_iso_resource resource; + struct fw_cdev_event_iso_resource iso_resource; }; static inline void __user *u64_to_uptr(__u64 value) @@ -314,11 +327,8 @@ static void for_each_client(struct fw_device *device, static int schedule_reallocations(int id, void *p, void *data) { - struct client_resource *r = p; + schedule_if_iso_resource(p); - if (r->release == release_iso_resource) - schedule_iso_resource(container_of(r, - struct iso_resource, resource)); return 0; } @@ -414,9 +424,7 @@ static int add_client_resource(struct client *client, &resource->handle); if (ret >= 0) { client_get(client); - if (resource->release == release_iso_resource) - schedule_iso_resource(container_of(resource, - struct iso_resource, resource)); + schedule_if_iso_resource(resource); } spin_unlock_irqrestore(&client->lock, flags); @@ -428,26 +436,26 @@ static int add_client_resource(struct client *client, static int release_client_resource(struct client *client, u32 handle, client_resource_release_fn_t release, - struct client_resource **resource) + struct client_resource **return_resource) { - struct client_resource *r; + struct client_resource *resource; spin_lock_irq(&client->lock); if (client->in_shutdown) - r = NULL; + resource = NULL; else - r = idr_find(&client->resource_idr, handle); - if (r && r->release == release) + resource = idr_find(&client->resource_idr, handle); + if (resource && resource->release == release) idr_remove(&client->resource_idr, handle); spin_unlock_irq(&client->lock); - if (!(r && r->release == release)) + if (!(resource && resource->release == release)) return -EINVAL; - if (resource) - *resource = r; + if (return_resource) + *return_resource = resource; else - r->release(client, r); + resource->release(client, resource); client_put(client); @@ -699,6 +707,7 @@ static int ioctl_send_response(struct client *client, void *buffer) struct fw_cdev_send_response *request = buffer; struct client_resource *resource; struct inbound_transaction_resource *r; + int ret = 0; if (release_client_resource(client, request->handle, release_request, &resource) < 0) @@ -708,13 +717,17 @@ static int ioctl_send_response(struct client *client, void *buffer) resource); if (request->length < r->length) r->length = request->length; - if (copy_from_user(r->data, u64_to_uptr(request->data), r->length)) - return -EFAULT; + + if (copy_from_user(r->data, u64_to_uptr(request->data), r->length)) { + ret = -EFAULT; + goto out; + } fw_send_response(client->device->card, r->request, request->rcode); + out: kfree(r); - return 0; + return ret; } static int ioctl_initiate_bus_reset(struct client *client, void *buffer) @@ -1028,8 +1041,7 @@ static void iso_resource_work(struct work_struct *work) /* Allow 1000ms grace period for other reallocations. */ if (todo == ISO_RES_ALLOC && time_is_after_jiffies(client->device->card->reset_jiffies + HZ)) { - if (schedule_delayed_work(&r->work, DIV_ROUND_UP(HZ, 3))) - client_get(client); + schedule_iso_resource(r, DIV_ROUND_UP(HZ, 3)); skip = true; } else { /* We could be called twice within the same generation. */ @@ -1097,12 +1109,12 @@ static void iso_resource_work(struct work_struct *work) e = r->e_dealloc; r->e_dealloc = NULL; } - e->resource.handle = r->resource.handle; - e->resource.channel = channel; - e->resource.bandwidth = bandwidth; + e->iso_resource.handle = r->resource.handle; + e->iso_resource.channel = channel; + e->iso_resource.bandwidth = bandwidth; queue_event(client, &e->event, - &e->resource, sizeof(e->resource), NULL, 0); + &e->iso_resource, sizeof(e->iso_resource), NULL, 0); if (free) { cancel_delayed_work(&r->work); @@ -1114,13 +1126,6 @@ static void iso_resource_work(struct work_struct *work) client_put(client); } -static void schedule_iso_resource(struct iso_resource *r) -{ - client_get(r->client); - if (!schedule_delayed_work(&r->work, 0)) - client_put(r->client); -} - static void release_iso_resource(struct client *client, struct client_resource *resource) { @@ -1129,7 +1134,7 @@ static void release_iso_resource(struct client *client, spin_lock_irq(&client->lock); r->todo = ISO_RES_DEALLOC; - schedule_iso_resource(r); + schedule_iso_resource(r, 0); spin_unlock_irq(&client->lock); } @@ -1162,10 +1167,10 @@ static int init_iso_resource(struct client *client, r->e_alloc = e1; r->e_dealloc = e2; - e1->resource.closure = request->closure; - e1->resource.type = FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED; - e2->resource.closure = request->closure; - e2->resource.type = FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED; + e1->iso_resource.closure = request->closure; + e1->iso_resource.type = FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED; + e2->iso_resource.closure = request->closure; + e2->iso_resource.type = FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED; if (todo == ISO_RES_ALLOC) { r->resource.release = release_iso_resource; @@ -1175,7 +1180,7 @@ static int init_iso_resource(struct client *client, } else { r->resource.release = NULL; r->resource.handle = -1; - schedule_iso_resource(r); + schedule_iso_resource(r, 0); } request->handle = r->resource.handle; @@ -1295,7 +1300,23 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = { static int dispatch_ioctl(struct client *client, unsigned int cmd, void __user *arg) { - char buffer[256]; + char buffer[sizeof(union { + struct fw_cdev_get_info _00; + struct fw_cdev_send_request _01; + struct fw_cdev_allocate _02; + struct fw_cdev_deallocate _03; + struct fw_cdev_send_response _04; + struct fw_cdev_initiate_bus_reset _05; + struct fw_cdev_add_descriptor _06; + struct fw_cdev_remove_descriptor _07; + struct fw_cdev_create_iso_context _08; + struct fw_cdev_queue_iso _09; + struct fw_cdev_start_iso _0a; + struct fw_cdev_stop_iso _0b; + struct fw_cdev_get_cycle_timer _0c; + struct fw_cdev_allocate_iso_resource _0d; + struct fw_cdev_send_stream_packet _13; + })]; int ret; if (_IOC_TYPE(cmd) != '#' || @@ -1390,10 +1411,10 @@ static int fw_device_op_mmap(struct file *file, struct vm_area_struct *vma) static int shutdown_resource(int id, void *p, void *data) { - struct client_resource *r = p; + struct client_resource *resource = p; struct client *client = data; - r->release(client, r); + resource->release(client, resource); client_put(client); return 0; @@ -1402,7 +1423,7 @@ static int shutdown_resource(int id, void *p, void *data) static int fw_device_op_release(struct inode *inode, struct file *file) { struct client *client = file->private_data; - struct event *e, *next_e; + struct event *event, *next_event; mutex_lock(&client->device->client_list_mutex); list_del(&client->link); @@ -1423,8 +1444,8 @@ static int fw_device_op_release(struct inode *inode, struct file *file) idr_remove_all(&client->resource_idr); idr_destroy(&client->resource_idr); - list_for_each_entry_safe(e, next_e, &client->event_list, link) - kfree(e); + list_for_each_entry_safe(event, next_event, &client->event_list, link) + kfree(event); client_put(client); diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index fddf2b3..9a5f38c 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -28,9 +28,9 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/string.h> #include <asm/atomic.h> +#include <asm/byteorder.h> #include <asm/system.h> #include "core.h" @@ -510,13 +510,16 @@ static void update_tree(struct fw_card *card, struct fw_node *root) static void update_topology_map(struct fw_card *card, u32 *self_ids, int self_id_count) { - int node_count; + int node_count = (card->root_node->node_id & 0x3f) + 1; + __be32 *map = card->topology_map; + + *map++ = cpu_to_be32((self_id_count + 2) << 16); + *map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1); + *map++ = cpu_to_be32((node_count << 16) | self_id_count); + + while (self_id_count--) + *map++ = cpu_to_be32p(self_ids++); - card->topology_map[1]++; - node_count = (card->root_node->node_id & 0x3f) + 1; - card->topology_map[2] = (node_count << 16) | self_id_count; - card->topology_map[0] = (self_id_count + 2) << 16; - memcpy(&card->topology_map[3], self_ids, self_id_count * 4); fw_compute_block_crc(card->topology_map); } diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index da628c7..842739d 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -218,12 +218,15 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel, packet->header_length = 16; packet->payload_length = 0; break; + + default: + WARN(1, KERN_ERR "wrong tcode %d", tcode); } common: packet->speed = speed; packet->generation = generation; packet->ack = 0; - packet->payload_bus = 0; + packet->payload_mapped = false; } /** @@ -595,11 +598,10 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header, break; default: - BUG(); - return; + WARN(1, KERN_ERR "wrong tcode %d", tcode); } - response->payload_bus = 0; + response->payload_mapped = false; } EXPORT_SYMBOL(fw_fill_response); @@ -810,8 +812,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request int speed, unsigned long long offset, void *payload, size_t length, void *callback_data) { - int i, start, end; - __be32 *map; + int start; if (!TCODE_IS_READ_REQUEST(tcode)) { fw_send_response(card, request, RCODE_TYPE_ERROR); @@ -824,11 +825,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request } start = (offset - topology_map_region.start) / 4; - end = start + length / 4; - map = payload; - - for (i = 0; i < length / 4; i++) - map[i] = cpu_to_be32(card->topology_map[start + i]); + memcpy(payload, &card->topology_map[start], length); fw_send_response(card, request, RCODE_COMPLETE); } diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 7ff6e75..ed3b1a7 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -40,7 +40,8 @@ struct fw_card_driver { * enable the PHY or set the link_on bit and initiate a bus * reset. */ - int (*enable)(struct fw_card *card, u32 *config_rom, size_t length); + int (*enable)(struct fw_card *card, + const __be32 *config_rom, size_t length); int (*update_phy_reg)(struct fw_card *card, int address, int clear_bits, int set_bits); @@ -48,10 +49,10 @@ struct fw_card_driver { /* * Update the config rom for an enabled card. This function * should change the config rom that is presented on the bus - * an initiate a bus reset. + * and initiate a bus reset. */ int (*set_config_rom)(struct fw_card *card, - u32 *config_rom, size_t length); + const __be32 *config_rom, size_t length); void (*send_request)(struct fw_card *card, struct fw_packet *packet); void (*send_response)(struct fw_card *card, struct fw_packet *packet); @@ -93,7 +94,7 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid); void fw_core_remove_card(struct fw_card *card); int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset); -int fw_compute_block_crc(u32 *block); +int fw_compute_block_crc(__be32 *block); void fw_schedule_bm_work(struct fw_card *card, unsigned long delay); static inline struct fw_card *fw_card_get(struct fw_card *card) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 94260aa..ae4556f 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -205,7 +205,7 @@ struct fw_ohci { dma_addr_t config_rom_bus; __be32 *next_config_rom; dma_addr_t next_config_rom_bus; - u32 next_header; + __be32 next_header; struct ar_context ar_request_ctx; struct ar_context ar_response_ctx; @@ -997,7 +997,8 @@ static int at_context_queue_packet(struct context *ctx, packet->ack = RCODE_SEND_ERROR; return -1; } - packet->payload_bus = payload_bus; + packet->payload_bus = payload_bus; + packet->payload_mapped = true; d[2].req_count = cpu_to_le16(packet->payload_length); d[2].data_address = cpu_to_le32(payload_bus); @@ -1025,7 +1026,7 @@ static int at_context_queue_packet(struct context *ctx, */ if (ohci->generation != packet->generation || reg_read(ohci, OHCI1394_IntEventSet) & OHCI1394_busReset) { - if (packet->payload_length > 0) + if (packet->payload_mapped) dma_unmap_single(ohci->card.device, payload_bus, packet->payload_length, DMA_TO_DEVICE); packet->ack = RCODE_GENERATION; @@ -1061,7 +1062,7 @@ static int handle_at_packet(struct context *context, /* This packet was cancelled, just continue. */ return 1; - if (packet->payload_bus) + if (packet->payload_mapped) dma_unmap_single(ohci->card.device, packet->payload_bus, packet->payload_length, DMA_TO_DEVICE); @@ -1357,8 +1358,9 @@ static void bus_reset_tasklet(unsigned long data) */ reg_write(ohci, OHCI1394_BusOptions, be32_to_cpu(ohci->config_rom[2])); - ohci->config_rom[0] = cpu_to_be32(ohci->next_header); - reg_write(ohci, OHCI1394_ConfigROMhdr, ohci->next_header); + ohci->config_rom[0] = ohci->next_header; + reg_write(ohci, OHCI1394_ConfigROMhdr, + be32_to_cpu(ohci->next_header)); } #ifdef CONFIG_FIREWIRE_OHCI_REMOTE_DMA @@ -1477,7 +1479,17 @@ static int software_reset(struct fw_ohci *ohci) return -EBUSY; } -static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length) +static void copy_config_rom(__be32 *dest, const __be32 *src, size_t length) +{ + size_t size = length * 4; + + memcpy(dest, src, size); + if (size < CONFIG_ROM_SIZE) + memset(&dest[length], 0, CONFIG_ROM_SIZE - size); +} + +static int ohci_enable(struct fw_card *card, + const __be32 *config_rom, size_t length) { struct fw_ohci *ohci = fw_ohci(card); struct pci_dev *dev = to_pci_dev(card->device); @@ -1579,8 +1591,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length) if (ohci->next_config_rom == NULL) return -ENOMEM; - memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE); - fw_memcpy_to_be32(ohci->next_config_rom, config_rom, length * 4); + copy_config_rom(ohci->next_config_rom, config_rom, length); } else { /* * In the suspend case, config_rom is NULL, which @@ -1590,7 +1601,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length) ohci->next_config_rom_bus = ohci->config_rom_bus; } - ohci->next_header = be32_to_cpu(ohci->next_config_rom[0]); + ohci->next_header = ohci->next_config_rom[0]; ohci->next_config_rom[0] = 0; reg_write(ohci, OHCI1394_ConfigROMhdr, 0); reg_write(ohci, OHCI1394_BusOptions, @@ -1624,7 +1635,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length) } static int ohci_set_config_rom(struct fw_card *card, - u32 *config_rom, size_t length) + const __be32 *config_rom, size_t length) { struct fw_ohci *ohci; unsigned long flags; @@ -1673,9 +1684,7 @@ static int ohci_set_config_rom(struct fw_card *card, ohci->next_config_rom = next_config_rom; ohci->next_config_rom_bus = next_config_rom_bus; - memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE); - fw_memcpy_to_be32(ohci->next_config_rom, config_rom, - length * 4); + copy_config_rom(ohci->next_config_rom, config_rom, length); ohci->next_header = config_rom[0]; ohci->next_config_rom[0] = 0; @@ -1729,7 +1738,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) if (packet->ack != 0) goto out; - if (packet->payload_bus) + if (packet->payload_mapped) dma_unmap_single(ohci->card.device, packet->payload_bus, packet->payload_length, DMA_TO_DEVICE); diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index 98dbbda..d485cdd 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -820,20 +820,25 @@ static void sbp2_release_target(struct kref *kref) fw_device_put(device); } -static struct workqueue_struct *sbp2_wq; +static void sbp2_target_get(struct sbp2_target *tgt) +{ + kref_get(&tgt->kref); +} static void sbp2_target_put(struct sbp2_target *tgt) { kref_put(&tgt->kref, sbp2_release_target); } +static struct workqueue_struct *sbp2_wq; + /* * Always get the target's kref when scheduling work on one its units. * Each workqueue job is responsible to call sbp2_target_put() upon return. */ static void sbp2_queue_work(struct sbp2_logical_unit *lu, unsigned long delay) { - kref_get(&lu->tgt->kref); + sbp2_target_get(lu->tgt); if (!queue_delayed_work(sbp2_wq, &lu->work, delay)) sbp2_target_put(lu->tgt); } diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig index d7ece13..8d8a00e 100644 --- a/drivers/i2c/Kconfig +++ b/drivers/i2c/Kconfig @@ -5,6 +5,7 @@ menuconfig I2C tristate "I2C support" depends on HAS_IOMEM + select RT_MUTEXES ---help--- I2C (pronounce: I-square-C) is a slow serial bus protocol used in many micro controller applications and developed by Philips. SMBus, diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index e8fe7f1..5f318ce 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -640,22 +640,6 @@ config I2C_TINY_USB This driver can also be built as a module. If so, the module will be called i2c-tiny-usb. -comment "Graphics adapter I2C/DDC channel drivers" - depends on PCI - -config I2C_VOODOO3 - tristate "Voodoo 3 (DEPRECATED)" - depends on PCI - select I2C_ALGOBIT - help - If you say yes to this option, support will be included for the - Voodoo 3 I2C interface. This driver is deprecated and you should - use the tdfxfb driver instead, which additionally provides - framebuffer support. - - This driver can also be built as a module. If so, the module - will be called i2c-voodoo3. - comment "Other I2C/SMBus bus drivers" config I2C_ACORN diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index ff937ac..302c551 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -61,9 +61,6 @@ obj-$(CONFIG_I2C_PARPORT_LIGHT) += i2c-parport-light.o obj-$(CONFIG_I2C_TAOS_EVM) += i2c-taos-evm.o obj-$(CONFIG_I2C_TINY_USB) += i2c-tiny-usb.o -# Graphics adapter I2C/DDC channel drivers -obj-$(CONFIG_I2C_VOODOO3) += i2c-voodoo3.o - # Other I2C/SMBus bus drivers obj-$(CONFIG_I2C_ACORN) += i2c-acorn.o obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c index d108450..8de7d7b 100644 --- a/drivers/i2c/busses/i2c-ali1535.c +++ b/drivers/i2c/busses/i2c-ali1535.c @@ -138,7 +138,7 @@ static unsigned short ali1535_smba; Note the differences between kernels with the old PCI BIOS interface and newer kernels with the real PCI interface. In compat.h some things are defined to make the transition easier. */ -static int ali1535_setup(struct pci_dev *dev) +static int __devinit ali1535_setup(struct pci_dev *dev) { int retval = -ENODEV; unsigned char temp; diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c index d627fce..e7e3205 100644 --- a/drivers/i2c/busses/i2c-ali15x3.c +++ b/drivers/i2c/busses/i2c-ali15x3.c @@ -131,7 +131,7 @@ MODULE_PARM_DESC(force_addr, static struct pci_driver ali15x3_driver; static unsigned short ali15x3_smba; -static int ali15x3_setup(struct pci_dev *ALI15X3_dev) +static int __devinit ali15x3_setup(struct pci_dev *ALI15X3_dev) { u16 a; unsigned char temp; diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 55edcfe..df6ab553 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -767,6 +767,9 @@ static int __devinit i801_probe(struct pci_dev *dev, const struct pci_device_id /* set up the sysfs linkage to our parent device */ i801_adapter.dev.parent = &dev->dev; + /* Retry up to 3 times on lost arbitration */ + i801_adapter.retries = 3; + snprintf(i801_adapter.name, sizeof(i801_adapter.name), "SMBus I801 adapter at %04lx", i801_smba); err = i2c_add_adapter(&i801_adapter); diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index a75c75e..5901707 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -56,12 +56,6 @@ iic_cook_addr(struct i2c_msg *msg) if (msg->flags & I2C_M_RD) addr |= 1; - /* - * Read or Write? - */ - if (msg->flags & I2C_M_REV_DIR_ADDR) - addr ^= 1; - return addr; } diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index bbab0e1..ed387ff 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -338,9 +338,6 @@ mv64xxx_i2c_prepare_for_io(struct mv64xxx_i2c_data *drv_data, if (msg->flags & I2C_M_RD) dir = 1; - if (msg->flags & I2C_M_REV_DIR_ADDR) - dir ^= 1; - if (msg->flags & I2C_M_TEN) { drv_data->addr1 = 0xf0 | (((u32)msg->addr & 0x300) >> 7) | dir; drv_data->addr2 = (u32)msg->addr & 0xff; diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c index 3c9d71f..1c440a7 100644 --- a/drivers/i2c/busses/i2c-powermac.c +++ b/drivers/i2c/busses/i2c-powermac.c @@ -49,48 +49,38 @@ static s32 i2c_powermac_smbus_xfer( struct i2c_adapter* adap, int rc = 0; int read = (read_write == I2C_SMBUS_READ); int addrdir = (addr << 1) | read; + int mode, subsize, len; + u32 subaddr; + u8 *buf; u8 local[2]; - rc = pmac_i2c_open(bus, 0); - if (rc) - return rc; + if (size == I2C_SMBUS_QUICK || size == I2C_SMBUS_BYTE) { + mode = pmac_i2c_mode_std; + subsize = 0; + subaddr = 0; + } else { + mode = read ? pmac_i2c_mode_combined : pmac_i2c_mode_stdsub; + subsize = 1; + subaddr = command; + } switch (size) { case I2C_SMBUS_QUICK: - rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std); - if (rc) - goto bail; - rc = pmac_i2c_xfer(bus, addrdir, 0, 0, NULL, 0); + buf = NULL; + len = 0; break; case I2C_SMBUS_BYTE: - rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std); - if (rc) - goto bail; - rc = pmac_i2c_xfer(bus, addrdir, 0, 0, &data->byte, 1); - break; case I2C_SMBUS_BYTE_DATA: - rc = pmac_i2c_setmode(bus, read ? - pmac_i2c_mode_combined : - pmac_i2c_mode_stdsub); - if (rc) - goto bail; - rc = pmac_i2c_xfer(bus, addrdir, 1, command, &data->byte, 1); + buf = &data->byte; + len = 1; break; case I2C_SMBUS_WORD_DATA: - rc = pmac_i2c_setmode(bus, read ? - pmac_i2c_mode_combined : - pmac_i2c_mode_stdsub); - if (rc) - goto bail; if (!read) { local[0] = data->word & 0xff; local[1] = (data->word >> 8) & 0xff; } - rc = pmac_i2c_xfer(bus, addrdir, 1, command, local, 2); - if (rc == 0 && read) { - data->word = ((u16)local[1]) << 8; - data->word |= local[0]; - } + buf = local; + len = 2; break; /* Note that these are broken vs. the expected smbus API where @@ -105,28 +95,44 @@ static s32 i2c_powermac_smbus_xfer( struct i2c_adapter* adap, * a repeat start/addr phase (but not stop in between) */ case I2C_SMBUS_BLOCK_DATA: - rc = pmac_i2c_setmode(bus, read ? - pmac_i2c_mode_combined : - pmac_i2c_mode_stdsub); - if (rc) - goto bail; - rc = pmac_i2c_xfer(bus, addrdir, 1, command, data->block, - data->block[0] + 1); - + buf = data->block; + len = data->block[0] + 1; break; case I2C_SMBUS_I2C_BLOCK_DATA: - rc = pmac_i2c_setmode(bus, read ? - pmac_i2c_mode_combined : - pmac_i2c_mode_stdsub); - if (rc) - goto bail; - rc = pmac_i2c_xfer(bus, addrdir, 1, command, - &data->block[1], data->block[0]); + buf = &data->block[1]; + len = data->block[0]; break; default: - rc = -EINVAL; + return -EINVAL; + } + + rc = pmac_i2c_open(bus, 0); + if (rc) { + dev_err(&adap->dev, "Failed to open I2C, err %d\n", rc); + return rc; + } + + rc = pmac_i2c_setmode(bus, mode); + if (rc) { + dev_err(&adap->dev, "Failed to set I2C mode %d, err %d\n", + mode, rc); + goto bail; } + + rc = pmac_i2c_xfer(bus, addrdir, subsize, subaddr, buf, len); + if (rc) { + dev_err(&adap->dev, + "I2C transfer at 0x%02x failed, size %d, err %d\n", + addrdir >> 1, size, rc); + goto bail; + } + + if (size == I2C_SMBUS_WORD_DATA && read) { + data->word = ((u16)local[1]) << 8; + data->word |= local[0]; + } + bail: pmac_i2c_close(bus); return rc; @@ -146,20 +152,33 @@ static int i2c_powermac_master_xfer( struct i2c_adapter *adap, int read; int addrdir; + if (num != 1) { + dev_err(&adap->dev, + "Multi-message I2C transactions not supported\n"); + return -EOPNOTSUPP; + } + if (msgs->flags & I2C_M_TEN) return -EINVAL; read = (msgs->flags & I2C_M_RD) != 0; addrdir = (msgs->addr << 1) | read; - if (msgs->flags & I2C_M_REV_DIR_ADDR) - addrdir ^= 1; rc = pmac_i2c_open(bus, 0); - if (rc) + if (rc) { + dev_err(&adap->dev, "Failed to open I2C, err %d\n", rc); return rc; + } rc = pmac_i2c_setmode(bus, pmac_i2c_mode_std); - if (rc) + if (rc) { + dev_err(&adap->dev, "Failed to set I2C mode %d, err %d\n", + pmac_i2c_mode_std, rc); goto bail; + } rc = pmac_i2c_xfer(bus, addrdir, 0, 0, msgs->buf, msgs->len); + if (rc < 0) + dev_err(&adap->dev, "I2C %s 0x%02x failed, err %d\n", + addrdir & 1 ? "read from" : "write to", addrdir >> 1, + rc); bail: pmac_i2c_close(bus); return rc < 0 ? rc : 1; @@ -183,19 +202,16 @@ static const struct i2c_algorithm i2c_powermac_algorithm = { static int __devexit i2c_powermac_remove(struct platform_device *dev) { struct i2c_adapter *adapter = platform_get_drvdata(dev); - struct pmac_i2c_bus *bus = i2c_get_adapdata(adapter); int rc; rc = i2c_del_adapter(adapter); - pmac_i2c_detach_adapter(bus, adapter); - i2c_set_adapdata(adapter, NULL); /* We aren't that prepared to deal with this... */ if (rc) printk(KERN_WARNING "i2c-powermac.c: Failed to remove bus %s !\n", adapter->name); platform_set_drvdata(dev, NULL); - kfree(adapter); + memset(adapter, 0, sizeof(*adapter)); return 0; } @@ -206,12 +222,12 @@ static int __devinit i2c_powermac_probe(struct platform_device *dev) struct pmac_i2c_bus *bus = dev->dev.platform_data; struct device_node *parent = NULL; struct i2c_adapter *adapter; - char name[32]; const char *basename; int rc; if (bus == NULL) return -EINVAL; + adapter = pmac_i2c_get_adapter(bus); /* Ok, now we need to make up a name for the interface that will * match what we used to do in the past, that is basically the @@ -237,29 +253,22 @@ static int __devinit i2c_powermac_probe(struct platform_device *dev) default: return -EINVAL; } - snprintf(name, 32, "%s %d", basename, pmac_i2c_get_channel(bus)); + snprintf(adapter->name, sizeof(adapter->name), "%s %d", basename, + pmac_i2c_get_channel(bus)); of_node_put(parent); - adapter = kzalloc(sizeof(struct i2c_adapter), GFP_KERNEL); - if (adapter == NULL) { - printk(KERN_ERR "i2c-powermac: can't allocate inteface !\n"); - return -ENOMEM; - } platform_set_drvdata(dev, adapter); - strcpy(adapter->name, name); adapter->algo = &i2c_powermac_algorithm; i2c_set_adapdata(adapter, bus); adapter->dev.parent = &dev->dev; - pmac_i2c_attach_adapter(bus, adapter); rc = i2c_add_adapter(adapter); if (rc) { printk(KERN_ERR "i2c-powermac: Adapter %s registration " - "failed\n", name); - i2c_set_adapdata(adapter, NULL); - pmac_i2c_detach_adapter(bus, adapter); + "failed\n", adapter->name); + memset(adapter, 0, sizeof(*adapter)); } - printk(KERN_INFO "PowerMac i2c bus %s registered\n", name); + printk(KERN_INFO "PowerMac i2c bus %s registered\n", adapter->name); if (!strncmp(basename, "uni-n", 5)) { struct device_node *np; diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c index 139f0c7..844569f 100644 --- a/drivers/i2c/busses/i2c-sis5595.c +++ b/drivers/i2c/busses/i2c-sis5595.c @@ -142,7 +142,7 @@ static void sis5595_write(u8 reg, u8 data) outb(data, sis5595_base + SMB_DAT); } -static int sis5595_setup(struct pci_dev *SIS5595_dev) +static int __devinit sis5595_setup(struct pci_dev *SIS5595_dev) { u16 a; u8 val; diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index 70ca41e..68cff7a 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -389,7 +389,7 @@ static u32 sis630_func(struct i2c_adapter *adapter) I2C_FUNC_SMBUS_BLOCK_DATA; } -static int sis630_setup(struct pci_dev *sis630_dev) +static int __devinit sis630_setup(struct pci_dev *sis630_dev) { unsigned char b; struct pci_dev *dummy = NULL; diff --git a/drivers/i2c/busses/i2c-stub.c b/drivers/i2c/busses/i2c-stub.c index 1b7b2af..0c770ea 100644 --- a/drivers/i2c/busses/i2c-stub.c +++ b/drivers/i2c/busses/i2c-stub.c @@ -35,6 +35,10 @@ module_param_array(chip_addr, ushort, NULL, S_IRUGO); MODULE_PARM_DESC(chip_addr, "Chip addresses (up to 10, between 0x03 and 0x77)"); +static unsigned long functionality = ~0UL; +module_param(functionality, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(functionality, "Override functionality bitfield"); + struct stub_chip { u8 pointer; u16 words[256]; /* Byte operations use the LSB as per SMBus @@ -48,7 +52,7 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data * data) { s32 ret; - int i; + int i, len; struct stub_chip *chip = NULL; /* Search for the right chip */ @@ -118,6 +122,29 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags, ret = 0; break; + case I2C_SMBUS_I2C_BLOCK_DATA: + len = data->block[0]; + if (read_write == I2C_SMBUS_WRITE) { + for (i = 0; i < len; i++) { + chip->words[command + i] &= 0xff00; + chip->words[command + i] |= data->block[1 + i]; + } + dev_dbg(&adap->dev, "i2c block data - addr 0x%02x, " + "wrote %d bytes at 0x%02x.\n", + addr, len, command); + } else { + for (i = 0; i < len; i++) { + data->block[1 + i] = + chip->words[command + i] & 0xff; + } + dev_dbg(&adap->dev, "i2c block data - addr 0x%02x, " + "read %d bytes at 0x%02x.\n", + addr, len, command); + } + + ret = 0; + break; + default: dev_dbg(&adap->dev, "Unsupported I2C/SMBus command\n"); ret = -EOPNOTSUPP; @@ -129,8 +156,9 @@ static s32 stub_xfer(struct i2c_adapter * adap, u16 addr, unsigned short flags, static u32 stub_func(struct i2c_adapter *adapter) { - return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | - I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA; + return (I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_I2C_BLOCK) & functionality; } static const struct i2c_algorithm smbus_algorithm = { diff --git a/drivers/i2c/busses/i2c-voodoo3.c b/drivers/i2c/busses/i2c-voodoo3.c deleted file mode 100644 index 7663d57..0000000 --- a/drivers/i2c/busses/i2c-voodoo3.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - Copyright (c) 1998, 1999 Frodo Looijaard <frodol@dds.nl>, - Philip Edelbrock <phil@netroedge.com>, - Ralph Metzler <rjkm@thp.uni-koeln.de>, and - Mark D. Studebaker <mdsxyz123@yahoo.com> - - Based on code written by Ralph Metzler <rjkm@thp.uni-koeln.de> and - Simon Vogl - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -/* This interfaces to the I2C bus of the Voodoo3 to gain access to - the BT869 and possibly other I2C devices. */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/i2c.h> -#include <linux/i2c-algo-bit.h> -#include <asm/io.h> - -/* the only registers we use */ -#define REG 0x78 -#define REG2 0x70 - -/* bit locations in the register */ -#define DDC_ENAB 0x00040000 -#define DDC_SCL_OUT 0x00080000 -#define DDC_SDA_OUT 0x00100000 -#define DDC_SCL_IN 0x00200000 -#define DDC_SDA_IN 0x00400000 -#define I2C_ENAB 0x00800000 -#define I2C_SCL_OUT 0x01000000 -#define I2C_SDA_OUT 0x02000000 -#define I2C_SCL_IN 0x04000000 -#define I2C_SDA_IN 0x08000000 - -/* initialization states */ -#define INIT2 0x2 -#define INIT3 0x4 - -/* delays */ -#define CYCLE_DELAY 10 -#define TIMEOUT (HZ / 2) - - -static void __iomem *ioaddr; - -/* The voo GPIO registers don't have individual masks for each bit - so we always have to read before writing. */ - -static void bit_vooi2c_setscl(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if (val) - r |= I2C_SCL_OUT; - else - r &= ~I2C_SCL_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -static void bit_vooi2c_setsda(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if (val) - r |= I2C_SDA_OUT; - else - r &= ~I2C_SDA_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -/* The GPIO pins are open drain, so the pins always remain outputs. - We rely on the i2c-algo-bit routines to set the pins high before - reading the input from other chips. */ - -static int bit_vooi2c_getscl(void *data) -{ - return (0 != (readl(ioaddr + REG) & I2C_SCL_IN)); -} - -static int bit_vooi2c_getsda(void *data) -{ - return (0 != (readl(ioaddr + REG) & I2C_SDA_IN)); -} - -static void bit_vooddc_setscl(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if (val) - r |= DDC_SCL_OUT; - else - r &= ~DDC_SCL_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -static void bit_vooddc_setsda(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if (val) - r |= DDC_SDA_OUT; - else - r &= ~DDC_SDA_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -static int bit_vooddc_getscl(void *data) -{ - return (0 != (readl(ioaddr + REG) & DDC_SCL_IN)); -} - -static int bit_vooddc_getsda(void *data) -{ - return (0 != (readl(ioaddr + REG) & DDC_SDA_IN)); -} - -static int config_v3(struct pci_dev *dev) -{ - unsigned long cadr; - - /* map Voodoo3 memory */ - cadr = dev->resource[0].start; - cadr &= PCI_BASE_ADDRESS_MEM_MASK; - ioaddr = ioremap_nocache(cadr, 0x1000); - if (ioaddr) { - writel(0x8160, ioaddr + REG2); - writel(0xcffc0020, ioaddr + REG); - dev_info(&dev->dev, "Using Banshee/Voodoo3 I2C device at %p\n", ioaddr); - return 0; - } - return -ENODEV; -} - -static struct i2c_algo_bit_data voo_i2c_bit_data = { - .setsda = bit_vooi2c_setsda, - .setscl = bit_vooi2c_setscl, - .getsda = bit_vooi2c_getsda, - .getscl = bit_vooi2c_getscl, - .udelay = CYCLE_DELAY, - .timeout = TIMEOUT -}; - -static struct i2c_adapter voodoo3_i2c_adapter = { - .owner = THIS_MODULE, - .name = "I2C Voodoo3/Banshee adapter", - .algo_data = &voo_i2c_bit_data, -}; - -static struct i2c_algo_bit_data voo_ddc_bit_data = { - .setsda = bit_vooddc_setsda, - .setscl = bit_vooddc_setscl, - .getsda = bit_vooddc_getsda, - .getscl = bit_vooddc_getscl, - .udelay = CYCLE_DELAY, - .timeout = TIMEOUT -}; - -static struct i2c_adapter voodoo3_ddc_adapter = { - .owner = THIS_MODULE, - .class = I2C_CLASS_DDC, - .name = "DDC Voodoo3/Banshee adapter", - .algo_data = &voo_ddc_bit_data, -}; - -static struct pci_device_id voodoo3_ids[] __devinitdata = { - { PCI_DEVICE(PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO3) }, - { PCI_DEVICE(PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_BANSHEE) }, - { 0, } -}; - -MODULE_DEVICE_TABLE (pci, voodoo3_ids); - -static int __devinit voodoo3_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - int retval; - - retval = config_v3(dev); - if (retval) - return retval; - - /* set up the sysfs linkage to our parent device */ - voodoo3_i2c_adapter.dev.parent = &dev->dev; - voodoo3_ddc_adapter.dev.parent = &dev->dev; - - retval = i2c_bit_add_bus(&voodoo3_i2c_adapter); - if (retval) - return retval; - retval = i2c_bit_add_bus(&voodoo3_ddc_adapter); - if (retval) - i2c_del_adapter(&voodoo3_i2c_adapter); - return retval; -} - -static void __devexit voodoo3_remove(struct pci_dev *dev) -{ - i2c_del_adapter(&voodoo3_i2c_adapter); - i2c_del_adapter(&voodoo3_ddc_adapter); - iounmap(ioaddr); -} - -static struct pci_driver voodoo3_driver = { - .name = "voodoo3_smbus", - .id_table = voodoo3_ids, - .probe = voodoo3_probe, - .remove = __devexit_p(voodoo3_remove), -}; - -static int __init i2c_voodoo3_init(void) -{ - return pci_register_driver(&voodoo3_driver); -} - -static void __exit i2c_voodoo3_exit(void) -{ - pci_unregister_driver(&voodoo3_driver); -} - - -MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>, " - "Philip Edelbrock <phil@netroedge.com>, " - "Ralph Metzler <rjkm@thp.uni-koeln.de>, " - "and Mark D. Studebaker <mdsxyz123@yahoo.com>"); -MODULE_DESCRIPTION("Voodoo3 I2C/SMBus driver"); -MODULE_LICENSE("GPL"); - -module_init(i2c_voodoo3_init); -module_exit(i2c_voodoo3_exit); diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig index f9618f4..ae4539d 100644 --- a/drivers/i2c/chips/Kconfig +++ b/drivers/i2c/chips/Kconfig @@ -6,16 +6,6 @@ menu "Miscellaneous I2C Chip support" -config DS1682 - tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm" - depends on EXPERIMENTAL - help - If you say yes here you get support for Dallas Semiconductor - DS1682 Total Elapsed Time Recorder. - - This driver can also be built as a module. If so, the module - will be called ds1682. - config SENSORS_TSL2550 tristate "Taos TSL2550 ambient light sensor" depends on EXPERIMENTAL diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile index 749cf36..fe0af0f 100644 --- a/drivers/i2c/chips/Makefile +++ b/drivers/i2c/chips/Makefile @@ -10,7 +10,6 @@ # * I/O expander drivers go to drivers/gpio # -obj-$(CONFIG_DS1682) += ds1682.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o ifeq ($(CONFIG_I2C_DEBUG_CHIP),y) diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 2965043..4f34823 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -558,11 +558,9 @@ static void i2c_scan_static_board_info(struct i2c_adapter *adapter) up_read(&__i2c_board_lock); } -static int i2c_do_add_adapter(struct device_driver *d, void *data) +static int i2c_do_add_adapter(struct i2c_driver *driver, + struct i2c_adapter *adap) { - struct i2c_driver *driver = to_i2c_driver(d); - struct i2c_adapter *adap = data; - /* Detect supported devices on that bus, and instantiate them */ i2c_detect(adap, driver); @@ -574,6 +572,11 @@ static int i2c_do_add_adapter(struct device_driver *d, void *data) return 0; } +static int __process_new_adapter(struct device_driver *d, void *data) +{ + return i2c_do_add_adapter(to_i2c_driver(d), data); +} + static int i2c_register_adapter(struct i2c_adapter *adap) { int res = 0, dummy; @@ -584,7 +587,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap) goto out_list; } - mutex_init(&adap->bus_lock); + rt_mutex_init(&adap->bus_lock); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) @@ -614,7 +617,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap) /* Notify drivers */ mutex_lock(&core_lock); dummy = bus_for_each_drv(&i2c_bus_type, NULL, adap, - i2c_do_add_adapter); + __process_new_adapter); mutex_unlock(&core_lock); return 0; @@ -715,10 +718,9 @@ retry: } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); -static int i2c_do_del_adapter(struct device_driver *d, void *data) +static int i2c_do_del_adapter(struct i2c_driver *driver, + struct i2c_adapter *adapter) { - struct i2c_driver *driver = to_i2c_driver(d); - struct i2c_adapter *adapter = data; struct i2c_client *client, *_n; int res; @@ -750,6 +752,11 @@ static int __unregister_client(struct device *dev, void *dummy) return 0; } +static int __process_removed_adapter(struct device_driver *d, void *data) +{ + return i2c_do_del_adapter(to_i2c_driver(d), data); +} + /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered @@ -777,7 +784,7 @@ int i2c_del_adapter(struct i2c_adapter *adap) /* Tell drivers about this removal */ mutex_lock(&core_lock); res = bus_for_each_drv(&i2c_bus_type, NULL, adap, - i2c_do_del_adapter); + __process_removed_adapter); mutex_unlock(&core_lock); if (res) return res; @@ -826,22 +833,11 @@ EXPORT_SYMBOL(i2c_del_adapter); /* ------------------------------------------------------------------------- */ -static int __attach_adapter(struct device *dev, void *data) +static int __process_new_driver(struct device *dev, void *data) { - struct i2c_adapter *adapter; - struct i2c_driver *driver = data; - if (dev->type != &i2c_adapter_type) return 0; - adapter = to_i2c_adapter(dev); - - i2c_detect(adapter, driver); - - /* Legacy drivers scan i2c busses directly */ - if (driver->attach_adapter) - driver->attach_adapter(adapter); - - return 0; + return i2c_do_add_adapter(data, to_i2c_adapter(dev)); } /* @@ -873,40 +869,18 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) INIT_LIST_HEAD(&driver->clients); /* Walk the adapters that are already present */ mutex_lock(&core_lock); - bus_for_each_dev(&i2c_bus_type, NULL, driver, __attach_adapter); + bus_for_each_dev(&i2c_bus_type, NULL, driver, __process_new_driver); mutex_unlock(&core_lock); return 0; } EXPORT_SYMBOL(i2c_register_driver); -static int __detach_adapter(struct device *dev, void *data) +static int __process_removed_driver(struct device *dev, void *data) { - struct i2c_adapter *adapter; - struct i2c_driver *driver = data; - struct i2c_client *client, *_n; - if (dev->type != &i2c_adapter_type) return 0; - adapter = to_i2c_adapter(dev); - - /* Remove the devices we created ourselves as the result of hardware - * probing (using a driver's detect method) */ - list_for_each_entry_safe(client, _n, &driver->clients, detected) { - dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", - client->name, client->addr); - list_del(&client->detected); - i2c_unregister_device(client); - } - - if (driver->detach_adapter) { - if (driver->detach_adapter(adapter)) - dev_err(&adapter->dev, - "detach_adapter failed for driver [%s]\n", - driver->driver.name); - } - - return 0; + return i2c_do_del_adapter(data, to_i2c_adapter(dev)); } /** @@ -917,7 +891,7 @@ static int __detach_adapter(struct device *dev, void *data) void i2c_del_driver(struct i2c_driver *driver) { mutex_lock(&core_lock); - bus_for_each_dev(&i2c_bus_type, NULL, driver, __detach_adapter); + bus_for_each_dev(&i2c_bus_type, NULL, driver, __process_removed_driver); mutex_unlock(&core_lock); driver_unregister(&driver->driver); @@ -1092,12 +1066,12 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) #endif if (in_atomic() || irqs_disabled()) { - ret = mutex_trylock(&adap->bus_lock); + ret = rt_mutex_trylock(&adap->bus_lock); if (!ret) /* I2C activity is ongoing. */ return -EAGAIN; } else { - mutex_lock_nested(&adap->bus_lock, adap->level); + rt_mutex_lock(&adap->bus_lock); } /* Retry automatically on arbitration loss */ @@ -1109,7 +1083,7 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) if (time_after(jiffies, orig_jiffies + adap->timeout)) break; } - mutex_unlock(&adap->bus_lock); + rt_mutex_unlock(&adap->bus_lock); return ret; } else { @@ -1180,7 +1154,7 @@ EXPORT_SYMBOL(i2c_master_recv); * ---------------------------------------------------- */ -static int i2c_detect_address(struct i2c_client *temp_client, int kind, +static int i2c_detect_address(struct i2c_client *temp_client, struct i2c_driver *driver) { struct i2c_board_info info; @@ -1199,22 +1173,18 @@ static int i2c_detect_address(struct i2c_client *temp_client, int kind, if (i2c_check_addr(adapter, addr)) return 0; - /* Make sure there is something at this address, unless forced */ - if (kind < 0) { - if (i2c_smbus_xfer(adapter, addr, 0, 0, 0, - I2C_SMBUS_QUICK, NULL) < 0) - return 0; + /* Make sure there is something at this address */ + if (i2c_smbus_xfer(adapter, addr, 0, 0, 0, I2C_SMBUS_QUICK, NULL) < 0) + return 0; - /* prevent 24RF08 corruption */ - if ((addr & ~0x0f) == 0x50) - i2c_smbus_xfer(adapter, addr, 0, 0, 0, - I2C_SMBUS_QUICK, NULL); - } + /* Prevent 24RF08 corruption */ + if ((addr & ~0x0f) == 0x50) + i2c_smbus_xfer(adapter, addr, 0, 0, 0, I2C_SMBUS_QUICK, NULL); /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; - err = driver->detect(temp_client, kind, &info); + err = driver->detect(temp_client, -1, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it here as this isn't an error. */ @@ -1259,40 +1229,13 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) return -ENOMEM; temp_client->adapter = adapter; - /* Force entries are done first, and are not affected by ignore - entries */ - if (address_data->forces) { - const unsigned short * const *forces = address_data->forces; - int kind; - - for (kind = 0; forces[kind]; kind++) { - for (i = 0; forces[kind][i] != I2C_CLIENT_END; - i += 2) { - if (forces[kind][i] == adap_id - || forces[kind][i] == ANY_I2C_BUS) { - dev_dbg(&adapter->dev, "found force " - "parameter for adapter %d, " - "addr 0x%02x, kind %d\n", - adap_id, forces[kind][i + 1], - kind); - temp_client->addr = forces[kind][i + 1]; - err = i2c_detect_address(temp_client, - kind, driver); - if (err) - goto exit_free; - } - } - } - } - /* Stop here if the classes do not match */ if (!(adapter->class & driver->class)) goto exit_free; /* Stop here if we can't use SMBUS_QUICK */ if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) { - if (address_data->probe[0] == I2C_CLIENT_END - && address_data->normal_i2c[0] == I2C_CLIENT_END) + if (address_data->normal_i2c[0] == I2C_CLIENT_END) goto exit_free; dev_warn(&adapter->dev, "SMBus Quick command not supported, " @@ -1301,48 +1244,12 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) goto exit_free; } - /* Probe entries are done second, and are not affected by ignore - entries either */ - for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) { - if (address_data->probe[i] == adap_id - || address_data->probe[i] == ANY_I2C_BUS) { - dev_dbg(&adapter->dev, "found probe parameter for " - "adapter %d, addr 0x%02x\n", adap_id, - address_data->probe[i + 1]); - temp_client->addr = address_data->probe[i + 1]; - err = i2c_detect_address(temp_client, -1, driver); - if (err) - goto exit_free; - } - } - - /* Normal entries are done last, unless shadowed by an ignore entry */ for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) { - int j, ignore; - - ignore = 0; - for (j = 0; address_data->ignore[j] != I2C_CLIENT_END; - j += 2) { - if ((address_data->ignore[j] == adap_id || - address_data->ignore[j] == ANY_I2C_BUS) - && address_data->ignore[j + 1] - == address_data->normal_i2c[i]) { - dev_dbg(&adapter->dev, "found ignore " - "parameter for adapter %d, " - "addr 0x%02x\n", adap_id, - address_data->ignore[j + 1]); - ignore = 1; - break; - } - } - if (ignore) - continue; - dev_dbg(&adapter->dev, "found normal entry for adapter %d, " "addr 0x%02x\n", adap_id, address_data->normal_i2c[i]); temp_client->addr = address_data->normal_i2c[i]; - err = i2c_detect_address(temp_client, -1, driver); + err = i2c_detect_address(temp_client, driver); if (err) goto exit_free; } @@ -1913,7 +1820,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, flags &= I2C_M_TEN | I2C_CLIENT_PEC; if (adapter->algo->smbus_xfer) { - mutex_lock(&adapter->bus_lock); + rt_mutex_lock(&adapter->bus_lock); /* Retry automatically on arbitration loss */ orig_jiffies = jiffies; @@ -1927,7 +1834,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, orig_jiffies + adapter->timeout)) break; } - mutex_unlock(&adapter->bus_lock); + rt_mutex_unlock(&adapter->bus_lock); } else res = i2c_smbus_xfer_emulated(adapter,addr,flags,read_write, command, protocol, data); diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index 7e13d2d..f4110aa 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -34,7 +34,6 @@ #include <linux/list.h> #include <linux/i2c.h> #include <linux/i2c-dev.h> -#include <linux/smp_lock.h> #include <linux/jiffies.h> #include <asm/uaccess.h> @@ -445,20 +444,14 @@ static int i2cdev_open(struct inode *inode, struct file *file) struct i2c_client *client; struct i2c_adapter *adap; struct i2c_dev *i2c_dev; - int ret = 0; - lock_kernel(); i2c_dev = i2c_dev_get_by_minor(minor); - if (!i2c_dev) { - ret = -ENODEV; - goto out; - } + if (!i2c_dev) + return -ENODEV; adap = i2c_get_adapter(i2c_dev->adap->nr); - if (!adap) { - ret = -ENODEV; - goto out; - } + if (!adap) + return -ENODEV; /* This creates an anonymous i2c_client, which may later be * pointed to some address using I2C_SLAVE or I2C_SLAVE_FORCE. @@ -470,8 +463,7 @@ static int i2cdev_open(struct inode *inode, struct file *file) client = kzalloc(sizeof(*client), GFP_KERNEL); if (!client) { i2c_put_adapter(adap); - ret = -ENOMEM; - goto out; + return -ENOMEM; } snprintf(client->name, I2C_NAME_SIZE, "i2c-dev %d", adap->nr); client->driver = &i2cdev_driver; @@ -479,9 +471,7 @@ static int i2cdev_open(struct inode *inode, struct file *file) client->adapter = adap; file->private_data = client; -out: - unlock_kernel(); - return ret; + return 0; } static int i2cdev_release(struct inode *inode, struct file *file) diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c index 39d4e01..a743e68 100644 --- a/drivers/ide/ide-pci-generic.c +++ b/drivers/ide/ide-pci-generic.c @@ -162,9 +162,10 @@ static const struct pci_device_id generic_pci_tbl[] = { #ifdef CONFIG_BLK_DEV_IDE_SATA { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_8237_SATA), 5 }, #endif - { PCI_VDEVICE(TOSHIBA, PCI_DEVICE_ID_TOSHIBA_PICCOLO), 4 }, { PCI_VDEVICE(TOSHIBA, PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), 4 }, { PCI_VDEVICE(TOSHIBA, PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), 4 }, + { PCI_VDEVICE(TOSHIBA, PCI_DEVICE_ID_TOSHIBA_PICCOLO_3), 4 }, + { PCI_VDEVICE(TOSHIBA, PCI_DEVICE_ID_TOSHIBA_PICCOLO_5), 4 }, { PCI_VDEVICE(NETCELL, PCI_DEVICE_ID_REVOLUTION), 6 }, /* * Must come last. If you add entries adjust diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c index 65c1429..d0dc1db 100644 --- a/drivers/ieee1394/ohci1394.c +++ b/drivers/ieee1394/ohci1394.c @@ -82,6 +82,7 @@ * */ +#include <linux/bitops.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> @@ -434,7 +435,6 @@ static void initialize_dma_trm_ctx(struct dma_trm_ctx *d) /* Count the number of available iso contexts */ static int get_nb_iso_ctx(struct ti_ohci *ohci, int reg) { - int i,ctx=0; u32 tmp; reg_write(ohci, reg, 0xffffffff); @@ -443,11 +443,7 @@ static int get_nb_iso_ctx(struct ti_ohci *ohci, int reg) DBGMSG("Iso contexts reg: %08x implemented: %08x", reg, tmp); /* Count the number of contexts */ - for (i=0; i<32; i++) { - if (tmp & 1) ctx++; - tmp >>= 1; - } - return ctx; + return hweight32(tmp); } /* Global initialization */ diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c index bba85ad..1a494d5 100644 --- a/drivers/input/keyboard/omap-keypad.c +++ b/drivers/input/keyboard/omap-keypad.c @@ -35,12 +35,12 @@ #include <linux/mutex.h> #include <linux/errno.h> #include <mach/gpio.h> -#include <mach/keypad.h> -#include <mach/menelaus.h> +#include <plat/keypad.h> +#include <plat/menelaus.h> #include <asm/irq.h> #include <mach/hardware.h> #include <asm/io.h> -#include <mach/mux.h> +#include <plat/mux.h> #undef NEW_BOARD_LEARNING_MODE diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c index 4460507..b982603 100644 --- a/drivers/leds/leds-ams-delta.c +++ b/drivers/leds/leds-ams-delta.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/platform_device.h> #include <linux/leds.h> -#include <mach/board-ams-delta.h> +#include <plat/board-ams-delta.h> /* * Our context diff --git a/drivers/leds/leds-locomo.c b/drivers/leds/leds-locomo.c index 5d91362..1f7c10f 100644 --- a/drivers/leds/leds-locomo.c +++ b/drivers/leds/leds-locomo.c @@ -44,7 +44,7 @@ static void locomoled_brightness_set1(struct led_classdev *led_cdev, static struct led_classdev locomo_led0 = { .name = "locomo:amber:charge", - .default_trigger = "sharpsl-charge", + .default_trigger = "main-battery-charging", .brightness_set = locomoled_brightness_set0, }; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index cc9f275..7b4ef5b 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -27,54 +27,49 @@ static int mouse_last_keycode; /* file(s) in /proc/sys/dev/mac_hid */ static ctl_table mac_hid_files[] = { { - .ctl_name = DEV_MAC_HID_MOUSE_BUTTON_EMULATION, .procname = "mouse_button_emulation", .data = &mouse_emulate_buttons, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, .procname = "mouse_button2_keycode", .data = &mouse_button2_keycode, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, .procname = "mouse_button3_keycode", .data = &mouse_button3_keycode, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; /* dir in /proc/sys/dev */ static ctl_table mac_hid_dir[] = { { - .ctl_name = DEV_MAC_HID, .procname = "mac_hid", .maxlen = 0, .mode = 0555, .child = mac_hid_files, }, - { .ctl_name = 0 } + { } }; /* /proc/sys/dev itself, in case that is not there yet */ static ctl_table mac_hid_root_dir[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = mac_hid_dir, }, - { .ctl_name = 0 } + { } }; static struct ctl_table_header *mac_hid_sysctl_header; diff --git a/drivers/md/md.c b/drivers/md/md.c index b182f86..5f154ef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -98,44 +98,40 @@ static struct ctl_table_header *raid_table_header; static ctl_table raid_table[] = { { - .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_dir_table[] = { { - .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, - { .ctl_name = 0 } + { } }; static const struct block_device_operations md_fops; diff --git a/drivers/media/radio/Kconfig b/drivers/media/radio/Kconfig index a87a477..b134553 100644 --- a/drivers/media/radio/Kconfig +++ b/drivers/media/radio/Kconfig @@ -195,6 +195,24 @@ config RADIO_MAESTRO To compile this driver as a module, choose M here: the module will be called radio-maestro. +config RADIO_MIROPCM20 + tristate "miroSOUND PCM20 radio" + depends on ISA && VIDEO_V4L2 + select SND_MIRO + ---help--- + Choose Y here if you have this FM radio card. You also need to enable + the ALSA sound system. This choice automatically selects the ALSA + sound card driver "Miro miroSOUND PCM1pro/PCM12/PCM20radio" as this + is required for the radio-miropcm20. + + In order to control your radio card, you will need to use programs + that are compatible with the Video For Linux API. Information on + this API and pointers to "v4l" programs may be found at + <file:Documentation/video4linux/API.html>. + + To compile this driver as a module, choose M here: the + module will be called radio-miropcm20. + config RADIO_SF16FMI tristate "SF16FMI Radio" depends on ISA && VIDEO_V4L2 diff --git a/drivers/media/radio/Makefile b/drivers/media/radio/Makefile index 2a1be3b..8a63d54 100644 --- a/drivers/media/radio/Makefile +++ b/drivers/media/radio/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_RADIO_TRUST) += radio-trust.o obj-$(CONFIG_I2C_SI4713) += si4713-i2c.o obj-$(CONFIG_RADIO_SI4713) += radio-si4713.o obj-$(CONFIG_RADIO_MAESTRO) += radio-maestro.o +obj-$(CONFIG_RADIO_MIROPCM20) += radio-miropcm20.o obj-$(CONFIG_USB_DSBR) += dsbr100.o obj-$(CONFIG_RADIO_SI470X) += si470x/ obj-$(CONFIG_USB_MR800) += radio-mr800.o diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c new file mode 100644 index 0000000..4ff8854 --- /dev/null +++ b/drivers/media/radio/radio-miropcm20.c @@ -0,0 +1,270 @@ +/* Miro PCM20 radio driver for Linux radio support + * (c) 1998 Ruurd Reitsma <R.A.Reitsma@wbmt.tudelft.nl> + * Thanks to Norberto Pellici for the ACI device interface specification + * The API part is based on the radiotrack driver by M. Kirkwood + * This driver relies on the aci mixer provided by the snd-miro + * ALSA driver. + * Look there for further info... + */ + +/* What ever you think about the ACI, version 0x07 is not very well! + * I can't get frequency, 'tuner status', 'tuner flags' or mute/mono + * conditions... Robert + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/videodev2.h> +#include <media/v4l2-device.h> +#include <media/v4l2-ioctl.h> +#include <sound/aci.h> + +static int radio_nr = -1; +module_param(radio_nr, int, 0); +MODULE_PARM_DESC(radio_nr, "Set radio device number (/dev/radioX). Default: -1 (autodetect)"); + +static int mono; +module_param(mono, bool, 0); +MODULE_PARM_DESC(mono, "Force tuner into mono mode."); + +struct pcm20 { + struct v4l2_device v4l2_dev; + struct video_device vdev; + unsigned long freq; + int muted; + struct snd_miro_aci *aci; +}; + +static struct pcm20 pcm20_card = { + .freq = 87*16000, + .muted = 1, +}; + +static int pcm20_mute(struct pcm20 *dev, unsigned char mute) +{ + dev->muted = mute; + return snd_aci_cmd(dev->aci, ACI_SET_TUNERMUTE, mute, -1); +} + +static int pcm20_stereo(struct pcm20 *dev, unsigned char stereo) +{ + return snd_aci_cmd(dev->aci, ACI_SET_TUNERMONO, !stereo, -1); +} + +static int pcm20_setfreq(struct pcm20 *dev, unsigned long freq) +{ + unsigned char freql; + unsigned char freqh; + struct snd_miro_aci *aci = dev->aci; + + dev->freq = freq; + + freq /= 160; + if (!(aci->aci_version == 0x07 || aci->aci_version >= 0xb0)) + freq /= 10; /* I don't know exactly which version + * needs this hack */ + freql = freq & 0xff; + freqh = freq >> 8; + + pcm20_stereo(dev, !mono); + return snd_aci_cmd(aci, ACI_WRITE_TUNE, freql, freqh); +} + +static const struct v4l2_file_operations pcm20_fops = { + .owner = THIS_MODULE, + .ioctl = video_ioctl2, +}; + +static int vidioc_querycap(struct file *file, void *priv, + struct v4l2_capability *v) +{ + strlcpy(v->driver, "Miro PCM20", sizeof(v->driver)); + strlcpy(v->card, "Miro PCM20", sizeof(v->card)); + strlcpy(v->bus_info, "ISA", sizeof(v->bus_info)); + v->version = 0x1; + v->capabilities = V4L2_CAP_TUNER | V4L2_CAP_RADIO; + return 0; +} + +static int vidioc_g_tuner(struct file *file, void *priv, + struct v4l2_tuner *v) +{ + if (v->index) /* Only 1 tuner */ + return -EINVAL; + strlcpy(v->name, "FM", sizeof(v->name)); + v->type = V4L2_TUNER_RADIO; + v->rangelow = 87*16000; + v->rangehigh = 108*16000; + v->signal = 0xffff; + v->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_STEREO; + v->capability = V4L2_TUNER_CAP_LOW; + v->audmode = V4L2_TUNER_MODE_MONO; + return 0; +} + +static int vidioc_s_tuner(struct file *file, void *priv, + struct v4l2_tuner *v) +{ + return v->index ? -EINVAL : 0; +} + +static int vidioc_g_frequency(struct file *file, void *priv, + struct v4l2_frequency *f) +{ + struct pcm20 *dev = video_drvdata(file); + + if (f->tuner != 0) + return -EINVAL; + + f->type = V4L2_TUNER_RADIO; + f->frequency = dev->freq; + return 0; +} + + +static int vidioc_s_frequency(struct file *file, void *priv, + struct v4l2_frequency *f) +{ + struct pcm20 *dev = video_drvdata(file); + + if (f->tuner != 0 || f->type != V4L2_TUNER_RADIO) + return -EINVAL; + + dev->freq = f->frequency; + pcm20_setfreq(dev, f->frequency); + return 0; +} + +static int vidioc_queryctrl(struct file *file, void *priv, + struct v4l2_queryctrl *qc) +{ + switch (qc->id) { + case V4L2_CID_AUDIO_MUTE: + return v4l2_ctrl_query_fill(qc, 0, 1, 1, 1); + } + return -EINVAL; +} + +static int vidioc_g_ctrl(struct file *file, void *priv, + struct v4l2_control *ctrl) +{ + struct pcm20 *dev = video_drvdata(file); + + switch (ctrl->id) { + case V4L2_CID_AUDIO_MUTE: + ctrl->value = dev->muted; + break; + default: + return -EINVAL; + } + return 0; +} + +static int vidioc_s_ctrl(struct file *file, void *priv, + struct v4l2_control *ctrl) +{ + struct pcm20 *dev = video_drvdata(file); + + switch (ctrl->id) { + case V4L2_CID_AUDIO_MUTE: + pcm20_mute(dev, ctrl->value); + break; + default: + return -EINVAL; + } + return 0; +} + +static int vidioc_g_input(struct file *filp, void *priv, unsigned int *i) +{ + *i = 0; + return 0; +} + +static int vidioc_s_input(struct file *filp, void *priv, unsigned int i) +{ + return i ? -EINVAL : 0; +} + +static int vidioc_g_audio(struct file *file, void *priv, + struct v4l2_audio *a) +{ + a->index = 0; + strlcpy(a->name, "Radio", sizeof(a->name)); + a->capability = V4L2_AUDCAP_STEREO; + return 0; +} + +static int vidioc_s_audio(struct file *file, void *priv, + struct v4l2_audio *a) +{ + return a->index ? -EINVAL : 0; +} + +static const struct v4l2_ioctl_ops pcm20_ioctl_ops = { + .vidioc_querycap = vidioc_querycap, + .vidioc_g_tuner = vidioc_g_tuner, + .vidioc_s_tuner = vidioc_s_tuner, + .vidioc_g_frequency = vidioc_g_frequency, + .vidioc_s_frequency = vidioc_s_frequency, + .vidioc_queryctrl = vidioc_queryctrl, + .vidioc_g_ctrl = vidioc_g_ctrl, + .vidioc_s_ctrl = vidioc_s_ctrl, + .vidioc_g_audio = vidioc_g_audio, + .vidioc_s_audio = vidioc_s_audio, + .vidioc_g_input = vidioc_g_input, + .vidioc_s_input = vidioc_s_input, +}; + +static int __init pcm20_init(void) +{ + struct pcm20 *dev = &pcm20_card; + struct v4l2_device *v4l2_dev = &dev->v4l2_dev; + int res; + + dev->aci = snd_aci_get_aci(); + if (dev->aci == NULL) { + v4l2_err(v4l2_dev, + "you must load the snd-miro driver first!\n"); + return -ENODEV; + } + strlcpy(v4l2_dev->name, "miropcm20", sizeof(v4l2_dev->name)); + + + res = v4l2_device_register(NULL, v4l2_dev); + if (res < 0) { + v4l2_err(v4l2_dev, "could not register v4l2_device\n"); + return -EINVAL; + } + + strlcpy(dev->vdev.name, v4l2_dev->name, sizeof(dev->vdev.name)); + dev->vdev.v4l2_dev = v4l2_dev; + dev->vdev.fops = &pcm20_fops; + dev->vdev.ioctl_ops = &pcm20_ioctl_ops; + dev->vdev.release = video_device_release_empty; + video_set_drvdata(&dev->vdev, dev); + + if (video_register_device(&dev->vdev, VFL_TYPE_RADIO, radio_nr) < 0) + goto fail; + + v4l2_info(v4l2_dev, "Mirosound PCM20 Radio tuner\n"); + return 0; +fail: + v4l2_device_unregister(v4l2_dev); + return -EINVAL; +} + +MODULE_AUTHOR("Ruurd Reitsma, Krzysztof Helt"); +MODULE_DESCRIPTION("A driver for the Miro PCM20 radio card."); +MODULE_LICENSE("GPL"); + +static void __exit pcm20_cleanup(void) +{ + struct pcm20 *dev = &pcm20_card; + + video_unregister_device(&dev->vdev); + v4l2_device_unregister(&dev->v4l2_dev); +} + +module_init(pcm20_init); +module_exit(pcm20_cleanup); diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 570be13..08f2d07 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -121,6 +121,12 @@ config TWL4030_POWER and load scripts controling which resources are switched off/on or reset when a sleep, wakeup or warm reset event occurs. +config TWL4030_CODEC + bool + depends on TWL4030_CORE + select MFD_CORE + default n + config MFD_TMIO bool default n diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f3b277b..af0fc90 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_MENELAUS) += menelaus.o obj-$(CONFIG_TWL4030_CORE) += twl4030-core.o twl4030-irq.o obj-$(CONFIG_TWL4030_POWER) += twl4030-power.o +obj-$(CONFIG_TWL4030_CODEC) += twl4030-codec.o obj-$(CONFIG_MFD_MC13783) += mc13783-core.o diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c index 57271cb..84815f9 100644 --- a/drivers/mfd/mcp-core.c +++ b/drivers/mfd/mcp-core.c @@ -17,11 +17,11 @@ #include <linux/device.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/mfd/mcp.h> #include <mach/dma.h> #include <asm/system.h> -#include "mcp.h" #define to_mcp(d) container_of(d, struct mcp, attached_device) #define to_mcp_driver(d) container_of(d, struct mcp_driver, drv) diff --git a/drivers/mfd/mcp-sa11x0.c b/drivers/mfd/mcp-sa11x0.c index 62b32da..2584272 100644 --- a/drivers/mfd/mcp-sa11x0.c +++ b/drivers/mfd/mcp-sa11x0.c @@ -19,6 +19,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/platform_device.h> +#include <linux/mfd/mcp.h> #include <mach/dma.h> #include <mach/hardware.h> @@ -28,7 +29,6 @@ #include <mach/assabet.h> -#include "mcp.h" struct mcp_sa11x0 { u32 mccr0; @@ -163,6 +163,7 @@ static int mcp_sa11x0_probe(struct platform_device *pdev) mcp->dma_audio_wr = DMA_Ser4MCP0Wr; mcp->dma_telco_rd = DMA_Ser4MCP1Rd; mcp->dma_telco_wr = DMA_Ser4MCP1Wr; + mcp->gpio_base = data->gpio_base; platform_set_drvdata(pdev, mcp); diff --git a/drivers/mfd/mcp.h b/drivers/mfd/mcp.h deleted file mode 100644 index c093a93..0000000 --- a/drivers/mfd/mcp.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * linux/drivers/mfd/mcp.h - * - * Copyright (C) 2001 Russell King, All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - */ -#ifndef MCP_H -#define MCP_H - -struct mcp_ops; - -struct mcp { - struct module *owner; - struct mcp_ops *ops; - spinlock_t lock; - int use_count; - unsigned int sclk_rate; - unsigned int rw_timeout; - dma_device_t dma_audio_rd; - dma_device_t dma_audio_wr; - dma_device_t dma_telco_rd; - dma_device_t dma_telco_wr; - struct device attached_device; -}; - -struct mcp_ops { - void (*set_telecom_divisor)(struct mcp *, unsigned int); - void (*set_audio_divisor)(struct mcp *, unsigned int); - void (*reg_write)(struct mcp *, unsigned int, unsigned int); - unsigned int (*reg_read)(struct mcp *, unsigned int); - void (*enable)(struct mcp *); - void (*disable)(struct mcp *); -}; - -void mcp_set_telecom_divisor(struct mcp *, unsigned int); -void mcp_set_audio_divisor(struct mcp *, unsigned int); -void mcp_reg_write(struct mcp *, unsigned int, unsigned int); -unsigned int mcp_reg_read(struct mcp *, unsigned int); -void mcp_enable(struct mcp *); -void mcp_disable(struct mcp *); -#define mcp_get_sclk_rate(mcp) ((mcp)->sclk_rate) - -struct mcp *mcp_host_alloc(struct device *, size_t); -int mcp_host_register(struct mcp *); -void mcp_host_unregister(struct mcp *); - -struct mcp_driver { - struct device_driver drv; - int (*probe)(struct mcp *); - void (*remove)(struct mcp *); - int (*suspend)(struct mcp *, pm_message_t); - int (*resume)(struct mcp *); -}; - -int mcp_driver_register(struct mcp_driver *); -void mcp_driver_unregister(struct mcp_driver *); - -#define mcp_get_drvdata(mcp) dev_get_drvdata(&(mcp)->attached_device) -#define mcp_set_drvdata(mcp,d) dev_set_drvdata(&(mcp)->attached_device, d) - -#define mcp_priv(mcp) ((void *)((mcp)+1)) - -#endif diff --git a/drivers/mfd/menelaus.c b/drivers/mfd/menelaus.c index 4b364ba..970afa1 100644 --- a/drivers/mfd/menelaus.c +++ b/drivers/mfd/menelaus.c @@ -44,7 +44,7 @@ #include <asm/mach/irq.h> #include <mach/gpio.h> -#include <mach/menelaus.h> +#include <plat/menelaus.h> #define DRIVER_NAME "menelaus" diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c new file mode 100644 index 0000000..77b9149 --- /dev/null +++ b/drivers/mfd/twl4030-codec.c @@ -0,0 +1,276 @@ +/* + * MFD driver for twl4030 codec submodule + * + * Author: Peter Ujfalusi <peter.ujfalusi@nokia.com> + * + * Copyright: (C) 2009 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/platform_device.h> +#include <linux/i2c/twl4030.h> +#include <linux/mfd/core.h> +#include <linux/mfd/twl4030-codec.h> + +#define TWL4030_CODEC_CELLS 2 + +static struct platform_device *twl4030_codec_dev; + +struct twl4030_codec_resource { + int request_count; + u8 reg; + u8 mask; +}; + +struct twl4030_codec { + unsigned int audio_mclk; + struct mutex mutex; + struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX]; + struct mfd_cell cells[TWL4030_CODEC_CELLS]; +}; + +/* + * Modify the resource, the function returns the content of the register + * after the modification. + */ +static int twl4030_codec_set_resource(enum twl4030_codec_res id, int enable) +{ + struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev); + u8 val; + + twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val, + codec->resource[id].reg); + + if (enable) + val |= codec->resource[id].mask; + else + val &= ~codec->resource[id].mask; + + twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE, + val, codec->resource[id].reg); + + return val; +} + +static inline int twl4030_codec_get_resource(enum twl4030_codec_res id) +{ + struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev); + u8 val; + + twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val, + codec->resource[id].reg); + + return val; +} + +/* + * Enable the resource. + * The function returns with error or the content of the register + */ +int twl4030_codec_enable_resource(enum twl4030_codec_res id) +{ + struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev); + int val; + + if (id >= TWL4030_CODEC_RES_MAX) { + dev_err(&twl4030_codec_dev->dev, + "Invalid resource ID (%u)\n", id); + return -EINVAL; + } + + mutex_lock(&codec->mutex); + if (!codec->resource[id].request_count) + /* Resource was disabled, enable it */ + val = twl4030_codec_set_resource(id, 1); + else + val = twl4030_codec_get_resource(id); + + codec->resource[id].request_count++; + mutex_unlock(&codec->mutex); + + return val; +} +EXPORT_SYMBOL_GPL(twl4030_codec_enable_resource); + +/* + * Disable the resource. + * The function returns with error or the content of the register + */ +int twl4030_codec_disable_resource(unsigned id) +{ + struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev); + int val; + + if (id >= TWL4030_CODEC_RES_MAX) { + dev_err(&twl4030_codec_dev->dev, + "Invalid resource ID (%u)\n", id); + return -EINVAL; + } + + mutex_lock(&codec->mutex); + if (!codec->resource[id].request_count) { + dev_err(&twl4030_codec_dev->dev, + "Resource has been disabled already (%u)\n", id); + mutex_unlock(&codec->mutex); + return -EPERM; + } + codec->resource[id].request_count--; + + if (!codec->resource[id].request_count) + /* Resource can be disabled now */ + val = twl4030_codec_set_resource(id, 0); + else + val = twl4030_codec_get_resource(id); + + mutex_unlock(&codec->mutex); + + return val; +} +EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource); + +unsigned int twl4030_codec_get_mclk(void) +{ + struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev); + + return codec->audio_mclk; +} +EXPORT_SYMBOL_GPL(twl4030_codec_get_mclk); + +static int __devinit twl4030_codec_probe(struct platform_device *pdev) +{ + struct twl4030_codec *codec; + struct twl4030_codec_data *pdata = pdev->dev.platform_data; + struct mfd_cell *cell = NULL; + int ret, childs = 0; + u8 val; + + if (!pdata) { + dev_err(&pdev->dev, "Platform data is missing\n"); + return -EINVAL; + } + + /* Configure APLL_INFREQ and disable APLL if enabled */ + val = 0; + switch (pdata->audio_mclk) { + case 19200000: + val |= TWL4030_APLL_INFREQ_19200KHZ; + break; + case 26000000: + val |= TWL4030_APLL_INFREQ_26000KHZ; + break; + case 38400000: + val |= TWL4030_APLL_INFREQ_38400KHZ; + break; + default: + dev_err(&pdev->dev, "Invalid audio_mclk\n"); + return -EINVAL; + } + twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE, + val, TWL4030_REG_APLL_CTL); + + codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL); + if (!codec) + return -ENOMEM; + + platform_set_drvdata(pdev, codec); + + twl4030_codec_dev = pdev; + mutex_init(&codec->mutex); + codec->audio_mclk = pdata->audio_mclk; + + /* Codec power */ + codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE; + codec->resource[TWL4030_CODEC_RES_POWER].mask = TWL4030_CODECPDZ; + + /* PLL */ + codec->resource[TWL4030_CODEC_RES_APLL].reg = TWL4030_REG_APLL_CTL; + codec->resource[TWL4030_CODEC_RES_APLL].mask = TWL4030_APLL_EN; + + if (pdata->audio) { + cell = &codec->cells[childs]; + cell->name = "twl4030_codec_audio"; + cell->platform_data = pdata->audio; + cell->data_size = sizeof(*pdata->audio); + childs++; + } + if (pdata->vibra) { + cell = &codec->cells[childs]; + cell->name = "twl4030_codec_vibra"; + cell->platform_data = pdata->vibra; + cell->data_size = sizeof(*pdata->vibra); + childs++; + } + + if (childs) + ret = mfd_add_devices(&pdev->dev, pdev->id, codec->cells, + childs, NULL, 0); + else { + dev_err(&pdev->dev, "No platform data found for childs\n"); + ret = -ENODEV; + } + + if (!ret) + return 0; + + platform_set_drvdata(pdev, NULL); + kfree(codec); + twl4030_codec_dev = NULL; + return ret; +} + +static int __devexit twl4030_codec_remove(struct platform_device *pdev) +{ + struct twl4030_codec *codec = platform_get_drvdata(pdev); + + mfd_remove_devices(&pdev->dev); + platform_set_drvdata(pdev, NULL); + kfree(codec); + twl4030_codec_dev = NULL; + + return 0; +} + +MODULE_ALIAS("platform:twl4030_codec"); + +static struct platform_driver twl4030_codec_driver = { + .probe = twl4030_codec_probe, + .remove = __devexit_p(twl4030_codec_remove), + .driver = { + .owner = THIS_MODULE, + .name = "twl4030_codec", + }, +}; + +static int __devinit twl4030_codec_init(void) +{ + return platform_driver_register(&twl4030_codec_driver); +} +module_init(twl4030_codec_init); + +static void __devexit twl4030_codec_exit(void) +{ + platform_driver_unregister(&twl4030_codec_driver); +} +module_exit(twl4030_codec_exit); + +MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>"); +MODULE_LICENSE("GPL"); + diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c index a1c47ee..40449cd 100644 --- a/drivers/mfd/twl4030-core.c +++ b/drivers/mfd/twl4030-core.c @@ -39,7 +39,7 @@ #include <linux/i2c/twl4030.h> #if defined(CONFIG_ARCH_OMAP2) || defined(CONFIG_ARCH_OMAP3) -#include <mach/cpu.h> +#include <plat/cpu.h> #endif /* @@ -114,6 +114,12 @@ #define twl_has_watchdog() false #endif +#if defined(CONFIG_TWL4030_CODEC) || defined(CONFIG_TWL4030_CODEC_MODULE) +#define twl_has_codec() true +#else +#define twl_has_codec() false +#endif + /* Triton Core internal information (BEGIN) */ /* Last - for index max*/ @@ -601,6 +607,14 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features) return PTR_ERR(child); } + if (twl_has_codec() && pdata->codec) { + child = add_child(1, "twl4030_codec", + pdata->codec, sizeof(*pdata->codec), + false, 0, 0); + if (IS_ERR(child)) + return PTR_ERR(child); + } + if (twl_has_regulator()) { /* child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1); @@ -763,7 +777,7 @@ static int twl4030_remove(struct i2c_client *client) } /* NOTE: this driver only handles a single twl4030/tps659x0 chip */ -static int +static int __init twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id) { int status; diff --git a/drivers/mfd/ucb1x00-assabet.c b/drivers/mfd/ucb1x00-assabet.c index 86fed48..cea9da6 100644 --- a/drivers/mfd/ucb1x00-assabet.c +++ b/drivers/mfd/ucb1x00-assabet.c @@ -14,10 +14,10 @@ #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/device.h> +#include <linux/mfd/ucb1x00.h> #include <mach/dma.h> -#include "ucb1x00.h" #define UCB1X00_ATTR(name,input)\ static ssize_t name##_show(struct device *dev, struct device_attribute *attr, \ diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c index 60c3988..252b741 100644 --- a/drivers/mfd/ucb1x00-core.c +++ b/drivers/mfd/ucb1x00-core.c @@ -25,12 +25,12 @@ #include <linux/interrupt.h> #include <linux/device.h> #include <linux/mutex.h> +#include <linux/mfd/ucb1x00.h> +#include <linux/gpio.h> #include <mach/dma.h> #include <mach/hardware.h> -#include "ucb1x00.h" - static DEFINE_MUTEX(ucb1x00_mutex); static LIST_HEAD(ucb1x00_drivers); static LIST_HEAD(ucb1x00_devices); @@ -108,6 +108,60 @@ unsigned int ucb1x00_io_read(struct ucb1x00 *ucb) return ucb1x00_reg_read(ucb, UCB_IO_DATA); } +static void ucb1x00_gpio_set(struct gpio_chip *chip, unsigned offset, int value) +{ + struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio); + unsigned long flags; + + spin_lock_irqsave(&ucb->io_lock, flags); + if (value) + ucb->io_out |= 1 << offset; + else + ucb->io_out &= ~(1 << offset); + + ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out); + spin_unlock_irqrestore(&ucb->io_lock, flags); +} + +static int ucb1x00_gpio_get(struct gpio_chip *chip, unsigned offset) +{ + struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio); + return ucb1x00_reg_read(ucb, UCB_IO_DATA) & (1 << offset); +} + +static int ucb1x00_gpio_direction_input(struct gpio_chip *chip, unsigned offset) +{ + struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio); + unsigned long flags; + + spin_lock_irqsave(&ucb->io_lock, flags); + ucb->io_dir &= ~(1 << offset); + ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir); + spin_unlock_irqrestore(&ucb->io_lock, flags); + + return 0; +} + +static int ucb1x00_gpio_direction_output(struct gpio_chip *chip, unsigned offset + , int value) +{ + struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio); + unsigned long flags; + + spin_lock_irqsave(&ucb->io_lock, flags); + ucb->io_dir |= (1 << offset); + ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir); + + if (value) + ucb->io_out |= 1 << offset; + else + ucb->io_out &= ~(1 << offset); + ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out); + spin_unlock_irqrestore(&ucb->io_lock, flags); + + return 0; +} + /* * UCB1300 data sheet says we must: * 1. enable ADC => 5us (including reference startup time) @@ -476,6 +530,7 @@ static int ucb1x00_probe(struct mcp *mcp) struct ucb1x00_driver *drv; unsigned int id; int ret = -ENODEV; + int temp; mcp_enable(mcp); id = mcp_reg_read(mcp, UCB_ID); @@ -508,12 +563,27 @@ static int ucb1x00_probe(struct mcp *mcp) goto err_free; } + ucb->gpio.base = -1; + if (mcp->gpio_base != 0) { + ucb->gpio.label = dev_name(&ucb->dev); + ucb->gpio.base = mcp->gpio_base; + ucb->gpio.ngpio = 10; + ucb->gpio.set = ucb1x00_gpio_set; + ucb->gpio.get = ucb1x00_gpio_get; + ucb->gpio.direction_input = ucb1x00_gpio_direction_input; + ucb->gpio.direction_output = ucb1x00_gpio_direction_output; + ret = gpiochip_add(&ucb->gpio); + if (ret) + goto err_free; + } else + dev_info(&ucb->dev, "gpio_base not set so no gpiolib support"); + ret = request_irq(ucb->irq, ucb1x00_irq, IRQF_TRIGGER_RISING, "UCB1x00", ucb); if (ret) { printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n", ucb->irq, ret); - goto err_free; + goto err_gpio; } mcp_set_drvdata(mcp, ucb); @@ -522,6 +592,7 @@ static int ucb1x00_probe(struct mcp *mcp) if (ret) goto err_irq; + INIT_LIST_HEAD(&ucb->devs); mutex_lock(&ucb1x00_mutex); list_add(&ucb->node, &ucb1x00_devices); @@ -529,10 +600,14 @@ static int ucb1x00_probe(struct mcp *mcp) ucb1x00_add_dev(ucb, drv); } mutex_unlock(&ucb1x00_mutex); + goto out; err_irq: free_irq(ucb->irq, ucb); + err_gpio: + if (ucb->gpio.base != -1) + temp = gpiochip_remove(&ucb->gpio); err_free: kfree(ucb); err_disable: @@ -545,6 +620,7 @@ static void ucb1x00_remove(struct mcp *mcp) { struct ucb1x00 *ucb = mcp_get_drvdata(mcp); struct list_head *l, *n; + int ret; mutex_lock(&ucb1x00_mutex); list_del(&ucb->node); @@ -554,6 +630,12 @@ static void ucb1x00_remove(struct mcp *mcp) } mutex_unlock(&ucb1x00_mutex); + if (ucb->gpio.base != -1) { + ret = gpiochip_remove(&ucb->gpio); + if (ret) + dev_err(&ucb->dev, "Can't remove gpio chip: %d\n", ret); + } + free_irq(ucb->irq, ucb); device_unregister(&ucb->dev); } @@ -604,6 +686,7 @@ static int ucb1x00_resume(struct mcp *mcp) struct ucb1x00 *ucb = mcp_get_drvdata(mcp); struct ucb1x00_dev *dev; + ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir); mutex_lock(&ucb1x00_mutex); list_for_each_entry(dev, &ucb->devs, dev_node) { if (dev->drv->resume) diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c index 61b7d3e..000cb41 100644 --- a/drivers/mfd/ucb1x00-ts.c +++ b/drivers/mfd/ucb1x00-ts.c @@ -30,12 +30,12 @@ #include <linux/freezer.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/mfd/ucb1x00.h> #include <mach/dma.h> #include <mach/collie.h> #include <asm/mach-types.h> -#include "ucb1x00.h" struct ucb1x00_ts { diff --git a/drivers/mfd/ucb1x00.h b/drivers/mfd/ucb1x00.h deleted file mode 100644 index a8ad8a0..0000000 --- a/drivers/mfd/ucb1x00.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * linux/drivers/mfd/ucb1x00.h - * - * Copyright (C) 2001 Russell King, All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - */ -#ifndef UCB1200_H -#define UCB1200_H - -#define UCB_IO_DATA 0x00 -#define UCB_IO_DIR 0x01 - -#define UCB_IO_0 (1 << 0) -#define UCB_IO_1 (1 << 1) -#define UCB_IO_2 (1 << 2) -#define UCB_IO_3 (1 << 3) -#define UCB_IO_4 (1 << 4) -#define UCB_IO_5 (1 << 5) -#define UCB_IO_6 (1 << 6) -#define UCB_IO_7 (1 << 7) -#define UCB_IO_8 (1 << 8) -#define UCB_IO_9 (1 << 9) - -#define UCB_IE_RIS 0x02 -#define UCB_IE_FAL 0x03 -#define UCB_IE_STATUS 0x04 -#define UCB_IE_CLEAR 0x04 -#define UCB_IE_ADC (1 << 11) -#define UCB_IE_TSPX (1 << 12) -#define UCB_IE_TSMX (1 << 13) -#define UCB_IE_TCLIP (1 << 14) -#define UCB_IE_ACLIP (1 << 15) - -#define UCB_IRQ_TSPX 12 - -#define UCB_TC_A 0x05 -#define UCB_TC_A_LOOP (1 << 7) /* UCB1200 */ -#define UCB_TC_A_AMPL (1 << 7) /* UCB1300 */ - -#define UCB_TC_B 0x06 -#define UCB_TC_B_VOICE_ENA (1 << 3) -#define UCB_TC_B_CLIP (1 << 4) -#define UCB_TC_B_ATT (1 << 6) -#define UCB_TC_B_SIDE_ENA (1 << 11) -#define UCB_TC_B_MUTE (1 << 13) -#define UCB_TC_B_IN_ENA (1 << 14) -#define UCB_TC_B_OUT_ENA (1 << 15) - -#define UCB_AC_A 0x07 -#define UCB_AC_B 0x08 -#define UCB_AC_B_LOOP (1 << 8) -#define UCB_AC_B_MUTE (1 << 13) -#define UCB_AC_B_IN_ENA (1 << 14) -#define UCB_AC_B_OUT_ENA (1 << 15) - -#define UCB_TS_CR 0x09 -#define UCB_TS_CR_TSMX_POW (1 << 0) -#define UCB_TS_CR_TSPX_POW (1 << 1) -#define UCB_TS_CR_TSMY_POW (1 << 2) -#define UCB_TS_CR_TSPY_POW (1 << 3) -#define UCB_TS_CR_TSMX_GND (1 << 4) -#define UCB_TS_CR_TSPX_GND (1 << 5) -#define UCB_TS_CR_TSMY_GND (1 << 6) -#define UCB_TS_CR_TSPY_GND (1 << 7) -#define UCB_TS_CR_MODE_INT (0 << 8) -#define UCB_TS_CR_MODE_PRES (1 << 8) -#define UCB_TS_CR_MODE_POS (2 << 8) -#define UCB_TS_CR_BIAS_ENA (1 << 11) -#define UCB_TS_CR_TSPX_LOW (1 << 12) -#define UCB_TS_CR_TSMX_LOW (1 << 13) - -#define UCB_ADC_CR 0x0a -#define UCB_ADC_SYNC_ENA (1 << 0) -#define UCB_ADC_VREFBYP_CON (1 << 1) -#define UCB_ADC_INP_TSPX (0 << 2) -#define UCB_ADC_INP_TSMX (1 << 2) -#define UCB_ADC_INP_TSPY (2 << 2) -#define UCB_ADC_INP_TSMY (3 << 2) -#define UCB_ADC_INP_AD0 (4 << 2) -#define UCB_ADC_INP_AD1 (5 << 2) -#define UCB_ADC_INP_AD2 (6 << 2) -#define UCB_ADC_INP_AD3 (7 << 2) -#define UCB_ADC_EXT_REF (1 << 5) -#define UCB_ADC_START (1 << 7) -#define UCB_ADC_ENA (1 << 15) - -#define UCB_ADC_DATA 0x0b -#define UCB_ADC_DAT_VAL (1 << 15) -#define UCB_ADC_DAT(x) (((x) & 0x7fe0) >> 5) - -#define UCB_ID 0x0c -#define UCB_ID_1200 0x1004 -#define UCB_ID_1300 0x1005 -#define UCB_ID_TC35143 0x9712 - -#define UCB_MODE 0x0d -#define UCB_MODE_DYN_VFLAG_ENA (1 << 12) -#define UCB_MODE_AUD_OFF_CAN (1 << 13) - -#include "mcp.h" - -struct ucb1x00_irq { - void *devid; - void (*fn)(int, void *); -}; - -struct ucb1x00 { - spinlock_t lock; - struct mcp *mcp; - unsigned int irq; - struct semaphore adc_sem; - spinlock_t io_lock; - u16 id; - u16 io_dir; - u16 io_out; - u16 adc_cr; - u16 irq_fal_enbl; - u16 irq_ris_enbl; - struct ucb1x00_irq irq_handler[16]; - struct device dev; - struct list_head node; - struct list_head devs; -}; - -struct ucb1x00_driver; - -struct ucb1x00_dev { - struct list_head dev_node; - struct list_head drv_node; - struct ucb1x00 *ucb; - struct ucb1x00_driver *drv; - void *priv; -}; - -struct ucb1x00_driver { - struct list_head node; - struct list_head devs; - int (*add)(struct ucb1x00_dev *dev); - void (*remove)(struct ucb1x00_dev *dev); - int (*suspend)(struct ucb1x00_dev *dev, pm_message_t state); - int (*resume)(struct ucb1x00_dev *dev); -}; - -#define classdev_to_ucb1x00(cd) container_of(cd, struct ucb1x00, dev) - -int ucb1x00_register_driver(struct ucb1x00_driver *); -void ucb1x00_unregister_driver(struct ucb1x00_driver *); - -/** - * ucb1x00_clkrate - return the UCB1x00 SIB clock rate - * @ucb: UCB1x00 structure describing chip - * - * Return the SIB clock rate in Hz. - */ -static inline unsigned int ucb1x00_clkrate(struct ucb1x00 *ucb) -{ - return mcp_get_sclk_rate(ucb->mcp); -} - -/** - * ucb1x00_enable - enable the UCB1x00 SIB clock - * @ucb: UCB1x00 structure describing chip - * - * Enable the SIB clock. This can be called multiple times. - */ -static inline void ucb1x00_enable(struct ucb1x00 *ucb) -{ - mcp_enable(ucb->mcp); -} - -/** - * ucb1x00_disable - disable the UCB1x00 SIB clock - * @ucb: UCB1x00 structure describing chip - * - * Disable the SIB clock. The SIB clock will only be disabled - * when the number of ucb1x00_enable calls match the number of - * ucb1x00_disable calls. - */ -static inline void ucb1x00_disable(struct ucb1x00 *ucb) -{ - mcp_disable(ucb->mcp); -} - -/** - * ucb1x00_reg_write - write a UCB1x00 register - * @ucb: UCB1x00 structure describing chip - * @reg: UCB1x00 4-bit register index to write - * @val: UCB1x00 16-bit value to write - * - * Write the UCB1x00 register @reg with value @val. The SIB - * clock must be running for this function to return. - */ -static inline void ucb1x00_reg_write(struct ucb1x00 *ucb, unsigned int reg, unsigned int val) -{ - mcp_reg_write(ucb->mcp, reg, val); -} - -/** - * ucb1x00_reg_read - read a UCB1x00 register - * @ucb: UCB1x00 structure describing chip - * @reg: UCB1x00 4-bit register index to write - * - * Read the UCB1x00 register @reg and return its value. The SIB - * clock must be running for this function to return. - */ -static inline unsigned int ucb1x00_reg_read(struct ucb1x00 *ucb, unsigned int reg) -{ - return mcp_reg_read(ucb->mcp, reg); -} -/** - * ucb1x00_set_audio_divisor - - * @ucb: UCB1x00 structure describing chip - * @div: SIB clock divisor - */ -static inline void ucb1x00_set_audio_divisor(struct ucb1x00 *ucb, unsigned int div) -{ - mcp_set_audio_divisor(ucb->mcp, div); -} - -/** - * ucb1x00_set_telecom_divisor - - * @ucb: UCB1x00 structure describing chip - * @div: SIB clock divisor - */ -static inline void ucb1x00_set_telecom_divisor(struct ucb1x00 *ucb, unsigned int div) -{ - mcp_set_telecom_divisor(ucb->mcp, div); -} - -void ucb1x00_io_set_dir(struct ucb1x00 *ucb, unsigned int, unsigned int); -void ucb1x00_io_write(struct ucb1x00 *ucb, unsigned int, unsigned int); -unsigned int ucb1x00_io_read(struct ucb1x00 *ucb); - -#define UCB_NOSYNC (0) -#define UCB_SYNC (1) - -unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync); -void ucb1x00_adc_enable(struct ucb1x00 *ucb); -void ucb1x00_adc_disable(struct ucb1x00 *ucb); - -/* - * Which edges of the IRQ do you want to control today? - */ -#define UCB_RISING (1 << 0) -#define UCB_FALLING (1 << 1) - -int ucb1x00_hook_irq(struct ucb1x00 *ucb, unsigned int idx, void (*fn)(int, void *), void *devid); -void ucb1x00_enable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges); -void ucb1x00_disable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges); -int ucb1x00_free_irq(struct ucb1x00 *ucb, unsigned int idx, void *devid); - -#endif diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index a2ea383..2c16ca6 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -246,6 +246,16 @@ config EP93XX_PWM To compile this driver as a module, choose M here: the module will be called ep93xx_pwm. +config DS1682 + tristate "Dallas DS1682 Total Elapsed Time Recorder with Alarm" + depends on I2C && EXPERIMENTAL + help + If you say yes here you get support for Dallas Semiconductor + DS1682 Total Elapsed Time Recorder. + + This driver can also be built as a module. If so, the module + will be called ds1682. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index e311267..906a0ed 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_HP_ILO) += hpilo.o obj-$(CONFIG_ISL29003) += isl29003.o obj-$(CONFIG_EP93XX_PWM) += ep93xx_pwm.o +obj-$(CONFIG_DS1682) += ds1682.o obj-$(CONFIG_C2PORT) += c2port/ obj-$(CONFIG_IWMC3200TOP) += iwmc3200top/ obj-y += eeprom/ diff --git a/drivers/i2c/chips/ds1682.c b/drivers/misc/ds1682.c index f3ee4a1..f3ee4a1 100644 --- a/drivers/i2c/chips/ds1682.c +++ b/drivers/misc/ds1682.c diff --git a/drivers/misc/ics932s401.c b/drivers/misc/ics932s401.c index 6e43ab4..4bb7a3a 100644 --- a/drivers/misc/ics932s401.c +++ b/drivers/misc/ics932s401.c @@ -417,32 +417,25 @@ static int ics932s401_detect(struct i2c_client *client, int kind, struct i2c_board_info *info) { struct i2c_adapter *adapter = client->adapter; + int vendor, device, revision; if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) return -ENODEV; - if (kind <= 0) { - int vendor, device, revision; - - vendor = i2c_smbus_read_word_data(client, - ICS932S401_REG_VENDOR_REV); - vendor >>= 8; - revision = vendor >> ICS932S401_REV_SHIFT; - vendor &= ICS932S401_VENDOR_MASK; - if (vendor != ICS932S401_VENDOR) - return -ENODEV; - - device = i2c_smbus_read_word_data(client, - ICS932S401_REG_DEVICE); - device >>= 8; - if (device != ICS932S401_DEVICE) - return -ENODEV; - - if (revision != ICS932S401_REV) - dev_info(&adapter->dev, "Unknown revision %d\n", - revision); - } else - dev_dbg(&adapter->dev, "detection forced\n"); + vendor = i2c_smbus_read_word_data(client, ICS932S401_REG_VENDOR_REV); + vendor >>= 8; + revision = vendor >> ICS932S401_REV_SHIFT; + vendor &= ICS932S401_VENDOR_MASK; + if (vendor != ICS932S401_VENDOR) + return -ENODEV; + + device = i2c_smbus_read_word_data(client, ICS932S401_REG_DEVICE); + device >>= 8; + if (device != ICS932S401_DEVICE) + return -ENODEV; + + if (revision != ICS932S401_REV) + dev_info(&adapter->dev, "Unknown revision %d\n", revision); strlcpy(info->type, "ics932s401", I2C_NAME_SIZE); diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c index fd3688a..832ed4c 100644 --- a/drivers/misc/sgi-xp/xpc_main.c +++ b/drivers/misc/sgi-xp/xpc_main.c @@ -89,48 +89,40 @@ static int xpc_disengage_max_timelimit = 120; static ctl_table xpc_sys_xpc_hb_dir[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "hb_interval", .data = &xpc_hb_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &xpc_hb_min_interval, .extra2 = &xpc_hb_max_interval}, { - .ctl_name = CTL_UNNUMBERED, .procname = "hb_check_interval", .data = &xpc_hb_check_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &xpc_hb_check_min_interval, .extra2 = &xpc_hb_check_max_interval}, {} }; static ctl_table xpc_sys_xpc_dir[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "hb", .mode = 0555, .child = xpc_sys_xpc_hb_dir}, { - .ctl_name = CTL_UNNUMBERED, .procname = "disengage_timelimit", .data = &xpc_disengage_timelimit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &xpc_disengage_min_timelimit, .extra2 = &xpc_disengage_max_timelimit}, {} }; static ctl_table xpc_sys_dir[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "xpc", .mode = 0555, .child = xpc_sys_xpc_dir}, diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index c76677a..b5bbe59 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -106,7 +106,8 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name) int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade); #if defined CONFIG_X86_64 - mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset); + mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset, + UV_AFFINITY_CPU); if (mq->irq < 0) { dev_err(xpc_part, "uv_setup_irq() returned error=%d\n", -mq->irq); @@ -136,7 +137,7 @@ static void xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq) { #if defined CONFIG_X86_64 - uv_teardown_irq(mq->irq, mq->mmr_blade, mq->mmr_offset); + uv_teardown_irq(mq->irq); #elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV int mmr_pnode; diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 705a589..90d168a 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -56,7 +56,7 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired) clk = 255; host->cclk = host->mclk / (2 * (clk + 1)); } - if (host->hw_designer == 0x80) + if (host->hw_designer == AMBA_VENDOR_ST) clk |= MCI_FCEN; /* Bug fix in ST IP block */ clk |= MCI_CLK_ENABLE; /* This hasn't proven to be worthwhile */ diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index b8fd7af..5f970e2 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -30,12 +30,12 @@ #include <asm/io.h> #include <asm/irq.h> -#include <mach/board.h> -#include <mach/mmc.h> +#include <plat/board.h> +#include <plat/mmc.h> #include <mach/gpio.h> -#include <mach/dma.h> -#include <mach/mux.h> -#include <mach/fpga.h> +#include <plat/dma.h> +#include <plat/mux.h> +#include <plat/fpga.h> #define OMAP_MMC_REG_CMD 0x00 #define OMAP_MMC_REG_ARGL 0x04 diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 0aecaae..4b23225 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -30,11 +30,11 @@ #include <linux/mmc/core.h> #include <linux/io.h> #include <linux/semaphore.h> -#include <mach/dma.h> +#include <plat/dma.h> #include <mach/hardware.h> -#include <mach/board.h> -#include <mach/mmc.h> -#include <mach/cpu.h> +#include <plat/board.h> +#include <plat/mmc.h> +#include <plat/cpu.h> /* OMAP HSMMC Host Controller Registers */ #define OMAP_HSMMC_SYSCONFIG 0x0010 diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 9fb480b..bb47ff4 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -43,6 +43,9 @@ #define NR_SG 1 #define CLKRT_OFF (~0) +#define mmc_has_26MHz() (cpu_is_pxa300() || cpu_is_pxa310() \ + || cpu_is_pxa935()) + struct pxamci_host { struct mmc_host *mmc; spinlock_t lock; @@ -457,7 +460,7 @@ static void pxamci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) clk_enable(host->clk); if (ios->clock == 26000000) { - /* to support 26MHz on pxa300/pxa310 */ + /* to support 26MHz */ host->clkrt = 7; } else { /* to handle (19.5MHz, 26MHz) */ @@ -608,8 +611,7 @@ static int pxamci_probe(struct platform_device *pdev) * Calculate minimum clock rate, rounding up. */ mmc->f_min = (host->clkrate + 63) / 64; - mmc->f_max = (cpu_is_pxa300() || cpu_is_pxa310()) ? 26000000 - : host->clkrate; + mmc->f_max = (mmc_has_26MHz()) ? 26000000 : host->clkrate; pxamci_init_ocr(host); @@ -618,7 +620,7 @@ static int pxamci_probe(struct platform_device *pdev) if (!cpu_is_pxa25x()) { mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ; host->cmdat |= CMDAT_SDIO_INT_EN; - if (cpu_is_pxa300() || cpu_is_pxa310()) + if (mmc_has_26MHz()) mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED; } diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c index a244781..ead0b2f 100644 --- a/drivers/mtd/maps/omap_nor.c +++ b/drivers/mtd/maps/omap_nor.c @@ -45,7 +45,7 @@ #include <asm/io.h> #include <mach/hardware.h> #include <asm/mach/flash.h> -#include <mach/tc.h> +#include <plat/tc.h> #ifdef CONFIG_MTD_PARTITIONS static const char *part_probes[] = { /* "RedBoot", */ "cmdlinepart", NULL }; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 8ca17a3..64e2b37 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->readsect(dev, block, buf)) return -EIO; + rq_flush_dcache_pages(req); return 0; case WRITE: if (!tr->writesect) return -EIO; + rq_flush_dcache_pages(req); for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->writesect(dev, block, buf)) return -EIO; diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 2fda0b6..8f8e87b 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -358,7 +358,7 @@ endchoice config MTD_NAND_PXA3xx tristate "Support for NAND flash devices on PXA3xx" - depends on MTD_NAND && PXA3xx + depends on MTD_NAND && (PXA3xx || ARCH_MMP) help This enables the driver for the NAND flash device found on PXA3xx processors diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c index 005b91f..2548e10 100644 --- a/drivers/mtd/nand/ams-delta.c +++ b/drivers/mtd/nand/ams-delta.c @@ -25,7 +25,7 @@ #include <mach/hardware.h> #include <asm/sizes.h> #include <mach/gpio.h> -#include <mach/board-ams-delta.h> +#include <plat/board-ams-delta.h> /* * MTD structure for E3 (Delta) diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c index 090ab87..1bb799f 100644 --- a/drivers/mtd/nand/omap2.c +++ b/drivers/mtd/nand/omap2.c @@ -18,9 +18,9 @@ #include <linux/mtd/partitions.h> #include <linux/io.h> -#include <mach/dma.h> -#include <mach/gpmc.h> -#include <mach/nand.h> +#include <plat/dma.h> +#include <plat/gpmc.h> +#include <plat/nand.h> #define GPMC_IRQ_STATUS 0x18 #define GPMC_ECC_CONFIG 0x1F4 diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index 6ea520a..1a5a036 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ +#include <linux/kernel.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/platform_device.h> @@ -22,7 +23,7 @@ #include <linux/irq.h> #include <mach/dma.h> -#include <mach/pxa3xx_nand.h> +#include <plat/pxa3xx_nand.h> #define CHIP_DELAY_TIMEOUT (2 * HZ/10) @@ -84,10 +85,6 @@ #define NDCB0_CMD1_MASK (0xff) #define NDCB0_ADDR_CYC_SHIFT (16) -/* dma-able I/O address for the NAND data and commands */ -#define NDCB0_DMA_ADDR (0x43100048) -#define NDDB_DMA_ADDR (0x43100040) - /* macros for registers read/write */ #define nand_writel(info, off, val) \ __raw_writel((val), (info)->mmio_base + (off)) @@ -123,6 +120,7 @@ struct pxa3xx_nand_info { struct clk *clk; void __iomem *mmio_base; + unsigned long mmio_phys; unsigned int buf_start; unsigned int buf_count; @@ -228,13 +226,35 @@ static struct pxa3xx_nand_flash samsung512MbX16 = { .chip_id = 0x46ec, }; +static struct pxa3xx_nand_flash samsung2GbX8 = { + .timing = &samsung512MbX16_timing, + .cmdset = &smallpage_cmdset, + .page_per_block = 64, + .page_size = 2048, + .flash_width = 8, + .dfc_width = 8, + .num_blocks = 2048, + .chip_id = 0xdaec, +}; + +static struct pxa3xx_nand_flash samsung32GbX8 = { + .timing = &samsung512MbX16_timing, + .cmdset = &smallpage_cmdset, + .page_per_block = 128, + .page_size = 4096, + .flash_width = 8, + .dfc_width = 8, + .num_blocks = 8192, + .chip_id = 0xd7ec, +}; + static struct pxa3xx_nand_timing micron_timing = { .tCH = 10, .tCS = 25, .tWH = 15, .tWP = 25, .tRH = 15, - .tRP = 25, + .tRP = 30, .tR = 25000, .tWHR = 60, .tAR = 10, @@ -262,6 +282,28 @@ static struct pxa3xx_nand_flash micron1GbX16 = { .chip_id = 0xb12c, }; +static struct pxa3xx_nand_flash micron4GbX8 = { + .timing = µn_timing, + .cmdset = &largepage_cmdset, + .page_per_block = 64, + .page_size = 2048, + .flash_width = 8, + .dfc_width = 8, + .num_blocks = 4096, + .chip_id = 0xdc2c, +}; + +static struct pxa3xx_nand_flash micron4GbX16 = { + .timing = µn_timing, + .cmdset = &largepage_cmdset, + .page_per_block = 64, + .page_size = 2048, + .flash_width = 16, + .dfc_width = 16, + .num_blocks = 4096, + .chip_id = 0xcc2c, +}; + static struct pxa3xx_nand_timing stm2GbX16_timing = { .tCH = 10, .tCS = 35, @@ -287,8 +329,12 @@ static struct pxa3xx_nand_flash stm2GbX16 = { static struct pxa3xx_nand_flash *builtin_flash_types[] = { &samsung512MbX16, + &samsung2GbX8, + &samsung32GbX8, µn1GbX8, µn1GbX16, + µn4GbX8, + µn4GbX16, &stm2GbX16, }; #endif /* CONFIG_MTD_NAND_PXA3xx_BUILTIN */ @@ -489,7 +535,7 @@ static int handle_data_pio(struct pxa3xx_nand_info *info) switch (info->state) { case STATE_PIO_WRITING: __raw_writesl(info->mmio_base + NDDB, info->data_buff, - info->data_size << 2); + DIV_ROUND_UP(info->data_size, 4)); enable_int(info, NDSR_CS0_BBD | NDSR_CS0_CMDD); @@ -501,7 +547,7 @@ static int handle_data_pio(struct pxa3xx_nand_info *info) break; case STATE_PIO_READING: __raw_readsl(info->mmio_base + NDDB, info->data_buff, - info->data_size << 2); + DIV_ROUND_UP(info->data_size, 4)); break; default: printk(KERN_ERR "%s: invalid state %d\n", __func__, @@ -523,11 +569,11 @@ static void start_data_dma(struct pxa3xx_nand_info *info, int dir_out) if (dir_out) { desc->dsadr = info->data_buff_phys; - desc->dtadr = NDDB_DMA_ADDR; + desc->dtadr = info->mmio_phys + NDDB; desc->dcmd |= DCMD_INCSRCADDR | DCMD_FLOWTRG; } else { desc->dtadr = info->data_buff_phys; - desc->dsadr = NDDB_DMA_ADDR; + desc->dsadr = info->mmio_phys + NDDB; desc->dcmd |= DCMD_INCTRGADDR | DCMD_FLOWSRC; } @@ -669,6 +715,7 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command, /* disable HW ECC to get all the OOB data */ info->buf_count = mtd->writesize + mtd->oobsize; info->buf_start = mtd->writesize + column; + memset(info->data_buff, 0xFF, info->buf_count); if (prepare_read_prog_cmd(info, cmdset->read1, column, page_addr)) break; @@ -1239,13 +1286,17 @@ static int pxa3xx_nand_probe(struct platform_device *pdev) ret = -ENODEV; goto fail_free_res; } + info->mmio_phys = r->start; ret = pxa3xx_nand_init_buff(info); if (ret) goto fail_free_io; - ret = request_irq(IRQ_NAND, pxa3xx_nand_irq, IRQF_DISABLED, - pdev->name, info); + /* initialize all interrupts to be disabled */ + disable_int(info, NDSR_MASK); + + ret = request_irq(irq, pxa3xx_nand_irq, IRQF_DISABLED, + pdev->name, info); if (ret < 0) { dev_err(&pdev->dev, "failed to request IRQ\n"); goto fail_free_buf; @@ -1271,7 +1322,7 @@ static int pxa3xx_nand_probe(struct platform_device *pdev) return add_mtd_partitions(mtd, pdata->parts, pdata->nr_parts); fail_free_irq: - free_irq(IRQ_NAND, info); + free_irq(irq, info); fail_free_buf: if (use_dma) { pxa_free_dma(info->data_dma_ch); @@ -1296,12 +1347,15 @@ static int pxa3xx_nand_remove(struct platform_device *pdev) struct mtd_info *mtd = platform_get_drvdata(pdev); struct pxa3xx_nand_info *info = mtd->priv; struct resource *r; + int irq; platform_set_drvdata(pdev, NULL); del_mtd_device(mtd); del_mtd_partitions(mtd); - free_irq(IRQ_NAND, info); + irq = platform_get_irq(pdev, 0); + if (irq >= 0) + free_irq(irq, info); if (use_dma) { pxa_free_dma(info->data_dma_ch); dma_free_writecombine(&pdev->dev, info->data_buff_size, diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c index 0108ed4..86c4f6dc 100644 --- a/drivers/mtd/onenand/omap2.c +++ b/drivers/mtd/onenand/omap2.c @@ -36,13 +36,13 @@ #include <linux/io.h> #include <asm/mach/flash.h> -#include <mach/gpmc.h> -#include <mach/onenand.h> +#include <plat/gpmc.h> +#include <plat/onenand.h> #include <mach/gpio.h> -#include <mach/dma.h> +#include <plat/dma.h> -#include <mach/board.h> +#include <plat/board.h> #define DRIVER_NAME "omap2-onenand" diff --git a/drivers/of/base.c b/drivers/of/base.c index ddf224d..e6627b2 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -9,7 +9,8 @@ * * Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net * - * Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell. + * Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and + * Grant Likely. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -82,6 +83,29 @@ struct property *of_find_property(const struct device_node *np, } EXPORT_SYMBOL(of_find_property); +/** + * of_find_all_nodes - Get next node in global list + * @prev: Previous node or NULL to start iteration + * of_node_put() will be called on it + * + * Returns a node pointer with refcount incremented, use + * of_node_put() on it when done. + */ +struct device_node *of_find_all_nodes(struct device_node *prev) +{ + struct device_node *np; + + read_lock(&devtree_lock); + np = prev ? prev->allnext : allnodes; + for (; np != NULL; np = np->allnext) + if (of_node_get(np)) + break; + of_node_put(prev); + read_unlock(&devtree_lock); + return np; +} +EXPORT_SYMBOL(of_find_all_nodes); + /* * Find a property with a given name for a given node * and return the value. diff --git a/drivers/parport/parport_mfc3.c b/drivers/parport/parport_mfc3.c index 6dec9ba..362db31 100644 --- a/drivers/parport/parport_mfc3.c +++ b/drivers/parport/parport_mfc3.c @@ -386,7 +386,7 @@ static void __exit parport_mfc3_exit(void) if (!this_port[i]) continue; parport_remove_port(this_port[i]); - if (!this_port[i]->irq != PARPORT_IRQ_NONE) { + if (this_port[i]->irq != PARPORT_IRQ_NONE) { if (--use_cnt == 0) free_irq(IRQ_AMIGA_PORTS, &pp_mfc3_ops); } diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 8eefe56..3f56bc0 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -233,10 +233,10 @@ static int do_hardware_modes (ctl_table *table, int write, return copy_to_user(result, buffer, len) ? -EFAULT : 0; } -#define PARPORT_PORT_DIR(CHILD) { .ctl_name = 0, .procname = NULL, .mode = 0555, .child = CHILD } -#define PARPORT_PARPORT_DIR(CHILD) { .ctl_name = DEV_PARPORT, .procname = "parport", \ +#define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } +#define PARPORT_PARPORT_DIR(CHILD) { .procname = "parport", \ .mode = 0555, .child = CHILD } -#define PARPORT_DEV_DIR(CHILD) { .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = CHILD } +#define PARPORT_DEV_DIR(CHILD) { .procname = "dev", .mode = 0555, .child = CHILD } #define PARPORT_DEVICES_ROOT_DIR { .procname = "devices", \ .mode = 0555, .child = NULL } @@ -270,7 +270,7 @@ static const struct parport_sysctl_table parport_sysctl_template = { .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = (void*) &parport_min_spintime_value, .extra2 = (void*) &parport_max_spintime_value }, @@ -279,28 +279,28 @@ static const struct parport_sysctl_table parport_sysctl_template = { .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_hardware_base_addr + .proc_handler = do_hardware_base_addr }, { .procname = "irq", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_hardware_irq + .proc_handler = do_hardware_irq }, { .procname = "dma", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_hardware_dma + .proc_handler = do_hardware_dma }, { .procname = "modes", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_hardware_modes + .proc_handler = do_hardware_modes }, PARPORT_DEVICES_ROOT_DIR, #ifdef CONFIG_PARPORT_1284 @@ -309,35 +309,35 @@ static const struct parport_sysctl_table parport_sysctl_template = { .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_autoprobe + .proc_handler = do_autoprobe }, { .procname = "autoprobe0", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_autoprobe + .proc_handler = do_autoprobe }, { .procname = "autoprobe1", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_autoprobe + .proc_handler = do_autoprobe }, { .procname = "autoprobe2", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_autoprobe + .proc_handler = do_autoprobe }, { .procname = "autoprobe3", .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_autoprobe + .proc_handler = do_autoprobe }, #endif /* IEEE 1284 support */ {} @@ -348,7 +348,7 @@ static const struct parport_sysctl_table parport_sysctl_template = { .data = NULL, .maxlen = 0, .mode = 0444, - .proc_handler = &do_active_device + .proc_handler = do_active_device }, {} }, @@ -386,14 +386,13 @@ parport_device_sysctl_template = { .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = (void*) &parport_min_timeslice_value, .extra2 = (void*) &parport_max_timeslice_value }, }, { { - .ctl_name = 0, .procname = NULL, .data = NULL, .maxlen = 0, @@ -438,7 +437,7 @@ parport_default_sysctl_table = { .data = &parport_default_timeslice, .maxlen = sizeof(parport_default_timeslice), .mode = 0644, - .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = (void*) &parport_min_timeslice_value, .extra2 = (void*) &parport_max_timeslice_value }, @@ -447,7 +446,7 @@ parport_default_sysctl_table = { .data = &parport_default_spintime, .maxlen = sizeof(parport_default_spintime), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = (void*) &parport_min_spintime_value, .extra2 = (void*) &parport_max_spintime_value }, @@ -455,7 +454,6 @@ parport_default_sysctl_table = { }, { { - .ctl_name = DEV_PARPORT_DEFAULT, .procname = "default", .mode = 0555, .child = parport_default_sysctl_table.vars @@ -495,7 +493,6 @@ int parport_proc_register(struct parport *port) t->vars[6 + i].extra2 = &port->probe_info[i]; t->port_dir[0].procname = port->name; - t->port_dir[0].ctl_name = 0; t->port_dir[0].child = t->vars; t->parport_dir[0].child = t->port_dir; @@ -534,11 +531,9 @@ int parport_device_proc_register(struct pardevice *device) t->dev_dir[0].child = t->parport_dir; t->parport_dir[0].child = t->port_dir; t->port_dir[0].procname = port->name; - t->port_dir[0].ctl_name = 0; t->port_dir[0].child = t->devices_root_dir; t->devices_root_dir[0].child = t->device_dir; - t->device_dir[0].ctl_name = 0; t->device_dir[0].procname = device->name; t->device_dir[0].child = t->vars; t->vars[0].data = &device->timeslice; diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig index f3ccbcc..cd5082d 100644 --- a/drivers/pcmcia/Kconfig +++ b/drivers/pcmcia/Kconfig @@ -179,7 +179,7 @@ config PCMCIA_BCM63XX depends on BCM63XX && PCMCIA config PCMCIA_SOC_COMMON - bool + tristate config PCMCIA_SA1100 tristate "SA1100 support" diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index 68570bc..663781d 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -23,8 +23,8 @@ #include <asm/io.h> #include <asm/sizes.h> -#include <mach/mux.h> -#include <mach/tc.h> +#include <plat/mux.h> +#include <plat/tc.h> /* NOTE: don't expect this to support many I/O cards. The 16xx chips have diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c index 11cc3ba..8db86b9 100644 --- a/drivers/pcmcia/sa1100_generic.c +++ b/drivers/pcmcia/sa1100_generic.c @@ -51,7 +51,7 @@ static int (*sa11x0_pcmcia_hw_init[])(struct device *dev) = { #ifdef CONFIG_SA1100_CERF pcmcia_cerf_init, #endif -#ifdef CONFIG_SA1100_H3600 +#if defined(CONFIG_SA1100_H3100) || defined(CONFIG_SA1100_H3600) pcmcia_h3600_init, #endif #ifdef CONFIG_SA1100_SHANNON diff --git a/drivers/pcmcia/sa1100_h3600.c b/drivers/pcmcia/sa1100_h3600.c index 3a121ac69..56329ad 100644 --- a/drivers/pcmcia/sa1100_h3600.c +++ b/drivers/pcmcia/sa1100_h3600.c @@ -10,47 +10,139 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/gpio.h> #include <mach/hardware.h> #include <asm/irq.h> #include <asm/mach-types.h> -#include <mach/h3600.h> +#include <mach/h3xxx.h> #include "sa1100_generic.h" static struct pcmcia_irqs irqs[] = { - { 0, IRQ_GPIO_H3600_PCMCIA_CD0, "PCMCIA CD0" }, - { 1, IRQ_GPIO_H3600_PCMCIA_CD1, "PCMCIA CD1" } + { .sock = 0, .str = "PCMCIA CD0" }, /* .irq will be filled later */ + { .sock = 1, .str = "PCMCIA CD1" } }; static int h3600_pcmcia_hw_init(struct soc_pcmcia_socket *skt) { - skt->socket.pci_irq = skt->nr ? IRQ_GPIO_H3600_PCMCIA_IRQ1 - : IRQ_GPIO_H3600_PCMCIA_IRQ0; + int err; + switch (skt->nr) { + case 0: + err = gpio_request(H3XXX_GPIO_PCMCIA_IRQ0, "PCMCIA IRQ0"); + if (err) + goto err00; + err = gpio_direction_input(H3XXX_GPIO_PCMCIA_IRQ0); + if (err) + goto err01; + skt->socket.pci_irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_IRQ0); + + err = gpio_request(H3XXX_GPIO_PCMCIA_CD0, "PCMCIA CD0"); + if (err) + goto err01; + err = gpio_direction_input(H3XXX_GPIO_PCMCIA_CD0); + if (err) + goto err02; + irqs[0].irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_CD0); + + err = gpio_request(H3XXX_EGPIO_OPT_NVRAM_ON, "OPT NVRAM ON"); + if (err) + goto err02; + err = gpio_direction_output(H3XXX_EGPIO_OPT_NVRAM_ON, 0); + if (err) + goto err03; + err = gpio_request(H3XXX_EGPIO_OPT_ON, "OPT ON"); + if (err) + goto err03; + err = gpio_direction_output(H3XXX_EGPIO_OPT_ON, 0); + if (err) + goto err04; + err = gpio_request(H3XXX_EGPIO_OPT_RESET, "OPT RESET"); + if (err) + goto err04; + err = gpio_direction_output(H3XXX_EGPIO_OPT_RESET, 0); + if (err) + goto err05; + err = gpio_request(H3XXX_EGPIO_CARD_RESET, "PCMCIA CARD RESET"); + if (err) + goto err05; + err = gpio_direction_output(H3XXX_EGPIO_CARD_RESET, 0); + if (err) + goto err06; + err = soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs)); + if (err) + goto err06; + break; + case 1: + err = gpio_request(H3XXX_GPIO_PCMCIA_IRQ1, "PCMCIA IRQ1"); + if (err) + goto err10; + err = gpio_direction_input(H3XXX_GPIO_PCMCIA_IRQ1); + if (err) + goto err11; + skt->socket.pci_irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_IRQ1); + + err = gpio_request(H3XXX_GPIO_PCMCIA_CD1, "PCMCIA CD1"); + if (err) + goto err11; + err = gpio_direction_input(H3XXX_GPIO_PCMCIA_CD1); + if (err) + goto err12; + irqs[1].irq = gpio_to_irq(H3XXX_GPIO_PCMCIA_CD1); + + err = soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs)); + if (err) + goto err12; + break; + } + return 0; - return soc_pcmcia_request_irqs(skt, irqs, ARRAY_SIZE(irqs)); +err06: gpio_free(H3XXX_EGPIO_CARD_RESET); +err05: gpio_free(H3XXX_EGPIO_OPT_RESET); +err04: gpio_free(H3XXX_EGPIO_OPT_ON); +err03: gpio_free(H3XXX_EGPIO_OPT_NVRAM_ON); +err02: gpio_free(H3XXX_GPIO_PCMCIA_CD0); +err01: gpio_free(H3XXX_GPIO_PCMCIA_IRQ0); +err00: return err; + +err12: gpio_free(H3XXX_GPIO_PCMCIA_CD0); +err11: gpio_free(H3XXX_GPIO_PCMCIA_IRQ0); +err10: return err; } static void h3600_pcmcia_hw_shutdown(struct soc_pcmcia_socket *skt) { soc_pcmcia_free_irqs(skt, irqs, ARRAY_SIZE(irqs)); - /* Disable CF bus: */ - assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 0); - assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 0); - assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 1); + switch (skt->nr) { + case 0: + /* Disable CF bus: */ + gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 0); + gpio_set_value(H3XXX_EGPIO_OPT_ON, 0); + gpio_set_value(H3XXX_EGPIO_OPT_RESET, 1); + + gpio_free(H3XXX_EGPIO_CARD_RESET); + gpio_free(H3XXX_EGPIO_OPT_RESET); + gpio_free(H3XXX_EGPIO_OPT_ON); + gpio_free(H3XXX_EGPIO_OPT_NVRAM_ON); + gpio_free(H3XXX_GPIO_PCMCIA_CD0); + gpio_free(H3XXX_GPIO_PCMCIA_IRQ0); + break; + case 1: + gpio_free(H3XXX_GPIO_PCMCIA_CD1); + gpio_free(H3XXX_GPIO_PCMCIA_IRQ1); + break; + } } static void h3600_pcmcia_socket_state(struct soc_pcmcia_socket *skt, struct pcmcia_state *state) { - unsigned long levels = GPLR; - switch (skt->nr) { case 0: - state->detect = levels & GPIO_H3600_PCMCIA_CD0 ? 0 : 1; - state->ready = levels & GPIO_H3600_PCMCIA_IRQ0 ? 1 : 0; + state->detect = !gpio_get_value(H3XXX_GPIO_PCMCIA_CD0); + state->ready = !!gpio_get_value(H3XXX_GPIO_PCMCIA_IRQ0); state->bvd1 = 0; state->bvd2 = 0; state->wrprot = 0; /* Not available on H3600. */ @@ -59,8 +151,8 @@ h3600_pcmcia_socket_state(struct soc_pcmcia_socket *skt, struct pcmcia_state *st break; case 1: - state->detect = levels & GPIO_H3600_PCMCIA_CD1 ? 0 : 1; - state->ready = levels & GPIO_H3600_PCMCIA_IRQ1 ? 1 : 0; + state->detect = !gpio_get_value(H3XXX_GPIO_PCMCIA_CD1); + state->ready = !!gpio_get_value(H3XXX_GPIO_PCMCIA_IRQ1); state->bvd1 = 0; state->bvd2 = 0; state->wrprot = 0; /* Not available on H3600. */ @@ -79,7 +171,7 @@ h3600_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, const socket_state_ return -1; } - assign_h3600_egpio(IPAQ_EGPIO_CARD_RESET, !!(state->flags & SS_RESET)); + gpio_set_value(H3XXX_EGPIO_CARD_RESET, !!(state->flags & SS_RESET)); /* Silently ignore Vpp, output enable, speaker enable. */ @@ -89,9 +181,9 @@ h3600_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, const socket_state_ static void h3600_pcmcia_socket_init(struct soc_pcmcia_socket *skt) { /* Enable CF bus: */ - assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 1); - assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 1); - assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 0); + gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 1); + gpio_set_value(H3XXX_EGPIO_OPT_ON, 1); + gpio_set_value(H3XXX_EGPIO_OPT_RESET, 0); msleep(10); @@ -109,10 +201,10 @@ static void h3600_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt) * socket 0 then socket 1. */ if (skt->nr == 1) { - assign_h3600_egpio(IPAQ_EGPIO_OPT_ON, 0); - assign_h3600_egpio(IPAQ_EGPIO_OPT_NVRAM_ON, 0); + gpio_set_value(H3XXX_EGPIO_OPT_ON, 0); + gpio_set_value(H3XXX_EGPIO_OPT_NVRAM_ON, 0); /* hmm, does this suck power? */ - assign_h3600_egpio(IPAQ_EGPIO_OPT_RESET, 1); + gpio_set_value(H3XXX_EGPIO_OPT_RESET, 1); } } @@ -131,7 +223,7 @@ int __init pcmcia_h3600_init(struct device *dev) { int ret = -ENODEV; - if (machine_is_h3600()) + if (machine_is_h3600() || machine_is_h3100()) ret = sa11xx_drv_pcmcia_probe(dev, &h3600_pcmcia_ops, 0, 2); return ret; diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index cea6cef..1186749 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -77,6 +77,13 @@ config BATTERY_TOSA Say Y to enable support for the battery on the Sharp Zaurus SL-6000 (tosa) models. +config BATTERY_COLLIE + tristate "Sharp SL-5500 (collie) battery" + depends on SA1100_COLLIE && MCP_UCB1200 + help + Say Y to enable support for the battery on the Sharp Zaurus + SL-5500 (collie) models. + config BATTERY_WM97XX bool "WM97xx generic battery driver" depends on TOUCHSCREEN_WM97XX=y diff --git a/drivers/power/Makefile b/drivers/power/Makefile index b96f29d..356cdfd 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_BATTERY_DS2782) += ds2782_battery.o obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o +obj-$(CONFIG_BATTERY_COLLIE) += collie_battery.o obj-$(CONFIG_BATTERY_WM97XX) += wm97xx_battery.o obj-$(CONFIG_BATTERY_BQ27x00) += bq27x00_battery.o obj-$(CONFIG_BATTERY_DA9030) += da9030_battery.o diff --git a/drivers/power/collie_battery.c b/drivers/power/collie_battery.c new file mode 100644 index 0000000..039f41a --- /dev/null +++ b/drivers/power/collie_battery.c @@ -0,0 +1,418 @@ +/* + * Battery and Power Management code for the Sharp SL-5x00 + * + * Copyright (C) 2009 Thomas Kunze + * + * based on tosa_battery.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/power_supply.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/gpio.h> +#include <linux/mfd/ucb1x00.h> + +#include <asm/mach/sharpsl_param.h> +#include <asm/mach-types.h> +#include <mach/collie.h> + +static DEFINE_MUTEX(bat_lock); /* protects gpio pins */ +static struct work_struct bat_work; +static struct ucb1x00 *ucb; + +struct collie_bat { + int status; + struct power_supply psy; + int full_chrg; + + struct mutex work_lock; /* protects data */ + + bool (*is_present)(struct collie_bat *bat); + int gpio_full; + int gpio_charge_on; + + int technology; + + int gpio_bat; + int adc_bat; + int adc_bat_divider; + int bat_max; + int bat_min; + + int gpio_temp; + int adc_temp; + int adc_temp_divider; +}; + +static struct collie_bat collie_bat_main; + +static unsigned long collie_read_bat(struct collie_bat *bat) +{ + unsigned long value = 0; + + if (bat->gpio_bat < 0 || bat->adc_bat < 0) + return 0; + mutex_lock(&bat_lock); + gpio_set_value(bat->gpio_bat, 1); + msleep(5); + ucb1x00_adc_enable(ucb); + value = ucb1x00_adc_read(ucb, bat->adc_bat, UCB_SYNC); + ucb1x00_adc_disable(ucb); + gpio_set_value(bat->gpio_bat, 0); + mutex_unlock(&bat_lock); + value = value * 1000000 / bat->adc_bat_divider; + + return value; +} + +static unsigned long collie_read_temp(struct collie_bat *bat) +{ + unsigned long value = 0; + if (bat->gpio_temp < 0 || bat->adc_temp < 0) + return 0; + + mutex_lock(&bat_lock); + gpio_set_value(bat->gpio_temp, 1); + msleep(5); + ucb1x00_adc_enable(ucb); + value = ucb1x00_adc_read(ucb, bat->adc_temp, UCB_SYNC); + ucb1x00_adc_disable(ucb); + gpio_set_value(bat->gpio_temp, 0); + mutex_unlock(&bat_lock); + + value = value * 10000 / bat->adc_temp_divider; + + return value; +} + +static int collie_bat_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + int ret = 0; + struct collie_bat *bat = container_of(psy, struct collie_bat, psy); + + if (bat->is_present && !bat->is_present(bat) + && psp != POWER_SUPPLY_PROP_PRESENT) { + return -ENODEV; + } + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + val->intval = bat->status; + break; + case POWER_SUPPLY_PROP_TECHNOLOGY: + val->intval = bat->technology; + break; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + val->intval = collie_read_bat(bat); + break; + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + if (bat->full_chrg == -1) + val->intval = bat->bat_max; + else + val->intval = bat->full_chrg; + break; + case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: + val->intval = bat->bat_max; + break; + case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: + val->intval = bat->bat_min; + break; + case POWER_SUPPLY_PROP_TEMP: + val->intval = collie_read_temp(bat); + break; + case POWER_SUPPLY_PROP_PRESENT: + val->intval = bat->is_present ? bat->is_present(bat) : 1; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static void collie_bat_external_power_changed(struct power_supply *psy) +{ + schedule_work(&bat_work); +} + +static irqreturn_t collie_bat_gpio_isr(int irq, void *data) +{ + pr_info("collie_bat_gpio irq: %d\n", gpio_get_value(irq_to_gpio(irq))); + schedule_work(&bat_work); + return IRQ_HANDLED; +} + +static void collie_bat_update(struct collie_bat *bat) +{ + int old; + struct power_supply *psy = &bat->psy; + + mutex_lock(&bat->work_lock); + + old = bat->status; + + if (bat->is_present && !bat->is_present(bat)) { + printk(KERN_NOTICE "%s not present\n", psy->name); + bat->status = POWER_SUPPLY_STATUS_UNKNOWN; + bat->full_chrg = -1; + } else if (power_supply_am_i_supplied(psy)) { + if (bat->status == POWER_SUPPLY_STATUS_DISCHARGING) { + gpio_set_value(bat->gpio_charge_on, 1); + mdelay(15); + } + + if (gpio_get_value(bat->gpio_full)) { + if (old == POWER_SUPPLY_STATUS_CHARGING || + bat->full_chrg == -1) + bat->full_chrg = collie_read_bat(bat); + + gpio_set_value(bat->gpio_charge_on, 0); + bat->status = POWER_SUPPLY_STATUS_FULL; + } else { + gpio_set_value(bat->gpio_charge_on, 1); + bat->status = POWER_SUPPLY_STATUS_CHARGING; + } + } else { + gpio_set_value(bat->gpio_charge_on, 0); + bat->status = POWER_SUPPLY_STATUS_DISCHARGING; + } + + if (old != bat->status) + power_supply_changed(psy); + + mutex_unlock(&bat->work_lock); +} + +static void collie_bat_work(struct work_struct *work) +{ + collie_bat_update(&collie_bat_main); +} + + +static enum power_supply_property collie_bat_main_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_MAX, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_TEMP, +}; + +static enum power_supply_property collie_bat_bu_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_MAX, + POWER_SUPPLY_PROP_PRESENT, +}; + +static struct collie_bat collie_bat_main = { + .status = POWER_SUPPLY_STATUS_DISCHARGING, + .full_chrg = -1, + .psy = { + .name = "main-battery", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = collie_bat_main_props, + .num_properties = ARRAY_SIZE(collie_bat_main_props), + .get_property = collie_bat_get_property, + .external_power_changed = collie_bat_external_power_changed, + .use_for_apm = 1, + }, + + .gpio_full = COLLIE_GPIO_CO, + .gpio_charge_on = COLLIE_GPIO_CHARGE_ON, + + .technology = POWER_SUPPLY_TECHNOLOGY_LIPO, + + .gpio_bat = COLLIE_GPIO_MBAT_ON, + .adc_bat = UCB_ADC_INP_AD1, + .adc_bat_divider = 155, + .bat_max = 4310000, + .bat_min = 1551 * 1000000 / 414, + + .gpio_temp = COLLIE_GPIO_TMP_ON, + .adc_temp = UCB_ADC_INP_AD0, + .adc_temp_divider = 10000, +}; + +static struct collie_bat collie_bat_bu = { + .status = POWER_SUPPLY_STATUS_UNKNOWN, + .full_chrg = -1, + + .psy = { + .name = "backup-battery", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = collie_bat_bu_props, + .num_properties = ARRAY_SIZE(collie_bat_bu_props), + .get_property = collie_bat_get_property, + .external_power_changed = collie_bat_external_power_changed, + }, + + .gpio_full = -1, + .gpio_charge_on = -1, + + .technology = POWER_SUPPLY_TECHNOLOGY_LiMn, + + .gpio_bat = COLLIE_GPIO_BBAT_ON, + .adc_bat = UCB_ADC_INP_AD1, + .adc_bat_divider = 155, + .bat_max = 3000000, + .bat_min = 1900000, + + .gpio_temp = -1, + .adc_temp = -1, + .adc_temp_divider = -1, +}; + +static struct { + int gpio; + char *name; + bool output; + int value; +} gpios[] = { + { COLLIE_GPIO_CO, "main battery full", 0, 0 }, + { COLLIE_GPIO_MAIN_BAT_LOW, "main battery low", 0, 0 }, + { COLLIE_GPIO_CHARGE_ON, "main charge on", 1, 0 }, + { COLLIE_GPIO_MBAT_ON, "main battery", 1, 0 }, + { COLLIE_GPIO_TMP_ON, "main battery temp", 1, 0 }, + { COLLIE_GPIO_BBAT_ON, "backup battery", 1, 0 }, +}; + +#ifdef CONFIG_PM +static int collie_bat_suspend(struct ucb1x00_dev *dev, pm_message_t state) +{ + /* flush all pending status updates */ + flush_scheduled_work(); + return 0; +} + +static int collie_bat_resume(struct ucb1x00_dev *dev) +{ + /* things may have changed while we were away */ + schedule_work(&bat_work); + return 0; +} +#else +#define collie_bat_suspend NULL +#define collie_bat_resume NULL +#endif + +static int __devinit collie_bat_probe(struct ucb1x00_dev *dev) +{ + int ret; + int i; + + if (!machine_is_collie()) + return -ENODEV; + + ucb = dev->ucb; + + for (i = 0; i < ARRAY_SIZE(gpios); i++) { + ret = gpio_request(gpios[i].gpio, gpios[i].name); + if (ret) { + i--; + goto err_gpio; + } + + if (gpios[i].output) + ret = gpio_direction_output(gpios[i].gpio, + gpios[i].value); + else + ret = gpio_direction_input(gpios[i].gpio); + + if (ret) + goto err_gpio; + } + + mutex_init(&collie_bat_main.work_lock); + + INIT_WORK(&bat_work, collie_bat_work); + + ret = power_supply_register(&dev->ucb->dev, &collie_bat_main.psy); + if (ret) + goto err_psy_reg_main; + ret = power_supply_register(&dev->ucb->dev, &collie_bat_bu.psy); + if (ret) + goto err_psy_reg_bu; + + ret = request_irq(gpio_to_irq(COLLIE_GPIO_CO), + collie_bat_gpio_isr, + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + "main full", &collie_bat_main); + if (!ret) { + schedule_work(&bat_work); + return 0; + } + power_supply_unregister(&collie_bat_bu.psy); +err_psy_reg_bu: + power_supply_unregister(&collie_bat_main.psy); +err_psy_reg_main: + + /* see comment in collie_bat_remove */ + flush_scheduled_work(); + + i--; +err_gpio: + for (; i >= 0; i--) + gpio_free(gpios[i].gpio); + + return ret; +} + +static void __devexit collie_bat_remove(struct ucb1x00_dev *dev) +{ + int i; + + free_irq(gpio_to_irq(COLLIE_GPIO_CO), &collie_bat_main); + + power_supply_unregister(&collie_bat_bu.psy); + power_supply_unregister(&collie_bat_main.psy); + + /* + * now flush all pending work. + * we won't get any more schedules, since all + * sources (isr and external_power_changed) + * are unregistered now. + */ + flush_scheduled_work(); + + for (i = ARRAY_SIZE(gpios) - 1; i >= 0; i--) + gpio_free(gpios[i].gpio); +} + +static struct ucb1x00_driver collie_bat_driver = { + .add = collie_bat_probe, + .remove = __devexit_p(collie_bat_remove), + .suspend = collie_bat_suspend, + .resume = collie_bat_resume, +}; + +static int __init collie_bat_init(void) +{ + return ucb1x00_register_driver(&collie_bat_driver); +} + +static void __exit collie_bat_exit(void) +{ + ucb1x00_unregister_driver(&collie_bat_driver); +} + +module_init(collie_bat_init); +module_exit(collie_bat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Kunze"); +MODULE_DESCRIPTION("Collie battery driver"); diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 3c20dae..f2e1004 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -509,6 +509,15 @@ config RTC_DRV_M48T59 This driver can also be built as a module, if so, the module will be called "rtc-m48t59". +config RTC_DRV_MSM6242 + tristate "Oki MSM6242" + help + If you say yes here you get support for the Oki MSM6242 + timekeeping chip. It is used in some Amiga models (e.g. A2000). + + This driver can also be built as a module. If so, the module + will be called rtc-msm6242. + config RTC_MXC tristate "Freescale MXC Real Time Clock" depends on ARCH_MXC @@ -529,6 +538,16 @@ config RTC_DRV_BQ4802 This driver can also be built as a module. If so, the module will be called rtc-bq4802. +config RTC_DRV_RP5C01 + tristate "Ricoh RP5C01" + help + If you say yes here you get support for the Ricoh RP5C01 + timekeeping chip. It is used in some Amiga models (e.g. A3000 + and A4000). + + This driver can also be built as a module. If so, the module + will be called rtc-rp5c01. + config RTC_DRV_V3020 tristate "EM Microelectronic V3020" help @@ -780,7 +799,7 @@ config RTC_DRV_TX4939 config RTC_DRV_MV tristate "Marvell SoC RTC" - depends on ARCH_KIRKWOOD + depends on ARCH_KIRKWOOD || ARCH_DOVE help If you say yes here you will get support for the in-chip RTC that can be found in some of Marvell's SoC devices, such as diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index aa3fbd5..af1ba7a 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_RTC_DRV_M48T86) += rtc-m48t86.o obj-$(CONFIG_RTC_MXC) += rtc-mxc.o obj-$(CONFIG_RTC_DRV_MAX6900) += rtc-max6900.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o +obj-$(CONFIG_RTC_DRV_MSM6242) += rtc-msm6242.o obj-$(CONFIG_RTC_DRV_MV) += rtc-mv.o obj-$(CONFIG_RTC_DRV_OMAP) += rtc-omap.o obj-$(CONFIG_RTC_DRV_PCAP) += rtc-pcap.o @@ -64,6 +65,7 @@ obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o obj-$(CONFIG_RTC_DRV_PS3) += rtc-ps3.o obj-$(CONFIG_RTC_DRV_PXA) += rtc-pxa.o obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o +obj-$(CONFIG_RTC_DRV_RP5C01) += rtc-rp5c01.o obj-$(CONFIG_RTC_DRV_RS5C313) += rtc-rs5c313.o obj-$(CONFIG_RTC_DRV_RS5C348) += rtc-rs5c348.o obj-$(CONFIG_RTC_DRV_RS5C372) += rtc-rs5c372.o diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c new file mode 100644 index 0000000..5f5968a --- /dev/null +++ b/drivers/rtc/rtc-msm6242.c @@ -0,0 +1,269 @@ +/* + * Oki MSM6242 RTC Driver + * + * Copyright 2009 Geert Uytterhoeven + * + * Based on the A2000 TOD code in arch/m68k/amiga/config.c + * Copyright (C) 1993 Hamish Macdonald + */ + +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/rtc.h> + + +enum { + MSM6242_SECOND1 = 0x0, /* 1-second digit register */ + MSM6242_SECOND10 = 0x1, /* 10-second digit register */ + MSM6242_MINUTE1 = 0x2, /* 1-minute digit register */ + MSM6242_MINUTE10 = 0x3, /* 10-minute digit register */ + MSM6242_HOUR1 = 0x4, /* 1-hour digit register */ + MSM6242_HOUR10 = 0x5, /* PM/AM, 10-hour digit register */ + MSM6242_DAY1 = 0x6, /* 1-day digit register */ + MSM6242_DAY10 = 0x7, /* 10-day digit register */ + MSM6242_MONTH1 = 0x8, /* 1-month digit register */ + MSM6242_MONTH10 = 0x9, /* 10-month digit register */ + MSM6242_YEAR1 = 0xa, /* 1-year digit register */ + MSM6242_YEAR10 = 0xb, /* 10-year digit register */ + MSM6242_WEEK = 0xc, /* Week register */ + MSM6242_CD = 0xd, /* Control Register D */ + MSM6242_CE = 0xe, /* Control Register E */ + MSM6242_CF = 0xf, /* Control Register F */ +}; + +#define MSM6242_HOUR10_AM (0 << 2) +#define MSM6242_HOUR10_PM (1 << 2) +#define MSM6242_HOUR10_HR_MASK (3 << 0) + +#define MSM6242_WEEK_SUNDAY 0 +#define MSM6242_WEEK_MONDAY 1 +#define MSM6242_WEEK_TUESDAY 2 +#define MSM6242_WEEK_WEDNESDAY 3 +#define MSM6242_WEEK_THURSDAY 4 +#define MSM6242_WEEK_FRIDAY 5 +#define MSM6242_WEEK_SATURDAY 6 + +#define MSM6242_CD_30_S_ADJ (1 << 3) /* 30-second adjustment */ +#define MSM6242_CD_IRQ_FLAG (1 << 2) +#define MSM6242_CD_BUSY (1 << 1) +#define MSM6242_CD_HOLD (1 << 0) + +#define MSM6242_CE_T_MASK (3 << 2) +#define MSM6242_CE_T_64HZ (0 << 2) /* period 1/64 second */ +#define MSM6242_CE_T_1HZ (1 << 2) /* period 1 second */ +#define MSM6242_CE_T_1MINUTE (2 << 2) /* period 1 minute */ +#define MSM6242_CE_T_1HOUR (3 << 2) /* period 1 hour */ + +#define MSM6242_CE_ITRPT_STND (1 << 1) +#define MSM6242_CE_MASK (1 << 0) /* STD.P output control */ + +#define MSM6242_CF_TEST (1 << 3) +#define MSM6242_CF_12H (0 << 2) +#define MSM6242_CF_24H (1 << 2) +#define MSM6242_CF_STOP (1 << 1) +#define MSM6242_CF_REST (1 << 0) /* reset */ + + +struct msm6242_priv { + u32 __iomem *regs; + struct rtc_device *rtc; +}; + +static inline unsigned int msm6242_read(struct msm6242_priv *priv, + unsigned int reg) +{ + return __raw_readl(&priv->regs[reg]) & 0xf; +} + +static inline void msm6242_write(struct msm6242_priv *priv, unsigned int val, + unsigned int reg) +{ + return __raw_writel(val, &priv->regs[reg]); +} + +static inline void msm6242_set(struct msm6242_priv *priv, unsigned int val, + unsigned int reg) +{ + msm6242_write(priv, msm6242_read(priv, reg) | val, reg); +} + +static inline void msm6242_clear(struct msm6242_priv *priv, unsigned int val, + unsigned int reg) +{ + msm6242_write(priv, msm6242_read(priv, reg) & ~val, reg); +} + +static void msm6242_lock(struct msm6242_priv *priv) +{ + int cnt = 5; + + msm6242_set(priv, MSM6242_CD_HOLD, MSM6242_CD); + + while ((msm6242_read(priv, MSM6242_CD) & MSM6242_CD_BUSY) && cnt) { + msm6242_clear(priv, MSM6242_CD_HOLD, MSM6242_CD); + udelay(70); + msm6242_set(priv, MSM6242_CD_HOLD, MSM6242_CD); + cnt--; + } + + if (!cnt) + pr_warning("msm6242: timed out waiting for RTC (0x%x)\n", + msm6242_read(priv, MSM6242_CD)); +} + +static void msm6242_unlock(struct msm6242_priv *priv) +{ + msm6242_clear(priv, MSM6242_CD_HOLD, MSM6242_CD); +} + +static int msm6242_read_time(struct device *dev, struct rtc_time *tm) +{ + struct msm6242_priv *priv = dev_get_drvdata(dev); + + msm6242_lock(priv); + + tm->tm_sec = msm6242_read(priv, MSM6242_SECOND10) * 10 + + msm6242_read(priv, MSM6242_SECOND1); + tm->tm_min = msm6242_read(priv, MSM6242_MINUTE10) * 10 + + msm6242_read(priv, MSM6242_MINUTE1); + tm->tm_hour = (msm6242_read(priv, MSM6242_HOUR10 & 3)) * 10 + + msm6242_read(priv, MSM6242_HOUR1); + tm->tm_mday = msm6242_read(priv, MSM6242_DAY10) * 10 + + msm6242_read(priv, MSM6242_DAY1); + tm->tm_wday = msm6242_read(priv, MSM6242_WEEK); + tm->tm_mon = msm6242_read(priv, MSM6242_MONTH10) * 10 + + msm6242_read(priv, MSM6242_MONTH1) - 1; + tm->tm_year = msm6242_read(priv, MSM6242_YEAR10) * 10 + + msm6242_read(priv, MSM6242_YEAR1); + if (tm->tm_year <= 69) + tm->tm_year += 100; + + if (!(msm6242_read(priv, MSM6242_CF) & MSM6242_CF_24H)) { + unsigned int pm = msm6242_read(priv, MSM6242_HOUR10) & + MSM6242_HOUR10_PM; + if (!pm && tm->tm_hour == 12) + tm->tm_hour = 0; + else if (pm && tm->tm_hour != 12) + tm->tm_hour += 12; + } + + msm6242_unlock(priv); + + return rtc_valid_tm(tm); +} + +static int msm6242_set_time(struct device *dev, struct rtc_time *tm) +{ + struct msm6242_priv *priv = dev_get_drvdata(dev); + + msm6242_lock(priv); + + msm6242_write(priv, tm->tm_sec / 10, MSM6242_SECOND10); + msm6242_write(priv, tm->tm_sec % 10, MSM6242_SECOND1); + msm6242_write(priv, tm->tm_min / 10, MSM6242_MINUTE10); + msm6242_write(priv, tm->tm_min % 10, MSM6242_MINUTE1); + if (msm6242_read(priv, MSM6242_CF) & MSM6242_CF_24H) + msm6242_write(priv, tm->tm_hour / 10, MSM6242_HOUR10); + else if (tm->tm_hour >= 12) + msm6242_write(priv, MSM6242_HOUR10_PM + (tm->tm_hour - 12) / 10, + MSM6242_HOUR10); + else + msm6242_write(priv, tm->tm_hour / 10, MSM6242_HOUR10); + msm6242_write(priv, tm->tm_hour % 10, MSM6242_HOUR1); + msm6242_write(priv, tm->tm_mday / 10, MSM6242_DAY10); + msm6242_write(priv, tm->tm_mday % 10, MSM6242_DAY1); + if (tm->tm_wday != -1) + msm6242_write(priv, tm->tm_wday, MSM6242_WEEK); + msm6242_write(priv, (tm->tm_mon + 1) / 10, MSM6242_MONTH10); + msm6242_write(priv, (tm->tm_mon + 1) % 10, MSM6242_MONTH1); + if (tm->tm_year >= 100) + tm->tm_year -= 100; + msm6242_write(priv, tm->tm_year / 10, MSM6242_YEAR10); + msm6242_write(priv, tm->tm_year % 10, MSM6242_YEAR1); + + msm6242_unlock(priv); + return 0; +} + +static const struct rtc_class_ops msm6242_rtc_ops = { + .read_time = msm6242_read_time, + .set_time = msm6242_set_time, +}; + +static int __init msm6242_rtc_probe(struct platform_device *dev) +{ + struct resource *res; + struct msm6242_priv *priv; + struct rtc_device *rtc; + int error; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->regs = ioremap(res->start, resource_size(res)); + if (!priv->regs) { + error = -ENOMEM; + goto out_free_priv; + } + + rtc = rtc_device_register("rtc-msm6242", &dev->dev, &msm6242_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) { + error = PTR_ERR(rtc); + goto out_unmap; + } + + priv->rtc = rtc; + platform_set_drvdata(dev, priv); + return 0; + +out_unmap: + iounmap(priv->regs); +out_free_priv: + kfree(priv); + return error; +} + +static int __exit msm6242_rtc_remove(struct platform_device *dev) +{ + struct msm6242_priv *priv = platform_get_drvdata(dev); + + rtc_device_unregister(priv->rtc); + iounmap(priv->regs); + kfree(priv); + return 0; +} + +static struct platform_driver msm6242_rtc_driver = { + .driver = { + .name = "rtc-msm6242", + .owner = THIS_MODULE, + }, + .remove = __exit_p(msm6242_rtc_remove), +}; + +static int __init msm6242_rtc_init(void) +{ + return platform_driver_probe(&msm6242_rtc_driver, msm6242_rtc_probe); +} + +static void __exit msm6242_rtc_fini(void) +{ + platform_driver_unregister(&msm6242_rtc_driver); +} + +module_init(msm6242_rtc_init); +module_exit(msm6242_rtc_fini); + +MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Oki MSM6242 RTC driver"); +MODULE_ALIAS("platform:rtc-msm6242"); diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c new file mode 100644 index 0000000..e1313fe --- /dev/null +++ b/drivers/rtc/rtc-rp5c01.c @@ -0,0 +1,222 @@ +/* + * Ricoh RP5C01 RTC Driver + * + * Copyright 2009 Geert Uytterhoeven + * + * Based on the A3000 TOD code in arch/m68k/amiga/config.c + * Copyright (C) 1993 Hamish Macdonald + */ + +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/rtc.h> + + +enum { + RP5C01_1_SECOND = 0x0, /* MODE 00 */ + RP5C01_10_SECOND = 0x1, /* MODE 00 */ + RP5C01_1_MINUTE = 0x2, /* MODE 00 and MODE 01 */ + RP5C01_10_MINUTE = 0x3, /* MODE 00 and MODE 01 */ + RP5C01_1_HOUR = 0x4, /* MODE 00 and MODE 01 */ + RP5C01_10_HOUR = 0x5, /* MODE 00 and MODE 01 */ + RP5C01_DAY_OF_WEEK = 0x6, /* MODE 00 and MODE 01 */ + RP5C01_1_DAY = 0x7, /* MODE 00 and MODE 01 */ + RP5C01_10_DAY = 0x8, /* MODE 00 and MODE 01 */ + RP5C01_1_MONTH = 0x9, /* MODE 00 */ + RP5C01_10_MONTH = 0xa, /* MODE 00 */ + RP5C01_1_YEAR = 0xb, /* MODE 00 */ + RP5C01_10_YEAR = 0xc, /* MODE 00 */ + + RP5C01_12_24_SELECT = 0xa, /* MODE 01 */ + RP5C01_LEAP_YEAR = 0xb, /* MODE 01 */ + + RP5C01_MODE = 0xd, /* all modes */ + RP5C01_TEST = 0xe, /* all modes */ + RP5C01_RESET = 0xf, /* all modes */ +}; + +#define RP5C01_12_24_SELECT_12 (0 << 0) +#define RP5C01_12_24_SELECT_24 (1 << 0) + +#define RP5C01_10_HOUR_AM (0 << 1) +#define RP5C01_10_HOUR_PM (1 << 1) + +#define RP5C01_MODE_TIMER_EN (1 << 3) /* timer enable */ +#define RP5C01_MODE_ALARM_EN (1 << 2) /* alarm enable */ + +#define RP5C01_MODE_MODE_MASK (3 << 0) +#define RP5C01_MODE_MODE00 (0 << 0) /* time */ +#define RP5C01_MODE_MODE01 (1 << 0) /* alarm, 12h/24h, leap year */ +#define RP5C01_MODE_RAM_BLOCK10 (2 << 0) /* RAM 4 bits x 13 */ +#define RP5C01_MODE_RAM_BLOCK11 (3 << 0) /* RAM 4 bits x 13 */ + +#define RP5C01_RESET_1HZ_PULSE (1 << 3) +#define RP5C01_RESET_16HZ_PULSE (1 << 2) +#define RP5C01_RESET_SECOND (1 << 1) /* reset divider stages for */ + /* seconds or smaller units */ +#define RP5C01_RESET_ALARM (1 << 0) /* reset all alarm registers */ + + +struct rp5c01_priv { + u32 __iomem *regs; + struct rtc_device *rtc; +}; + +static inline unsigned int rp5c01_read(struct rp5c01_priv *priv, + unsigned int reg) +{ + return __raw_readl(&priv->regs[reg]) & 0xf; +} + +static inline void rp5c01_write(struct rp5c01_priv *priv, unsigned int val, + unsigned int reg) +{ + return __raw_writel(val, &priv->regs[reg]); +} + +static void rp5c01_lock(struct rp5c01_priv *priv) +{ + rp5c01_write(priv, RP5C01_MODE_MODE00, RP5C01_MODE); +} + +static void rp5c01_unlock(struct rp5c01_priv *priv) +{ + rp5c01_write(priv, RP5C01_MODE_TIMER_EN | RP5C01_MODE_MODE01, + RP5C01_MODE); +} + +static int rp5c01_read_time(struct device *dev, struct rtc_time *tm) +{ + struct rp5c01_priv *priv = dev_get_drvdata(dev); + + rp5c01_lock(priv); + + tm->tm_sec = rp5c01_read(priv, RP5C01_10_SECOND) * 10 + + rp5c01_read(priv, RP5C01_1_SECOND); + tm->tm_min = rp5c01_read(priv, RP5C01_10_MINUTE) * 10 + + rp5c01_read(priv, RP5C01_1_MINUTE); + tm->tm_hour = rp5c01_read(priv, RP5C01_10_HOUR) * 10 + + rp5c01_read(priv, RP5C01_1_HOUR); + tm->tm_mday = rp5c01_read(priv, RP5C01_10_DAY) * 10 + + rp5c01_read(priv, RP5C01_1_DAY); + tm->tm_wday = rp5c01_read(priv, RP5C01_DAY_OF_WEEK); + tm->tm_mon = rp5c01_read(priv, RP5C01_10_MONTH) * 10 + + rp5c01_read(priv, RP5C01_1_MONTH) - 1; + tm->tm_year = rp5c01_read(priv, RP5C01_10_YEAR) * 10 + + rp5c01_read(priv, RP5C01_1_YEAR); + if (tm->tm_year <= 69) + tm->tm_year += 100; + + rp5c01_unlock(priv); + + return rtc_valid_tm(tm); +} + +static int rp5c01_set_time(struct device *dev, struct rtc_time *tm) +{ + struct rp5c01_priv *priv = dev_get_drvdata(dev); + + rp5c01_lock(priv); + + rp5c01_write(priv, tm->tm_sec / 10, RP5C01_10_SECOND); + rp5c01_write(priv, tm->tm_sec % 10, RP5C01_1_SECOND); + rp5c01_write(priv, tm->tm_min / 10, RP5C01_10_MINUTE); + rp5c01_write(priv, tm->tm_min % 10, RP5C01_1_MINUTE); + rp5c01_write(priv, tm->tm_hour / 10, RP5C01_10_HOUR); + rp5c01_write(priv, tm->tm_hour % 10, RP5C01_1_HOUR); + rp5c01_write(priv, tm->tm_mday / 10, RP5C01_10_DAY); + rp5c01_write(priv, tm->tm_mday % 10, RP5C01_1_DAY); + if (tm->tm_wday != -1) + rp5c01_write(priv, tm->tm_wday, RP5C01_DAY_OF_WEEK); + rp5c01_write(priv, (tm->tm_mon + 1) / 10, RP5C01_10_MONTH); + rp5c01_write(priv, (tm->tm_mon + 1) % 10, RP5C01_1_MONTH); + if (tm->tm_year >= 100) + tm->tm_year -= 100; + rp5c01_write(priv, tm->tm_year / 10, RP5C01_10_YEAR); + rp5c01_write(priv, tm->tm_year % 10, RP5C01_1_YEAR); + + rp5c01_unlock(priv); + return 0; +} + +static const struct rtc_class_ops rp5c01_rtc_ops = { + .read_time = rp5c01_read_time, + .set_time = rp5c01_set_time, +}; + +static int __init rp5c01_rtc_probe(struct platform_device *dev) +{ + struct resource *res; + struct rp5c01_priv *priv; + struct rtc_device *rtc; + int error; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->regs = ioremap(res->start, resource_size(res)); + if (!priv->regs) { + error = -ENOMEM; + goto out_free_priv; + } + + rtc = rtc_device_register("rtc-rp5c01", &dev->dev, &rp5c01_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) { + error = PTR_ERR(rtc); + goto out_unmap; + } + + priv->rtc = rtc; + platform_set_drvdata(dev, priv); + return 0; + +out_unmap: + iounmap(priv->regs); +out_free_priv: + kfree(priv); + return error; +} + +static int __exit rp5c01_rtc_remove(struct platform_device *dev) +{ + struct rp5c01_priv *priv = platform_get_drvdata(dev); + + rtc_device_unregister(priv->rtc); + iounmap(priv->regs); + kfree(priv); + return 0; +} + +static struct platform_driver rp5c01_rtc_driver = { + .driver = { + .name = "rtc-rp5c01", + .owner = THIS_MODULE, + }, + .remove = __exit_p(rp5c01_rtc_remove), +}; + +static int __init rp5c01_rtc_init(void) +{ + return platform_driver_probe(&rp5c01_rtc_driver, rp5c01_rtc_probe); +} + +static void __exit rp5c01_rtc_fini(void) +{ + platform_driver_unregister(&rp5c01_rtc_driver); +} + +module_init(rp5c01_rtc_init); +module_exit(rp5c01_rtc_fini); + +MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ricoh RP5C01 RTC driver"); +MODULE_ALIAS("platform:rtc-rp5c01"); diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c index b44462a..740fe40 100644 --- a/drivers/s390/char/sclp_async.c +++ b/drivers/s390/char/sclp_async.c @@ -101,18 +101,17 @@ static struct ctl_table callhome_table[] = { .mode = 0644, .proc_handler = proc_handler_callhome, }, - { .ctl_name = 0 } + {} }; static struct ctl_table kern_dir_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .maxlen = 0, .mode = 0555, .child = callhome_table, }, - { .ctl_name = 0 } + {} }; /* diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c index 63a30f5..2b6b93f 100644 --- a/drivers/scsi/scsi_sysctl.c +++ b/drivers/scsi/scsi_sysctl.c @@ -13,26 +13,23 @@ static ctl_table scsi_table[] = { - { .ctl_name = DEV_SCSI_LOGGING_LEVEL, - .procname = "logging_level", + { .procname = "logging_level", .data = &scsi_logging_level, .maxlen = sizeof(scsi_logging_level), .mode = 0644, - .proc_handler = &proc_dointvec }, + .proc_handler = proc_dointvec }, { } }; static ctl_table scsi_dir_table[] = { - { .ctl_name = DEV_SCSI, - .procname = "scsi", + { .procname = "scsi", .mode = 0555, .child = scsi_table }, { } }; static ctl_table scsi_root_table[] = { - { .ctl_name = CTL_DEV, - .procname = "dev", + { .procname = "dev", .mode = 0555, .child = scsi_dir_table }, { } diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig index e522572..50943ff 100644 --- a/drivers/serial/Kconfig +++ b/drivers/serial/Kconfig @@ -1477,4 +1477,17 @@ config SERIAL_BCM63XX_CONSOLE If you have enabled the serial port on the bcm63xx CPU you can make it the console by answering Y to this option. +config SERIAL_GRLIB_GAISLER_APBUART + tristate "GRLIB APBUART serial support" + depends on OF + ---help--- + Add support for the GRLIB APBUART serial port. + +config SERIAL_GRLIB_GAISLER_APBUART_CONSOLE + bool "Console on GRLIB APBUART serial port" + depends on SERIAL_GRLIB_GAISLER_APBUART=y + select SERIAL_CORE_CONSOLE + help + Support for running a console on the GRLIB APBUART + endmenu diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile index d21d5dd..5548fe7 100644 --- a/drivers/serial/Makefile +++ b/drivers/serial/Makefile @@ -81,3 +81,4 @@ obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o obj-$(CONFIG_SERIAL_QE) += ucc_uart.o obj-$(CONFIG_SERIAL_TIMBERDALE) += timbuart.o +obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o diff --git a/drivers/serial/apbuart.c b/drivers/serial/apbuart.c new file mode 100644 index 0000000..fe91319 --- /dev/null +++ b/drivers/serial/apbuart.c @@ -0,0 +1,710 @@ +/* + * Driver for GRLIB serial ports (APBUART) + * + * Based on linux/drivers/serial/amba.c + * + * Copyright (C) 2000 Deep Blue Solutions Ltd. + * Copyright (C) 2003 Konrad Eisele <eiselekd@web.de> + * Copyright (C) 2006 Daniel Hellstrom <daniel@gaisler.com>, Aeroflex Gaisler AB + * Copyright (C) 2008 Gilead Kutnick <kutnickg@zin-tech.com> + * Copyright (C) 2009 Kristoffer Glembo <kristoffer@gaisler.com>, Aeroflex Gaisler AB + */ + +#if defined(CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) +#define SUPPORT_SYSRQ +#endif + +#include <linux/module.h> +#include <linux/tty.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/serial.h> +#include <linux/console.h> +#include <linux/sysrq.h> +#include <linux/kthread.h> +#include <linux/device.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <linux/serial_core.h> +#include <asm/irq.h> + +#include "apbuart.h" + +#define SERIAL_APBUART_MAJOR TTY_MAJOR +#define SERIAL_APBUART_MINOR 64 +#define UART_DUMMY_RSR_RX 0x8000 /* for ignore all read */ + +static void apbuart_tx_chars(struct uart_port *port); + +static void apbuart_stop_tx(struct uart_port *port) +{ + unsigned int cr; + + cr = UART_GET_CTRL(port); + cr &= ~UART_CTRL_TI; + UART_PUT_CTRL(port, cr); +} + +static void apbuart_start_tx(struct uart_port *port) +{ + unsigned int cr; + + cr = UART_GET_CTRL(port); + cr |= UART_CTRL_TI; + UART_PUT_CTRL(port, cr); + + if (UART_GET_STATUS(port) & UART_STATUS_THE) + apbuart_tx_chars(port); +} + +static void apbuart_stop_rx(struct uart_port *port) +{ + unsigned int cr; + + cr = UART_GET_CTRL(port); + cr &= ~(UART_CTRL_RI); + UART_PUT_CTRL(port, cr); +} + +static void apbuart_enable_ms(struct uart_port *port) +{ + /* No modem status change interrupts for APBUART */ +} + +static void apbuart_rx_chars(struct uart_port *port) +{ + struct tty_struct *tty = port->state->port.tty; + unsigned int status, ch, rsr, flag; + unsigned int max_chars = port->fifosize; + + status = UART_GET_STATUS(port); + + while (UART_RX_DATA(status) && (max_chars--)) { + + ch = UART_GET_CHAR(port); + flag = TTY_NORMAL; + + port->icount.rx++; + + rsr = UART_GET_STATUS(port) | UART_DUMMY_RSR_RX; + UART_PUT_STATUS(port, 0); + if (rsr & UART_STATUS_ERR) { + + if (rsr & UART_STATUS_BR) { + rsr &= ~(UART_STATUS_FE | UART_STATUS_PE); + port->icount.brk++; + if (uart_handle_break(port)) + goto ignore_char; + } else if (rsr & UART_STATUS_PE) { + port->icount.parity++; + } else if (rsr & UART_STATUS_FE) { + port->icount.frame++; + } + if (rsr & UART_STATUS_OE) + port->icount.overrun++; + + rsr &= port->read_status_mask; + + if (rsr & UART_STATUS_PE) + flag = TTY_PARITY; + else if (rsr & UART_STATUS_FE) + flag = TTY_FRAME; + } + + if (uart_handle_sysrq_char(port, ch)) + goto ignore_char; + + uart_insert_char(port, rsr, UART_STATUS_OE, ch, flag); + + + ignore_char: + status = UART_GET_STATUS(port); + } + + tty_flip_buffer_push(tty); +} + +static void apbuart_tx_chars(struct uart_port *port) +{ + struct circ_buf *xmit = &port->state->xmit; + int count; + + if (port->x_char) { + UART_PUT_CHAR(port, port->x_char); + port->icount.tx++; + port->x_char = 0; + return; + } + + if (uart_circ_empty(xmit) || uart_tx_stopped(port)) { + apbuart_stop_tx(port); + return; + } + + /* amba: fill FIFO */ + count = port->fifosize >> 1; + do { + UART_PUT_CHAR(port, xmit->buf[xmit->tail]); + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + port->icount.tx++; + if (uart_circ_empty(xmit)) + break; + } while (--count > 0); + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); + + if (uart_circ_empty(xmit)) + apbuart_stop_tx(port); +} + +static irqreturn_t apbuart_int(int irq, void *dev_id) +{ + struct uart_port *port = dev_id; + unsigned int status; + + spin_lock(&port->lock); + + status = UART_GET_STATUS(port); + if (status & UART_STATUS_DR) + apbuart_rx_chars(port); + if (status & UART_STATUS_THE) + apbuart_tx_chars(port); + + spin_unlock(&port->lock); + + return IRQ_HANDLED; +} + +static unsigned int apbuart_tx_empty(struct uart_port *port) +{ + unsigned int status = UART_GET_STATUS(port); + return status & UART_STATUS_THE ? TIOCSER_TEMT : 0; +} + +static unsigned int apbuart_get_mctrl(struct uart_port *port) +{ + /* The GRLIB APBUART handles flow control in hardware */ + return TIOCM_CAR | TIOCM_DSR | TIOCM_CTS; +} + +static void apbuart_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ + /* The GRLIB APBUART handles flow control in hardware */ +} + +static void apbuart_break_ctl(struct uart_port *port, int break_state) +{ + /* We don't support sending break */ +} + +static int apbuart_startup(struct uart_port *port) +{ + int retval; + unsigned int cr; + + /* Allocate the IRQ */ + retval = request_irq(port->irq, apbuart_int, 0, "apbuart", port); + if (retval) + return retval; + + /* Finally, enable interrupts */ + cr = UART_GET_CTRL(port); + UART_PUT_CTRL(port, + cr | UART_CTRL_RE | UART_CTRL_TE | + UART_CTRL_RI | UART_CTRL_TI); + + return 0; +} + +static void apbuart_shutdown(struct uart_port *port) +{ + unsigned int cr; + + /* disable all interrupts, disable the port */ + cr = UART_GET_CTRL(port); + UART_PUT_CTRL(port, + cr & ~(UART_CTRL_RE | UART_CTRL_TE | + UART_CTRL_RI | UART_CTRL_TI)); + + /* Free the interrupt */ + free_irq(port->irq, port); +} + +static void apbuart_set_termios(struct uart_port *port, + struct ktermios *termios, struct ktermios *old) +{ + unsigned int cr; + unsigned long flags; + unsigned int baud, quot; + + /* Ask the core to calculate the divisor for us. */ + baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16); + if (baud == 0) + panic("invalid baudrate %i\n", port->uartclk / 16); + + /* uart_get_divisor calc a *16 uart freq, apbuart is *8 */ + quot = (uart_get_divisor(port, baud)) * 2; + cr = UART_GET_CTRL(port); + cr &= ~(UART_CTRL_PE | UART_CTRL_PS); + + if (termios->c_cflag & PARENB) { + cr |= UART_CTRL_PE; + if ((termios->c_cflag & PARODD)) + cr |= UART_CTRL_PS; + } + + /* Enable flow control. */ + if (termios->c_cflag & CRTSCTS) + cr |= UART_CTRL_FL; + + spin_lock_irqsave(&port->lock, flags); + + /* Update the per-port timeout. */ + uart_update_timeout(port, termios->c_cflag, baud); + + port->read_status_mask = UART_STATUS_OE; + if (termios->c_iflag & INPCK) + port->read_status_mask |= UART_STATUS_FE | UART_STATUS_PE; + + /* Characters to ignore */ + port->ignore_status_mask = 0; + if (termios->c_iflag & IGNPAR) + port->ignore_status_mask |= UART_STATUS_FE | UART_STATUS_PE; + + /* Ignore all characters if CREAD is not set. */ + if ((termios->c_cflag & CREAD) == 0) + port->ignore_status_mask |= UART_DUMMY_RSR_RX; + + /* Set baud rate */ + quot -= 1; + UART_PUT_SCAL(port, quot); + UART_PUT_CTRL(port, cr); + + spin_unlock_irqrestore(&port->lock, flags); +} + +static const char *apbuart_type(struct uart_port *port) +{ + return port->type == PORT_APBUART ? "GRLIB/APBUART" : NULL; +} + +static void apbuart_release_port(struct uart_port *port) +{ + release_mem_region(port->mapbase, 0x100); +} + +static int apbuart_request_port(struct uart_port *port) +{ + return request_mem_region(port->mapbase, 0x100, "grlib-apbuart") + != NULL ? 0 : -EBUSY; + return 0; +} + +/* Configure/autoconfigure the port */ +static void apbuart_config_port(struct uart_port *port, int flags) +{ + if (flags & UART_CONFIG_TYPE) { + port->type = PORT_APBUART; + apbuart_request_port(port); + } +} + +/* Verify the new serial_struct (for TIOCSSERIAL) */ +static int apbuart_verify_port(struct uart_port *port, + struct serial_struct *ser) +{ + int ret = 0; + if (ser->type != PORT_UNKNOWN && ser->type != PORT_APBUART) + ret = -EINVAL; + if (ser->irq < 0 || ser->irq >= NR_IRQS) + ret = -EINVAL; + if (ser->baud_base < 9600) + ret = -EINVAL; + return ret; +} + +static struct uart_ops grlib_apbuart_ops = { + .tx_empty = apbuart_tx_empty, + .set_mctrl = apbuart_set_mctrl, + .get_mctrl = apbuart_get_mctrl, + .stop_tx = apbuart_stop_tx, + .start_tx = apbuart_start_tx, + .stop_rx = apbuart_stop_rx, + .enable_ms = apbuart_enable_ms, + .break_ctl = apbuart_break_ctl, + .startup = apbuart_startup, + .shutdown = apbuart_shutdown, + .set_termios = apbuart_set_termios, + .type = apbuart_type, + .release_port = apbuart_release_port, + .request_port = apbuart_request_port, + .config_port = apbuart_config_port, + .verify_port = apbuart_verify_port, +}; + +static struct uart_port grlib_apbuart_ports[UART_NR]; +static struct device_node *grlib_apbuart_nodes[UART_NR]; + +static int apbuart_scan_fifo_size(struct uart_port *port, int portnumber) +{ + int ctrl, loop = 0; + int status; + int fifosize; + unsigned long flags; + + ctrl = UART_GET_CTRL(port); + + /* + * Enable the transceiver and wait for it to be ready to send data. + * Clear interrupts so that this process will not be externally + * interrupted in the middle (which can cause the transceiver to + * drain prematurely). + */ + + local_irq_save(flags); + + UART_PUT_CTRL(port, ctrl | UART_CTRL_TE); + + while (!UART_TX_READY(UART_GET_STATUS(port))) + loop++; + + /* + * Disable the transceiver so data isn't actually sent during the + * actual test. + */ + + UART_PUT_CTRL(port, ctrl & ~(UART_CTRL_TE)); + + fifosize = 1; + UART_PUT_CHAR(port, 0); + + /* + * So long as transmitting a character increments the tranceivier FIFO + * length the FIFO must be at least that big. These bytes will + * automatically drain off of the FIFO. + */ + + status = UART_GET_STATUS(port); + while (((status >> 20) & 0x3F) == fifosize) { + fifosize++; + UART_PUT_CHAR(port, 0); + status = UART_GET_STATUS(port); + } + + fifosize--; + + UART_PUT_CTRL(port, ctrl); + local_irq_restore(flags); + + if (fifosize == 0) + fifosize = 1; + + return fifosize; +} + +static void apbuart_flush_fifo(struct uart_port *port) +{ + int i; + + for (i = 0; i < port->fifosize; i++) + UART_GET_CHAR(port); +} + + +/* ======================================================================== */ +/* Console driver, if enabled */ +/* ======================================================================== */ + +#ifdef CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE + +static void apbuart_console_putchar(struct uart_port *port, int ch) +{ + unsigned int status; + do { + status = UART_GET_STATUS(port); + } while (!UART_TX_READY(status)); + UART_PUT_CHAR(port, ch); +} + +static void +apbuart_console_write(struct console *co, const char *s, unsigned int count) +{ + struct uart_port *port = &grlib_apbuart_ports[co->index]; + unsigned int status, old_cr, new_cr; + + /* First save the CR then disable the interrupts */ + old_cr = UART_GET_CTRL(port); + new_cr = old_cr & ~(UART_CTRL_RI | UART_CTRL_TI); + UART_PUT_CTRL(port, new_cr); + + uart_console_write(port, s, count, apbuart_console_putchar); + + /* + * Finally, wait for transmitter to become empty + * and restore the TCR + */ + do { + status = UART_GET_STATUS(port); + } while (!UART_TX_READY(status)); + UART_PUT_CTRL(port, old_cr); +} + +static void __init +apbuart_console_get_options(struct uart_port *port, int *baud, + int *parity, int *bits) +{ + if (UART_GET_CTRL(port) & (UART_CTRL_RE | UART_CTRL_TE)) { + + unsigned int quot, status; + status = UART_GET_STATUS(port); + + *parity = 'n'; + if (status & UART_CTRL_PE) { + if ((status & UART_CTRL_PS) == 0) + *parity = 'e'; + else + *parity = 'o'; + } + + *bits = 8; + quot = UART_GET_SCAL(port) / 8; + *baud = port->uartclk / (16 * (quot + 1)); + } +} + +static int __init apbuart_console_setup(struct console *co, char *options) +{ + struct uart_port *port; + int baud = 38400; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + pr_debug("apbuart_console_setup co=%p, co->index=%i, options=%s\n", + co, co->index, options); + + /* + * Check whether an invalid uart number has been specified, and + * if so, search for the first available port that does have + * console support. + */ + if (co->index >= grlib_apbuart_port_nr) + co->index = 0; + + port = &grlib_apbuart_ports[co->index]; + + spin_lock_init(&port->lock); + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else + apbuart_console_get_options(port, &baud, &parity, &bits); + + return uart_set_options(port, co, baud, parity, bits, flow); +} + +static struct uart_driver grlib_apbuart_driver; + +static struct console grlib_apbuart_console = { + .name = "ttyS", + .write = apbuart_console_write, + .device = uart_console_device, + .setup = apbuart_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &grlib_apbuart_driver, +}; + + +static void grlib_apbuart_configure(void); + +static int __init apbuart_console_init(void) +{ + grlib_apbuart_configure(); + register_console(&grlib_apbuart_console); + return 0; +} + +console_initcall(apbuart_console_init); + +#define APBUART_CONSOLE (&grlib_apbuart_console) +#else +#define APBUART_CONSOLE NULL +#endif + +static struct uart_driver grlib_apbuart_driver = { + .owner = THIS_MODULE, + .driver_name = "serial", + .dev_name = "ttyS", + .major = SERIAL_APBUART_MAJOR, + .minor = SERIAL_APBUART_MINOR, + .nr = UART_NR, + .cons = APBUART_CONSOLE, +}; + + +/* ======================================================================== */ +/* OF Platform Driver */ +/* ======================================================================== */ + +static int __devinit apbuart_probe(struct of_device *op, + const struct of_device_id *match) +{ + int i = -1; + struct uart_port *port = NULL; + + i = 0; + for (i = 0; i < grlib_apbuart_port_nr; i++) { + if (op->node == grlib_apbuart_nodes[i]) + break; + } + + port = &grlib_apbuart_ports[i]; + port->dev = &op->dev; + + uart_add_one_port(&grlib_apbuart_driver, (struct uart_port *) port); + + apbuart_flush_fifo((struct uart_port *) port); + + printk(KERN_INFO "grlib-apbuart at 0x%llx, irq %d\n", + (unsigned long long) port->mapbase, port->irq); + return 0; + +} + +static struct of_device_id __initdata apbuart_match[] = { + { + .name = "GAISLER_APBUART", + }, + {}, +}; + +static struct of_platform_driver grlib_apbuart_of_driver = { + .match_table = apbuart_match, + .probe = apbuart_probe, + .driver = { + .owner = THIS_MODULE, + .name = "grlib-apbuart", + }, +}; + + +static void grlib_apbuart_configure(void) +{ + static int enum_done; + struct device_node *np, *rp; + struct uart_port *port = NULL; + const u32 *prop; + int freq_khz; + int v = 0, d = 0; + unsigned int addr; + int irq, line; + struct amba_prom_registers *regs; + + if (enum_done) + return; + + /* Get bus frequency */ + rp = of_find_node_by_path("/"); + rp = of_get_next_child(rp, NULL); + prop = of_get_property(rp, "clock-frequency", NULL); + freq_khz = *prop; + + line = 0; + for_each_matching_node(np, apbuart_match) { + + int *vendor = (int *) of_get_property(np, "vendor", NULL); + int *device = (int *) of_get_property(np, "device", NULL); + int *irqs = (int *) of_get_property(np, "interrupts", NULL); + regs = (struct amba_prom_registers *) + of_get_property(np, "reg", NULL); + + if (vendor) + v = *vendor; + if (device) + d = *device; + + if (!irqs || !regs) + return; + + grlib_apbuart_nodes[line] = np; + + addr = regs->phys_addr; + irq = *irqs; + + port = &grlib_apbuart_ports[line]; + + port->mapbase = addr; + port->membase = ioremap(addr, sizeof(struct grlib_apbuart_regs_map)); + port->irq = irq; + port->iotype = UPIO_MEM; + port->ops = &grlib_apbuart_ops; + port->flags = UPF_BOOT_AUTOCONF; + port->line = line; + port->uartclk = freq_khz * 1000; + port->fifosize = apbuart_scan_fifo_size((struct uart_port *) port, line); + line++; + + /* We support maximum UART_NR uarts ... */ + if (line == UART_NR) + break; + + } + + enum_done = 1; + + grlib_apbuart_driver.nr = grlib_apbuart_port_nr = line; +} + +static int __init grlib_apbuart_init(void) +{ + int ret; + + /* Find all APBUARTS in device the tree and initialize their ports */ + grlib_apbuart_configure(); + + printk(KERN_INFO "Serial: GRLIB APBUART driver\n"); + + ret = uart_register_driver(&grlib_apbuart_driver); + + if (ret) { + printk(KERN_ERR "%s: uart_register_driver failed (%i)\n", + __FILE__, ret); + return ret; + } + + ret = of_register_platform_driver(&grlib_apbuart_of_driver); + if (ret) { + printk(KERN_ERR + "%s: of_register_platform_driver failed (%i)\n", + __FILE__, ret); + uart_unregister_driver(&grlib_apbuart_driver); + return ret; + } + + return ret; +} + +static void __exit grlib_apbuart_exit(void) +{ + int i; + + for (i = 0; i < grlib_apbuart_port_nr; i++) + uart_remove_one_port(&grlib_apbuart_driver, + &grlib_apbuart_ports[i]); + + uart_unregister_driver(&grlib_apbuart_driver); + of_unregister_platform_driver(&grlib_apbuart_of_driver); +} + +module_init(grlib_apbuart_init); +module_exit(grlib_apbuart_exit); + +MODULE_AUTHOR("Aeroflex Gaisler AB"); +MODULE_DESCRIPTION("GRLIB APBUART serial driver"); +MODULE_VERSION("2.1"); +MODULE_LICENSE("GPL"); diff --git a/drivers/serial/apbuart.h b/drivers/serial/apbuart.h new file mode 100644 index 0000000..5faf87c --- /dev/null +++ b/drivers/serial/apbuart.h @@ -0,0 +1,64 @@ +#ifndef __GRLIB_APBUART_H__ +#define __GRLIB_APBUART_H__ + +#include <asm/io.h> + +#define UART_NR 8 +static int grlib_apbuart_port_nr; + +struct grlib_apbuart_regs_map { + u32 data; + u32 status; + u32 ctrl; + u32 scaler; +}; + +struct amba_prom_registers { + unsigned int phys_addr; + unsigned int reg_size; +}; + +/* + * The following defines the bits in the APBUART Status Registers. + */ +#define UART_STATUS_DR 0x00000001 /* Data Ready */ +#define UART_STATUS_TSE 0x00000002 /* TX Send Register Empty */ +#define UART_STATUS_THE 0x00000004 /* TX Hold Register Empty */ +#define UART_STATUS_BR 0x00000008 /* Break Error */ +#define UART_STATUS_OE 0x00000010 /* RX Overrun Error */ +#define UART_STATUS_PE 0x00000020 /* RX Parity Error */ +#define UART_STATUS_FE 0x00000040 /* RX Framing Error */ +#define UART_STATUS_ERR 0x00000078 /* Error Mask */ + +/* + * The following defines the bits in the APBUART Ctrl Registers. + */ +#define UART_CTRL_RE 0x00000001 /* Receiver enable */ +#define UART_CTRL_TE 0x00000002 /* Transmitter enable */ +#define UART_CTRL_RI 0x00000004 /* Receiver interrupt enable */ +#define UART_CTRL_TI 0x00000008 /* Transmitter irq */ +#define UART_CTRL_PS 0x00000010 /* Parity select */ +#define UART_CTRL_PE 0x00000020 /* Parity enable */ +#define UART_CTRL_FL 0x00000040 /* Flow control enable */ +#define UART_CTRL_LB 0x00000080 /* Loopback enable */ + +#define APBBASE(port) ((struct grlib_apbuart_regs_map *)((port)->membase)) + +#define APBBASE_DATA_P(port) (&(APBBASE(port)->data)) +#define APBBASE_STATUS_P(port) (&(APBBASE(port)->status)) +#define APBBASE_CTRL_P(port) (&(APBBASE(port)->ctrl)) +#define APBBASE_SCALAR_P(port) (&(APBBASE(port)->scaler)) + +#define UART_GET_CHAR(port) (__raw_readl(APBBASE_DATA_P(port))) +#define UART_PUT_CHAR(port, v) (__raw_writel(v, APBBASE_DATA_P(port))) +#define UART_GET_STATUS(port) (__raw_readl(APBBASE_STATUS_P(port))) +#define UART_PUT_STATUS(port, v)(__raw_writel(v, APBBASE_STATUS_P(port))) +#define UART_GET_CTRL(port) (__raw_readl(APBBASE_CTRL_P(port))) +#define UART_PUT_CTRL(port, v) (__raw_writel(v, APBBASE_CTRL_P(port))) +#define UART_GET_SCAL(port) (__raw_readl(APBBASE_SCALAR_P(port))) +#define UART_PUT_SCAL(port, v) (__raw_writel(v, APBBASE_SCALAR_P(port))) + +#define UART_RX_DATA(s) (((s) & UART_STATUS_DR) != 0) +#define UART_TX_READY(s) (((s) & UART_STATUS_THE) != 0) + +#endif /* __GRLIB_APBUART_H__ */ diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c index c99f082..73f089d 100644 --- a/drivers/serial/s3c2410.c +++ b/drivers/serial/s3c2410.c @@ -2,7 +2,7 @@ * * Driver for Samsung S3C2410 SoC onboard UARTs. * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/serial/s3c2412.c b/drivers/serial/s3c2412.c index 6e057d8..ce75e28 100644 --- a/drivers/serial/s3c2412.c +++ b/drivers/serial/s3c2412.c @@ -2,7 +2,7 @@ * * Driver for Samsung S3C2412 and S3C2413 SoC onboard UARTs. * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/serial/s3c2440.c b/drivers/serial/s3c2440.c index 69ff5d3..094cc39 100644 --- a/drivers/serial/s3c2440.c +++ b/drivers/serial/s3c2440.c @@ -2,7 +2,7 @@ * * Driver for Samsung S3C2440 and S3C2442 SoC onboard UARTs. * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/serial/s3c24a0.c b/drivers/serial/s3c24a0.c index 26c49e1..fad6083 100644 --- a/drivers/serial/s3c24a0.c +++ b/drivers/serial/s3c24a0.c @@ -6,7 +6,7 @@ * * Author: Sandeep Patil <sandeep.patil@azingo.com> * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/serial/samsung.c b/drivers/serial/samsung.c index 1523e8d..52e3df1 100644 --- a/drivers/serial/samsung.c +++ b/drivers/serial/samsung.c @@ -2,7 +2,7 @@ * * Driver core for Samsung SoC onboard UARTs. * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/serial/samsung.h b/drivers/serial/samsung.h index d3fe315..1fb2234 100644 --- a/drivers/serial/samsung.h +++ b/drivers/serial/samsung.h @@ -2,7 +2,7 @@ * * Driver for Samsung SoC onboard UARTs. * - * Ben Dooks, Copyright (c) 2003-2005,2008 Simtec Electronics + * Ben Dooks, Copyright (c) 2003-2008 Simtec Electronics * http://armlinux.simtec.co.uk/ * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c index ba1a872..bf5f95a 100644 --- a/drivers/spi/omap2_mcspi.c +++ b/drivers/spi/omap2_mcspi.c @@ -35,8 +35,8 @@ #include <linux/spi/spi.h> -#include <mach/dma.h> -#include <mach/clock.h> +#include <plat/dma.h> +#include <plat/clock.h> #define OMAP2_MCSPI_MAX_FREQ 48000000 diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c index e75ba9b..6c3a855 100644 --- a/drivers/spi/omap_uwire.c +++ b/drivers/spi/omap_uwire.c @@ -51,8 +51,8 @@ #include <asm/io.h> #include <asm/mach-types.h> -#include <mach/mux.h> -#include <mach/omap730.h> /* OMAP730_IO_CONF registers */ +#include <plat/mux.h> +#include <plat/omap7xx.h> /* OMAP7XX_IO_CONF registers */ /* FIXME address is now a platform device resource, @@ -504,7 +504,7 @@ static int __init uwire_probe(struct platform_device *pdev) } clk_enable(uwire->ck); - if (cpu_is_omap730()) + if (cpu_is_omap7xx()) uwire_idx_shift = 1; else uwire_idx_shift = 2; @@ -573,8 +573,8 @@ static int __init omap_uwire_init(void) } if (machine_is_omap_perseus2()) { /* configure pins: MPU_UW_nSCS1, MPU_UW_SDO, MPU_UW_SCLK */ - int val = omap_readl(OMAP730_IO_CONF_9) & ~0x00EEE000; - omap_writel(val | 0x00AAA000, OMAP730_IO_CONF_9); + int val = omap_readl(OMAP7XX_IO_CONF_9) & ~0x00EEE000; + omap_writel(val | 0x00AAA000, OMAP7XX_IO_CONF_9); } return platform_driver_probe(&uwire_driver, uwire_probe); diff --git a/drivers/staging/arlan/arlan-proc.c b/drivers/staging/arlan/arlan-proc.c index a8b6896..b22983e 100644 --- a/drivers/staging/arlan/arlan-proc.c +++ b/drivers/staging/arlan/arlan-proc.c @@ -816,84 +816,83 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, /* Place files in /proc/sys/dev/arlan */ -#define CTBLN(num,card,nam) \ - { .ctl_name = num,\ - .procname = #nam,\ +#define CTBLN(card,nam) \ + { .procname = #nam,\ .data = &(arlan_conf[card].nam),\ - .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec} + .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec} #ifdef ARLAN_DEBUGGING #define ARLAN_PROC_DEBUG_ENTRIES \ - { .ctl_name = 48, .procname = "entry_exit_debug",\ + { .procname = "entry_exit_debug",\ .data = &arlan_entry_and_exit_debug,\ - .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec},\ - { .ctl_name = 49, .procname = "debug", .data = &arlan_debug,\ - .maxlen = sizeof(int), .mode = 0600, .proc_handler = &proc_dointvec}, + .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec},\ + { .procname = "debug", .data = &arlan_debug,\ + .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec}, #else #define ARLAN_PROC_DEBUG_ENTRIES #endif #define ARLAN_SYSCTL_TABLE_TOTAL(cardNo)\ - CTBLN(1,cardNo,spreadingCode),\ - CTBLN(2,cardNo, channelNumber),\ - CTBLN(3,cardNo, scramblingDisable),\ - CTBLN(4,cardNo, txAttenuation),\ - CTBLN(5,cardNo, systemId), \ - CTBLN(6,cardNo, maxDatagramSize),\ - CTBLN(7,cardNo, maxFrameSize),\ - CTBLN(8,cardNo, maxRetries),\ - CTBLN(9,cardNo, receiveMode),\ - CTBLN(10,cardNo, priority),\ - CTBLN(11,cardNo, rootOrRepeater),\ - CTBLN(12,cardNo, SID),\ - CTBLN(13,cardNo, registrationMode),\ - CTBLN(14,cardNo, registrationFill),\ - CTBLN(15,cardNo, localTalkAddress),\ - CTBLN(16,cardNo, codeFormat),\ - CTBLN(17,cardNo, numChannels),\ - CTBLN(18,cardNo, channel1),\ - CTBLN(19,cardNo, channel2),\ - CTBLN(20,cardNo, channel3),\ - CTBLN(21,cardNo, channel4),\ - CTBLN(22,cardNo, txClear),\ - CTBLN(23,cardNo, txRetries),\ - CTBLN(24,cardNo, txRouting),\ - CTBLN(25,cardNo, txScrambled),\ - CTBLN(26,cardNo, rxParameter),\ - CTBLN(27,cardNo, txTimeoutMs),\ - CTBLN(28,cardNo, waitCardTimeout),\ - CTBLN(29,cardNo, channelSet), \ - {.ctl_name = 30, .procname = "name",\ + CTBLN(cardNo,spreadingCode),\ + CTBLN(cardNo, channelNumber),\ + CTBLN(cardNo, scramblingDisable),\ + CTBLN(cardNo, txAttenuation),\ + CTBLN(cardNo, systemId), \ + CTBLN(cardNo, maxDatagramSize),\ + CTBLN(cardNo, maxFrameSize),\ + CTBLN(cardNo, maxRetries),\ + CTBLN(cardNo, receiveMode),\ + CTBLN(cardNo, priority),\ + CTBLN(cardNo, rootOrRepeater),\ + CTBLN(cardNo, SID),\ + CTBLN(cardNo, registrationMode),\ + CTBLN(cardNo, registrationFill),\ + CTBLN(cardNo, localTalkAddress),\ + CTBLN(cardNo, codeFormat),\ + CTBLN(cardNo, numChannels),\ + CTBLN(cardNo, channel1),\ + CTBLN(cardNo, channel2),\ + CTBLN(cardNo, channel3),\ + CTBLN(cardNo, channel4),\ + CTBLN(cardNo, txClear),\ + CTBLN(cardNo, txRetries),\ + CTBLN(cardNo, txRouting),\ + CTBLN(cardNo, txScrambled),\ + CTBLN(cardNo, rxParameter),\ + CTBLN(cardNo, txTimeoutMs),\ + CTBLN(cardNo, waitCardTimeout),\ + CTBLN(cardNo, channelSet), \ + { .procname = "name",\ .data = arlan_conf[cardNo].siteName,\ - .maxlen = 16, .mode = 0600, .proc_handler = &proc_dostring},\ - CTBLN(31,cardNo,waitTime),\ - CTBLN(32,cardNo,lParameter),\ - CTBLN(33,cardNo,_15),\ - CTBLN(34,cardNo,headerSize),\ - CTBLN(36,cardNo,tx_delay_ms),\ - CTBLN(37,cardNo,retries),\ - CTBLN(38,cardNo,ReTransmitPacketMaxSize),\ - CTBLN(39,cardNo,waitReTransmitPacketMaxSize),\ - CTBLN(40,cardNo,fastReTransCount),\ - CTBLN(41,cardNo,driverRetransmissions),\ - CTBLN(42,cardNo,txAckTimeoutMs),\ - CTBLN(43,cardNo,registrationInterrupts),\ - CTBLN(44,cardNo,hardwareType),\ - CTBLN(45,cardNo,radioType),\ - CTBLN(46,cardNo,writeEEPROM),\ - CTBLN(47,cardNo,writeRadioType),\ + .maxlen = 16, .mode = 0600, .proc_handler = proc_dostring},\ + CTBLN(cardNo,waitTime),\ + CTBLN(cardNo,lParameter),\ + CTBLN(cardNo,_15),\ + CTBLN(cardNo,headerSize),\ + CTBLN(cardNo,tx_delay_ms),\ + CTBLN(cardNo,retries),\ + CTBLN(cardNo,ReTransmitPacketMaxSize),\ + CTBLN(cardNo,waitReTransmitPacketMaxSize),\ + CTBLN(cardNo,fastReTransCount),\ + CTBLN(cardNo,driverRetransmissions),\ + CTBLN(cardNo,txAckTimeoutMs),\ + CTBLN(cardNo,registrationInterrupts),\ + CTBLN(cardNo,hardwareType),\ + CTBLN(cardNo,radioType),\ + CTBLN(cardNo,writeEEPROM),\ + CTBLN(cardNo,writeRadioType),\ ARLAN_PROC_DEBUG_ENTRIES\ - CTBLN(50,cardNo,in_speed),\ - CTBLN(51,cardNo,out_speed),\ - CTBLN(52,cardNo,in_speed10),\ - CTBLN(53,cardNo,out_speed10),\ - CTBLN(54,cardNo,in_speed_max),\ - CTBLN(55,cardNo,out_speed_max),\ - CTBLN(56,cardNo,measure_rate),\ - CTBLN(57,cardNo,pre_Command_Wait),\ - CTBLN(58,cardNo,rx_tweak1),\ - CTBLN(59,cardNo,rx_tweak2),\ - CTBLN(60,cardNo,tx_queue_len),\ + CTBLN(cardNo,in_speed),\ + CTBLN(cardNo,out_speed),\ + CTBLN(cardNo,in_speed10),\ + CTBLN(cardNo,out_speed10),\ + CTBLN(cardNo,in_speed_max),\ + CTBLN(cardNo,out_speed_max),\ + CTBLN(cardNo,measure_rate),\ + CTBLN(cardNo,pre_Command_Wait),\ + CTBLN(cardNo,rx_tweak1),\ + CTBLN(cardNo,rx_tweak2),\ + CTBLN(cardNo,tx_queue_len),\ @@ -903,63 +902,56 @@ static ctl_table arlan_conf_table0[] = #ifdef ARLAN_PROC_SHM_DUMP { - .ctl_name = 150, .procname = "arlan0-txRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_infotxRing, + .proc_handler = arlan_sysctl_infotxRing, }, { - .ctl_name = 151, .procname = "arlan0-rxRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_inforxRing, + .proc_handler = arlan_sysctl_inforxRing, }, { - .ctl_name = 152, .procname = "arlan0-18", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info18, + .proc_handler = arlan_sysctl_info18, }, { - .ctl_name = 153, .procname = "arlan0-ring", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info161719, + .proc_handler = arlan_sysctl_info161719, }, { - .ctl_name = 154, .procname = "arlan0-shm-cpy", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info, + .proc_handler = arlan_sysctl_info, }, #endif { - .ctl_name = 155, .procname = "config0", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_configure + .proc_handler = arlan_configure }, { - .ctl_name = 156, .procname = "reset0", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_sysctl_reset, + .proc_handler = arlan_sysctl_reset, }, - { .ctl_name = 0 } + { } }; static ctl_table arlan_conf_table1[] = @@ -969,63 +961,56 @@ static ctl_table arlan_conf_table1[] = #ifdef ARLAN_PROC_SHM_DUMP { - .ctl_name = 150, .procname = "arlan1-txRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_infotxRing, + .proc_handler = arlan_sysctl_infotxRing, }, { - .ctl_name = 151, .procname = "arlan1-rxRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_inforxRing, + .proc_handler = arlan_sysctl_inforxRing, }, { - .ctl_name = 152, .procname = "arlan1-18", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info18, + .proc_handler = arlan_sysctl_info18, }, { - .ctl_name = 153, .procname = "arlan1-ring", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info161719, + .proc_handler = arlan_sysctl_info161719, }, { - .ctl_name = 154, .procname = "arlan1-shm-cpy", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info, + .proc_handler = arlan_sysctl_info, }, #endif { - .ctl_name = 155, .procname = "config1", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_configure, + .proc_handler = arlan_configure, }, { - .ctl_name = 156, .procname = "reset1", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_sysctl_reset, + .proc_handler = arlan_sysctl_reset, }, - { .ctl_name = 0 } + { } }; static ctl_table arlan_conf_table2[] = @@ -1035,63 +1020,56 @@ static ctl_table arlan_conf_table2[] = #ifdef ARLAN_PROC_SHM_DUMP { - .ctl_name = 150, .procname = "arlan2-txRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_infotxRing, + .proc_handler = arlan_sysctl_infotxRing, }, { - .ctl_name = 151, .procname = "arlan2-rxRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_inforxRing, + .proc_handler = arlan_sysctl_inforxRing, }, { - .ctl_name = 152, .procname = "arlan2-18", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info18, + .proc_handler = arlan_sysctl_info18, }, { - .ctl_name = 153, .procname = "arlan2-ring", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info161719, + .proc_handler = arlan_sysctl_info161719, }, { - .ctl_name = 154, .procname = "arlan2-shm-cpy", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info, + .proc_handler = arlan_sysctl_info, }, #endif { - .ctl_name = 155, .procname = "config2", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_configure, + .proc_handler = arlan_configure, }, { - .ctl_name = 156, .procname = "reset2", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_sysctl_reset, + .proc_handler = arlan_sysctl_reset, }, - { .ctl_name = 0 } + { } }; static ctl_table arlan_conf_table3[] = @@ -1101,63 +1079,56 @@ static ctl_table arlan_conf_table3[] = #ifdef ARLAN_PROC_SHM_DUMP { - .ctl_name = 150, .procname = "arlan3-txRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_infotxRing, + .proc_handler = arlan_sysctl_infotxRing, }, { - .ctl_name = 151, .procname = "arlan3-rxRing", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_inforxRing, + .proc_handler = arlan_sysctl_inforxRing, }, { - .ctl_name = 152, .procname = "arlan3-18", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info18, + .proc_handler = arlan_sysctl_info18, }, { - .ctl_name = 153, .procname = "arlan3-ring", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info161719, + .proc_handler = arlan_sysctl_info161719, }, { - .ctl_name = 154, .procname = "arlan3-shm-cpy", .data = &arlan_drive_info, .maxlen = ARLAN_STR_SIZE, .mode = 0400, - .proc_handler = &arlan_sysctl_info, + .proc_handler = arlan_sysctl_info, }, #endif { - .ctl_name = 155, .procname = "config3", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_configure, + .proc_handler = arlan_configure, }, { - .ctl_name = 156, .procname = "reset3", .data = &conf_reset_result, .maxlen = 100, .mode = 0400, - .proc_handler = &arlan_sysctl_reset, + .proc_handler = arlan_sysctl_reset, }, - { .ctl_name = 0 } + { } }; @@ -1165,41 +1136,37 @@ static ctl_table arlan_conf_table3[] = static ctl_table arlan_table[] = { { - .ctl_name = 0, .procname = "arlan0", .maxlen = 0, .mode = 0600, .child = arlan_conf_table0, }, { - .ctl_name = 0, .procname = "arlan1", .maxlen = 0, .mode = 0600, .child = arlan_conf_table1, }, { - .ctl_name = 0, .procname = "arlan2", .maxlen = 0, .mode = 0600, .child = arlan_conf_table2, }, { - .ctl_name = 0, .procname = "arlan3", .maxlen = 0, .mode = 0600, .child = arlan_conf_table3, }, - { .ctl_name = 0 } + { } }; #else -static ctl_table arlan_table[MAX_ARLANS + 1] = +static ctl_table arlan_table[] = { - { .ctl_name = 0 } + { } }; #endif @@ -1209,22 +1176,14 @@ static ctl_table arlan_table[MAX_ARLANS + 1] = static ctl_table arlan_root_table[] = { { - .ctl_name = CTL_ARLAN, .procname = "arlan", .maxlen = 0, .mode = 0555, .child = arlan_table, }, - { .ctl_name = 0 } + { } }; -/* Make sure that /proc/sys/dev is there */ -//static ctl_table arlan_device_root_table[] = -//{ -// {CTL_DEV, "dev", NULL, 0, 0555, arlan_root_table}, -// {0} -//}; - static struct ctl_table_header *arlan_device_sysctl_header; @@ -1234,8 +1193,6 @@ int __init init_arlan_proc(void) int i = 0; if (arlan_device_sysctl_header) return 0; - for (i = 0; i < MAX_ARLANS && arlan_device[i]; i++) - arlan_table[i].ctl_name = i + 1; arlan_device_sysctl_header = register_sysctl_table(arlan_root_table); if (!arlan_device_sysctl_header) return -1; diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index c94de31..f69b778 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c struct inode *inode = mapping->host; struct pohmelfs_inode *pi = POHMELFS_I(inode); struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb); - struct backing_dev_info *bdi = mapping->backing_dev_info; int err = 0; int done = 0; int nr_pages; @@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -248,10 +242,6 @@ retry: if (wbc->nr_to_write <= 0) done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } continue; out_continue: diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index a2db0e1..f81e4f0 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -52,9 +52,9 @@ #include <asm/unaligned.h> #include <asm/mach-types.h> -#include <mach/dma.h> -#include <mach/usb.h> -#include <mach/control.h> +#include <plat/dma.h> +#include <plat/usb.h> +#include <plat/control.h> #include "omap_udc.h" @@ -2098,6 +2098,7 @@ static inline int machine_without_vbus_sense(void) || machine_is_omap_h4() #endif || machine_is_sx1() + || cpu_is_omap7xx() /* No known omap7xx boards with vbus sense */ ); } @@ -2838,6 +2839,16 @@ static int __init omap_udc_probe(struct platform_device *pdev) udelay(100); } + if (cpu_is_omap7xx()) { + dc_clk = clk_get(&pdev->dev, "usb_dc_ck"); + hhc_clk = clk_get(&pdev->dev, "l3_ocpi_ck"); + BUG_ON(IS_ERR(dc_clk) || IS_ERR(hhc_clk)); + /* can't use omap_udc_enable_clock yet */ + clk_enable(dc_clk); + clk_enable(hhc_clk); + udelay(100); + } + INFO("OMAP UDC rev %d.%d%s\n", omap_readw(UDC_REV) >> 4, omap_readw(UDC_REV) & 0xf, config->otg ? ", Mini-AB" : ""); @@ -2970,7 +2981,7 @@ known: goto cleanup3; } #endif - if (cpu_is_omap16xx()) { + if (cpu_is_omap16xx() || cpu_is_omap7xx()) { udc->dc_clk = dc_clk; udc->hhc_clk = hhc_clk; clk_disable(hhc_clk); @@ -3008,7 +3019,7 @@ cleanup0: if (xceiv) otg_put_transceiver(xceiv); - if (cpu_is_omap16xx() || cpu_is_omap24xx()) { + if (cpu_is_omap16xx() || cpu_is_omap24xx() || cpu_is_omap7xx()) { clk_disable(hhc_clk); clk_disable(dc_clk); clk_put(hhc_clk); @@ -3115,6 +3126,10 @@ static struct platform_driver udc_driver = { static int __init udc_init(void) { + /* Disable DMA for omap7xx -- it doesn't work right. */ + if (cpu_is_omap7xx()) + use_dma = 0; + INFO("%s, version: " DRIVER_VERSION #ifdef USE_ISO " (iso)" diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c index 83cbecd..5645f70 100644 --- a/drivers/usb/host/ohci-omap.c +++ b/drivers/usb/host/ohci-omap.c @@ -24,10 +24,10 @@ #include <asm/io.h> #include <asm/mach-types.h> -#include <mach/mux.h> +#include <plat/mux.h> #include <mach/irqs.h> -#include <mach/fpga.h> -#include <mach/usb.h> +#include <plat/fpga.h> +#include <plat/usb.h> /* OMAP-1510 OHCI has its own MMU for DMA */ diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c index 3487520..6761d20 100644 --- a/drivers/usb/musb/omap2430.c +++ b/drivers/usb/musb/omap2430.c @@ -35,7 +35,7 @@ #include <asm/mach-types.h> #include <mach/hardware.h> -#include <mach/mux.h> +#include <plat/mux.h> #include "musb_core.h" #include "omap2430.h" diff --git a/drivers/usb/musb/omap2430.h b/drivers/usb/musb/omap2430.h index dc76707..fbede77 100644 --- a/drivers/usb/musb/omap2430.h +++ b/drivers/usb/musb/omap2430.h @@ -12,7 +12,7 @@ #if defined(CONFIG_ARCH_OMAP2430) || defined(CONFIG_ARCH_OMAP3430) #include <mach/hardware.h> -#include <mach/usb.h> +#include <plat/usb.h> /* * OMAP2430-specific definitions diff --git a/drivers/usb/musb/tusb6010_omap.c b/drivers/usb/musb/tusb6010_omap.c index 7e073a0..e13c770 100644 --- a/drivers/usb/musb/tusb6010_omap.c +++ b/drivers/usb/musb/tusb6010_omap.c @@ -15,8 +15,8 @@ #include <linux/usb.h> #include <linux/platform_device.h> #include <linux/dma-mapping.h> -#include <mach/dma.h> -#include <mach/mux.h> +#include <plat/dma.h> +#include <plat/mux.h> #include "musb_core.h" diff --git a/drivers/usb/otg/isp1301_omap.c b/drivers/usb/otg/isp1301_omap.c index 77a5f41..d54460a 100644 --- a/drivers/usb/otg/isp1301_omap.c +++ b/drivers/usb/otg/isp1301_omap.c @@ -36,8 +36,8 @@ #include <asm/irq.h> #include <asm/mach-types.h> -#include <mach/usb.h> -#include <mach/mux.h> +#include <plat/usb.h> +#include <plat/mux.h> #ifndef DEBUG diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 188e1ba..6b89eb5 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -5,6 +5,9 @@ menu "Graphics support" depends on HAS_IOMEM +config HAVE_FB_ATMEL + bool + source "drivers/char/agp/Kconfig" source "drivers/gpu/vga/Kconfig" @@ -937,7 +940,7 @@ config FB_S1D13XXX config FB_ATMEL tristate "AT91/AT32 LCD Controller support" - depends on FB && (ARCH_AT91SAM9261 || ARCH_AT91SAM9G10 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91SAM9G45 || ARCH_AT91CAP9 || AVR32) + depends on FB && HAVE_FB_ATMEL select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/atafb.c b/drivers/video/atafb.c index 37624f7..b7687c5 100644 --- a/drivers/video/atafb.c +++ b/drivers/video/atafb.c @@ -2242,6 +2242,9 @@ static int ext_setcolreg(unsigned int regno, unsigned int red, if (!external_vgaiobase) return 1; + if (regno > 255) + return 1; + switch (external_card_type) { case IS_VGA: OUTB(0x3c8, regno); diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c index 701a108..7fcb0eb 100644 --- a/drivers/video/backlight/da903x_bl.c +++ b/drivers/video/backlight/da903x_bl.c @@ -25,6 +25,7 @@ #define DA9034_WLED_CONTROL1 0x3C #define DA9034_WLED_CONTROL2 0x3D +#define DA9034_WLED_ISET(x) ((x) & 0x1f) #define DA9034_WLED_BOOST_EN (1 << 5) @@ -101,6 +102,7 @@ static struct backlight_ops da903x_backlight_ops = { static int da903x_backlight_probe(struct platform_device *pdev) { + struct da9034_backlight_pdata *pdata = pdev->dev.platform_data; struct da903x_backlight_data *data; struct backlight_device *bl; int max_brightness; @@ -127,6 +129,11 @@ static int da903x_backlight_probe(struct platform_device *pdev) data->da903x_dev = pdev->dev.parent; data->current_brightness = 0; + /* adjust the WLED output current */ + if (pdata) + da903x_write(data->da903x_dev, DA9034_WLED_CONTROL2, + DA9034_WLED_ISET(pdata->output_current)); + bl = backlight_device_register(pdev->name, data->da903x_dev, data, &da903x_backlight_ops); if (IS_ERR(bl)) { diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c index cbad67e..8693e5f 100644 --- a/drivers/video/backlight/omap1_bl.c +++ b/drivers/video/backlight/omap1_bl.c @@ -26,8 +26,8 @@ #include <linux/backlight.h> #include <mach/hardware.h> -#include <mach/board.h> -#include <mach/mux.h> +#include <plat/board.h> +#include <plat/mux.h> #define OMAPBL_MAX_INTENSITY 0xff diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c index bbfb502..4a3d46e 100644 --- a/drivers/video/backlight/tdo24m.c +++ b/drivers/video/backlight/tdo24m.c @@ -367,6 +367,7 @@ static int __devinit tdo24m_probe(struct spi_device *spi) spi_message_init(m); + x->cs_change = 1; x->tx_buf = &lcd->buf[0]; spi_message_add_tail(x, m); diff --git a/drivers/video/omap/Makefile b/drivers/video/omap/Makefile index b63b198..49226a1 100644 --- a/drivers/video/omap/Makefile +++ b/drivers/video/omap/Makefile @@ -35,6 +35,7 @@ objs-y$(CONFIG_MACH_OMAP3EVM) += lcd_omap3evm.o objs-y$(CONFIG_MACH_OMAP3_BEAGLE) += lcd_omap3beagle.o objs-y$(CONFIG_FB_OMAP_LCD_MIPID) += lcd_mipid.o objs-y$(CONFIG_MACH_OVERO) += lcd_overo.o +objs-y$(CONFIG_MACH_HERALD) += lcd_htcherald.o omapfb-objs := $(objs-yy) diff --git a/drivers/video/omap/blizzard.c b/drivers/video/omap/blizzard.c index 70dadf9..f5d75f2 100644 --- a/drivers/video/omap/blizzard.c +++ b/drivers/video/omap/blizzard.c @@ -26,9 +26,9 @@ #include <linux/delay.h> #include <linux/clk.h> -#include <mach/dma.h> -#include <mach/omapfb.h> -#include <mach/blizzard.h> +#include <plat/dma.h> +#include <plat/omapfb.h> +#include <plat/blizzard.h> #include "dispc.h" diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c index f16e421..7c833db 100644 --- a/drivers/video/omap/dispc.c +++ b/drivers/video/omap/dispc.c @@ -25,9 +25,9 @@ #include <linux/clk.h> #include <linux/io.h> -#include <mach/sram.h> -#include <mach/omapfb.h> -#include <mach/board.h> +#include <plat/sram.h> +#include <plat/omapfb.h> +#include <plat/board.h> #include "dispc.h" @@ -204,6 +204,7 @@ static u32 inline dispc_read_reg(int idx) /* Select RFBI or bypass mode */ static void enable_rfbi_mode(int enable) { + void __iomem *rfbi_control; u32 l; l = dispc_read_reg(DISPC_CONTROL); @@ -216,9 +217,15 @@ static void enable_rfbi_mode(int enable) dispc_write_reg(DISPC_CONTROL, l); /* Set bypass mode in RFBI module */ - l = __raw_readl(OMAP2_IO_ADDRESS(RFBI_CONTROL)); + rfbi_control = ioremap(RFBI_CONTROL, SZ_1K); + if (!rfbi_control) { + pr_err("Unable to ioremap rfbi_control\n"); + return; + } + l = __raw_readl(rfbi_control); l |= enable ? 0 : (1 << 1); - __raw_writel(l, OMAP2_IO_ADDRESS(RFBI_CONTROL)); + __raw_writel(l, rfbi_control); + iounmap(rfbi_control); } static void set_lcd_data_lines(int data_lines) @@ -1367,6 +1374,7 @@ static int omap_dispc_init(struct omapfb_device *fbdev, int ext_mode, int r; u32 l; struct lcd_panel *panel = fbdev->panel; + void __iomem *ram_fw_base; int tmo = 10000; int skip_init = 0; int i; @@ -1441,7 +1449,13 @@ static int omap_dispc_init(struct omapfb_device *fbdev, int ext_mode, } /* L3 firewall setting: enable access to OCM RAM */ - __raw_writel(0x402000b0, OMAP2_IO_ADDRESS(0x680050a0)); + ram_fw_base = ioremap(0x68005000, SZ_1K); + if (!ram_fw_base) { + dev_err(dispc.fbdev->dev, "Cannot ioremap to enable OCM RAM\n"); + goto fail1; + } + __raw_writel(0x402000b0, ram_fw_base + 0xa0); + iounmap(ram_fw_base); if ((r = alloc_palette_ram()) < 0) goto fail2; diff --git a/drivers/video/omap/hwa742.c b/drivers/video/omap/hwa742.c index ca51583..17a975e 100644 --- a/drivers/video/omap/hwa742.c +++ b/drivers/video/omap/hwa742.c @@ -26,9 +26,9 @@ #include <linux/delay.h> #include <linux/clk.h> -#include <mach/dma.h> -#include <mach/omapfb.h> -#include <mach/hwa742.h> +#include <plat/dma.h> +#include <plat/omapfb.h> +#include <plat/hwa742.h> #define HWA742_REV_CODE_REG 0x0 #define HWA742_CONFIG_REG 0x2 diff --git a/drivers/video/omap/lcd_2430sdp.c b/drivers/video/omap/lcd_2430sdp.c index 393712b..fea7fee 100644 --- a/drivers/video/omap/lcd_2430sdp.c +++ b/drivers/video/omap/lcd_2430sdp.c @@ -27,8 +27,8 @@ #include <linux/gpio.h> #include <linux/i2c/twl4030.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define SDP2430_LCD_PANEL_BACKLIGHT_GPIO 91 diff --git a/drivers/video/omap/lcd_ams_delta.c b/drivers/video/omap/lcd_ams_delta.c index 1f74399..3d52772 100644 --- a/drivers/video/omap/lcd_ams_delta.c +++ b/drivers/video/omap/lcd_ams_delta.c @@ -25,9 +25,9 @@ #include <linux/io.h> #include <linux/delay.h> -#include <mach/board-ams-delta.h> +#include <plat/board-ams-delta.h> #include <mach/hardware.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> #define AMS_DELTA_DEFAULT_CONTRAST 112 diff --git a/drivers/video/omap/lcd_apollon.c b/drivers/video/omap/lcd_apollon.c index 626ae3a..4c5cefc 100644 --- a/drivers/video/omap/lcd_apollon.c +++ b/drivers/video/omap/lcd_apollon.c @@ -25,8 +25,8 @@ #include <linux/platform_device.h> #include <mach/gpio.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> /* #define USE_35INCH_LCD 1 */ diff --git a/drivers/video/omap/lcd_h3.c b/drivers/video/omap/lcd_h3.c index 417ae5e..240b4fb 100644 --- a/drivers/video/omap/lcd_h3.c +++ b/drivers/video/omap/lcd_h3.c @@ -24,7 +24,7 @@ #include <linux/i2c/tps65010.h> #include <mach/gpio.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> #define MODULE_NAME "omapfb-lcd_h3" diff --git a/drivers/video/omap/lcd_h4.c b/drivers/video/omap/lcd_h4.c index 0c398bd..720625d 100644 --- a/drivers/video/omap/lcd_h4.c +++ b/drivers/video/omap/lcd_h4.c @@ -22,7 +22,7 @@ #include <linux/module.h> #include <linux/platform_device.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> static int h4_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) { diff --git a/drivers/video/omap/lcd_htcherald.c b/drivers/video/omap/lcd_htcherald.c new file mode 100644 index 0000000..2e0c81e --- /dev/null +++ b/drivers/video/omap/lcd_htcherald.c @@ -0,0 +1,130 @@ +/* + * File: drivers/video/omap/lcd-htcherald.c + * + * LCD panel support for the HTC Herald + * + * Copyright (C) 2009 Cory Maccarrone <darkstar6262@gmail.com> + * Copyright (C) 2009 Wing Linux + * + * Based on the lcd_htcwizard.c file from the linwizard project: + * Copyright (C) linwizard.sourceforge.net + * Author: Angelo Arrifano <miknix@gmail.com> + * Based on lcd_h4 by Imre Deak <imre.deak@nokia.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include <linux/module.h> +#include <linux/platform_device.h> + +#include <plat/omapfb.h> + +static int htcherald_panel_init(struct lcd_panel *panel, + struct omapfb_device *fbdev) +{ + return 0; +} + +static void htcherald_panel_cleanup(struct lcd_panel *panel) +{ +} + +static int htcherald_panel_enable(struct lcd_panel *panel) +{ + return 0; +} + +static void htcherald_panel_disable(struct lcd_panel *panel) +{ +} + +static unsigned long htcherald_panel_get_caps(struct lcd_panel *panel) +{ + return 0; +} + +/* Found on WIZ200 (miknix) and some HERA110 models (darkstar62) */ +struct lcd_panel htcherald_panel_1 = { + .name = "lcd_herald", + .config = OMAP_LCDC_PANEL_TFT | + OMAP_LCDC_INV_HSYNC | + OMAP_LCDC_INV_VSYNC | + OMAP_LCDC_INV_PIX_CLOCK, + .bpp = 16, + .data_lines = 16, + .x_res = 240, + .y_res = 320, + .pixel_clock = 6093, + .pcd = 0, /* 15 */ + .hsw = 10, + .hfp = 10, + .hbp = 20, + .vsw = 3, + .vfp = 2, + .vbp = 2, + + .init = htcherald_panel_init, + .cleanup = htcherald_panel_cleanup, + .enable = htcherald_panel_enable, + .disable = htcherald_panel_disable, + .get_caps = htcherald_panel_get_caps, +}; + +static int htcherald_panel_probe(struct platform_device *pdev) +{ + omapfb_register_panel(&htcherald_panel_1); + return 0; +} + +static int htcherald_panel_remove(struct platform_device *pdev) +{ + return 0; +} + +static int htcherald_panel_suspend(struct platform_device *pdev, + pm_message_t mesg) +{ + return 0; +} + +static int htcherald_panel_resume(struct platform_device *pdev) +{ + return 0; +} + +struct platform_driver htcherald_panel_driver = { + .probe = htcherald_panel_probe, + .remove = htcherald_panel_remove, + .suspend = htcherald_panel_suspend, + .resume = htcherald_panel_resume, + .driver = { + .name = "lcd_htcherald", + .owner = THIS_MODULE, + }, +}; + +static int htcherald_panel_drv_init(void) +{ + return platform_driver_register(&htcherald_panel_driver); +} + +static void htcherald_panel_drv_cleanup(void) +{ + platform_driver_unregister(&htcherald_panel_driver); +} + +module_init(htcherald_panel_drv_init); +module_exit(htcherald_panel_drv_cleanup); + diff --git a/drivers/video/omap/lcd_inn1510.c b/drivers/video/omap/lcd_inn1510.c index cdbd8bb..aafe9b4 100644 --- a/drivers/video/omap/lcd_inn1510.c +++ b/drivers/video/omap/lcd_inn1510.c @@ -23,8 +23,8 @@ #include <linux/platform_device.h> #include <linux/io.h> -#include <mach/fpga.h> -#include <mach/omapfb.h> +#include <plat/fpga.h> +#include <plat/omapfb.h> static int innovator1510_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) diff --git a/drivers/video/omap/lcd_inn1610.c b/drivers/video/omap/lcd_inn1610.c index 268f7f8..0de3382 100644 --- a/drivers/video/omap/lcd_inn1610.c +++ b/drivers/video/omap/lcd_inn1610.c @@ -23,7 +23,7 @@ #include <linux/platform_device.h> #include <mach/gpio.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> #define MODULE_NAME "omapfb-lcd_h3" diff --git a/drivers/video/omap/lcd_ldp.c b/drivers/video/omap/lcd_ldp.c index dbfe897..6a260df 100644 --- a/drivers/video/omap/lcd_ldp.c +++ b/drivers/video/omap/lcd_ldp.c @@ -27,8 +27,8 @@ #include <linux/i2c/twl4030.h> #include <mach/gpio.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define LCD_PANEL_BACKLIGHT_GPIO (15 + OMAP_MAX_GPIO_LINES) diff --git a/drivers/video/omap/lcd_mipid.c b/drivers/video/omap/lcd_mipid.c index 918ee89..2162eb0 100644 --- a/drivers/video/omap/lcd_mipid.c +++ b/drivers/video/omap/lcd_mipid.c @@ -23,8 +23,8 @@ #include <linux/workqueue.h> #include <linux/spi/spi.h> -#include <mach/omapfb.h> -#include <mach/lcd_mipid.h> +#include <plat/omapfb.h> +#include <plat/lcd_mipid.h> #define MIPID_MODULE_NAME "lcd_mipid" diff --git a/drivers/video/omap/lcd_omap2evm.c b/drivers/video/omap/lcd_omap2evm.c index 7a2bbe2..e1a38ab 100644 --- a/drivers/video/omap/lcd_omap2evm.c +++ b/drivers/video/omap/lcd_omap2evm.c @@ -26,8 +26,8 @@ #include <linux/gpio.h> #include <linux/i2c/twl4030.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define LCD_PANEL_ENABLE_GPIO 154 diff --git a/drivers/video/omap/lcd_omap3beagle.c b/drivers/video/omap/lcd_omap3beagle.c index 4011910..ccec084 100644 --- a/drivers/video/omap/lcd_omap3beagle.c +++ b/drivers/video/omap/lcd_omap3beagle.c @@ -25,8 +25,8 @@ #include <linux/gpio.h> #include <linux/i2c/twl4030.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define LCD_PANEL_ENABLE_GPIO 170 diff --git a/drivers/video/omap/lcd_omap3evm.c b/drivers/video/omap/lcd_omap3evm.c index b6a4c2c..556eb31 100644 --- a/drivers/video/omap/lcd_omap3evm.c +++ b/drivers/video/omap/lcd_omap3evm.c @@ -25,8 +25,8 @@ #include <linux/gpio.h> #include <linux/i2c/twl4030.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define LCD_PANEL_ENABLE_GPIO 153 diff --git a/drivers/video/omap/lcd_osk.c b/drivers/video/omap/lcd_osk.c index b3fa88b..bb21d7d 100644 --- a/drivers/video/omap/lcd_osk.c +++ b/drivers/video/omap/lcd_osk.c @@ -24,8 +24,8 @@ #include <linux/platform_device.h> #include <mach/gpio.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> static int osk_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) { diff --git a/drivers/video/omap/lcd_overo.c b/drivers/video/omap/lcd_overo.c index 2bc5c92..b0f86e5 100644 --- a/drivers/video/omap/lcd_overo.c +++ b/drivers/video/omap/lcd_overo.c @@ -24,8 +24,8 @@ #include <linux/i2c/twl4030.h> #include <mach/gpio.h> -#include <mach/mux.h> -#include <mach/omapfb.h> +#include <plat/mux.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> #define LCD_ENABLE 144 diff --git a/drivers/video/omap/lcd_palmte.c b/drivers/video/omap/lcd_palmte.c index 4bf3c79..d302896 100644 --- a/drivers/video/omap/lcd_palmte.c +++ b/drivers/video/omap/lcd_palmte.c @@ -23,8 +23,8 @@ #include <linux/platform_device.h> #include <linux/io.h> -#include <mach/fpga.h> -#include <mach/omapfb.h> +#include <plat/fpga.h> +#include <plat/omapfb.h> static int palmte_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) diff --git a/drivers/video/omap/lcd_palmtt.c b/drivers/video/omap/lcd_palmtt.c index 48ea1f9..557424f 100644 --- a/drivers/video/omap/lcd_palmtt.c +++ b/drivers/video/omap/lcd_palmtt.c @@ -30,7 +30,7 @@ GPIO13 - screen blanking #include <linux/io.h> #include <mach/gpio.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> static int palmtt_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) diff --git a/drivers/video/omap/lcd_palmz71.c b/drivers/video/omap/lcd_palmz71.c index 0697d29..5f4b5b2 100644 --- a/drivers/video/omap/lcd_palmz71.c +++ b/drivers/video/omap/lcd_palmz71.c @@ -24,7 +24,7 @@ #include <linux/platform_device.h> #include <linux/io.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> static int palmz71_panel_init(struct lcd_panel *panel, struct omapfb_device *fbdev) diff --git a/drivers/video/omap/lcdc.c b/drivers/video/omap/lcdc.c index ab39492..5f32caf 100644 --- a/drivers/video/omap/lcdc.c +++ b/drivers/video/omap/lcdc.c @@ -29,8 +29,8 @@ #include <linux/vmalloc.h> #include <linux/clk.h> -#include <mach/dma.h> -#include <mach/omapfb.h> +#include <plat/dma.h> +#include <plat/omapfb.h> #include <asm/mach-types.h> diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c index 0d0c8c8..f900a43 100644 --- a/drivers/video/omap/omapfb_main.c +++ b/drivers/video/omap/omapfb_main.c @@ -28,8 +28,8 @@ #include <linux/mm.h> #include <linux/uaccess.h> -#include <mach/dma.h> -#include <mach/omapfb.h> +#include <plat/dma.h> +#include <plat/omapfb.h> #include "lcdc.h" #include "dispc.h" diff --git a/drivers/video/omap/rfbi.c b/drivers/video/omap/rfbi.c index ee01e84..c90fa39 100644 --- a/drivers/video/omap/rfbi.c +++ b/drivers/video/omap/rfbi.c @@ -27,7 +27,7 @@ #include <linux/clk.h> #include <linux/io.h> -#include <mach/omapfb.h> +#include <plat/omapfb.h> #include "dispc.h" diff --git a/drivers/video/omap/sossi.c b/drivers/video/omap/sossi.c index a769462..79dc84f 100644 --- a/drivers/video/omap/sossi.c +++ b/drivers/video/omap/sossi.c @@ -24,8 +24,8 @@ #include <linux/irq.h> #include <linux/io.h> -#include <mach/dma.h> -#include <mach/omapfb.h> +#include <plat/dma.h> +#include <plat/omapfb.h> #include "lcdc.h" diff --git a/drivers/video/pxa168fb.c b/drivers/video/pxa168fb.c index 84d8327..75285d3 100644 --- a/drivers/video/pxa168fb.c +++ b/drivers/video/pxa168fb.c @@ -687,6 +687,7 @@ static int __init pxa168fb_probe(struct platform_device *pdev) } info->fix.smem_start = (unsigned long)fbi->fb_start_dma; + set_graphics_start(info, 0, 0); /* * Set video mode according to platform data. diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c index 1820c4a..f58a3aa 100644 --- a/drivers/video/pxafb.c +++ b/drivers/video/pxafb.c @@ -80,7 +80,8 @@ static int pxafb_activate_var(struct fb_var_screeninfo *var, struct pxafb_info *); static void set_ctrlr_state(struct pxafb_info *fbi, u_int state); -static void setup_base_frame(struct pxafb_info *fbi, int branch); +static void setup_base_frame(struct pxafb_info *fbi, + struct fb_var_screeninfo *var, int branch); static int setup_frame_dma(struct pxafb_info *fbi, int dma, int pal, unsigned long offset, size_t size); @@ -397,6 +398,7 @@ static void pxafb_setmode(struct fb_var_screeninfo *var, var->lower_margin = mode->lower_margin; var->sync = mode->sync; var->grayscale = mode->cmap_greyscale; + var->transp.length = mode->transparency; /* set the initial RGBA bitfields */ pxafb_set_pixfmt(var, mode->depth); @@ -531,12 +533,22 @@ static int pxafb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { struct pxafb_info *fbi = (struct pxafb_info *)info; + struct fb_var_screeninfo newvar; int dma = DMA_MAX + DMA_BASE; if (fbi->state != C_ENABLE) return 0; - setup_base_frame(fbi, 1); + /* Only take .xoffset, .yoffset and .vmode & FB_VMODE_YWRAP from what + * was passed in and copy the rest from the old screeninfo. + */ + memcpy(&newvar, &fbi->fb.var, sizeof(newvar)); + newvar.xoffset = var->xoffset; + newvar.yoffset = var->yoffset; + newvar.vmode &= ~FB_VMODE_YWRAP; + newvar.vmode |= var->vmode & FB_VMODE_YWRAP; + + setup_base_frame(fbi, &newvar, 1); if (fbi->lccr0 & LCCR0_SDS) lcd_writel(fbi, FBR1, fbi->fdadr[dma + 1] | 0x1); @@ -1052,9 +1064,10 @@ static int setup_frame_dma(struct pxafb_info *fbi, int dma, int pal, return 0; } -static void setup_base_frame(struct pxafb_info *fbi, int branch) +static void setup_base_frame(struct pxafb_info *fbi, + struct fb_var_screeninfo *var, + int branch) { - struct fb_var_screeninfo *var = &fbi->fb.var; struct fb_fix_screeninfo *fix = &fbi->fb.fix; int nbytes, dma, pal, bpp = var->bits_per_pixel; unsigned long offset; @@ -1332,7 +1345,7 @@ static int pxafb_activate_var(struct fb_var_screeninfo *var, #endif setup_parallel_timing(fbi, var); - setup_base_frame(fbi, 0); + setup_base_frame(fbi, var, 0); fbi->reg_lccr0 = fbi->lccr0 | (LCCR0_LDM | LCCR0_SFM | LCCR0_IUM | LCCR0_EFM | diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index 3ed571a..429ea99 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -43,7 +43,7 @@ #include <linux/io.h> #include <linux/uaccess.h> #include <mach/hardware.h> -#include <mach/prcm.h> +#include <plat/prcm.h> #include "omap_wdt.h" diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c index d3c824d..c14ae86 100644 --- a/drivers/watchdog/riowd.c +++ b/drivers/watchdog/riowd.c @@ -10,7 +10,6 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/miscdevice.h> -#include <linux/smp_lock.h> #include <linux/watchdog.h> #include <linux/of.h> #include <linux/of_device.h> @@ -75,7 +74,6 @@ static void riowd_writereg(struct riowd *p, u8 val, int index) static int riowd_open(struct inode *inode, struct file *filp) { - cycle_kernel_lock(); nonseekable_open(inode, filp); return 0; } @@ -194,6 +192,8 @@ static int __devinit riowd_probe(struct of_device *op, printk(KERN_ERR PFX "Cannot map registers.\n"); goto out_free; } + /* Make miscdev useable right away */ + riowd_device = p; err = misc_register(&riowd_miscdev); if (err) { @@ -205,10 +205,10 @@ static int __devinit riowd_probe(struct of_device *op, "regs at %p\n", riowd_timeout, p->regs); dev_set_drvdata(&op->dev, p); - riowd_device = p; return 0; out_iounmap: + riowd_device = NULL; of_iounmap(&op->resource[0], p->regs, 2); out_free: diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c index e6c4390..53180a3 100644 --- a/drivers/zorro/zorro-driver.c +++ b/drivers/zorro/zorro-driver.c @@ -156,5 +156,4 @@ postcore_initcall(zorro_driver_init); EXPORT_SYMBOL(zorro_match_device); EXPORT_SYMBOL(zorro_register_driver); EXPORT_SYMBOL(zorro_unregister_driver); -EXPORT_SYMBOL(zorro_dev_driver); EXPORT_SYMBOL(zorro_bus_type); |