From fdda387f73947e6ae511ec601f5b3c6fbb582aac Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Thu, 2 Nov 2006 11:19:21 -0500 Subject: [DLM] Add support for tcp communications The following patch adds a TCP based communications layer to the DLM which is compile time selectable. The existing SCTP layer gives the advantage of allowing multihoming, whereas the TCP layer has been heavily tested in previous versions of the DLM and is known to be robust and therefore can be used as a baseline for performance testing. Signed-off-by: Patrick Caulfield Signed-off-by: Steven Whitehouse --- fs/dlm/Kconfig | 17 + fs/dlm/Makefile | 4 +- fs/dlm/lowcomms-sctp.c | 1239 +++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lowcomms-tcp.c | 1263 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lowcomms.c | 1239 ----------------------------------------------- 5 files changed, 2522 insertions(+), 1240 deletions(-) create mode 100644 fs/dlm/lowcomms-sctp.c create mode 100644 fs/dlm/lowcomms-tcp.c delete mode 100644 fs/dlm/lowcomms.c (limited to 'fs/dlm') diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 81b2c64..c5985b8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,23 @@ config DLM A general purpose distributed lock manager for kernel or userspace applications. +choice + prompt "Select DLM communications protocol" + depends on DLM + default DLM_TCP + help + The DLM Can use TCP or SCTP for it's network communications. + SCTP supports multi-homed operations whereas TCP doesn't. + However, SCTP seems to have stability problems at the moment. + +config DLM_TCP + bool "TCP/IP" + +config DLM_SCTP + bool "SCTP" + +endchoice + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 1832e02..6538894 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -4,7 +4,6 @@ dlm-y := ast.o \ dir.o \ lock.o \ lockspace.o \ - lowcomms.o \ main.o \ member.o \ memory.o \ @@ -17,3 +16,6 @@ dlm-y := ast.o \ util.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o +dlm-$(CONFIG_DLM_TCP) += lowcomms-tcp.o + +dlm-$(CONFIG_DLM_SCTP) += lowcomms-sctp.o \ No newline at end of file diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c new file mode 100644 index 0000000..6da6b14 --- /dev/null +++ b/fs/dlm/lowcomms-sctp.c @@ -0,0 +1,1239 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is it's + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * I don't see any problem with the recv thread executing the locking + * code on behalf of remote processes as the locking code is + * short, efficient and never (well, hardly ever) waits. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "config.h" +#include "midcomms.h" + +static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static int dlm_local_count; +static int dlm_local_nodeid; + +/* One of these per connected node */ + +#define NI_INIT_PENDING 1 +#define NI_WRITE_PENDING 2 + +struct nodeinfo { + spinlock_t lock; + sctp_assoc_t assoc_id; + unsigned long flags; + struct list_head write_list; /* nodes with pending writes */ + struct list_head writequeue; /* outgoing writequeue_entries */ + spinlock_t writequeue_lock; + int nodeid; +}; + +static DEFINE_IDR(nodeinfo_idr); +static struct rw_semaphore nodeinfo_lock; +static int max_nodeid; + +struct cbuf { + unsigned base; + unsigned len; + unsigned mask; +}; + +/* Just the one of these, now. But this struct keeps + the connection-specific variables together */ + +#define CF_READ_PENDING 1 + +struct connection { + struct socket *sock; + unsigned long flags; + struct page *rx_page; + atomic_t waiting_requests; + struct cbuf cb; + int eagain_flag; +}; + +/* An entry waiting to be sent */ + +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct nodeinfo *ni; +}; + +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) +#define CBUF_EMPTY(cb) ((cb)->len == 0) +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) + +#define CBUF_INIT(cb, size) \ +do { \ + (cb)->base = (cb)->len = 0; \ + (cb)->mask = ((size)-1); \ +} while(0) + +#define CBUF_EAT(cb, n) \ +do { \ + (cb)->len -= (n); \ + (cb)->base += (n); \ + (cb)->base &= (cb)->mask; \ +} while(0) + + +/* List of nodes which have writes pending */ +static struct list_head write_nodes; +static spinlock_t write_nodes_lock; + +/* Maximum number of incoming messages to process before + * doing a schedule() + */ +#define MAX_RX_MSG_COUNT 25 + +/* Manage daemons */ +static struct task_struct *recv_task; +static struct task_struct *send_task; +static wait_queue_head_t lowcomms_recv_wait; +static atomic_t accepting; + +/* The SCTP connection */ +static struct connection sctp_con; + + +static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) +{ + struct sockaddr_storage addr; + int error; + + if (!dlm_local_count) + return -1; + + error = dlm_nodeid_to_addr(nodeid, &addr); + if (error) + return error; + + if (dlm_local_addr[0]->ss_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; + struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; + } else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; + memcpy(&ret6->sin6_addr, &in6->sin6_addr, + sizeof(in6->sin6_addr)); + } + + return 0; +} + +static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) +{ + struct nodeinfo *ni; + int r; + int n; + + down_read(&nodeinfo_lock); + ni = idr_find(&nodeinfo_idr, nodeid); + up_read(&nodeinfo_lock); + + if (!ni && alloc) { + down_write(&nodeinfo_lock); + + ni = idr_find(&nodeinfo_idr, nodeid); + if (ni) + goto out_up; + + r = idr_pre_get(&nodeinfo_idr, alloc); + if (!r) + goto out_up; + + ni = kmalloc(sizeof(struct nodeinfo), alloc); + if (!ni) + goto out_up; + + r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); + if (r) { + kfree(ni); + ni = NULL; + goto out_up; + } + if (n != nodeid) { + idr_remove(&nodeinfo_idr, n); + kfree(ni); + ni = NULL; + goto out_up; + } + memset(ni, 0, sizeof(struct nodeinfo)); + spin_lock_init(&ni->lock); + INIT_LIST_HEAD(&ni->writequeue); + spin_lock_init(&ni->writequeue_lock); + ni->nodeid = nodeid; + + if (nodeid > max_nodeid) + max_nodeid = nodeid; + out_up: + up_write(&nodeinfo_lock); + } + + return ni; +} + +/* Don't call this too often... */ +static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc) +{ + int i; + struct nodeinfo *ni; + + for (i=1; i<=max_nodeid; i++) { + ni = nodeid2nodeinfo(i, 0); + if (ni && ni->assoc_id == assoc) + return ni; + } + return NULL; +} + +/* Data or notification available on socket */ +static void lowcomms_data_ready(struct sock *sk, int count_unused) +{ + atomic_inc(&sctp_con.waiting_requests); + if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) + return; + + wake_up_interruptible(&lowcomms_recv_wait); +} + + +/* Add the port number to an IP6 or 4 sockaddr and return the address length. + Also padd out the struct with zeros to make comparisons meaningful */ + +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + struct sockaddr_in *local4_addr; + struct sockaddr_in6 *local6_addr; + + if (!dlm_local_count) + return; + + if (!port) { + if (dlm_local_addr[0]->ss_family == AF_INET) { + local4_addr = (struct sockaddr_in *)dlm_local_addr[0]; + port = be16_to_cpu(local4_addr->sin_port); + } else { + local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0]; + port = be16_to_cpu(local6_addr->sin6_port); + } + } + + saddr->ss_family = dlm_local_addr[0]->ss_family; + if (dlm_local_addr[0]->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); + memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) - + sizeof(struct sockaddr_in)); + *addr_len = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) - + sizeof(struct sockaddr_in6)); + *addr_len = sizeof(struct sockaddr_in6); + } +} + +/* Close the connection and tidy up */ +static void close_connection(void) +{ + if (sctp_con.sock) { + sock_release(sctp_con.sock); + sctp_con.sock = NULL; + } + + if (sctp_con.rx_page) { + __free_page(sctp_con.rx_page); + sctp_con.rx_page = NULL; + } +} + +/* We only send shutdown messages to nodes that are not part of the cluster */ +static void send_shutdown(sctp_assoc_t associd) +{ + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int ret; + + outmessage.msg_name = NULL; + outmessage.msg_namelen = 0; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + outmessage.msg_controllen = cmsg->cmsg_len; + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + + sinfo->sinfo_flags |= MSG_EOF; + sinfo->sinfo_assoc_id = associd; + + ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0); + + if (ret != 0) + log_print("send EOF to node failed: %d", ret); +} + + +/* INIT failed but we don't know which node... + restart INIT on all pending nodes */ +static void init_failed(void) +{ + int i; + struct nodeinfo *ni; + + for (i=1; i<=max_nodeid; i++) { + ni = nodeid2nodeinfo(i, 0); + if (!ni) + continue; + + if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { + ni->assoc_id = 0; + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { + spin_lock_bh(&write_nodes_lock); + list_add_tail(&ni->write_list, &write_nodes); + spin_unlock_bh(&write_nodes_lock); + } + } + } + wake_up_process(send_task); +} + +/* Something happened to an association */ +static void process_sctp_notification(struct msghdr *msg, char *buf) +{ + union sctp_notification *sn = (union sctp_notification *)buf; + + if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { + switch (sn->sn_assoc_change.sac_state) { + + case SCTP_COMM_UP: + case SCTP_RESTART: + { + /* Check that the new node is in the lockspace */ + struct sctp_prim prim; + mm_segment_t fs; + int nodeid; + int prim_len, ret; + int addr_len; + struct nodeinfo *ni; + + /* This seems to happen when we received a connection + * too early... or something... anyway, it happens but + * we always seem to get a real message too, see + * receive_from_sock */ + + if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { + log_print("COMM_UP for invalid assoc ID %d", + (int)sn->sn_assoc_change.sac_assoc_id); + init_failed(); + return; + } + memset(&prim, 0, sizeof(struct sctp_prim)); + prim_len = sizeof(struct sctp_prim); + prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; + + fs = get_fs(); + set_fs(get_ds()); + ret = sctp_con.sock->ops->getsockopt(sctp_con.sock, + IPPROTO_SCTP, SCTP_PRIMARY_ADDR, + (char*)&prim, &prim_len); + set_fs(fs); + if (ret < 0) { + struct nodeinfo *ni; + + log_print("getsockopt/sctp_primary_addr on " + "new assoc %d failed : %d", + (int)sn->sn_assoc_change.sac_assoc_id, ret); + + /* Retry INIT later */ + ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); + if (ni) + clear_bit(NI_INIT_PENDING, &ni->flags); + return; + } + make_sockaddr(&prim.ssp_addr, 0, &addr_len); + if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + log_print("reject connect from unknown addr"); + send_shutdown(prim.ssp_assoc_id); + return; + } + + ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); + if (!ni) + return; + + /* Save the assoc ID */ + spin_lock(&ni->lock); + ni->assoc_id = sn->sn_assoc_change.sac_assoc_id; + spin_unlock(&ni->lock); + + log_print("got new/restarted association %d nodeid %d", + (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + + /* Send any pending writes */ + clear_bit(NI_INIT_PENDING, &ni->flags); + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { + spin_lock_bh(&write_nodes_lock); + list_add_tail(&ni->write_list, &write_nodes); + spin_unlock_bh(&write_nodes_lock); + } + wake_up_process(send_task); + } + break; + + case SCTP_COMM_LOST: + case SCTP_SHUTDOWN_COMP: + { + struct nodeinfo *ni; + + ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); + if (ni) { + spin_lock(&ni->lock); + ni->assoc_id = 0; + spin_unlock(&ni->lock); + } + } + break; + + /* We don't know which INIT failed, so clear the PENDING flags + * on them all. if assoc_id is zero then it will then try + * again */ + + case SCTP_CANT_STR_ASSOC: + { + log_print("Can't start SCTP association - retrying"); + init_failed(); + } + break; + + default: + log_print("unexpected SCTP assoc change id=%d state=%d", + (int)sn->sn_assoc_change.sac_assoc_id, + sn->sn_assoc_change.sac_state); + } + } +} + +/* Data received from remote end */ +static int receive_from_sock(void) +{ + int ret = 0; + struct msghdr msg; + struct kvec iov[2]; + unsigned len; + int r; + struct sctp_sndrcvinfo *sinfo; + struct cmsghdr *cmsg; + struct nodeinfo *ni; + + /* These two are marginally too big for stack allocation, but this + * function is (currently) only called by dlm_recvd so static should be + * OK. + */ + static struct sockaddr_storage msgname; + static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + + if (sctp_con.sock == NULL) + goto out; + + if (sctp_con.rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + sctp_con.rx_page = alloc_page(GFP_ATOMIC); + if (sctp_con.rx_page == NULL) + goto out_resched; + CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE); + } + + memset(&incmsg, 0, sizeof(incmsg)); + memset(&msgname, 0, sizeof(msgname)); + + memset(incmsg, 0, sizeof(incmsg)); + msg.msg_name = &msgname; + msg.msg_namelen = sizeof(msgname); + msg.msg_flags = 0; + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + msg.msg_iovlen = 1; + + /* I don't see why this circular buffer stuff is necessary for SCTP + * which is a packet-based protocol, but the whole thing breaks under + * load without it! The overhead is minimal (and is in the TCP lowcomms + * anyway, of course) so I'll leave it in until I can figure out what's + * really happening. + */ + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb); + iov[0].iov_base = page_address(sctp_con.rx_page) + + CBUF_DATA(&sctp_con.cb); + iov[1].iov_len = 0; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb); + iov[1].iov_len = sctp_con.cb.base; + iov[1].iov_base = page_address(sctp_con.rx_page); + msg.msg_iovlen = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len, + MSG_NOSIGNAL | MSG_DONTWAIT); + if (ret <= 0) + goto out_close; + + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + cmsg = CMSG_FIRSTHDR(&msg); + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + + if (msg.msg_flags & MSG_NOTIFICATION) { + process_sctp_notification(&msg, page_address(sctp_con.rx_page)); + return 0; + } + + /* Is this a new association ? */ + ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL); + if (ni) { + ni->assoc_id = sinfo->sinfo_assoc_id; + if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { + + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { + spin_lock_bh(&write_nodes_lock); + list_add_tail(&ni->write_list, &write_nodes); + spin_unlock_bh(&write_nodes_lock); + } + wake_up_process(send_task); + } + } + + /* INIT sends a message with length of 1 - ignore it */ + if (r == 1) + return 0; + + CBUF_ADD(&sctp_con.cb, ret); + ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), + page_address(sctp_con.rx_page), + sctp_con.cb.base, sctp_con.cb.len, + PAGE_CACHE_SIZE); + if (ret < 0) + goto out_close; + CBUF_EAT(&sctp_con.cb, ret); + + out: + ret = 0; + goto out_ret; + + out_resched: + lowcomms_data_ready(sctp_con.sock->sk, 0); + ret = 0; + schedule(); + goto out_ret; + + out_close: + if (ret != -EAGAIN) + log_print("error reading from sctp socket: %d", ret); + out_ret: + return ret; +} + +/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */ +static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) +{ + mm_segment_t fs; + int result = 0; + + fs = get_fs(); + set_fs(get_ds()); + if (num == 1) + result = sctp_con.sock->ops->bind(sctp_con.sock, + (struct sockaddr *) addr, addr_len); + else + result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP, + SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len); + set_fs(fs); + + if (result < 0) + log_print("Can't bind to port %d addr number %d", + dlm_config.tcp_port, num); + + return result; +} + +static void init_local(void) +{ + struct sockaddr_storage sas, *addr; + int i; + + dlm_local_nodeid = dlm_our_nodeid(); + + for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) { + if (dlm_our_addr(&sas, i)) + break; + + addr = kmalloc(sizeof(*addr), GFP_KERNEL); + if (!addr) + break; + memcpy(addr, &sas, sizeof(*addr)); + dlm_local_addr[dlm_local_count++] = addr; + } +} + +/* Initialise SCTP socket and bind to all interfaces */ +static int init_sock(void) +{ + mm_segment_t fs; + struct socket *sock = NULL; + struct sockaddr_storage localaddr; + struct sctp_event_subscribe subscribe; + int result = -EINVAL, num = 1, i, addr_len; + + if (!dlm_local_count) { + init_local(); + if (!dlm_local_count) { + log_print("no local IP address has been set"); + goto out; + } + } + + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, + IPPROTO_SCTP, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + /* Listen for events */ + memset(&subscribe, 0, sizeof(subscribe)); + subscribe.sctp_data_io_event = 1; + subscribe.sctp_association_event = 1; + subscribe.sctp_send_failure_event = 1; + subscribe.sctp_shutdown_event = 1; + subscribe.sctp_partial_delivery_event = 1; + + fs = get_fs(); + set_fs(get_ds()); + result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS, + (char *)&subscribe, sizeof(subscribe)); + set_fs(fs); + + if (result < 0) { + log_print("Failed to set SCTP_EVENTS on socket: result=%d", + result); + goto create_delsock; + } + + /* Init con struct */ + sock->sk->sk_user_data = &sctp_con; + sctp_con.sock = sock; + sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready; + + /* Bind to all interfaces. */ + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); + + result = add_bind_addr(&localaddr, addr_len, num); + if (result) + goto create_delsock; + ++num; + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + log_print("Can't set socket listening"); + goto create_delsock; + } + + return 0; + + create_delsock: + sock_release(sock); + sctp_con.sock = NULL; + out: + return result; +} + + +static struct writequeue_entry *new_writequeue_entry(gfp_t allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + + return entry; +} + +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct writequeue_entry *e; + int offset = 0; + int users = 0; + struct nodeinfo *ni; + + if (!atomic_read(&accepting)) + return NULL; + + ni = nodeid2nodeinfo(nodeid, allocation); + if (!ni) + return NULL; + + spin_lock(&ni->writequeue_lock); + e = list_entry(ni->writequeue.prev, struct writequeue_entry, list); + if (((struct list_head *) e == &ni->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + users = e->users++; + } + spin_unlock(&ni->writequeue_lock); + + if (e) { + got_one: + if (users == 0) + kmap(e->page); + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(allocation); + if (e) { + spin_lock(&ni->writequeue_lock); + offset = e->end; + e->end += len; + e->ni = ni; + users = e->users++; + list_add_tail(&e->list, &ni->writequeue); + spin_unlock(&ni->writequeue_lock); + goto got_one; + } + return NULL; +} + +void dlm_lowcomms_commit_buffer(void *arg) +{ + struct writequeue_entry *e = (struct writequeue_entry *) arg; + int users; + struct nodeinfo *ni = e->ni; + + if (!atomic_read(&accepting)) + return; + + spin_lock(&ni->writequeue_lock); + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + kunmap(e->page); + spin_unlock(&ni->writequeue_lock); + + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { + spin_lock_bh(&write_nodes_lock); + list_add_tail(&ni->write_list, &write_nodes); + spin_unlock_bh(&write_nodes_lock); + wake_up_process(send_task); + } + return; + + out: + spin_unlock(&ni->writequeue_lock); + return; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* Initiate an SCTP association. In theory we could just use sendmsg() on + the first IP address and it should work, but this allows us to set up the + association before sending any valuable data that we can't afford to lose. + It also keeps the send path clean as it can now always use the association ID */ +static void initiate_association(int nodeid) +{ + struct sockaddr_storage rem_addr; + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int ret; + int addrlen; + char buf[1]; + struct kvec iov[1]; + struct nodeinfo *ni; + + log_print("Initiating association with node %d", nodeid); + + ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); + if (!ni) + return; + + if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) { + log_print("no address for nodeid %d", nodeid); + return; + } + + make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); + + outmessage.msg_name = &rem_addr; + outmessage.msg_namelen = addrlen; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + iov[0].iov_base = buf; + iov[0].iov_len = 1; + + /* Real INIT messages seem to cause trouble. Just send a 1 byte message + we can afford to lose */ + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); + + outmessage.msg_controllen = cmsg->cmsg_len; + ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1); + if (ret < 0) { + log_print("send INIT to node failed: %d", ret); + /* Try again later */ + clear_bit(NI_INIT_PENDING, &ni->flags); + } +} + +/* Send a message */ +static int send_to_sock(struct nodeinfo *ni) +{ + int ret = 0; + struct writequeue_entry *e; + int len, offset; + struct msghdr outmsg; + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + struct kvec iov; + + /* See if we need to init an association before we start + sending precious messages */ + spin_lock(&ni->lock); + if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { + spin_unlock(&ni->lock); + initiate_association(ni->nodeid); + return 0; + } + spin_unlock(&ni->lock); + + outmsg.msg_name = NULL; /* We use assoc_id */ + outmsg.msg_namelen = 0; + outmsg.msg_control = outcmsg; + outmsg.msg_controllen = sizeof(outcmsg); + outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR; + + cmsg = CMSG_FIRSTHDR(&outmsg); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); + sinfo->sinfo_assoc_id = ni->assoc_id; + outmsg.msg_controllen = cmsg->cmsg_len; + + spin_lock(&ni->writequeue_lock); + for (;;) { + if (list_empty(&ni->writequeue)) + break; + e = list_entry(ni->writequeue.next, struct writequeue_entry, + list); + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&ni->writequeue_lock); + kmap(e->page); + + ret = 0; + if (len) { + iov.iov_base = page_address(e->page)+offset; + iov.iov_len = len; + + ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1, + len); + if (ret == -EAGAIN) { + sctp_con.eagain_flag = 1; + goto out; + } else if (ret < 0) + goto send_error; + } else { + /* Don't starve people filling buffers */ + schedule(); + } + + spin_lock(&ni->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + continue; + } + } + spin_unlock(&ni->writequeue_lock); + out: + return ret; + + send_error: + log_print("Error sending to node %d %d", ni->nodeid, ret); + spin_lock(&ni->lock); + if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { + ni->assoc_id = 0; + spin_unlock(&ni->lock); + initiate_association(ni->nodeid); + } else + spin_unlock(&ni->lock); + + return ret; +} + +/* Try to send any messages that are pending */ +static void process_output_queue(void) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock_bh(&write_nodes_lock); + list_for_each_safe(list, temp, &write_nodes) { + struct nodeinfo *ni = + list_entry(list, struct nodeinfo, write_list); + clear_bit(NI_WRITE_PENDING, &ni->flags); + list_del(&ni->write_list); + + spin_unlock_bh(&write_nodes_lock); + + send_to_sock(ni); + spin_lock_bh(&write_nodes_lock); + } + spin_unlock_bh(&write_nodes_lock); +} + +/* Called after we've had -EAGAIN and been woken up */ +static void refill_write_queue(void) +{ + int i; + + for (i=1; i<=max_nodeid; i++) { + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); + + if (ni) { + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { + spin_lock_bh(&write_nodes_lock); + list_add_tail(&ni->write_list, &write_nodes); + spin_unlock_bh(&write_nodes_lock); + } + } + } +} + +static void clean_one_writequeue(struct nodeinfo *ni) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock(&ni->writequeue_lock); + list_for_each_safe(list, temp, &ni->writequeue) { + struct writequeue_entry *e = + list_entry(list, struct writequeue_entry, list); + list_del(&e->list); + free_entry(e); + } + spin_unlock(&ni->writequeue_lock); +} + +static void clean_writequeues(void) +{ + int i; + + for (i=1; i<=max_nodeid; i++) { + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); + if (ni) + clean_one_writequeue(ni); + } +} + + +static void dealloc_nodeinfo(void) +{ + int i; + + for (i=1; i<=max_nodeid; i++) { + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); + if (ni) { + idr_remove(&nodeinfo_idr, i); + kfree(ni); + } + } +} + +int dlm_lowcomms_close(int nodeid) +{ + struct nodeinfo *ni; + + ni = nodeid2nodeinfo(nodeid, 0); + if (!ni) + return -1; + + spin_lock(&ni->lock); + if (ni->assoc_id) { + ni->assoc_id = 0; + /* Don't send shutdown here, sctp will just queue it + till the node comes back up! */ + } + spin_unlock(&ni->lock); + + clean_one_writequeue(ni); + clear_bit(NI_INIT_PENDING, &ni->flags); + return 0; +} + +static int write_list_empty(void) +{ + int status; + + spin_lock_bh(&write_nodes_lock); + status = list_empty(&write_nodes); + spin_unlock_bh(&write_nodes_lock); + + return status; +} + +static int dlm_recvd(void *data) +{ + DECLARE_WAITQUEUE(wait, current); + + while (!kthread_should_stop()) { + int count = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&lowcomms_recv_wait, &wait); + if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) + schedule(); + remove_wait_queue(&lowcomms_recv_wait, &wait); + set_current_state(TASK_RUNNING); + + if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { + int ret; + + do { + ret = receive_from_sock(); + + /* Don't starve out everyone else */ + if (++count >= MAX_RX_MSG_COUNT) { + schedule(); + count = 0; + } + } while (!kthread_should_stop() && ret >=0); + } + schedule(); + } + + return 0; +} + +static int dlm_sendd(void *data) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (write_list_empty()) + schedule(); + set_current_state(TASK_RUNNING); + + if (sctp_con.eagain_flag) { + sctp_con.eagain_flag = 0; + refill_write_queue(); + } + process_output_queue(); + } + + remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); + + return 0; +} + +static void daemons_stop(void) +{ + kthread_stop(recv_task); + kthread_stop(send_task); +} + +static int daemons_start(void) +{ + struct task_struct *p; + int error; + + p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_recvd %d", error); + return error; + } + recv_task = p; + + p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_sendd %d", error); + kthread_stop(recv_task); + return error; + } + send_task = p; + + return 0; +} + +/* + * This is quite likely to sleep... + */ +int dlm_lowcomms_start(void) +{ + int error; + + error = init_sock(); + if (error) + goto fail_sock; + error = daemons_start(); + if (error) + goto fail_sock; + atomic_set(&accepting, 1); + return 0; + + fail_sock: + close_connection(); + return error; +} + +/* Set all the activity flags to prevent any socket activity. */ + +void dlm_lowcomms_stop(void) +{ + atomic_set(&accepting, 0); + sctp_con.flags = 0x7; + daemons_stop(); + clean_writequeues(); + close_connection(); + dealloc_nodeinfo(); + max_nodeid = 0; +} + +int dlm_lowcomms_init(void) +{ + init_waitqueue_head(&lowcomms_recv_wait); + spin_lock_init(&write_nodes_lock); + INIT_LIST_HEAD(&write_nodes); + init_rwsem(&nodeinfo_lock); + return 0; +} + +void dlm_lowcomms_exit(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); + dlm_local_count = 0; + dlm_local_nodeid = 0; +} + diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c new file mode 100644 index 0000000..7289e59 --- /dev/null +++ b/fs/dlm/lowcomms-tcp.c @@ -0,0 +1,1263 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is it's + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * I don't see any problem with the recv thread executing the locking + * code on behalf of remote processes as the locking code is + * short, efficient and never waits. + * + */ + + +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "config.h" + +struct cbuf { + unsigned base; + unsigned len; + unsigned mask; +}; + +#ifndef FALSE +#define FALSE 0 +#define TRUE 1 +#endif +#define NODE_INCREMENT 32 + +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0) +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) +#define CBUF_EMPTY(cb) ((cb)->len == 0) +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \ + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0) +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) + +/* Maximum number of incoming messages to process before + doing a schedule() +*/ +#define MAX_RX_MSG_COUNT 25 + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + struct rw_semaphore sock_sem; /* Stop connect races */ + struct list_head read_list; /* On this list when ready for reading */ + struct list_head write_list; /* On this list when ready for writing */ + struct list_head state_list; /* On this list when ready to connect */ + unsigned long flags; /* bit 1,2 = We are on the read/write lists */ +#define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 +#define CF_CONNECT_PENDING 3 +#define CF_IS_OTHERCON 4 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + struct list_head listenlist; /* List of allocated listening sockets */ + spinlock_t writequeue_lock; + int (*rx_action) (struct connection *); /* What to do when active */ + struct page *rx_page; + struct cbuf cb; + int retries; + atomic_t waiting_requests; +#define MAX_CONNECT_RETRIES 3 + struct connection *othercon; +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct connection *con; +}; + +static struct sockaddr_storage dlm_local_addr; + +/* Manage daemons */ +static struct task_struct *recv_task; +static struct task_struct *send_task; + +static wait_queue_t lowcomms_send_waitq_head; +static wait_queue_head_t lowcomms_send_waitq; +static wait_queue_t lowcomms_recv_waitq_head; +static wait_queue_head_t lowcomms_recv_waitq; + +/* An array of pointers to connections, indexed by NODEID */ +static struct connection **connections; +static struct semaphore connections_lock; +static kmem_cache_t *con_cache; +static int conn_array_size; +static atomic_t accepting; + +/* List of sockets that have reads pending */ +static struct list_head read_sockets; +static spinlock_t read_sockets_lock; + +/* List of sockets which have writes pending */ +static struct list_head write_sockets; +static spinlock_t write_sockets_lock; + +/* List of sockets which have connects pending */ +static struct list_head state_sockets; +static spinlock_t state_sockets_lock; + +static struct connection *nodeid2con(int nodeid, gfp_t allocation) +{ + struct connection *con = NULL; + + down(&connections_lock); + if (nodeid >= conn_array_size) { + int new_size = nodeid + NODE_INCREMENT; + struct connection **new_conns; + + new_conns = kmalloc(sizeof(struct connection *) * + new_size, allocation); + if (!new_conns) + goto finish; + + memset(new_conns, 0, sizeof(struct connection *) * new_size); + memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size); + conn_array_size = new_size; + kfree(connections); + connections = new_conns; + + } + + con = connections[nodeid]; + if (con == NULL && allocation) { + con = kmem_cache_alloc(con_cache, allocation); + if (!con) + goto finish; + + memset(con, 0, sizeof(*con)); + con->nodeid = nodeid; + init_rwsem(&con->sock_sem); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + + connections[nodeid] = con; + } + + finish: + up(&connections_lock); + return con; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk, int count_unused) +{ + struct connection *con = sock2con(sk); + + atomic_inc(&con->waiting_requests); + if (test_and_set_bit(CF_READ_PENDING, &con->flags)) + return; + + spin_lock_bh(&read_sockets_lock); + list_add_tail(&con->read_list, &read_sockets); + spin_unlock_bh(&read_sockets_lock); + + wake_up_interruptible(&lowcomms_recv_waitq); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + return; + + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static inline void lowcomms_connect_sock(struct connection *con) +{ + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + return; + if (!atomic_read(&accepting)) + return; + + spin_lock_bh(&state_sockets_lock); + list_add_tail(&con->state_list, &state_sockets); + spin_unlock_bh(&state_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static void lowcomms_state_change(struct sock *sk) +{ +/* struct connection *con = sock2con(sk); */ + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + lowcomms_write_space(sk); + break; + + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_TIME_WAIT: + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_CLOSING: + /* FIXME: I think this causes more trouble than it solves. + lowcomms wil reconnect anyway when there is something to + send. This just attempts reconnection if a node goes down! + */ + /* lowcomms_connect_sock(con); */ + break; + + default: + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state); + break; + } +} + +/* Make a socket active */ +static int add_sock(struct socket *sock, struct connection *con) +{ + con->sock = sock; + + /* Install a data_ready callback */ + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->sock->sk->sk_write_space = lowcomms_write_space; + con->sock->sk->sk_state_change = lowcomms_state_change; + + return 0; +} + +/* Add the port number to an IP6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + saddr->ss_family = dlm_local_addr.ss_family; + if (saddr->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + } + else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, int and_other) +{ + down_write(&con->sock_sem); + + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + if (con->othercon && and_other) { + /* Argh! recursion in kernel code! + Actually, this isn't a list so it + will only re-enter once. + */ + close_connection(con->othercon, FALSE); + } + if (con->rx_page) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + con->retries = 0; + up_write(&con->sock_sem); +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con) +{ + int ret = 0; + struct msghdr msg; + struct iovec iov[2]; + mm_segment_t fs; + unsigned len; + int r; + int call_again_soon = 0; + + down_read(&con->sock_sem); + + if (con->sock == NULL) + goto out; + if (con->rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + con->rx_page = alloc_page(GFP_ATOMIC); + if (con->rx_page == NULL) + goto out_resched; + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = 0; + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb); + iov[1].iov_len = 0; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (CBUF_DATA(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb); + iov[1].iov_len = con->cb.base; + iov[1].iov_base = page_address(con->rx_page); + msg.msg_iovlen = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + fs = get_fs(); + set_fs(get_ds()); + r = ret = sock_recvmsg(con->sock, &msg, len, + MSG_DONTWAIT | MSG_NOSIGNAL); + set_fs(fs); + + if (ret <= 0) + goto out_close; + if (ret == len) + call_again_soon = 1; + CBUF_ADD(&con->cb, ret); + ret = dlm_process_incoming_buffer(con->nodeid, + page_address(con->rx_page), + con->cb.base, con->cb.len, + PAGE_CACHE_SIZE); + if (ret == -EBADMSG) { + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, " + "iov_len=%u, iov_base[0]=%p, read=%d\n", + page_address(con->rx_page), con->cb.base, con->cb.len, + len, iov[0].iov_base, r); + } + if (ret < 0) + goto out_close; + CBUF_EAT(&con->cb, ret); + + if (CBUF_EMPTY(&con->cb) && !call_again_soon) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + out: + if (call_again_soon) + goto out_resched; + up_read(&con->sock_sem); + ret = 0; + goto out_ret; + + out_resched: + lowcomms_data_ready(con->sock->sk, 0); + up_read(&con->sock_sem); + ret = 0; + schedule(); + goto out_ret; + + out_close: + up_read(&con->sock_sem); + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, FALSE); + /* Reconnect when there is something to send */ + } + + out_ret: + return ret; +} + +/* Listening socket is busy, accept a connection */ +static int accept_from_sock(struct connection *con) +{ + int result; + struct sockaddr_storage peeraddr; + struct socket *newsock; + int len; + int nodeid; + struct connection *newcon; + + memset(&peeraddr, 0, sizeof(peeraddr)); + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); + if (result < 0) + return -ENOMEM; + + down_read(&con->sock_sem); + + result = -ENOTCONN; + if (con->sock == NULL) + goto accept_err; + + newsock->type = con->sock->type; + newsock->ops = con->sock->ops; + + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + memset(&peeraddr, 0, sizeof(peeraddr)); + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, + &len, 2)) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + make_sockaddr(&peeraddr, 0, &len); + if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { + printk("dlm: connect from non cluster node\n"); + sock_release(newsock); + up_read(&con->sock_sem); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * TEMPORARY FIX: + * In this case we store the incoming one in "othercon" + */ + newcon = nodeid2con(nodeid, GFP_KERNEL); + if (!newcon) { + result = -ENOMEM; + goto accept_err; + } + down_write(&newcon->sock_sem); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_alloc(con_cache, GFP_KERNEL); + if (!othercon) { + printk("dlm: failed to allocate incoming socket\n"); + up_write(&newcon->sock_sem); + result = -ENOMEM; + goto accept_err; + } + memset(othercon, 0, sizeof(*othercon)); + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + init_rwsem(&othercon->sock_sem); + set_bit(CF_IS_OTHERCON, &othercon->flags); + newcon->othercon = othercon; + } + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + } + else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + + } + + up_write(&newcon->sock_sem); + + /* + * Add it to the active queue in case we got data + * beween processing the accept adding the socket + * to the read_sockets list + */ + lowcomms_data_ready(newsock->sk, 0); + up_read(&con->sock_sem); + + return 0; + + accept_err: + up_read(&con->sock_sem); + sock_release(newsock); + + if (result != -EAGAIN) + printk("dlm: error accepting connection from node: %d\n", result); + return result; +} + +/* Connect a new socket to its peer */ +static int connect_to_sock(struct connection *con) +{ + int result = -EHOSTUNREACH; + struct sockaddr_storage saddr; + int addr_len; + struct socket *sock; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return 0; + } + + down_write(&con->sock_sem); + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + /* Some odd races can cause double-connects, ignore them */ + if (con->sock) { + result = 0; + goto out; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); + if (result < 0) + goto out_err; + + memset(&saddr, 0, sizeof(saddr)); + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) + goto out_err; + + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); + + add_sock(sock, con); + + log_print("connecting to %d", con->nodeid); + result = + sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result != 0) + goto out_err; + + out: + up_write(&con->sock_sem); + /* + * Returning an error here means we've given up trying to connect to + * a remote node, otherwise we return 0 and reschedule the connetion + * attempt + */ + return result; + + out_err: + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && result != -ENETUNREACH && + result != -ENETDOWN && result != EINVAL + && result != -EPROTONOSUPPORT) { + lowcomms_connect_sock(con); + result = 0; + } + goto out; +} + +static struct socket *create_listen_sock(struct connection *con, struct sockaddr_storage *saddr) +{ + struct socket *sock = NULL; + mm_segment_t fs; + int result = 0; + int one = 1; + int addr_len; + + if (dlm_local_addr.ss_family == AF_INET) + addr_len = sizeof(struct sockaddr_in); + else + addr_len = sizeof(struct sockaddr_in6); + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); + if (result < 0) { + printk("dlm: Can't create listening comms socket\n"); + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result); + } + sock->sk->sk_user_data = con; + con->rx_action = accept_from_sock; + con->sock = sock; + + /* Bind to our port */ + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); + if (result < 0) { + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + con->sock = NULL; + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Set keepalive failed: %d\n", result); + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + + create_out: + return sock; +} + + +/* Listen on all interfaces */ +static int listen_for_all(void) +{ + struct socket *sock = NULL; + struct connection *con = nodeid2con(0, GFP_KERNEL); + int result = -EINVAL; + + /* We don't support multi-homed hosts */ + memset(con, 0, sizeof(*con)); + init_rwsem(&con->sock_sem); + spin_lock_init(&con->writequeue_lock); + INIT_LIST_HEAD(&con->writequeue); + set_bit(CF_IS_OTHERCON, &con->flags); + + sock = create_listen_sock(con, &dlm_local_addr); + if (sock) { + add_sock(sock, con); + result = 0; + } + else { + result = -EADDRINUSE; + } + + return result; +} + + + +static struct writequeue_entry *new_writequeue_entry(struct connection *con, + gfp_t allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + entry->con = con; + + return entry; +} + +void *dlm_lowcomms_get_buffer(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct connection *con; + struct writequeue_entry *e; + int offset = 0; + int users = 0; + + if (!atomic_read(&accepting)) + return NULL; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + spin_lock(&con->writequeue_lock); + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); + if (((struct list_head *) e == &con->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + users = e->users++; + } + spin_unlock(&con->writequeue_lock); + + if (e) { + got_one: + if (users == 0) + kmap(e->page); + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(con, allocation); + if (e) { + spin_lock(&con->writequeue_lock); + offset = e->end; + e->end += len; + users = e->users++; + list_add_tail(&e->list, &con->writequeue); + spin_unlock(&con->writequeue_lock); + goto got_one; + } + return NULL; +} + +void dlm_lowcomms_commit_buffer(void *mh) +{ + struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct connection *con = e->con; + int users; + + if (!atomic_read(&accepting)) + return; + + spin_lock(&con->writequeue_lock); + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + kunmap(e->page); + spin_unlock(&con->writequeue_lock); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); + } + return; + + out: + spin_unlock(&con->writequeue_lock); + return; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* Send a message */ +static int send_to_sock(struct connection *con) +{ + int ret = 0; + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + struct writequeue_entry *e; + int len, offset; + + down_read(&con->sock_sem); + if (con->sock == NULL) + goto out_connect; + + sendpage = con->sock->ops->sendpage; + + spin_lock(&con->writequeue_lock); + for (;;) { + e = list_entry(con->writequeue.next, struct writequeue_entry, + list); + if ((struct list_head *) e == &con->writequeue) + break; + + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&con->writequeue_lock); + + ret = 0; + if (len) { + ret = sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) + goto out; + if (ret <= 0) + goto send_error; + } + else { + /* Don't starve people filling buffers */ + schedule(); + } + + spin_lock(&con->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + continue; + } + } + spin_unlock(&con->writequeue_lock); + out: + up_read(&con->sock_sem); + return ret; + + send_error: + up_read(&con->sock_sem); + close_connection(con, FALSE); + lowcomms_connect_sock(con); + return ret; + + out_connect: + up_read(&con->sock_sem); + lowcomms_connect_sock(con); + return 0; +} + +static void clean_one_writequeue(struct connection *con) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock(&con->writequeue_lock); + list_for_each_safe(list, temp, &con->writequeue) { + struct writequeue_entry *e = + list_entry(list, struct writequeue_entry, list); + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int dlm_lowcomms_close(int nodeid) +{ + struct connection *con; + + if (!connections) + goto out; + + log_print("closing connection to node %d", nodeid); + con = nodeid2con(nodeid, 0); + if (con) { + clean_one_writequeue(con); + close_connection(con, TRUE); + atomic_set(&con->waiting_requests, 0); + } + return 0; + + out: + return -1; +} + +/* API send message call, may queue the request */ +/* N.B. This is the old interface - use the new one for new calls */ +int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation) +{ + struct writequeue_entry *e; + char *b; + + e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b); + if (e) { + memcpy(b, buf, len); + dlm_lowcomms_commit_buffer(e); + return 0; + } + return -ENOBUFS; +} + +/* Look for activity on active sockets */ +static void process_sockets(void) +{ + struct list_head *list; + struct list_head *temp; + int count = 0; + + spin_lock_bh(&read_sockets_lock); + list_for_each_safe(list, temp, &read_sockets) { + + struct connection *con = + list_entry(list, struct connection, read_list); + list_del(&con->read_list); + clear_bit(CF_READ_PENDING, &con->flags); + + spin_unlock_bh(&read_sockets_lock); + + /* This can reach zero if we are processing requests + * as they come in. + */ + if (atomic_read(&con->waiting_requests) == 0) { + spin_lock_bh(&read_sockets_lock); + continue; + } + + do { + con->rx_action(con); + + /* Don't starve out everyone else */ + if (++count >= MAX_RX_MSG_COUNT) { + schedule(); + count = 0; + } + + } while (!atomic_dec_and_test(&con->waiting_requests) && + !kthread_should_stop()); + + spin_lock_bh(&read_sockets_lock); + } + spin_unlock_bh(&read_sockets_lock); +} + +/* Try to send any messages that are pending + */ +static void process_output_queue(void) +{ + struct list_head *list; + struct list_head *temp; + int ret; + + spin_lock_bh(&write_sockets_lock); + list_for_each_safe(list, temp, &write_sockets) { + struct connection *con = + list_entry(list, struct connection, write_list); + clear_bit(CF_WRITE_PENDING, &con->flags); + list_del(&con->write_list); + + spin_unlock_bh(&write_sockets_lock); + + ret = send_to_sock(con); + if (ret < 0) { + } + spin_lock_bh(&write_sockets_lock); + } + spin_unlock_bh(&write_sockets_lock); +} + +static void process_state_queue(void) +{ + struct list_head *list; + struct list_head *temp; + int ret; + + spin_lock_bh(&state_sockets_lock); + list_for_each_safe(list, temp, &state_sockets) { + struct connection *con = + list_entry(list, struct connection, state_list); + list_del(&con->state_list); + clear_bit(CF_CONNECT_PENDING, &con->flags); + spin_unlock_bh(&state_sockets_lock); + + ret = connect_to_sock(con); + if (ret < 0) { + } + spin_lock_bh(&state_sockets_lock); + } + spin_unlock_bh(&state_sockets_lock); +} + + +/* Discard all entries on the write queues */ +static void clean_writequeues(void) +{ + int nodeid; + + for (nodeid = 1; nodeid < conn_array_size; nodeid++) { + struct connection *con = nodeid2con(nodeid, 0); + + if (con) + clean_one_writequeue(con); + } +} + +static int read_list_empty(void) +{ + int status; + + spin_lock_bh(&read_sockets_lock); + status = list_empty(&read_sockets); + spin_unlock_bh(&read_sockets_lock); + + return status; +} + +/* DLM Transport comms receive daemon */ +static int dlm_recvd(void *data) +{ + init_waitqueue_head(&lowcomms_recv_waitq); + init_waitqueue_entry(&lowcomms_recv_waitq_head, current); + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (read_list_empty()) + schedule(); + set_current_state(TASK_RUNNING); + + process_sockets(); + } + + return 0; +} + +static int write_and_state_lists_empty(void) +{ + int status; + + spin_lock_bh(&write_sockets_lock); + status = list_empty(&write_sockets); + spin_unlock_bh(&write_sockets_lock); + + spin_lock_bh(&state_sockets_lock); + if (list_empty(&state_sockets) == 0) + status = 0; + spin_unlock_bh(&state_sockets_lock); + + return status; +} + +/* DLM Transport send daemon */ +static int dlm_sendd(void *data) +{ + init_waitqueue_head(&lowcomms_send_waitq); + init_waitqueue_entry(&lowcomms_send_waitq_head, current); + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (write_and_state_lists_empty()) + schedule(); + set_current_state(TASK_RUNNING); + + process_state_queue(); + process_output_queue(); + } + + return 0; +} + +static void daemons_stop(void) +{ + kthread_stop(recv_task); + kthread_stop(send_task); +} + +static int daemons_start(void) +{ + struct task_struct *p; + int error; + + p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_recvd %d", error); + return error; + } + recv_task = p; + + p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_sendd %d", error); + kthread_stop(recv_task); + return error; + } + send_task = p; + + return 0; +} + +/* + * Return the largest buffer size we can cope with. + */ +int lowcomms_max_buffer_size(void) +{ + return PAGE_CACHE_SIZE; +} + +void dlm_lowcomms_stop(void) +{ + int i; + + atomic_set(&accepting, 0); + + /* Set all the activity flags to prevent any + socket activity. + */ + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) + connections[i]->flags |= 0x7; + } + daemons_stop(); + clean_writequeues(); + + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) { + close_connection(connections[i], TRUE); + if (connections[i]->othercon) + kmem_cache_free(con_cache, connections[i]->othercon); + kmem_cache_free(con_cache, connections[i]); + } + } + + kfree(connections); + connections = NULL; + + kmem_cache_destroy(con_cache); +} + +/* This is quite likely to sleep... */ +int dlm_lowcomms_start(void) +{ + int error = 0; + + error = -ENOTCONN; + + /* + * Temporarily initialise the waitq head so that lowcomms_send_message + * doesn't crash if it gets called before the thread is fully + * initialised + */ + init_waitqueue_head(&lowcomms_send_waitq); + + error = -ENOMEM; + connections = kmalloc(sizeof(struct connection *) * + NODE_INCREMENT, GFP_KERNEL); + if (!connections) + goto out; + + memset(connections, 0, + sizeof(struct connection *) * NODE_INCREMENT); + + conn_array_size = NODE_INCREMENT; + + if (dlm_our_addr(&dlm_local_addr, 0)) { + log_print("no local IP address has been set"); + goto fail_free_conn; + } + if (!dlm_our_addr(&dlm_local_addr, 1)) { + log_print("This dlm comms module does not support multi-homed clustering"); + goto fail_free_conn; + } + + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), + __alignof__(struct connection), 0, NULL, NULL); + if (!con_cache) + goto fail_free_conn; + + + /* Start listening */ + error = listen_for_all(); + if (error) + goto fail_unlisten; + + error = daemons_start(); + if (error) + goto fail_unlisten; + + atomic_set(&accepting, 1); + + return 0; + + fail_unlisten: + close_connection(connections[0], 0); + kmem_cache_free(con_cache, connections[0]); + kmem_cache_destroy(con_cache); + + fail_free_conn: + kfree(connections); + + out: + return error; +} + +int dlm_lowcomms_init(void) +{ + INIT_LIST_HEAD(&read_sockets); + INIT_LIST_HEAD(&write_sockets); + INIT_LIST_HEAD(&state_sockets); + + spin_lock_init(&read_sockets_lock); + spin_lock_init(&write_sockets_lock); + spin_lock_init(&state_sockets_lock); + init_MUTEX(&connections_lock); + + return 0; +} + +void dlm_lowcomms_exit(void) +{ +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c deleted file mode 100644 index 6da6b14..0000000 --- a/fs/dlm/lowcomms.c +++ /dev/null @@ -1,1239 +0,0 @@ -/****************************************************************************** -******************************************************************************* -** -** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. -** -** This copyrighted material is made available to anyone wishing to use, -** modify, copy, or redistribute it subject to the terms and conditions -** of the GNU General Public License v.2. -** -******************************************************************************* -******************************************************************************/ - -/* - * lowcomms.c - * - * This is the "low-level" comms layer. - * - * It is responsible for sending/receiving messages - * from other nodes in the cluster. - * - * Cluster nodes are referred to by their nodeids. nodeids are - * simply 32 bit numbers to the locking module - if they need to - * be expanded for the cluster infrastructure then that is it's - * responsibility. It is this layer's - * responsibility to resolve these into IP address or - * whatever it needs for inter-node communication. - * - * The comms level is two kernel threads that deal mainly with - * the receiving of messages from other nodes and passing them - * up to the mid-level comms layer (which understands the - * message format) for execution by the locking core, and - * a send thread which does all the setting up of connections - * to remote nodes and the sending of data. Threads are not allowed - * to send their own data because it may cause them to wait in times - * of high load. Also, this way, the sending thread can collect together - * messages bound for one node and send them in one block. - * - * I don't see any problem with the recv thread executing the locking - * code on behalf of remote processes as the locking code is - * short, efficient and never (well, hardly ever) waits. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "dlm_internal.h" -#include "lowcomms.h" -#include "config.h" -#include "midcomms.h" - -static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; -static int dlm_local_count; -static int dlm_local_nodeid; - -/* One of these per connected node */ - -#define NI_INIT_PENDING 1 -#define NI_WRITE_PENDING 2 - -struct nodeinfo { - spinlock_t lock; - sctp_assoc_t assoc_id; - unsigned long flags; - struct list_head write_list; /* nodes with pending writes */ - struct list_head writequeue; /* outgoing writequeue_entries */ - spinlock_t writequeue_lock; - int nodeid; -}; - -static DEFINE_IDR(nodeinfo_idr); -static struct rw_semaphore nodeinfo_lock; -static int max_nodeid; - -struct cbuf { - unsigned base; - unsigned len; - unsigned mask; -}; - -/* Just the one of these, now. But this struct keeps - the connection-specific variables together */ - -#define CF_READ_PENDING 1 - -struct connection { - struct socket *sock; - unsigned long flags; - struct page *rx_page; - atomic_t waiting_requests; - struct cbuf cb; - int eagain_flag; -}; - -/* An entry waiting to be sent */ - -struct writequeue_entry { - struct list_head list; - struct page *page; - int offset; - int len; - int end; - int users; - struct nodeinfo *ni; -}; - -#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) -#define CBUF_EMPTY(cb) ((cb)->len == 0) -#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) -#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) - -#define CBUF_INIT(cb, size) \ -do { \ - (cb)->base = (cb)->len = 0; \ - (cb)->mask = ((size)-1); \ -} while(0) - -#define CBUF_EAT(cb, n) \ -do { \ - (cb)->len -= (n); \ - (cb)->base += (n); \ - (cb)->base &= (cb)->mask; \ -} while(0) - - -/* List of nodes which have writes pending */ -static struct list_head write_nodes; -static spinlock_t write_nodes_lock; - -/* Maximum number of incoming messages to process before - * doing a schedule() - */ -#define MAX_RX_MSG_COUNT 25 - -/* Manage daemons */ -static struct task_struct *recv_task; -static struct task_struct *send_task; -static wait_queue_head_t lowcomms_recv_wait; -static atomic_t accepting; - -/* The SCTP connection */ -static struct connection sctp_con; - - -static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) -{ - struct sockaddr_storage addr; - int error; - - if (!dlm_local_count) - return -1; - - error = dlm_nodeid_to_addr(nodeid, &addr); - if (error) - return error; - - if (dlm_local_addr[0]->ss_family == AF_INET) { - struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; - struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; - ret4->sin_addr.s_addr = in4->sin_addr.s_addr; - } else { - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; - struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; - memcpy(&ret6->sin6_addr, &in6->sin6_addr, - sizeof(in6->sin6_addr)); - } - - return 0; -} - -static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) -{ - struct nodeinfo *ni; - int r; - int n; - - down_read(&nodeinfo_lock); - ni = idr_find(&nodeinfo_idr, nodeid); - up_read(&nodeinfo_lock); - - if (!ni && alloc) { - down_write(&nodeinfo_lock); - - ni = idr_find(&nodeinfo_idr, nodeid); - if (ni) - goto out_up; - - r = idr_pre_get(&nodeinfo_idr, alloc); - if (!r) - goto out_up; - - ni = kmalloc(sizeof(struct nodeinfo), alloc); - if (!ni) - goto out_up; - - r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); - if (r) { - kfree(ni); - ni = NULL; - goto out_up; - } - if (n != nodeid) { - idr_remove(&nodeinfo_idr, n); - kfree(ni); - ni = NULL; - goto out_up; - } - memset(ni, 0, sizeof(struct nodeinfo)); - spin_lock_init(&ni->lock); - INIT_LIST_HEAD(&ni->writequeue); - spin_lock_init(&ni->writequeue_lock); - ni->nodeid = nodeid; - - if (nodeid > max_nodeid) - max_nodeid = nodeid; - out_up: - up_write(&nodeinfo_lock); - } - - return ni; -} - -/* Don't call this too often... */ -static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc) -{ - int i; - struct nodeinfo *ni; - - for (i=1; i<=max_nodeid; i++) { - ni = nodeid2nodeinfo(i, 0); - if (ni && ni->assoc_id == assoc) - return ni; - } - return NULL; -} - -/* Data or notification available on socket */ -static void lowcomms_data_ready(struct sock *sk, int count_unused) -{ - atomic_inc(&sctp_con.waiting_requests); - if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) - return; - - wake_up_interruptible(&lowcomms_recv_wait); -} - - -/* Add the port number to an IP6 or 4 sockaddr and return the address length. - Also padd out the struct with zeros to make comparisons meaningful */ - -static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, - int *addr_len) -{ - struct sockaddr_in *local4_addr; - struct sockaddr_in6 *local6_addr; - - if (!dlm_local_count) - return; - - if (!port) { - if (dlm_local_addr[0]->ss_family == AF_INET) { - local4_addr = (struct sockaddr_in *)dlm_local_addr[0]; - port = be16_to_cpu(local4_addr->sin_port); - } else { - local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0]; - port = be16_to_cpu(local6_addr->sin6_port); - } - } - - saddr->ss_family = dlm_local_addr[0]->ss_family; - if (dlm_local_addr[0]->ss_family == AF_INET) { - struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; - in4_addr->sin_port = cpu_to_be16(port); - memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); - memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in)); - *addr_len = sizeof(struct sockaddr_in); - } else { - struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; - in6_addr->sin6_port = cpu_to_be16(port); - memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in6)); - *addr_len = sizeof(struct sockaddr_in6); - } -} - -/* Close the connection and tidy up */ -static void close_connection(void) -{ - if (sctp_con.sock) { - sock_release(sctp_con.sock); - sctp_con.sock = NULL; - } - - if (sctp_con.rx_page) { - __free_page(sctp_con.rx_page); - sctp_con.rx_page = NULL; - } -} - -/* We only send shutdown messages to nodes that are not part of the cluster */ -static void send_shutdown(sctp_assoc_t associd) -{ - static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int ret; - - outmessage.msg_name = NULL; - outmessage.msg_namelen = 0; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; - - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - outmessage.msg_controllen = cmsg->cmsg_len; - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - - sinfo->sinfo_flags |= MSG_EOF; - sinfo->sinfo_assoc_id = associd; - - ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0); - - if (ret != 0) - log_print("send EOF to node failed: %d", ret); -} - - -/* INIT failed but we don't know which node... - restart INIT on all pending nodes */ -static void init_failed(void) -{ - int i; - struct nodeinfo *ni; - - for (i=1; i<=max_nodeid; i++) { - ni = nodeid2nodeinfo(i, 0); - if (!ni) - continue; - - if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { - ni->assoc_id = 0; - if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { - spin_lock_bh(&write_nodes_lock); - list_add_tail(&ni->write_list, &write_nodes); - spin_unlock_bh(&write_nodes_lock); - } - } - } - wake_up_process(send_task); -} - -/* Something happened to an association */ -static void process_sctp_notification(struct msghdr *msg, char *buf) -{ - union sctp_notification *sn = (union sctp_notification *)buf; - - if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { - switch (sn->sn_assoc_change.sac_state) { - - case SCTP_COMM_UP: - case SCTP_RESTART: - { - /* Check that the new node is in the lockspace */ - struct sctp_prim prim; - mm_segment_t fs; - int nodeid; - int prim_len, ret; - int addr_len; - struct nodeinfo *ni; - - /* This seems to happen when we received a connection - * too early... or something... anyway, it happens but - * we always seem to get a real message too, see - * receive_from_sock */ - - if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { - log_print("COMM_UP for invalid assoc ID %d", - (int)sn->sn_assoc_change.sac_assoc_id); - init_failed(); - return; - } - memset(&prim, 0, sizeof(struct sctp_prim)); - prim_len = sizeof(struct sctp_prim); - prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; - - fs = get_fs(); - set_fs(get_ds()); - ret = sctp_con.sock->ops->getsockopt(sctp_con.sock, - IPPROTO_SCTP, SCTP_PRIMARY_ADDR, - (char*)&prim, &prim_len); - set_fs(fs); - if (ret < 0) { - struct nodeinfo *ni; - - log_print("getsockopt/sctp_primary_addr on " - "new assoc %d failed : %d", - (int)sn->sn_assoc_change.sac_assoc_id, ret); - - /* Retry INIT later */ - ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); - if (ni) - clear_bit(NI_INIT_PENDING, &ni->flags); - return; - } - make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { - log_print("reject connect from unknown addr"); - send_shutdown(prim.ssp_assoc_id); - return; - } - - ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); - if (!ni) - return; - - /* Save the assoc ID */ - spin_lock(&ni->lock); - ni->assoc_id = sn->sn_assoc_change.sac_assoc_id; - spin_unlock(&ni->lock); - - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); - - /* Send any pending writes */ - clear_bit(NI_INIT_PENDING, &ni->flags); - if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { - spin_lock_bh(&write_nodes_lock); - list_add_tail(&ni->write_list, &write_nodes); - spin_unlock_bh(&write_nodes_lock); - } - wake_up_process(send_task); - } - break; - - case SCTP_COMM_LOST: - case SCTP_SHUTDOWN_COMP: - { - struct nodeinfo *ni; - - ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); - if (ni) { - spin_lock(&ni->lock); - ni->assoc_id = 0; - spin_unlock(&ni->lock); - } - } - break; - - /* We don't know which INIT failed, so clear the PENDING flags - * on them all. if assoc_id is zero then it will then try - * again */ - - case SCTP_CANT_STR_ASSOC: - { - log_print("Can't start SCTP association - retrying"); - init_failed(); - } - break; - - default: - log_print("unexpected SCTP assoc change id=%d state=%d", - (int)sn->sn_assoc_change.sac_assoc_id, - sn->sn_assoc_change.sac_state); - } - } -} - -/* Data received from remote end */ -static int receive_from_sock(void) -{ - int ret = 0; - struct msghdr msg; - struct kvec iov[2]; - unsigned len; - int r; - struct sctp_sndrcvinfo *sinfo; - struct cmsghdr *cmsg; - struct nodeinfo *ni; - - /* These two are marginally too big for stack allocation, but this - * function is (currently) only called by dlm_recvd so static should be - * OK. - */ - static struct sockaddr_storage msgname; - static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - - if (sctp_con.sock == NULL) - goto out; - - if (sctp_con.rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - sctp_con.rx_page = alloc_page(GFP_ATOMIC); - if (sctp_con.rx_page == NULL) - goto out_resched; - CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE); - } - - memset(&incmsg, 0, sizeof(incmsg)); - memset(&msgname, 0, sizeof(msgname)); - - memset(incmsg, 0, sizeof(incmsg)); - msg.msg_name = &msgname; - msg.msg_namelen = sizeof(msgname); - msg.msg_flags = 0; - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - msg.msg_iovlen = 1; - - /* I don't see why this circular buffer stuff is necessary for SCTP - * which is a packet-based protocol, but the whole thing breaks under - * load without it! The overhead is minimal (and is in the TCP lowcomms - * anyway, of course) so I'll leave it in until I can figure out what's - * really happening. - */ - - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. - */ - iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb); - iov[0].iov_base = page_address(sctp_con.rx_page) + - CBUF_DATA(&sctp_con.cb); - iov[1].iov_len = 0; - - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) - */ - if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) { - iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb); - iov[1].iov_len = sctp_con.cb.base; - iov[1].iov_base = page_address(sctp_con.rx_page); - msg.msg_iovlen = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - - r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len, - MSG_NOSIGNAL | MSG_DONTWAIT); - if (ret <= 0) - goto out_close; - - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - cmsg = CMSG_FIRSTHDR(&msg); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); - - if (msg.msg_flags & MSG_NOTIFICATION) { - process_sctp_notification(&msg, page_address(sctp_con.rx_page)); - return 0; - } - - /* Is this a new association ? */ - ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL); - if (ni) { - ni->assoc_id = sinfo->sinfo_assoc_id; - if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { - - if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { - spin_lock_bh(&write_nodes_lock); - list_add_tail(&ni->write_list, &write_nodes); - spin_unlock_bh(&write_nodes_lock); - } - wake_up_process(send_task); - } - } - - /* INIT sends a message with length of 1 - ignore it */ - if (r == 1) - return 0; - - CBUF_ADD(&sctp_con.cb, ret); - ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), - page_address(sctp_con.rx_page), - sctp_con.cb.base, sctp_con.cb.len, - PAGE_CACHE_SIZE); - if (ret < 0) - goto out_close; - CBUF_EAT(&sctp_con.cb, ret); - - out: - ret = 0; - goto out_ret; - - out_resched: - lowcomms_data_ready(sctp_con.sock->sk, 0); - ret = 0; - schedule(); - goto out_ret; - - out_close: - if (ret != -EAGAIN) - log_print("error reading from sctp socket: %d", ret); - out_ret: - return ret; -} - -/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */ -static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) -{ - mm_segment_t fs; - int result = 0; - - fs = get_fs(); - set_fs(get_ds()); - if (num == 1) - result = sctp_con.sock->ops->bind(sctp_con.sock, - (struct sockaddr *) addr, addr_len); - else - result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len); - set_fs(fs); - - if (result < 0) - log_print("Can't bind to port %d addr number %d", - dlm_config.tcp_port, num); - - return result; -} - -static void init_local(void) -{ - struct sockaddr_storage sas, *addr; - int i; - - dlm_local_nodeid = dlm_our_nodeid(); - - for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) { - if (dlm_our_addr(&sas, i)) - break; - - addr = kmalloc(sizeof(*addr), GFP_KERNEL); - if (!addr) - break; - memcpy(addr, &sas, sizeof(*addr)); - dlm_local_addr[dlm_local_count++] = addr; - } -} - -/* Initialise SCTP socket and bind to all interfaces */ -static int init_sock(void) -{ - mm_segment_t fs; - struct socket *sock = NULL; - struct sockaddr_storage localaddr; - struct sctp_event_subscribe subscribe; - int result = -EINVAL, num = 1, i, addr_len; - - if (!dlm_local_count) { - init_local(); - if (!dlm_local_count) { - log_print("no local IP address has been set"); - goto out; - } - } - - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, - IPPROTO_SCTP, &sock); - if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); - goto out; - } - - /* Listen for events */ - memset(&subscribe, 0, sizeof(subscribe)); - subscribe.sctp_data_io_event = 1; - subscribe.sctp_association_event = 1; - subscribe.sctp_send_failure_event = 1; - subscribe.sctp_shutdown_event = 1; - subscribe.sctp_partial_delivery_event = 1; - - fs = get_fs(); - set_fs(get_ds()); - result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS, - (char *)&subscribe, sizeof(subscribe)); - set_fs(fs); - - if (result < 0) { - log_print("Failed to set SCTP_EVENTS on socket: result=%d", - result); - goto create_delsock; - } - - /* Init con struct */ - sock->sk->sk_user_data = &sctp_con; - sctp_con.sock = sock; - sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready; - - /* Bind to all interfaces. */ - for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); - make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); - - result = add_bind_addr(&localaddr, addr_len, num); - if (result) - goto create_delsock; - ++num; - } - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't set socket listening"); - goto create_delsock; - } - - return 0; - - create_delsock: - sock_release(sock); - sctp_con.sock = NULL; - out: - return result; -} - - -static struct writequeue_entry *new_writequeue_entry(gfp_t allocation) -{ - struct writequeue_entry *entry; - - entry = kmalloc(sizeof(struct writequeue_entry), allocation); - if (!entry) - return NULL; - - entry->page = alloc_page(allocation); - if (!entry->page) { - kfree(entry); - return NULL; - } - - entry->offset = 0; - entry->len = 0; - entry->end = 0; - entry->users = 0; - - return entry; -} - -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) -{ - struct writequeue_entry *e; - int offset = 0; - int users = 0; - struct nodeinfo *ni; - - if (!atomic_read(&accepting)) - return NULL; - - ni = nodeid2nodeinfo(nodeid, allocation); - if (!ni) - return NULL; - - spin_lock(&ni->writequeue_lock); - e = list_entry(ni->writequeue.prev, struct writequeue_entry, list); - if (((struct list_head *) e == &ni->writequeue) || - (PAGE_CACHE_SIZE - e->end < len)) { - e = NULL; - } else { - offset = e->end; - e->end += len; - users = e->users++; - } - spin_unlock(&ni->writequeue_lock); - - if (e) { - got_one: - if (users == 0) - kmap(e->page); - *ppc = page_address(e->page) + offset; - return e; - } - - e = new_writequeue_entry(allocation); - if (e) { - spin_lock(&ni->writequeue_lock); - offset = e->end; - e->end += len; - e->ni = ni; - users = e->users++; - list_add_tail(&e->list, &ni->writequeue); - spin_unlock(&ni->writequeue_lock); - goto got_one; - } - return NULL; -} - -void dlm_lowcomms_commit_buffer(void *arg) -{ - struct writequeue_entry *e = (struct writequeue_entry *) arg; - int users; - struct nodeinfo *ni = e->ni; - - if (!atomic_read(&accepting)) - return; - - spin_lock(&ni->writequeue_lock); - users = --e->users; - if (users) - goto out; - e->len = e->end - e->offset; - kunmap(e->page); - spin_unlock(&ni->writequeue_lock); - - if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { - spin_lock_bh(&write_nodes_lock); - list_add_tail(&ni->write_list, &write_nodes); - spin_unlock_bh(&write_nodes_lock); - wake_up_process(send_task); - } - return; - - out: - spin_unlock(&ni->writequeue_lock); - return; -} - -static void free_entry(struct writequeue_entry *e) -{ - __free_page(e->page); - kfree(e); -} - -/* Initiate an SCTP association. In theory we could just use sendmsg() on - the first IP address and it should work, but this allows us to set up the - association before sending any valuable data that we can't afford to lose. - It also keeps the send path clean as it can now always use the association ID */ -static void initiate_association(int nodeid) -{ - struct sockaddr_storage rem_addr; - static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int ret; - int addrlen; - char buf[1]; - struct kvec iov[1]; - struct nodeinfo *ni; - - log_print("Initiating association with node %d", nodeid); - - ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); - if (!ni) - return; - - if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) { - log_print("no address for nodeid %d", nodeid); - return; - } - - make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); - - outmessage.msg_name = &rem_addr; - outmessage.msg_namelen = addrlen; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; - - iov[0].iov_base = buf; - iov[0].iov_len = 1; - - /* Real INIT messages seem to cause trouble. Just send a 1 byte message - we can afford to lose */ - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); - - outmessage.msg_controllen = cmsg->cmsg_len; - ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1); - if (ret < 0) { - log_print("send INIT to node failed: %d", ret); - /* Try again later */ - clear_bit(NI_INIT_PENDING, &ni->flags); - } -} - -/* Send a message */ -static int send_to_sock(struct nodeinfo *ni) -{ - int ret = 0; - struct writequeue_entry *e; - int len, offset; - struct msghdr outmsg; - static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - struct kvec iov; - - /* See if we need to init an association before we start - sending precious messages */ - spin_lock(&ni->lock); - if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { - spin_unlock(&ni->lock); - initiate_association(ni->nodeid); - return 0; - } - spin_unlock(&ni->lock); - - outmsg.msg_name = NULL; /* We use assoc_id */ - outmsg.msg_namelen = 0; - outmsg.msg_control = outcmsg; - outmsg.msg_controllen = sizeof(outcmsg); - outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR; - - cmsg = CMSG_FIRSTHDR(&outmsg); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); - sinfo->sinfo_assoc_id = ni->assoc_id; - outmsg.msg_controllen = cmsg->cmsg_len; - - spin_lock(&ni->writequeue_lock); - for (;;) { - if (list_empty(&ni->writequeue)) - break; - e = list_entry(ni->writequeue.next, struct writequeue_entry, - list); - len = e->len; - offset = e->offset; - BUG_ON(len == 0 && e->users == 0); - spin_unlock(&ni->writequeue_lock); - kmap(e->page); - - ret = 0; - if (len) { - iov.iov_base = page_address(e->page)+offset; - iov.iov_len = len; - - ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1, - len); - if (ret == -EAGAIN) { - sctp_con.eagain_flag = 1; - goto out; - } else if (ret < 0) - goto send_error; - } else { - /* Don't starve people filling buffers */ - schedule(); - } - - spin_lock(&ni->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - continue; - } - } - spin_unlock(&ni->writequeue_lock); - out: - return ret; - - send_error: - log_print("Error sending to node %d %d", ni->nodeid, ret); - spin_lock(&ni->lock); - if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { - ni->assoc_id = 0; - spin_unlock(&ni->lock); - initiate_association(ni->nodeid); - } else - spin_unlock(&ni->lock); - - return ret; -} - -/* Try to send any messages that are pending */ -static void process_output_queue(void) -{ - struct list_head *list; - struct list_head *temp; - - spin_lock_bh(&write_nodes_lock); - list_for_each_safe(list, temp, &write_nodes) { - struct nodeinfo *ni = - list_entry(list, struct nodeinfo, write_list); - clear_bit(NI_WRITE_PENDING, &ni->flags); - list_del(&ni->write_list); - - spin_unlock_bh(&write_nodes_lock); - - send_to_sock(ni); - spin_lock_bh(&write_nodes_lock); - } - spin_unlock_bh(&write_nodes_lock); -} - -/* Called after we've had -EAGAIN and been woken up */ -static void refill_write_queue(void) -{ - int i; - - for (i=1; i<=max_nodeid; i++) { - struct nodeinfo *ni = nodeid2nodeinfo(i, 0); - - if (ni) { - if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { - spin_lock_bh(&write_nodes_lock); - list_add_tail(&ni->write_list, &write_nodes); - spin_unlock_bh(&write_nodes_lock); - } - } - } -} - -static void clean_one_writequeue(struct nodeinfo *ni) -{ - struct list_head *list; - struct list_head *temp; - - spin_lock(&ni->writequeue_lock); - list_for_each_safe(list, temp, &ni->writequeue) { - struct writequeue_entry *e = - list_entry(list, struct writequeue_entry, list); - list_del(&e->list); - free_entry(e); - } - spin_unlock(&ni->writequeue_lock); -} - -static void clean_writequeues(void) -{ - int i; - - for (i=1; i<=max_nodeid; i++) { - struct nodeinfo *ni = nodeid2nodeinfo(i, 0); - if (ni) - clean_one_writequeue(ni); - } -} - - -static void dealloc_nodeinfo(void) -{ - int i; - - for (i=1; i<=max_nodeid; i++) { - struct nodeinfo *ni = nodeid2nodeinfo(i, 0); - if (ni) { - idr_remove(&nodeinfo_idr, i); - kfree(ni); - } - } -} - -int dlm_lowcomms_close(int nodeid) -{ - struct nodeinfo *ni; - - ni = nodeid2nodeinfo(nodeid, 0); - if (!ni) - return -1; - - spin_lock(&ni->lock); - if (ni->assoc_id) { - ni->assoc_id = 0; - /* Don't send shutdown here, sctp will just queue it - till the node comes back up! */ - } - spin_unlock(&ni->lock); - - clean_one_writequeue(ni); - clear_bit(NI_INIT_PENDING, &ni->flags); - return 0; -} - -static int write_list_empty(void) -{ - int status; - - spin_lock_bh(&write_nodes_lock); - status = list_empty(&write_nodes); - spin_unlock_bh(&write_nodes_lock); - - return status; -} - -static int dlm_recvd(void *data) -{ - DECLARE_WAITQUEUE(wait, current); - - while (!kthread_should_stop()) { - int count = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&lowcomms_recv_wait, &wait); - if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) - schedule(); - remove_wait_queue(&lowcomms_recv_wait, &wait); - set_current_state(TASK_RUNNING); - - if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { - int ret; - - do { - ret = receive_from_sock(); - - /* Don't starve out everyone else */ - if (++count >= MAX_RX_MSG_COUNT) { - schedule(); - count = 0; - } - } while (!kthread_should_stop() && ret >=0); - } - schedule(); - } - - return 0; -} - -static int dlm_sendd(void *data) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (write_list_empty()) - schedule(); - set_current_state(TASK_RUNNING); - - if (sctp_con.eagain_flag) { - sctp_con.eagain_flag = 0; - refill_write_queue(); - } - process_output_queue(); - } - - remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - return 0; -} - -static void daemons_stop(void) -{ - kthread_stop(recv_task); - kthread_stop(send_task); -} - -static int daemons_start(void) -{ - struct task_struct *p; - int error; - - p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); - error = IS_ERR(p); - if (error) { - log_print("can't start dlm_recvd %d", error); - return error; - } - recv_task = p; - - p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); - error = IS_ERR(p); - if (error) { - log_print("can't start dlm_sendd %d", error); - kthread_stop(recv_task); - return error; - } - send_task = p; - - return 0; -} - -/* - * This is quite likely to sleep... - */ -int dlm_lowcomms_start(void) -{ - int error; - - error = init_sock(); - if (error) - goto fail_sock; - error = daemons_start(); - if (error) - goto fail_sock; - atomic_set(&accepting, 1); - return 0; - - fail_sock: - close_connection(); - return error; -} - -/* Set all the activity flags to prevent any socket activity. */ - -void dlm_lowcomms_stop(void) -{ - atomic_set(&accepting, 0); - sctp_con.flags = 0x7; - daemons_stop(); - clean_writequeues(); - close_connection(); - dealloc_nodeinfo(); - max_nodeid = 0; -} - -int dlm_lowcomms_init(void) -{ - init_waitqueue_head(&lowcomms_recv_wait); - spin_lock_init(&write_nodes_lock); - INIT_LIST_HEAD(&write_nodes); - init_rwsem(&nodeinfo_lock); - return 0; -} - -void dlm_lowcomms_exit(void) -{ - int i; - - for (i = 0; i < dlm_local_count; i++) - kfree(dlm_local_addr[i]); - dlm_local_count = 0; - dlm_local_nodeid = 0; -} - -- cgit v1.1 From 520698096436f7da5b9142e63e3bed5580c5f14e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 2 Nov 2006 09:49:02 -0600 Subject: [DLM] res_recover_locks_count not reset when recover_locks is aborted Red Hat BZ 213684 If a node sends an lkb to the new master (RCOM_LOCK message) during recovery and recovery is then aborted on both nodes before it gets a reply, the res_recover_locks_count needs to be reset to 0 so that when the subsequent recovery comes along and sends the lkb to the new master again the assertion doesn't trigger that checks that counter is zero. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/recover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/dlm') diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index a5e6d18..cf9f683 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -252,6 +252,7 @@ static void recover_list_clear(struct dlm_ls *ls) spin_lock(&ls->ls_recover_list_lock); list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { list_del_init(&r->res_recover_list); + r->res_recover_locks_count = 0; dlm_put_rsb(r); ls->ls_recover_list_count--; } -- cgit v1.1 From 435618b75b82b5ee511cc01fcdda9c44adb2f4bd Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 2 Nov 2006 09:45:56 -0600 Subject: [DLM] status messages ping-pong between unmounted nodes Red Hat BZ 213682 If two nodes leave the lockspace (while unmounting the fs in the case of gfs) after one has sent a STATUS message to the other, STATUS/STATUS_REPLY messages will then ping-pong between the nodes when neither of them can find the lockspace in question any longer. We kill this by not sending another STATUS message when we get a STATUS_REPLY for an unknown lockspace. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/rcom.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 518239a..87b12f7 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -412,9 +412,10 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - log_print("lockspace %x from %d not found", - hd->h_lockspace, nodeid); - send_ls_not_ready(nodeid, rc); + log_print("lockspace %x from %d type %x not found", + hd->h_lockspace, nodeid, rc->rc_type); + if (rc->rc_type == DLM_RCOM_STATUS) + send_ls_not_ready(nodeid, rc); return; } -- cgit v1.1 From d4400156d415540086c34a06e5d233122d6bf56a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 31 Oct 2006 11:55:56 -0600 Subject: [DLM] fix requestqueue race Red Hat BZ 211914 There's a race between dlm_recoverd (1) enabling locking and (2) clearing out the requestqueue, and dlm_recvd (1) checking if locking is enabled and (2) adding a message to the requestqueue. An order of recoverd(1), recvd(1), recvd(2), recoverd(2) will result in a message being left on the requestqueue. The fix is to have dlm_recvd check if dlm_recoverd has enabled locking after taking the mutex for the requestqueue and if it has processing the message instead of queueing it. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lock.c | 15 +++++++++++---- fs/dlm/requestqueue.c | 21 +++++++++++++++++---- fs/dlm/requestqueue.h | 2 +- 3 files changed, 29 insertions(+), 9 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 3f2befa..6088a16 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3028,10 +3028,17 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) while (1) { if (dlm_locking_stopped(ls)) { - if (!recovery) - dlm_add_requestqueue(ls, nodeid, hd); - error = -EINTR; - goto out; + if (recovery) { + error = -EINTR; + goto out; + } + error = dlm_add_requestqueue(ls, nodeid, hd); + if (error == -EAGAIN) + continue; + else { + error = -EINTR; + goto out; + } } if (lock_recovery_try(ls)) diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 7b2b089..0226d2a 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -30,26 +30,39 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) { struct rq_entry *e; int length = hd->h_length; + int rv = 0; if (dlm_is_removed(ls, nodeid)) - return; + return 0; e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!e) { log_print("dlm_add_requestqueue: out of memory\n"); - return; + return 0; } e->nodeid = nodeid; memcpy(e->request, hd, length); + /* We need to check dlm_locking_stopped() after taking the mutex to + avoid a race where dlm_recoverd enables locking and runs + process_requestqueue between our earlier dlm_locking_stopped check + and this addition to the requestqueue. */ + mutex_lock(&ls->ls_requestqueue_mutex); - list_add_tail(&e->list, &ls->ls_requestqueue); + if (dlm_locking_stopped(ls)) + list_add_tail(&e->list, &ls->ls_requestqueue); + else { + log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid); + kfree(e); + rv = -EAGAIN; + } mutex_unlock(&ls->ls_requestqueue_mutex); + return rv; } int dlm_process_requestqueue(struct dlm_ls *ls) diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index 349f0d2..6a53ea0 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -13,7 +13,7 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); -- cgit v1.1 From 91c0dc93a1a6bbdd79707ed311e48b4397df177f Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 31 Oct 2006 11:56:01 -0600 Subject: [DLM] fix aborted recovery during node removal Red Hat BZ 211914 With the new cluster infrastructure, dlm recovery for a node removal can be aborted and restarted for a node addition. When this happens, the restarted recovery isn't aware that it's doing recovery for the earlier removal as well as the addition. So, it then skips the recovery steps only required when nodes are removed. This can result in locks not being purged for failed/removed nodes. The fix is to check for removed nodes for which recovery has not been completed at the start of a new recovery sequence. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/member.c | 8 ++++++++ fs/dlm/recoverd.c | 7 +++++++ 2 files changed, 15 insertions(+) (limited to 'fs/dlm') diff --git a/fs/dlm/member.c b/fs/dlm/member.c index a3f7de7..85e2897 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -186,6 +186,14 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; int i, error, found, pos = 0, neg = 0, low = -1; + /* previously removed members that we've not finished removing need to + count as a negative change so the "neg" recovery steps will happen */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + log_debug(ls, "prev removed member %d", memb->nodeid); + neg++; + } + /* move departed members from ls_nodes to ls_nodes_gone */ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 362e3ef..4a1d602 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -164,6 +164,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) */ dlm_recover_rsbs(ls); + } else { + /* + * Other lockspace members may be going through the "neg" steps + * while also adding us to the lockspace, in which case they'll + * be looking for this status bit during dlm_recover_locks(). + */ + dlm_set_recover_status(ls, DLM_RS_LOCKS); } dlm_release_root_list(ls); -- cgit v1.1 From 2cdc98aaf072d573df10c503d3b3b0b74e2a6d06 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 31 Oct 2006 11:56:08 -0600 Subject: [DLM] fix stopping unstarted recovery Red Hat BZ 211914 When many nodes are joining a lockspace simultaneously, the dlm gets a quick sequence of stop/start events, a pair for adding each node. dlm_controld in user space sends dlm_recoverd in the kernel each stop and start event. dlm_controld will sometimes send the stop before dlm_recoverd has had a chance to take up the previously queued start. The stop aborts the processing of the previous start by setting the RECOVERY_STOP flag. dlm_recoverd is erroneously clearing this flag and ignoring the stop/abort if it happens to take up the start after the stop meant to abort it. The fix is to check the sequence number that's incremented for each stop/start before clearing the flag. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/recoverd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 4a1d602..6e4ee94 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -219,6 +219,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) return error; } +/* The dlm_ls_start() that created the rv we take here may already have been + stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP + flag set. */ + static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; @@ -226,7 +230,8 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; ls->ls_recover_args = NULL; - clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + if (rv && ls->ls_recover_seq == rv->seq) + clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); spin_unlock(&ls->ls_recover_lock); if (rv) { -- cgit v1.1 From 4b77f2c93d052adca8cc8690b9b5e7f8798f4ddd Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 1 Nov 2006 09:31:48 -0600 Subject: [DLM] do full recover_locks barrier Red Hat BZ 211914 The previous patch "[DLM] fix aborted recovery during node removal" was incomplete as discovered with further testing. It set the bit for the RS_LOCKS barrier but did not then wait for the barrier. This is often ok, but sometimes it will cause yet another recovery hang. If it's a new node that also has the lowest nodeid that skips the barrier wait, then it misses the important step of collecting and reporting the barrier status from the other nodes (which is the job of the low nodeid in the barrier wait routine). Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/recoverd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 6e4ee94..8bb895f 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -168,9 +168,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Other lockspace members may be going through the "neg" steps * while also adding us to the lockspace, in which case they'll - * be looking for this status bit during dlm_recover_locks(). + * be doing the recover_locks (RS_LOCKS) barrier. */ dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls); + if (error) { + log_error(ls, "recover_locks_wait failed %d", error); + goto fail; + } } dlm_release_root_list(ls); -- cgit v1.1 From 6f90a8b1b87f97144911790390d56f695b59db9b Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 10 Nov 2006 14:16:27 -0600 Subject: [DLM] clear sbflags on lock master RH BZ 211622 The ALTMODE flag can be set in the lock master's copy of the lock but never cleared, so ALTMODE will also be returned in a subsequent conversion of the lock when it shouldn't be. This results in lock_dlm incorrectly switching to the alternate lock mode when returning the result to gfs which then asserts when it sees the wrong lock state. The fix is to propagate the cleared sbflags value to the master node when the lock is requested. QA's d_rwrandirectlarge test triggers this bug very quickly. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/dlm') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6088a16..30878de 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2372,6 +2372,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) { lkb->lkb_exflags = ms->m_exflags; + lkb->lkb_sbflags = ms->m_sbflags; lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | (ms->m_flags & 0x0000FFFF); } -- cgit v1.1 From b98c95af01c10827e3443157651eb469071391a3 Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 15 Nov 2006 12:29:24 -0500 Subject: [DLM] Fix DLM config The attached patch fixes the DLM config so that it selects the chosen network transport. It should fix the bug where DLM can be left selected when NET gets unselected. This incorporates all the comments received about this patch. Cc: Adrian Bunk Cc: Andrew Morton Signed-Off-By: Patrick Caulfield Signed-off-by: Steven Whitehouse --- fs/dlm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index c5985b8..b5654a2 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,10 +1,11 @@ menu "Distributed Lock Manager" - depends on INET && IP_SCTP && EXPERIMENTAL + depends on EXPERIMENTAL && INET config DLM tristate "Distributed Lock Manager (DLM)" depends on IPV6 || IPV6=n select CONFIGFS_FS + select IP_SCTP if DLM_SCTP help A general purpose distributed lock manager for kernel or userspace applications. -- cgit v1.1 From 2896ee37ccc1f9acb244c9b02becb74a43661009 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 27 Nov 2006 11:31:22 -0600 Subject: [DLM] fix add_requestqueue checking nodes list Requests that arrive after recovery has started are saved in the requestqueue and processed after recovery is done. Some of these requests are purged during recovery if they are from nodes that have been removed. We move the purging of the requests (dlm_purge_requestqueue) to later in the recovery sequence which allows the routine saving requests (dlm_add_requestqueue) to avoid filtering out requests by nodeid since the same will be done by the purge. The current code has add_requestqueue filtering by nodeid but doesn't hold any locks when accessing the list of current nodes. This also means that we need to call the purge routine when the lockspace is being shut down since the add routine will not be rejecting requests itself any more. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lockspace.c | 2 ++ fs/dlm/recoverd.c | 16 ++++++++-------- fs/dlm/requestqueue.c | 7 ++++--- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f8842ca..791388b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -22,6 +22,7 @@ #include "memory.h" #include "lock.h" #include "recover.h" +#include "requestqueue.h" #ifdef CONFIG_DLM_DEBUG int dlm_create_debug_file(struct dlm_ls *ls); @@ -684,6 +685,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * Free structures on any other lists */ + dlm_purge_requestqueue(ls); kfree(ls->ls_recover_args); dlm_clear_free_entries(ls); dlm_clear_members(ls); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 8bb895f..9dc2f91 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -94,14 +94,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) } /* - * Purge directory-related requests that are saved in requestqueue. - * All dir requests from before recovery are invalid now due to the dir - * rebuild and will be resent by the requesting nodes. - */ - - dlm_purge_requestqueue(ls); - - /* * Wait for all nodes to complete directory rebuild. */ @@ -181,6 +173,14 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_release_root_list(ls); + /* + * Purge directory-related requests that are saved in requestqueue. + * All dir requests from before recovery are invalid now due to the dir + * rebuild and will be resent by the requesting nodes. + */ + + dlm_purge_requestqueue(ls); + dlm_set_recover_status(ls, DLM_RS_DONE); error = dlm_recover_done_wait(ls); if (error) { diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 0226d2a..65008d7 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -36,9 +36,6 @@ int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) int length = hd->h_length; int rv = 0; - if (dlm_is_removed(ls, nodeid)) - return 0; - e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!e) { log_print("dlm_add_requestqueue: out of memory\n"); @@ -133,6 +130,10 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) { uint32_t type = ms->m_type; + /* the ls is being cleaned up and freed by release_lockspace */ + if (!ls->ls_count) + return 1; + if (dlm_is_removed(ls, nodeid)) return 1; -- cgit v1.1 From 1babdb453138f17b8ed3d1d5711089c4e2fa5ace Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 27 Nov 2006 13:18:41 -0600 Subject: [DLM] fix size of STATUS_REPLY message When the not_ready routine sends a "fake" status reply with blank status flags, it needs to use the correct size for a normal STATUS_REPLY by including the size of the would-be config parameters. We also fill in the non-existant config parameters with an invalid lvblen value so it's easier to notice if these invalid paratmers are ever being used. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/rcom.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 87b12f7..6ac195c 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -370,9 +370,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; + struct rcom_config *rf; struct dlm_mhandle *mh; char *mb; - int mb_len = sizeof(struct dlm_rcom); + int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb); if (!mh) @@ -391,6 +392,9 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_result = -ESRCH; + rf = (struct rcom_config *) rc->rc_buf; + rf->rf_lvblen = -1; + dlm_rcom_out(rc); dlm_lowcomms_commit_buffer(mh); -- cgit v1.1 From 98f176fb32f33795b6d0f83856008b932123ab38 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 27 Nov 2006 13:19:28 -0600 Subject: [DLM] don't accept replies to old recovery messages We often abort a recovery after sending a status request to a remote node. We want to ignore any potential status reply we get from the remote node. If we get one of these unwanted replies, we've often moved on to the next recovery message and incremented the message sequence counter, so the reply will be ignored due to the seq number. In some cases, we've not moved on to the next message so the seq number of the reply we want to ignore is still correct, causing the reply to be accepted. The next recovery message will then mistake this old reply as a new one. To fix this, we add the flag RCOM_WAIT to indicate when we can accept a new reply. We clear this flag if we abort recovery while waiting for a reply. Before the flag is set again (to allow new replies) we know that any old replies will be rejected due to their sequence number. We also initialize the recovery-message sequence number to a random value when a lockspace is first created. This makes it clear when messages are being rejected from an old instance of a lockspace that has since been recreated. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/dlm_internal.h | 4 +++- fs/dlm/lockspace.c | 2 ++ fs/dlm/rcom.c | 44 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 39 insertions(+), 11 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 1e5cd67..1ee8195 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -471,6 +471,7 @@ struct dlm_ls { char *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ uint64_t ls_rcom_seq; + spinlock_t ls_rcom_spin; struct list_head ls_recover_list; spinlock_t ls_recover_list_lock; int ls_recover_list_count; @@ -488,7 +489,8 @@ struct dlm_ls { #define LSFL_RUNNING 1 #define LSFL_RECOVERY_STOP 2 #define LSFL_RCOM_READY 3 -#define LSFL_UEVENT_WAIT 4 +#define LSFL_RCOM_WAIT 4 +#define LSFL_UEVENT_WAIT 5 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 791388b..59012b0 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -479,6 +479,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; ls->ls_recover_seq = 0; ls->ls_recover_args = NULL; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 6ac195c..c42f2db 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -90,13 +90,28 @@ static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) return 0; } +static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq) +{ + spin_lock(&ls->ls_rcom_spin); + *new_seq = ++ls->ls_rcom_seq; + set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +static void disallow_sync_reply(struct dlm_ls *ls) +{ + spin_lock(&ls->ls_rcom_spin); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + int dlm_rcom_status(struct dlm_ls *ls, int nodeid) { struct dlm_rcom *rc; struct dlm_mhandle *mh; int error = 0; - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); ls->ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { @@ -108,12 +123,14 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); if (error) goto out; - rc->rc_id = ++ls->ls_rcom_seq; + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); send_rcom(ls, mh, rc); error = dlm_wait_function(ls, &rcom_response); - clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + disallow_sync_reply(ls); if (error) goto out; @@ -150,14 +167,20 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) { - if (rc_in->rc_id != ls->ls_rcom_seq) { - log_debug(ls, "reject old reply %d got %llx wanted %llx", - rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq); - return; + spin_lock(&ls->ls_rcom_spin); + if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || + rc_in->rc_id != ls->ls_rcom_seq) { + log_debug(ls, "reject reply %d from %d seq %llx expect %llx", + rc_in->rc_type, rc_in->rc_header.h_nodeid, + rc_in->rc_id, ls->ls_rcom_seq); + goto out; } memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); set_bit(LSFL_RCOM_READY, &ls->ls_flags); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); wake_up(&ls->ls_wait_general); + out: + spin_unlock(&ls->ls_rcom_spin); } static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -171,7 +194,6 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) struct dlm_mhandle *mh; int error = 0, len = sizeof(struct dlm_rcom); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); ls->ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { @@ -185,12 +207,14 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); - rc->rc_id = ++ls->ls_rcom_seq; + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); send_rcom(ls, mh, rc); error = dlm_wait_function(ls, &rcom_response); - clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + disallow_sync_reply(ls); out: return error; } -- cgit v1.1 From 57adf7eede38d315e0e328c52484d6a596e9a238 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 29 Nov 2006 09:33:48 -0500 Subject: [DLM] fix format warnings in rcom.c and recoverd.c This fixes the following gcc warnings generated on the architectures where uint64_t != unsigned long long (e.g. ppc64). fs/dlm/rcom.c:154: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'uint64_t' fs/dlm/rcom.c:154: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'uint64_t' fs/dlm/recoverd.c:48: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t' fs/dlm/recoverd.c:202: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t' fs/dlm/recoverd.c:210: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'uint64_t' Signed-off-by: Ryusuke Konishi Signed-off-by: Patrick Caulfield Signed-off-by: Steven Whitehouse --- fs/dlm/rcom.c | 3 ++- fs/dlm/recoverd.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index c42f2db..4cc31be 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -172,7 +172,8 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc_in->rc_id != ls->ls_rcom_seq) { log_debug(ls, "reject reply %d from %d seq %llx expect %llx", rc_in->rc_type, rc_in->rc_header.h_nodeid, - rc_in->rc_id, ls->ls_rcom_seq); + (unsigned long long)rc_in->rc_id, + (unsigned long long)ls->ls_rcom_seq); goto out; } memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 9dc2f91..650536aa 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -45,7 +45,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "recover %llx", rv->seq); + log_debug(ls, "recover %llx", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -212,7 +212,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_astd_wake(); - log_debug(ls, "recover %llx done: %u ms", rv->seq, + log_debug(ls, "recover %llx done: %u ms", + (unsigned long long)rv->seq, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -220,7 +221,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "recover %llx error %d", rv->seq, error); + log_debug(ls, "recover %llx error %d", + (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; } -- cgit v1.1 From ac33d0710595579e3cfca42dde2257eb0b123f6d Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 6 Dec 2006 15:10:37 +0000 Subject: [DLM] Clean up lowcomms This fixes up most of the things pointed out by akpm and Pavel Machek with comments below indicating why some things have been left: Andrew Morton wrote: > >> +static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) >> +{ >> + struct nodeinfo *ni; >> + int r; >> + int n; >> + >> + down_read(&nodeinfo_lock); > > Given that this function can sleep, I wonder if `alloc' is useful. > > I see lots of callers passing in a literal "0" for `alloc'. That's in fact > a secret (GFP_ATOMIC & ~__GFP_HIGH). I doubt if that's what you really > meant. Particularly as the code could at least have used __GFP_WAIT (aka > GFP_NOIO) which is much, much more reliable than "0". In fact "0" is the > least reliable mode possible. > > IOW, this is all bollixed up. When 0 is passed into nodeid2nodeinfo the function does not try to allocate a new structure at all. it's an indication that the caller only wants the nodeinfo struct for that nodeid if there actually is one in existance. I've tidied the function itself so it's more obvious, (and tidier!) >> +/* Data received from remote end */ >> +static int receive_from_sock(void) >> +{ >> + int ret = 0; >> + struct msghdr msg; >> + struct kvec iov[2]; >> + unsigned len; >> + int r; >> + struct sctp_sndrcvinfo *sinfo; >> + struct cmsghdr *cmsg; >> + struct nodeinfo *ni; >> + >> + /* These two are marginally too big for stack allocation, but this >> + * function is (currently) only called by dlm_recvd so static should be >> + * OK. >> + */ >> + static struct sockaddr_storage msgname; >> + static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; > > whoa. This is globally singly-threaded code?? Yes. it is only ever run in the context of dlm_recvd. >> >> +static void initiate_association(int nodeid) >> +{ >> + struct sockaddr_storage rem_addr; >> + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; > > Another static buffer to worry about. Globally singly-threaded code? Yes. Only ever called by dlm_sendd. >> + >> +/* Send a message */ >> +static int send_to_sock(struct nodeinfo *ni) >> +{ >> + int ret = 0; >> + struct writequeue_entry *e; >> + int len, offset; >> + struct msghdr outmsg; >> + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; > > Singly-threaded? Yep. >> >> +static void dealloc_nodeinfo(void) >> +{ >> + int i; >> + >> + for (i=1; i<=max_nodeid; i++) { >> + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); >> + if (ni) { >> + idr_remove(&nodeinfo_idr, i); > > Didn't that need locking? Not. it's only ever called at DLM shutdown after all the other threads have been stopped. >> >> +static int write_list_empty(void) >> +{ >> + int status; >> + >> + spin_lock_bh(&write_nodes_lock); >> + status = list_empty(&write_nodes); >> + spin_unlock_bh(&write_nodes_lock); >> + >> + return status; >> +} > > This function's return value is meaningless. As soon as the lock gets > dropped, the return value can get out of sync with reality. > > Looking at the caller, this _might_ happen to be OK, but it's a nasty and > dangerous thing. Really the locking should be moved into the caller. It's just an optimisation to allow the caller to schedule if there is no work to do. if something arrives immediately afterwards then it will get picked up when the process re-awakes (and it will be woken by that arrival). The 'accepting' atomic has gone completely. as Andrew pointed out it didn't really achieve much anyway. I suspect it was a plaster over some other startup or shutdown bug to be honest. Signed-off-by: Patrick Caulfield Signed-off-by: Steven Whitehouse Cc: Andrew Morton Cc: Pavel Machek --- fs/dlm/lowcomms-sctp.c | 264 ++++++++++++++++++-------------------- fs/dlm/lowcomms-tcp.c | 342 +++++++++++++++++++------------------------------ fs/dlm/lowcomms.h | 2 - fs/dlm/main.c | 10 +- 4 files changed, 261 insertions(+), 357 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c index 6da6b14..fe158d7 100644 --- a/fs/dlm/lowcomms-sctp.c +++ b/fs/dlm/lowcomms-sctp.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -75,13 +75,13 @@ struct nodeinfo { }; static DEFINE_IDR(nodeinfo_idr); -static struct rw_semaphore nodeinfo_lock; -static int max_nodeid; +static DECLARE_RWSEM(nodeinfo_lock); +static int max_nodeid; struct cbuf { - unsigned base; - unsigned len; - unsigned mask; + unsigned int base; + unsigned int len; + unsigned int mask; }; /* Just the one of these, now. But this struct keeps @@ -90,9 +90,9 @@ struct cbuf { #define CF_READ_PENDING 1 struct connection { - struct socket *sock; + struct socket *sock; unsigned long flags; - struct page *rx_page; + struct page *rx_page; atomic_t waiting_requests; struct cbuf cb; int eagain_flag; @@ -102,36 +102,40 @@ struct connection { struct writequeue_entry { struct list_head list; - struct page *page; + struct page *page; int offset; int len; int end; int users; - struct nodeinfo *ni; + struct nodeinfo *ni; }; -#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) -#define CBUF_EMPTY(cb) ((cb)->len == 0) -#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) -#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} -#define CBUF_INIT(cb, size) \ -do { \ - (cb)->base = (cb)->len = 0; \ - (cb)->mask = ((size)-1); \ -} while(0) +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} -#define CBUF_EAT(cb, n) \ -do { \ - (cb)->len -= (n); \ - (cb)->base += (n); \ - (cb)->base &= (cb)->mask; \ -} while(0) +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} /* List of nodes which have writes pending */ -static struct list_head write_nodes; -static spinlock_t write_nodes_lock; +static LIST_HEAD(write_nodes); +static DEFINE_SPINLOCK(write_nodes_lock); /* Maximum number of incoming messages to process before * doing a schedule() @@ -141,8 +145,7 @@ static spinlock_t write_nodes_lock; /* Manage daemons */ static struct task_struct *recv_task; static struct task_struct *send_task; -static wait_queue_head_t lowcomms_recv_wait; -static atomic_t accepting; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait); /* The SCTP connection */ static struct connection sctp_con; @@ -161,11 +164,11 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) return error; if (dlm_local_addr[0]->ss_family == AF_INET) { - struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; + struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; } else { - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; memcpy(&ret6->sin6_addr, &in6->sin6_addr, sizeof(in6->sin6_addr)); @@ -174,6 +177,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) return 0; } +/* If alloc is 0 here we will not attempt to allocate a new + nodeinfo struct */ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) { struct nodeinfo *ni; @@ -184,44 +189,45 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) ni = idr_find(&nodeinfo_idr, nodeid); up_read(&nodeinfo_lock); - if (!ni && alloc) { - down_write(&nodeinfo_lock); + if (ni || !alloc) + return ni; - ni = idr_find(&nodeinfo_idr, nodeid); - if (ni) - goto out_up; + down_write(&nodeinfo_lock); - r = idr_pre_get(&nodeinfo_idr, alloc); - if (!r) - goto out_up; + ni = idr_find(&nodeinfo_idr, nodeid); + if (ni) + goto out_up; - ni = kmalloc(sizeof(struct nodeinfo), alloc); - if (!ni) - goto out_up; + r = idr_pre_get(&nodeinfo_idr, alloc); + if (!r) + goto out_up; - r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); - if (r) { - kfree(ni); - ni = NULL; - goto out_up; - } - if (n != nodeid) { - idr_remove(&nodeinfo_idr, n); - kfree(ni); - ni = NULL; - goto out_up; - } - memset(ni, 0, sizeof(struct nodeinfo)); - spin_lock_init(&ni->lock); - INIT_LIST_HEAD(&ni->writequeue); - spin_lock_init(&ni->writequeue_lock); - ni->nodeid = nodeid; - - if (nodeid > max_nodeid) - max_nodeid = nodeid; - out_up: - up_write(&nodeinfo_lock); + ni = kmalloc(sizeof(struct nodeinfo), alloc); + if (!ni) + goto out_up; + + r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); + if (r) { + kfree(ni); + ni = NULL; + goto out_up; } + if (n != nodeid) { + idr_remove(&nodeinfo_idr, n); + kfree(ni); + ni = NULL; + goto out_up; + } + memset(ni, 0, sizeof(struct nodeinfo)); + spin_lock_init(&ni->lock); + INIT_LIST_HEAD(&ni->writequeue); + spin_lock_init(&ni->writequeue_lock); + ni->nodeid = nodeid; + + if (nodeid > max_nodeid) + max_nodeid = nodeid; +out_up: + up_write(&nodeinfo_lock); return ni; } @@ -279,13 +285,13 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, in4_addr->sin_port = cpu_to_be16(port); memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in)); + sizeof(struct sockaddr_in)); *addr_len = sizeof(struct sockaddr_in); } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; in6_addr->sin6_port = cpu_to_be16(port); memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in6)); + sizeof(struct sockaddr_in6)); *addr_len = sizeof(struct sockaddr_in6); } } @@ -324,7 +330,7 @@ static void send_shutdown(sctp_assoc_t associd) cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); outmessage.msg_controllen = cmsg->cmsg_len; - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_flags |= MSG_EOF; @@ -387,7 +393,7 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { log_print("COMM_UP for invalid assoc ID %d", - (int)sn->sn_assoc_change.sac_assoc_id); + (int)sn->sn_assoc_change.sac_assoc_id); init_failed(); return; } @@ -398,15 +404,18 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) fs = get_fs(); set_fs(get_ds()); ret = sctp_con.sock->ops->getsockopt(sctp_con.sock, - IPPROTO_SCTP, SCTP_PRIMARY_ADDR, - (char*)&prim, &prim_len); + IPPROTO_SCTP, + SCTP_PRIMARY_ADDR, + (char*)&prim, + &prim_len); set_fs(fs); if (ret < 0) { struct nodeinfo *ni; log_print("getsockopt/sctp_primary_addr on " "new assoc %d failed : %d", - (int)sn->sn_assoc_change.sac_assoc_id, ret); + (int)sn->sn_assoc_change.sac_assoc_id, + ret); /* Retry INIT later */ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); @@ -426,12 +435,10 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) return; /* Save the assoc ID */ - spin_lock(&ni->lock); ni->assoc_id = sn->sn_assoc_change.sac_assoc_id; - spin_unlock(&ni->lock); log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + (int)sn->sn_assoc_change.sac_assoc_id, nodeid); /* Send any pending writes */ clear_bit(NI_INIT_PENDING, &ni->flags); @@ -507,13 +514,12 @@ static int receive_from_sock(void) sctp_con.rx_page = alloc_page(GFP_ATOMIC); if (sctp_con.rx_page == NULL) goto out_resched; - CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE); + cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE); } memset(&incmsg, 0, sizeof(incmsg)); memset(&msgname, 0, sizeof(msgname)); - memset(incmsg, 0, sizeof(incmsg)); msg.msg_name = &msgname; msg.msg_namelen = sizeof(msgname); msg.msg_flags = 0; @@ -532,17 +538,17 @@ static int receive_from_sock(void) * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. */ - iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb); + iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb); iov[0].iov_base = page_address(sctp_con.rx_page) + - CBUF_DATA(&sctp_con.cb); + cbuf_data(&sctp_con.cb); iov[1].iov_len = 0; /* * iov[1] is the bit of the circular buffer between the start of the * buffer and the start of the currently used section (cb.base) */ - if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) { - iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb); + if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb); iov[1].iov_len = sctp_con.cb.base; iov[1].iov_base = page_address(sctp_con.rx_page); msg.msg_iovlen = 2; @@ -557,7 +563,7 @@ static int receive_from_sock(void) msg.msg_control = incmsg; msg.msg_controllen = sizeof(incmsg); cmsg = CMSG_FIRSTHDR(&msg); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); if (msg.msg_flags & MSG_NOTIFICATION) { process_sctp_notification(&msg, page_address(sctp_con.rx_page)); @@ -583,29 +589,29 @@ static int receive_from_sock(void) if (r == 1) return 0; - CBUF_ADD(&sctp_con.cb, ret); + cbuf_add(&sctp_con.cb, ret); ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), page_address(sctp_con.rx_page), sctp_con.cb.base, sctp_con.cb.len, PAGE_CACHE_SIZE); if (ret < 0) goto out_close; - CBUF_EAT(&sctp_con.cb, ret); + cbuf_eat(&sctp_con.cb, ret); - out: +out: ret = 0; goto out_ret; - out_resched: +out_resched: lowcomms_data_ready(sctp_con.sock->sk, 0); ret = 0; - schedule(); + cond_resched(); goto out_ret; - out_close: +out_close: if (ret != -EAGAIN) log_print("error reading from sctp socket: %d", ret); - out_ret: +out_ret: return ret; } @@ -619,10 +625,12 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) set_fs(get_ds()); if (num == 1) result = sctp_con.sock->ops->bind(sctp_con.sock, - (struct sockaddr *) addr, addr_len); + (struct sockaddr *) addr, + addr_len); else result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len); + SCTP_SOCKOPT_BINDX_ADD, + (char *)addr, addr_len); set_fs(fs); if (result < 0) @@ -719,10 +727,10 @@ static int init_sock(void) return 0; - create_delsock: +create_delsock: sock_release(sock); sctp_con.sock = NULL; - out: +out: return result; } @@ -756,16 +764,13 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) int users = 0; struct nodeinfo *ni; - if (!atomic_read(&accepting)) - return NULL; - ni = nodeid2nodeinfo(nodeid, allocation); if (!ni) return NULL; spin_lock(&ni->writequeue_lock); e = list_entry(ni->writequeue.prev, struct writequeue_entry, list); - if (((struct list_head *) e == &ni->writequeue) || + if ((&e->list == &ni->writequeue) || (PAGE_CACHE_SIZE - e->end < len)) { e = NULL; } else { @@ -776,7 +781,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) spin_unlock(&ni->writequeue_lock); if (e) { - got_one: + got_one: if (users == 0) kmap(e->page); *ppc = page_address(e->page) + offset; @@ -803,9 +808,6 @@ void dlm_lowcomms_commit_buffer(void *arg) int users; struct nodeinfo *ni = e->ni; - if (!atomic_read(&accepting)) - return; - spin_lock(&ni->writequeue_lock); users = --e->users; if (users) @@ -822,7 +824,7 @@ void dlm_lowcomms_commit_buffer(void *arg) } return; - out: +out: spin_unlock(&ni->writequeue_lock); return; } @@ -878,7 +880,7 @@ static void initiate_association(int nodeid) cmsg->cmsg_level = IPPROTO_SCTP; cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); @@ -892,7 +894,7 @@ static void initiate_association(int nodeid) } /* Send a message */ -static int send_to_sock(struct nodeinfo *ni) +static void send_to_sock(struct nodeinfo *ni) { int ret = 0; struct writequeue_entry *e; @@ -903,13 +905,13 @@ static int send_to_sock(struct nodeinfo *ni) struct sctp_sndrcvinfo *sinfo; struct kvec iov; - /* See if we need to init an association before we start + /* See if we need to init an association before we start sending precious messages */ spin_lock(&ni->lock); if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { spin_unlock(&ni->lock); initiate_association(ni->nodeid); - return 0; + return; } spin_unlock(&ni->lock); @@ -923,7 +925,7 @@ static int send_to_sock(struct nodeinfo *ni) cmsg->cmsg_level = IPPROTO_SCTP; cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); sinfo->sinfo_assoc_id = ni->assoc_id; @@ -955,7 +957,7 @@ static int send_to_sock(struct nodeinfo *ni) goto send_error; } else { /* Don't starve people filling buffers */ - schedule(); + cond_resched(); } spin_lock(&ni->writequeue_lock); @@ -964,15 +966,16 @@ static int send_to_sock(struct nodeinfo *ni) if (e->len == 0 && e->users == 0) { list_del(&e->list); + kunmap(e->page); free_entry(e); continue; } } spin_unlock(&ni->writequeue_lock); - out: - return ret; +out: + return; - send_error: +send_error: log_print("Error sending to node %d %d", ni->nodeid, ret); spin_lock(&ni->lock); if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { @@ -982,7 +985,7 @@ static int send_to_sock(struct nodeinfo *ni) } else spin_unlock(&ni->lock); - return ret; + return; } /* Try to send any messages that are pending */ @@ -994,7 +997,7 @@ static void process_output_queue(void) spin_lock_bh(&write_nodes_lock); list_for_each_safe(list, temp, &write_nodes) { struct nodeinfo *ni = - list_entry(list, struct nodeinfo, write_list); + list_entry(list, struct nodeinfo, write_list); clear_bit(NI_WRITE_PENDING, &ni->flags); list_del(&ni->write_list); @@ -1106,7 +1109,7 @@ static int dlm_recvd(void *data) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&lowcomms_recv_wait, &wait); if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) - schedule(); + cond_resched(); remove_wait_queue(&lowcomms_recv_wait, &wait); set_current_state(TASK_RUNNING); @@ -1118,12 +1121,12 @@ static int dlm_recvd(void *data) /* Don't starve out everyone else */ if (++count >= MAX_RX_MSG_COUNT) { - schedule(); + cond_resched(); count = 0; } } while (!kthread_should_stop() && ret >=0); } - schedule(); + cond_resched(); } return 0; @@ -1138,7 +1141,7 @@ static int dlm_sendd(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if (write_list_empty()) - schedule(); + cond_resched(); set_current_state(TASK_RUNNING); if (sctp_con.eagain_flag) { @@ -1166,7 +1169,7 @@ static int daemons_start(void) p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_recvd %d", error); return error; } @@ -1174,7 +1177,7 @@ static int daemons_start(void) p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_sendd %d", error); kthread_stop(recv_task); return error; @@ -1197,43 +1200,28 @@ int dlm_lowcomms_start(void) error = daemons_start(); if (error) goto fail_sock; - atomic_set(&accepting, 1); return 0; - fail_sock: +fail_sock: close_connection(); return error; } -/* Set all the activity flags to prevent any socket activity. */ - void dlm_lowcomms_stop(void) { - atomic_set(&accepting, 0); + int i; + sctp_con.flags = 0x7; daemons_stop(); clean_writequeues(); close_connection(); dealloc_nodeinfo(); max_nodeid = 0; -} -int dlm_lowcomms_init(void) -{ - init_waitqueue_head(&lowcomms_recv_wait); - spin_lock_init(&write_nodes_lock); - INIT_LIST_HEAD(&write_nodes); - init_rwsem(&nodeinfo_lock); - return 0; -} - -void dlm_lowcomms_exit(void) -{ - int i; + dlm_local_count = 0; + dlm_local_nodeid = 0; for (i = 0; i < dlm_local_count; i++) kfree(dlm_local_addr[i]); - dlm_local_count = 0; - dlm_local_nodeid = 0; } diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c index 7289e59..8f2791f 100644 --- a/fs/dlm/lowcomms-tcp.c +++ b/fs/dlm/lowcomms-tcp.c @@ -54,44 +54,59 @@ #include "config.h" struct cbuf { - unsigned base; - unsigned len; - unsigned mask; + unsigned int base; + unsigned int len; + unsigned int mask; }; -#ifndef FALSE -#define FALSE 0 -#define TRUE 1 -#endif #define NODE_INCREMENT 32 +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} -#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0) -#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) -#define CBUF_EMPTY(cb) ((cb)->len == 0) -#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) -#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \ - (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0) -#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} + +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} + +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} + +static bool cbuf_empty(struct cbuf *cb) +{ + return cb->len == 0; +} /* Maximum number of incoming messages to process before - doing a schedule() + doing a cond_resched() */ #define MAX_RX_MSG_COUNT 25 struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct rw_semaphore sock_sem; /* Stop connect races */ - struct list_head read_list; /* On this list when ready for reading */ - struct list_head write_list; /* On this list when ready for writing */ - struct list_head state_list; /* On this list when ready to connect */ + struct rw_semaphore sock_sem; /* Stop connect races */ + struct list_head read_list; /* On this list when ready for reading */ + struct list_head write_list; /* On this list when ready for writing */ + struct list_head state_list; /* On this list when ready to connect */ unsigned long flags; /* bit 1,2 = We are on the read/write lists */ #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 #define CF_CONNECT_PENDING 3 #define CF_IS_OTHERCON 4 - struct list_head writequeue; /* List of outgoing writequeue_entries */ - struct list_head listenlist; /* List of allocated listening sockets */ + struct list_head writequeue; /* List of outgoing writequeue_entries */ + struct list_head listenlist; /* List of allocated listening sockets */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ struct page *rx_page; @@ -121,28 +136,27 @@ static struct task_struct *recv_task; static struct task_struct *send_task; static wait_queue_t lowcomms_send_waitq_head; -static wait_queue_head_t lowcomms_send_waitq; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq); static wait_queue_t lowcomms_recv_waitq_head; -static wait_queue_head_t lowcomms_recv_waitq; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq); /* An array of pointers to connections, indexed by NODEID */ static struct connection **connections; -static struct semaphore connections_lock; +static DECLARE_MUTEX(connections_lock); static kmem_cache_t *con_cache; static int conn_array_size; -static atomic_t accepting; /* List of sockets that have reads pending */ -static struct list_head read_sockets; -static spinlock_t read_sockets_lock; +static LIST_HEAD(read_sockets); +static DEFINE_SPINLOCK(read_sockets_lock); /* List of sockets which have writes pending */ -static struct list_head write_sockets; -static spinlock_t write_sockets_lock; +static LIST_HEAD(write_sockets); +static DEFINE_SPINLOCK(write_sockets_lock); /* List of sockets which have connects pending */ -static struct list_head state_sockets; -static spinlock_t state_sockets_lock; +static LIST_HEAD(state_sockets); +static DEFINE_SPINLOCK(state_sockets_lock); static struct connection *nodeid2con(int nodeid, gfp_t allocation) { @@ -153,12 +167,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) int new_size = nodeid + NODE_INCREMENT; struct connection **new_conns; - new_conns = kmalloc(sizeof(struct connection *) * + new_conns = kzalloc(sizeof(struct connection *) * new_size, allocation); if (!new_conns) goto finish; - memset(new_conns, 0, sizeof(struct connection *) * new_size); memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size); conn_array_size = new_size; kfree(connections); @@ -168,11 +181,10 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) con = connections[nodeid]; if (con == NULL && allocation) { - con = kmem_cache_alloc(con_cache, allocation); + con = kmem_cache_zalloc(con_cache, allocation); if (!con) goto finish; - memset(con, 0, sizeof(*con)); con->nodeid = nodeid; init_rwsem(&con->sock_sem); INIT_LIST_HEAD(&con->writequeue); @@ -181,7 +193,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) connections[nodeid] = con; } - finish: +finish: up(&connections_lock); return con; } @@ -220,8 +232,6 @@ static inline void lowcomms_connect_sock(struct connection *con) { if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) return; - if (!atomic_read(&accepting)) - return; spin_lock_bh(&state_sockets_lock); list_add_tail(&con->state_list, &state_sockets); @@ -232,31 +242,8 @@ static inline void lowcomms_connect_sock(struct connection *con) static void lowcomms_state_change(struct sock *sk) { -/* struct connection *con = sock2con(sk); */ - - switch (sk->sk_state) { - case TCP_ESTABLISHED: + if (sk->sk_state == TCP_ESTABLISHED) lowcomms_write_space(sk); - break; - - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - case TCP_TIME_WAIT: - case TCP_CLOSE: - case TCP_CLOSE_WAIT: - case TCP_LAST_ACK: - case TCP_CLOSING: - /* FIXME: I think this causes more trouble than it solves. - lowcomms wil reconnect anyway when there is something to - send. This just attempts reconnection if a node goes down! - */ - /* lowcomms_connect_sock(con); */ - break; - - default: - printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state); - break; - } } /* Make a socket active */ @@ -277,13 +264,12 @@ static int add_sock(struct socket *sock, struct connection *con) static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, int *addr_len) { - saddr->ss_family = dlm_local_addr.ss_family; - if (saddr->ss_family == AF_INET) { + saddr->ss_family = dlm_local_addr.ss_family; + if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = cpu_to_be16(port); *addr_len = sizeof(struct sockaddr_in); - } - else { + } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; in6_addr->sin6_port = cpu_to_be16(port); *addr_len = sizeof(struct sockaddr_in6); @@ -291,7 +277,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, int and_other) +static void close_connection(struct connection *con, bool and_other) { down_write(&con->sock_sem); @@ -300,11 +286,8 @@ static void close_connection(struct connection *con, int and_other) con->sock = NULL; } if (con->othercon && and_other) { - /* Argh! recursion in kernel code! - Actually, this isn't a list so it - will only re-enter once. - */ - close_connection(con->othercon, FALSE); + /* Will only re-enter once. */ + close_connection(con->othercon, false); } if (con->rx_page) { __free_page(con->rx_page); @@ -337,7 +320,7 @@ static int receive_from_sock(struct connection *con) con->rx_page = alloc_page(GFP_ATOMIC); if (con->rx_page == NULL) goto out_resched; - CBUF_INIT(&con->cb, PAGE_CACHE_SIZE); + cbuf_init(&con->cb, PAGE_CACHE_SIZE); } msg.msg_control = NULL; @@ -352,16 +335,16 @@ static int receive_from_sock(struct connection *con) * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. */ - iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb); + iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); iov[1].iov_len = 0; /* * iov[1] is the bit of the circular buffer between the start of the * buffer and the start of the currently used section (cb.base) */ - if (CBUF_DATA(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb); + if (cbuf_data(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); iov[1].iov_len = con->cb.base; iov[1].iov_base = page_address(con->rx_page); msg.msg_iovlen = 2; @@ -378,7 +361,7 @@ static int receive_from_sock(struct connection *con) goto out_close; if (ret == len) call_again_soon = 1; - CBUF_ADD(&con->cb, ret); + cbuf_add(&con->cb, ret); ret = dlm_process_incoming_buffer(con->nodeid, page_address(con->rx_page), con->cb.base, con->cb.len, @@ -391,35 +374,32 @@ static int receive_from_sock(struct connection *con) } if (ret < 0) goto out_close; - CBUF_EAT(&con->cb, ret); + cbuf_eat(&con->cb, ret); - if (CBUF_EMPTY(&con->cb) && !call_again_soon) { + if (cbuf_empty(&con->cb) && !call_again_soon) { __free_page(con->rx_page); con->rx_page = NULL; } - out: +out: if (call_again_soon) goto out_resched; up_read(&con->sock_sem); - ret = 0; - goto out_ret; + return 0; - out_resched: +out_resched: lowcomms_data_ready(con->sock->sk, 0); up_read(&con->sock_sem); - ret = 0; - schedule(); - goto out_ret; + cond_resched(); + return 0; - out_close: +out_close: up_read(&con->sock_sem); if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { - close_connection(con, FALSE); + close_connection(con, false); /* Reconnect when there is something to send */ } - out_ret: return ret; } @@ -434,7 +414,8 @@ static int accept_from_sock(struct connection *con) struct connection *newcon; memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; @@ -462,7 +443,7 @@ static int accept_from_sock(struct connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { - printk("dlm: connect from non cluster node\n"); + printk("dlm: connect from non cluster node\n"); sock_release(newsock); up_read(&con->sock_sem); return -1; @@ -483,17 +464,16 @@ static int accept_from_sock(struct connection *con) } down_write(&newcon->sock_sem); if (newcon->sock) { - struct connection *othercon = newcon->othercon; + struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_alloc(con_cache, GFP_KERNEL); + othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); if (!othercon) { printk("dlm: failed to allocate incoming socket\n"); up_write(&newcon->sock_sem); result = -ENOMEM; goto accept_err; } - memset(othercon, 0, sizeof(*othercon)); othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; init_rwsem(&othercon->sock_sem); @@ -523,7 +503,7 @@ static int accept_from_sock(struct connection *con) return 0; - accept_err: +accept_err: up_read(&con->sock_sem); sock_release(newsock); @@ -533,7 +513,7 @@ static int accept_from_sock(struct connection *con) } /* Connect a new socket to its peer */ -static int connect_to_sock(struct connection *con) +static void connect_to_sock(struct connection *con) { int result = -EHOSTUNREACH; struct sockaddr_storage saddr; @@ -542,7 +522,7 @@ static int connect_to_sock(struct connection *con) if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); - return 0; + return; } down_write(&con->sock_sem); @@ -556,13 +536,14 @@ static int connect_to_sock(struct connection *con) } /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (result < 0) goto out_err; memset(&saddr, 0, sizeof(saddr)); if (dlm_nodeid_to_addr(con->nodeid, &saddr)) - goto out_err; + goto out_err; sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; @@ -574,22 +555,13 @@ static int connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, - O_NONBLOCK); + O_NONBLOCK); if (result == -EINPROGRESS) result = 0; - if (result != 0) - goto out_err; - - out: - up_write(&con->sock_sem); - /* - * Returning an error here means we've given up trying to connect to - * a remote node, otherwise we return 0 and reschedule the connetion - * attempt - */ - return result; + if (result == 0) + goto out; - out_err: +out_err: if (con->sock) { sock_release(con->sock); con->sock = NULL; @@ -604,12 +576,15 @@ static int connect_to_sock(struct connection *con) lowcomms_connect_sock(con); result = 0; } - goto out; +out: + up_write(&con->sock_sem); + return; } -static struct socket *create_listen_sock(struct connection *con, struct sockaddr_storage *saddr) +static struct socket *create_listen_sock(struct connection *con, + struct sockaddr_storage *saddr) { - struct socket *sock = NULL; + struct socket *sock = NULL; mm_segment_t fs; int result = 0; int one = 1; @@ -629,10 +604,12 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr fs = get_fs(); set_fs(get_ds()); - result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one)); + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); set_fs(fs); if (result < 0) { - printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result); + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n", + result); } sock->sk->sk_user_data = con; con->rx_action = accept_from_sock; @@ -652,7 +629,8 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr fs = get_fs(); set_fs(get_ds()); - result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one)); + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&one, sizeof(one)); set_fs(fs); if (result < 0) { printk("dlm: Set keepalive failed: %d\n", result); @@ -666,7 +644,7 @@ static struct socket *create_listen_sock(struct connection *con, struct sockaddr goto create_out; } - create_out: +create_out: return sock; } @@ -679,10 +657,6 @@ static int listen_for_all(void) int result = -EINVAL; /* We don't support multi-homed hosts */ - memset(con, 0, sizeof(*con)); - init_rwsem(&con->sock_sem); - spin_lock_init(&con->writequeue_lock); - INIT_LIST_HEAD(&con->writequeue); set_bit(CF_IS_OTHERCON, &con->flags); sock = create_listen_sock(con, &dlm_local_addr); @@ -731,16 +705,12 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, int offset = 0; int users = 0; - if (!atomic_read(&accepting)) - return NULL; - con = nodeid2con(nodeid, allocation); if (!con) return NULL; - spin_lock(&con->writequeue_lock); e = list_entry(con->writequeue.prev, struct writequeue_entry, list); - if (((struct list_head *) e == &con->writequeue) || + if ((&e->list == &con->writequeue) || (PAGE_CACHE_SIZE - e->end < len)) { e = NULL; } else { @@ -751,7 +721,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, spin_unlock(&con->writequeue_lock); if (e) { - got_one: + got_one: if (users == 0) kmap(e->page); *ppc = page_address(e->page) + offset; @@ -777,10 +747,6 @@ void dlm_lowcomms_commit_buffer(void *mh) struct connection *con = e->con; int users; - if (!atomic_read(&accepting)) - return; - - spin_lock(&con->writequeue_lock); users = --e->users; if (users) goto out; @@ -797,7 +763,7 @@ void dlm_lowcomms_commit_buffer(void *mh) } return; - out: +out: spin_unlock(&con->writequeue_lock); return; } @@ -809,7 +775,7 @@ static void free_entry(struct writequeue_entry *e) } /* Send a message */ -static int send_to_sock(struct connection *con) +static void send_to_sock(struct connection *con) { int ret = 0; ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); @@ -846,7 +812,7 @@ static int send_to_sock(struct connection *con) } else { /* Don't starve people filling buffers */ - schedule(); + cond_resched(); } spin_lock(&con->writequeue_lock); @@ -855,25 +821,26 @@ static int send_to_sock(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); + kunmap(e->page); free_entry(e); continue; } } spin_unlock(&con->writequeue_lock); - out: +out: up_read(&con->sock_sem); - return ret; + return; - send_error: +send_error: up_read(&con->sock_sem); - close_connection(con, FALSE); + close_connection(con, false); lowcomms_connect_sock(con); - return ret; + return; - out_connect: +out_connect: up_read(&con->sock_sem); lowcomms_connect_sock(con); - return 0; + return; } static void clean_one_writequeue(struct connection *con) @@ -904,12 +871,12 @@ int dlm_lowcomms_close(int nodeid) con = nodeid2con(nodeid, 0); if (con) { clean_one_writequeue(con); - close_connection(con, TRUE); + close_connection(con, true); atomic_set(&con->waiting_requests, 0); } return 0; - out: +out: return -1; } @@ -940,7 +907,7 @@ static void process_sockets(void) list_for_each_safe(list, temp, &read_sockets) { struct connection *con = - list_entry(list, struct connection, read_list); + list_entry(list, struct connection, read_list); list_del(&con->read_list); clear_bit(CF_READ_PENDING, &con->flags); @@ -959,7 +926,7 @@ static void process_sockets(void) /* Don't starve out everyone else */ if (++count >= MAX_RX_MSG_COUNT) { - schedule(); + cond_resched(); count = 0; } @@ -977,20 +944,16 @@ static void process_output_queue(void) { struct list_head *list; struct list_head *temp; - int ret; spin_lock_bh(&write_sockets_lock); list_for_each_safe(list, temp, &write_sockets) { struct connection *con = - list_entry(list, struct connection, write_list); + list_entry(list, struct connection, write_list); clear_bit(CF_WRITE_PENDING, &con->flags); list_del(&con->write_list); spin_unlock_bh(&write_sockets_lock); - - ret = send_to_sock(con); - if (ret < 0) { - } + send_to_sock(con); spin_lock_bh(&write_sockets_lock); } spin_unlock_bh(&write_sockets_lock); @@ -1000,19 +963,16 @@ static void process_state_queue(void) { struct list_head *list; struct list_head *temp; - int ret; spin_lock_bh(&state_sockets_lock); list_for_each_safe(list, temp, &state_sockets) { struct connection *con = - list_entry(list, struct connection, state_list); + list_entry(list, struct connection, state_list); list_del(&con->state_list); clear_bit(CF_CONNECT_PENDING, &con->flags); spin_unlock_bh(&state_sockets_lock); - ret = connect_to_sock(con); - if (ret < 0) { - } + connect_to_sock(con); spin_lock_bh(&state_sockets_lock); } spin_unlock_bh(&state_sockets_lock); @@ -1046,14 +1006,13 @@ static int read_list_empty(void) /* DLM Transport comms receive daemon */ static int dlm_recvd(void *data) { - init_waitqueue_head(&lowcomms_recv_waitq); init_waitqueue_entry(&lowcomms_recv_waitq_head, current); add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if (read_list_empty()) - schedule(); + cond_resched(); set_current_state(TASK_RUNNING); process_sockets(); @@ -1081,14 +1040,13 @@ static int write_and_state_lists_empty(void) /* DLM Transport send daemon */ static int dlm_sendd(void *data) { - init_waitqueue_head(&lowcomms_send_waitq); init_waitqueue_entry(&lowcomms_send_waitq_head, current); add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if (write_and_state_lists_empty()) - schedule(); + cond_resched(); set_current_state(TASK_RUNNING); process_state_queue(); @@ -1111,7 +1069,7 @@ static int daemons_start(void) p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_recvd %d", error); return error; } @@ -1119,7 +1077,7 @@ static int daemons_start(void) p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_sendd %d", error); kthread_stop(recv_task); return error; @@ -1141,21 +1099,20 @@ void dlm_lowcomms_stop(void) { int i; - atomic_set(&accepting, 0); - - /* Set all the activity flags to prevent any + /* Set all the flags to prevent any socket activity. */ for (i = 0; i < conn_array_size; i++) { if (connections[i]) - connections[i]->flags |= 0x7; + connections[i]->flags |= 0xFF; } + daemons_stop(); clean_writequeues(); for (i = 0; i < conn_array_size; i++) { if (connections[i]) { - close_connection(connections[i], TRUE); + close_connection(connections[i], true); if (connections[i]->othercon) kmem_cache_free(con_cache, connections[i]->othercon); kmem_cache_free(con_cache, connections[i]); @@ -1173,24 +1130,12 @@ int dlm_lowcomms_start(void) { int error = 0; - error = -ENOTCONN; - - /* - * Temporarily initialise the waitq head so that lowcomms_send_message - * doesn't crash if it gets called before the thread is fully - * initialised - */ - init_waitqueue_head(&lowcomms_send_waitq); - error = -ENOMEM; - connections = kmalloc(sizeof(struct connection *) * + connections = kzalloc(sizeof(struct connection *) * NODE_INCREMENT, GFP_KERNEL); if (!connections) goto out; - memset(connections, 0, - sizeof(struct connection *) * NODE_INCREMENT); - conn_array_size = NODE_INCREMENT; if (dlm_our_addr(&dlm_local_addr, 0)) { @@ -1203,7 +1148,8 @@ int dlm_lowcomms_start(void) } con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, NULL, NULL); + __alignof__(struct connection), 0, + NULL, NULL); if (!con_cache) goto fail_free_conn; @@ -1217,40 +1163,20 @@ int dlm_lowcomms_start(void) if (error) goto fail_unlisten; - atomic_set(&accepting, 1); - return 0; - fail_unlisten: - close_connection(connections[0], 0); +fail_unlisten: + close_connection(connections[0], false); kmem_cache_free(con_cache, connections[0]); kmem_cache_destroy(con_cache); - fail_free_conn: +fail_free_conn: kfree(connections); - out: +out: return error; } -int dlm_lowcomms_init(void) -{ - INIT_LIST_HEAD(&read_sockets); - INIT_LIST_HEAD(&write_sockets); - INIT_LIST_HEAD(&state_sockets); - - spin_lock_init(&read_sockets_lock); - spin_lock_init(&write_sockets_lock); - spin_lock_init(&state_sockets_lock); - init_MUTEX(&connections_lock); - - return 0; -} - -void dlm_lowcomms_exit(void) -{ -} - /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 2d045e0..a9a9618 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -14,8 +14,6 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ -int dlm_lowcomms_init(void); -void dlm_lowcomms_exit(void); int dlm_lowcomms_start(void); void dlm_lowcomms_stop(void); int dlm_lowcomms_close(int nodeid); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index a8da8dc..162fbae 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -16,7 +16,6 @@ #include "lock.h" #include "user.h" #include "memory.h" -#include "lowcomms.h" #include "config.h" #ifdef CONFIG_DLM_DEBUG @@ -47,20 +46,14 @@ static int __init init_dlm(void) if (error) goto out_config; - error = dlm_lowcomms_init(); - if (error) - goto out_debug; - error = dlm_user_init(); if (error) - goto out_lowcomms; + goto out_debug; printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; - out_lowcomms: - dlm_lowcomms_exit(); out_debug: dlm_unregister_debugfs(); out_config: @@ -76,7 +69,6 @@ static int __init init_dlm(void) static void __exit exit_dlm(void) { dlm_user_exit(); - dlm_lowcomms_exit(); dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); -- cgit v1.1