diff options
Diffstat (limited to 'arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c')
-rw-r--r-- | arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c | 2858 |
1 files changed, 2858 insertions, 0 deletions
diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c new file mode 100644 index 0000000..047547f --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c @@ -0,0 +1,2858 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Server (offload) side Linux-specific functions and callbacks. + */ + + +#include "pvtcp.h" + +#if defined(CONFIG_NET_NS) +#include <linux/nsproxy.h> +#include <linux/un.h> +#endif + +#include <net/ipv6.h> +#include <linux/kobject.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/cred.h> + + +/* The PVSock address (127.238.0.1) in binary form, host byte order. */ +#define PVTCP_PVSOCK_ADDR 0x7fee0001 +#define PVTCP_PVSOCK_NET 0x7fee0000 +#define PVTCP_PVSOCK_MASK 0x000000ff + +/* From mvpkm */ +extern uid_t Mvpkm_vmwareUid; + +/* + * Credentials to back socket file pointer. Used in Android ICS network + * data usage accounting to bill guest data to MVP. + */ +static struct cred _cred; +static struct file _file = { + .f_cred = &_cred, +}; + +/* From pvtcp_off_io_linux.c */ +extern CommOSAtomic PvtcpOutputAIOSection; +extern void PvtcpOffLargeDgramBufInit(void); + +static const unsigned short portRangeBase = 7000; +static const unsigned int portRangeSize = 31; +static int hooksRegistered = 0; + +static inline int PvtcpTestPortIndexBit(unsigned int addr, + unsigned int portIdx); +/** + * @note + * Netfilter hooks: + * + * We decide to drop each packet based on the following criteria: + * 1) Destination address is to a pvsock address AND + * 3) (NOT(uid == 0 OR uid == vmwareUid)) OR + * 4) (type == UDP AND NOT(port-in-pvsock-range))) + */ + +/** + * @brief Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param skb skbuff + * @param inet6 is this socket ipv4 or ipv6? + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static inline unsigned int +PvsockNfHook(struct sk_buff *skb, int inet6) +{ + uid_t uid; + unsigned int port; + struct socket *sock; + unsigned int addr = inet6 ? + ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]) : + ntohl(ip_hdr(skb)->daddr); + + if (likely((addr ^ PVTCP_PVSOCK_NET) & ~PVTCP_PVSOCK_MASK)) { + /* Not a pvsock address. */ + return NF_ACCEPT; + } + + sock = skb->sk->sk_socket; + if (unlikely(!sock)) { + return NF_ACCEPT; + } + + /* + * Guest (kernel) sockets can send to other guest sockets, + * Root can send to whoever it wants, no checks. + */ + uid = (sock->file ? sock->file->f_cred->uid : 0); + if (uid == 0 || (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM)) { + return NF_ACCEPT; + } + + /* + * Only vmware can send to guest. + */ + if (likely(uid == Mvpkm_vmwareUid)) { + if (sock->type == SOCK_DGRAM) { + /* + * Deny sending to UDP port in pvsock range, if receiving socket was + * not created by the guest with this pvsock address. Drop all other + * UDP packets. + */ + port = ntohs(udp_hdr(skb)->dest) - portRangeBase; + if ((port < portRangeSize) && + PvtcpTestPortIndexBit(htonl(addr), port)) { + return NF_ACCEPT; + } + return NF_DROP; + } + /* + * TCP is all-good. + */ + return NF_ACCEPT; + } + + return NF_DROP; +} + + +/** + * @brief AF_INET4 Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param hooknum netfilter hook number + * @param skb skbuff + * @param in rx net_device + * @param out out net_device + * @param okfn ignored + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static unsigned int +Inet4NfHook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return PvsockNfHook(skb, 0); +} + +/** + * @brief AF_INET6 Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param hooknum netfilter hook number + * @param skb skbuff + * @param in rx net_device + * @param out out net_device + * @param okfn ignored + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static unsigned int +Inet6NfHook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!ipv6_addr_v4mapped(&ipv6_hdr(skb)->daddr)) { + /* Not ipv4-mapped, so not a pvsock address. */ + return NF_ACCEPT; + } + + return PvsockNfHook(skb, 1); +} + + +static struct nf_hook_ops netfilterHooks[] = { + { + .hook = Inet4NfHook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_SECURITY + }, + { + .hook = Inet6NfHook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_SECURITY + } +}; + + +#if !defined(CONFIG_SYSFS) +#error "The pvTCP offload module requires sysfs!" +#endif + +/* + * State kobject, attributes and type. + */ + +typedef struct PvtcpStateKObj { + struct kobject kobj; + CommTranspInitArgs transpArgs; + unsigned int pvsockAddr; + int useNS; + int haveNS; +} PvtcpStateKObj; + + +typedef struct PvtcpStateKObjAttr { + struct attribute attr; + ssize_t (*show)(PvtcpStateKObj *stateKObj, char *buf); + ssize_t (*store)(PvtcpStateKObj *stateKObj, const char *buf, size_t count); +} PvtcpStateKObjAttr; + + +/** + * @brief Releases state a kobject. + * @param kobj (embedded) state kobject. + */ + +static void +StateKObjRelease(struct kobject *kobj) +{ + kfree(container_of(kobj, PvtcpStateKObj, kobj)); +} + + +/** + * @brief Sysfs show function for all pvtcp attributes. + * @param kobj (embedded) state kobject. + * @param attr pvtcp attribute to show. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); + PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); + + if (stateAttr->show) { + return stateAttr->show(stateKObj, buf); + } + + return -EIO; +} + + +/** + * @brief Sysfs store function for all pvtcp attributes. + * @param kobj (embedded) state kobject. + * @param attr pvtcp attribute to show. + * @param buf input buffer. + * @param count input buffer length. + * @return number of bytes consumed or negative error code. + */ + +static ssize_t +StateKObjStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count) +{ + PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); + PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); + + if (stateAttr->store) { + return stateAttr->store(stateKObj, buf, count); + } + + return -EIO; +} + + +static struct sysfs_ops StateKObjSysfsOps = { + .show = StateKObjShow, + .store = StateKObjStore +}; + + +/** + * @brief Show function for the comm_info pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjCommInfoShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + unsigned int typeHash; + + /* + * In the offload module, the transport arguments' type field has been + * assigned the matching index in the versions array at probe time. + * Recover and print out the type hash. + */ + + typeHash = CommTransp_GetType(pvtcpVersions[stateKObj->transpArgs.type]); + + return snprintf(buf, PAGE_SIZE, "ID=%u,%u\nCAPACITY=%u\nTYPE=0x%0x\n", + stateKObj->transpArgs.id.d32[0], + stateKObj->transpArgs.id.d32[1], + stateKObj->transpArgs.capacity, + typeHash); +} + + +/** + * @brief Show function for the pvsock_addr pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjPvsockAddrShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + union { + unsigned int raw; + unsigned char bytes[4]; + } addr; + + addr.raw = stateKObj->pvsockAddr; + return snprintf(buf, PAGE_SIZE, "%u.%u.%u.%u\n", + (unsigned int)addr.bytes[0], (unsigned int)addr.bytes[1], + (unsigned int)addr.bytes[2], (unsigned int)addr.bytes[3]); +} + + +/** + * @brief Show function for the use_ns pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjUseNSShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", stateKObj->useNS); +} + + +/** + * @brief Store function for the use_ns pvtcp attribute. + * @param stateKObj state kobject. + * @param buf input buffer. + * @param count input buffer length. + * @return number of bytes consumed or negative error code. + */ + +static ssize_t +StateKObjUseNSStore(PvtcpStateKObj *stateKObj, + const char *buf, + size_t count) +{ + int rc = -EINVAL; + + /* coverity[secure_coding] */ + if (stateKObj->haveNS && (sscanf(buf, "%d", &stateKObj->useNS) == 1)) { + stateKObj->useNS = !!stateKObj->useNS; + rc = count; + } + + return rc; +} + + +static PvtcpStateKObjAttr stateKObjCommInfoAttr = + __ATTR(comm_info, 0444, StateKObjCommInfoShow, NULL); + +static PvtcpStateKObjAttr stateKObjPvsockAddrAttr = + __ATTR(pvsock_addr, 0444, StateKObjPvsockAddrShow, NULL); + +static PvtcpStateKObjAttr stateKObjUseNSAttr = + __ATTR(use_ns, 0644, StateKObjUseNSShow, StateKObjUseNSStore); + + +static struct attribute *stateKObjDefaultAttrs[] = { + &stateKObjCommInfoAttr.attr, + &stateKObjPvsockAddrAttr.attr, + &stateKObjUseNSAttr.attr, + NULL +}; + + +static struct kobj_type stateKType = { + .sysfs_ops = &StateKObjSysfsOps, + .release = StateKObjRelease, + .default_attrs = stateKObjDefaultAttrs +}; + + +/* + * Initialization of module entry and exit callbacks. + */ + +static int Init(void *args); +static void Exit(void); + +COMM_OS_MOD_INIT(Init, Exit); + + +/* + * AIO socket read buffers, stats and other global state. + */ + +static CommOSMutex globalLock; +static char perCpuBuf[NR_CPUS][PVTCP_SOCK_BUF_SIZE]; + +#define PVTCP_OFF_MAX_LB_ADDRS 255 +static unsigned int loopbackAddrs[PVTCP_OFF_MAX_LB_ADDRS] = { + 0xffffffff, // Network address always on, all ports allowed. + 0x7fffffff // Host address not yet on, all ports allowed. + // All the rest zeroed out. +}; + +static const unsigned int loopbackReserved = 0x00000001 << 31; + + +#define PvtcpTestLoopbackBit(entry, mask) \ + ((entry) & (mask)) + +#define PvtcpSetLoopbackBit(entry, mask) \ + ((entry) |= (mask)) + +#define PvtcpResetLoopbackBit(entry, mask) \ + ((entry) &= ~(mask)) + + +static inline int +PvtcpTestPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + return PvtcpTestLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +static inline void +PvtcpSetPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + PvtcpSetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +static inline void +PvtcpResetPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + PvtcpResetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +unsigned int pvtcpLoopbackOffAddr; + +unsigned long long pvtcpOffDgramAllocations = 0; + +/* + * Destructor shim addresses and function pointer + */ + +extern void asmDestructorShim(struct sock*); + + +/* + * Functions. + */ + +/** + * @brief Release a socket, NULLing out the fake file field to avoid confusing + * Linux on the release path + * @param sock socket to release + */ +static void +SockReleaseWrapper(struct socket *sock) +{ + sock->file = NULL; + sock_release(sock); +} + +/** + * @brief Gets a new loopback address in the 127.238.0.255 network. + * Note that the first address, 127.238.0.1, is always the host's. + * @return new address or -1U if none is available. + */ + +static unsigned int +GetLoopbackAddr(void) +{ + static unsigned char addrTempl[4] = { 127, 238, 0, 0 }; + unsigned int rc = -1U; + unsigned int idx; + struct socket *sock; + + CommOS_MutexLock(&globalLock); + for (idx = 1; idx < PVTCP_OFF_MAX_LB_ADDRS; idx++) { + if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { + addrTempl[3] = (unsigned char)idx; + memcpy(&rc, addrTempl, sizeof rc); + + /* Create a dgram socket to configure/bring-up the lo:N interface. */ + + if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { + int err; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr = { .s_addr = rc } + }; + struct ifreq ifr = { + .ifr_flags = IFF_UP + }; + + snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); + memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); + err = kernel_sock_ioctl(sock, SIOCSIFADDR, (unsigned long)&ifr); + sock_release(sock); + if (err) { + CommOS_Log(("%s: Could not set loopback address (ioctl)!\n", + __FUNCTION__)); + rc = -1U; + continue; /* Try next address. */ + } else { + PvtcpSetLoopbackBit(loopbackAddrs[idx], loopbackReserved); + CommOS_Debug(("%s: Allocated loopback address [%u.%u.%u.%u].\n", + __FUNCTION__, + addrTempl[0], addrTempl[1], + addrTempl[2], addrTempl[3])); + break; + } + } else { + CommOS_Log(("%s: Could not set loopback address (create)!\n", + __FUNCTION__)); + rc = -1U; + break; + } + } + } + if (idx == PVTCP_OFF_MAX_LB_ADDRS) { + CommOS_Log(("%s: loopback address range exceeded!\n", __FUNCTION__)); + } + + CommOS_MutexUnlock(&globalLock); + return rc; +} + + +/** + * @brief Puts back a loopback address in the 127.238.0.255 network. + * @param uaddr address to put back. + */ + +static void +PutLoopbackAddr(unsigned int uaddr) +{ + const unsigned char addrTempl[3] = { 127, 238, 0 }; + unsigned char addr[4]; + unsigned int idx; + struct socket *sock; + + memcpy(addr, &uaddr, sizeof uaddr); + if (memcmp(addrTempl, addr, sizeof addrTempl)) { + return; + } + + idx = addr[3]; + if ((idx == 0) || (idx >= PVTCP_OFF_MAX_LB_ADDRS)) { + return; + } + + CommOS_MutexLock(&globalLock); + if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { + CommOS_Debug(("%s: loopback entry [%u] already freed.\n", + __FUNCTION__, idx)); + goto out; + } + + if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr = { .s_addr = uaddr } + }; + struct ifreq ifr = { + .ifr_flags = 0 + }; + + snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); + memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); + kernel_sock_ioctl(sock, SIOCSIFFLAGS, (unsigned long)&ifr); + sock_release(sock); + loopbackAddrs[idx] = 0; // Zero everything out. + CommOS_Debug(("%s: Deallocated loopback address [%u.%u.%u.%u].\n", + __FUNCTION__, addr[0], addr[1], addr[2], addr[3])); + } else { + CommOS_Log(("%s: Could not delete loopback address!\n", + __FUNCTION__)); + } + +out: + CommOS_MutexUnlock(&globalLock); +} + + +/** + * @brief Retrieves and retains the namespace associated with a channel. + * A server must be listening for requests to retrieve the pid of the + * process owning the net namespace for the passed context/vm id. + * Communication takes place over a datagram socket in the AF_UNIX family, + * bound to "/usr/lib/vmware/pvtcp/config/serv_addr". + * @param state channel state for which to retrieve the network namespace. + * @sideeffect If an associated namespace is found, it is retained and saved + * in the state object. + */ + +static void +GetNetNamespace(PvtcpState *state) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + CommTranspInitArgs args; + pid_t pidn; + struct pid *pid; + struct task_struct *tsk; + struct nsproxy *nsproxy; + struct net *ns; + struct socket *sock; + struct sockaddr_un addr = { + .sun_family = AF_UNIX + }; + struct timeval timeout = { + .tv_sec = 3000, + .tv_usec = 0 + }; + const int passcred = 1; + char buf[64]; + struct kvec vec; + const char *sockname = "pvtcp-vpn"; /* abstract namespace for AF_UNIX/LOCAL sockets */ + const size_t socknamelen = strlen(sockname); + + struct msghdr msg = { + .msg_name = (struct sockaddr *)&addr, + .msg_namelen = 1 + offsetof(struct sockaddr_un, sun_path) + socknamelen + }; + + + if (!state) { + return; + } + + args = CommSvc_GetTranspInitArgs(state->channel); + ns = NULL; + pidn = 0; + + if (sock_create_kern(AF_UNIX, SOCK_DGRAM, 0, &sock)) { + CommOS_Debug(("%s: Can't create config socket!\n", __FUNCTION__)); + goto out; + } + if (kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&timeout, sizeof timeout)) { + sock_release(sock); + CommOS_Debug(("%s: Can't set timeout on config socket!\n", __FUNCTION__)); + goto out; + } + if (kernel_setsockopt(sock, SOL_SOCKET, SO_PASSCRED, + (char *)&passcred, sizeof passcred)) { + sock_release(sock); + CommOS_Debug(("%s: Can't set passcred on config socket!\n", + __FUNCTION__)); + goto out; + } + + /* + * Send the configuration request and receive the reply: + * - the request carries the VM/guest ID as used in the transport + * arguments used to create the channel. + * - the reply is expected to contain the pid of the namespace owner. + */ + + memset(buf, 0, sizeof buf); + snprintf(buf, sizeof buf, "%u\n", args.id.d32[0]); + buf[sizeof buf - 1] = '\0'; + vec.iov_base = buf; + vec.iov_len = strlen(buf); + + /* use anonymous name */ + addr.sun_path[0] = 0; + memcpy(addr.sun_path+1, sockname, socknamelen); + + if (kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len) <= 0) { + sock_release(sock); + CommOS_Debug(("%s: Could not send config request for vm [%u]!\n", + __FUNCTION__, args.id.d32[0])); + goto out; + } + + memset(buf, 0, sizeof buf); + vec.iov_base = buf; + vec.iov_len = sizeof buf; + if (kernel_recvmsg(sock, &msg, &vec, 1, vec.iov_len, 0) <= 0) { + CommOS_Debug(("%s: Could not receive config reply for vm [%u]!\n", + __FUNCTION__, args.id.d32[0])); + } else { + buf[sizeof buf - 1] = '\0'; + /* coverity[secure_coding] */ + sscanf(buf, "%d", &pidn); + } + sock_release(sock); + + if (!pidn) { + goto out; + } + + pid = find_get_pid(pidn); + if (pid) { + tsk = pid_task(pid, PIDTYPE_PID); + if (tsk) { + rcu_read_lock(); + nsproxy = task_nsproxy(tsk); + if (nsproxy && nsproxy->net_ns) { + ns = maybe_get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + } + put_pid(pid); + } + +out: + if (!ns) { + CommOS_Debug(("%s: Not using a namespace for vm [%u].\n", + __FUNCTION__, args.id.d32[0])); + ns = &init_net; + } else { + CommOS_Debug(("%s: Found the net namespace for vm [%u].\n", + __FUNCTION__, args.id.d32[0])); + } +#else + void *ns = NULL; +#endif + + state->namespace = ns; +} + + +/** + * @brief Releases the network namespace associated with a channel state. + * @param namespace namespace to be released. + * @sideeffect If the namespace is not the initial one, it is released. + */ + +static void +PutNetNamespace(void *namespace) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + if (namespace && (namespace != &init_net)) { + put_net((struct net *)namespace); + } +#endif +} + + +/** + * @brief Offload state constructor called when a channel is created. + * The function first calls the default state allocator; it then retrieves + * the n/w namespace associated with this client, retains it and stores it + * in the state object. Finally, it creates a sysfs node. + * @param[in,out] channel channel to initialize. + * @return pointer to a new state structure or NULL. + * @sideeffect Allocates memory. + */ + +static void * +StateAlloc(CommChannel channel) +{ + extern struct kset *Mvpkm_FindVMNamedKSet(int, const char *); + PvtcpState *state = NULL; + PvtcpIf *loopbackNetif = NULL; + PvtcpStateKObj *stateKObj = NULL; + struct kset *kset = NULL; + int rc; + CommTranspInitArgs transpArgs; + + transpArgs = CommSvc_GetTranspInitArgs(channel); + + /* + * The transport ID is assigned in an implementation-dependent way. + * (see lib/comm/comm_transp.h for transport type definitions.) + * However, the first 32 bits are expected to denote the guest/VM ID, + * while the last 32 bits are a resource handle within that VM. On MVP, + * transports map to queue pairs, which follow this convention. + */ + + kset = Mvpkm_FindVMNamedKSet((int)transpArgs.id.d32[0], "devices"); + if (!kset) { + CommOS_Debug(("%s: Could not find sysfs '.../vm/N/devices' kset!\n", + __FUNCTION__)); + goto error; + } + + state = PvtcpStateAlloc(channel); + if (!state) { + CommOS_Debug(("%s: Could not allocate state!\n", __FUNCTION__)); + goto error; + } + + /* coverity[leaked_storage] */ + stateKObj = kzalloc(sizeof *stateKObj, GFP_KERNEL); + if (!stateKObj) { + CommOS_Debug(("%s: Could not allocate state kobject!\n", __FUNCTION__)); + goto error; + } + + stateKObj->kobj.kset = kset; + /* coverity[leaked_storage] */ + rc = kobject_init_and_add(&stateKObj->kobj, &stateKType, NULL, "pvtcp"); + if (rc) { + CommOS_Debug(("%s: Could not add state kobject to parent kset [%d]!\n", + __FUNCTION__, rc)); + goto error; + } + + loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); + BUG_ON(loopbackNetif == NULL); + loopbackNetif->conf.addr.in.s_addr = GetLoopbackAddr(); + if (loopbackNetif->conf.addr.in.s_addr == -1U) { + CommOS_Log(("%s: Could not allocate loopback address!\n", __FUNCTION__)); + goto error; + } + + GetNetNamespace(state); + + stateKObj->transpArgs = transpArgs; + stateKObj->pvsockAddr = loopbackNetif->conf.addr.in.s_addr; +#if defined(CONFIG_NET_NS) + stateKObj->haveNS = (state->namespace != &init_net); + stateKObj->useNS = stateKObj->haveNS; +#endif + state->extra = stateKObj; + + _cred.uid = _cred.gid = _cred.suid = _cred.sgid = + _cred.euid = _cred.egid = _cred.fsuid = _cred.fsgid = Mvpkm_vmwareUid; + + +out: + if (kset) { + kset_put(kset); + } + return state; + +error: + if (stateKObj) { + kobject_del(&stateKObj->kobj); + kobject_put(&stateKObj->kobj); + } + if (loopbackNetif && (loopbackNetif->conf.addr.in.s_addr != -1U)) { + PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); + } + if (state) { + PvtcpStateFree(state); + state = NULL; + } + goto out; +} + + +/** + * @brief Offload state destructor called when a channel is closed. + * The function releases this client's n/w namespace and then calls the + * default state deallocator. + * @param arg pointer to state structure. + * @sideeffect Destroys all netifs and their sockets, deallocates memory. + */ + +static void +StateFree(void *arg) +{ + PvtcpState *state = arg; + PvtcpIf *loopbackNetif; + void *namespace; + + if (!state) { + return; + } + + if (state->extra) { + PvtcpStateKObj *stateKObj = state->extra; + + kobject_del(&stateKObj->kobj); + kobject_put(&stateKObj->kobj); + } + + namespace = state->namespace; + loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); + BUG_ON(loopbackNetif == NULL); + PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); + PvtcpStateFree(state); + PutNetNamespace(namespace); +} + + +/** + * @brief Releases socket. This function is called when the channel state + * owning the socket is closed. + * @param[in,out] pvsk PV socket to release. + * @sideeffect the socket eventually gets deallocated. + */ + +void +PvtcpReleaseSocket(PvtcpSock *pvsk) +{ + struct socket *sock = SkFromPvsk(pvsk)->sk_socket; + + SOCK_IN_LOCK(pvsk); + SOCK_OUT_LOCK(pvsk); + pvsk->peerSockSet = 0; + SockReleaseWrapper(sock); + SOCK_OUT_UNLOCK(pvsk); + SOCK_IN_UNLOCK(pvsk); + CommOS_Debug(("%s: [0x%p].\n", __FUNCTION__, pvsk)); +} + + +/** + * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1. + * @param pvsk socket to test. + * @param addr inet4 address to test. + * @return > 1: morph and propagate new address to caller, 1: just morph, + * 0: don't morph, < 0 (-EADDRNOTAVAIL): bad loopback. + */ + +static inline int +TestLoopbackInet4(PvtcpSock *pvsk, + unsigned int addr) +{ + if (!ipv4_is_loopback(addr)) { + return 0; + } + + if (addr != htonl(PVTCP_PVSOCK_ADDR)) { + if (addr != htonl(INADDR_LOOPBACK)) { + return -EADDRNOTAVAIL; + } + if (PvtcpHasSockNamespace(pvsk)) { + /* We don't morph normal 127.0.0.1 when NS present. */ + + return 0; + } + return 2; + } + + return 1; +} + + +/** + * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1 and the + * socket has a namespace. If yes, the address will be morphed into + * the actual loopback address, then a bind() is performed. + * Note that the function returns EADDRNOTAVAIL for any other loopbacks. + * @param pvsk socket to test. + * @param[in,out] addr inet4 address to test. + * @param port port to bind, or zero for any port. + * @return 1 if bind should be performed by caller, bind return code otherwise. + */ + +int +PvtcpTestAndBindLoopbackInet4(PvtcpSock *pvsk, + unsigned int *addr, + unsigned short port) +{ + int rc; + struct sockaddr_in sin; + unsigned int morphedAddr; + int propagate = 0; + + rc = TestLoopbackInet4(pvsk, *addr); + switch (rc) { + case 2: + propagate = 1; // Fall through. + case 1: + break; // Proceed with morphing. + case 0: + return 1; // Don't morph, let bind() be done by caller. + default: + return rc; + } + + if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The socket has already been morphed/bound. */ + + morphedAddr = pvsk->netif->conf.addr.in.s_addr; + rc = 0; + goto out; + } + + /* + * Move the socket to the initial namespace before binding it + * such that the loopback address is accessible to the host. + */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + morphedAddr = pvsk->netif->conf.addr.in.s_addr; + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_port = port; + sin.sin_addr.s_addr = morphedAddr; + + /* Bind to the channel loopback address. */ + + rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, + (struct sockaddr *)&sin, sizeof sin); + if (rc) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + } else { + /* + * Bind succeeded on pvsock address. + * If this is a pvsock UDP reserved port, record it. + */ + + port = ntohs(port) - portRangeBase; + if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && + (port < portRangeSize)) { + CommOS_MutexLock(&globalLock); + PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + + /* + * pvsock data usage shouldn't be counted as MVP external traffic. + */ + SkFromPvsk(pvsk)->sk_socket->file = NULL; + } + +out: + if (propagate) { + *addr = morphedAddr; + } + return rc; +} + + +/** + * @brief Tests if the passed address is IPV4-mapped 127.238.0.1 or 127.0.0.1, + * clean ::1, and whether the socket has a namespace. + * If needed, the address will be morphed into the actual loopback address, + * then a bind() is performed. + * Note that the function returns EADDRNOTAVAIL for any other loopbacks. + * @param pvsk socket to test. + * @param[in,out] addr0 first 64 bits of inet6 address to test. + * @param[in,out] addr1 last 64 bits of inet6 address to test. + * @param port port to bind, or zero for any port. + * @return 1 if bind should be performed by caller, bind return code otherwise. + */ + +int +PvtcpTestAndBindLoopbackInet6(PvtcpSock *pvsk, + unsigned long long *addr0, + unsigned long long *addr1, + unsigned short port) +{ + int rc; + struct sockaddr_in6 sin6; + union { + unsigned long long halves[2]; + struct in6_addr in6; + } in6Addr = { + .halves = { *addr0, *addr1 } + }; + int propagate = 0; + const int ipv6Only = 0; + + if (ipv6_addr_loopback(&in6Addr.in6)) { + if (PvtcpHasSockNamespace(pvsk)) { + return 1; + } + + /* Remember that we were passed '::1'. */ + + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP, 1); + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &in6Addr.in6); + } + + if (!ipv6_addr_v4mapped(&in6Addr.in6)) { + /* If the address is not ipv4-mapped, stop testing. */ + + return 1; + } + + rc = TestLoopbackInet4(pvsk, in6Addr.in6.s6_addr32[3]); + switch (rc) { + case 2: + propagate = 1; // Fall through. + case 1: + break; // Proceed with morphing. + case 0: + return 1; // Don't morph, let bind() be done by caller. + default: + return rc; + } + + if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The socket has already been morphed/bound. */ + + ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); + rc = 0; + goto out; + } + + /* + * Move the socket to the initial namespace before binding it + * such that the loopback address is accessible to the host. + */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); + memset(&sin6, 0, sizeof sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = port; + sin6.sin6_addr = in6Addr.in6; + + /* + * Ensure we can use ipv4 mapped addresses and bind to the channel + * loopback address. + */ + + (void)kernel_setsockopt(SkFromPvsk(pvsk)->sk_socket, IPPROTO_IPV6, + IPV6_V6ONLY, (char *)&ipv6Only, sizeof ipv6Only); + rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, + (struct sockaddr *)&sin6, sizeof sin6); + if (rc) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + } else { + /* + * Bind succeeded on pvsock address. + * If this is a pvsock UDP reserved port, record it. + */ + + port = ntohs(port) - portRangeBase; + if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && + (port < portRangeSize)) { + CommOS_MutexLock(&globalLock); + PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + + /* + * pvsock data usage shouldn't be counted as MVP external traffic. + */ + SkFromPvsk(pvsk)->sk_socket->file = NULL; + } + +out: + if (propagate) { + *addr0 = in6Addr.halves[0]; + *addr1 = in6Addr.halves[1]; + } + return rc; +} + + +/** + * @brief Resets a 127.238.0.N address to 127.0.0.1. + * @param pvsk socket whose address needs resetting. + * @param[in,out] addr inet4 address to reset. + */ + +void +PvtcpResetLoopbackInet4(PvtcpSock *pvsk, + unsigned int *addr) +{ + if (!PvtcpHasSockNamespace(pvsk)) { + static const unsigned int pvsockAddr = htonl(PVTCP_PVSOCK_ADDR); + + if (!memcmp(&pvsockAddr, addr, 3) && memcmp(&pvsockAddr, addr, 4)) { + /* If it's a pvsock address but _not_ the host's, overwrite it. */ + + *addr = htonl(INADDR_LOOPBACK); + } + } +} + + +/** + * @brief Resets an IPV4-mapped ::ffff:127.238.0.N IPV6 address to loopback. + * @param pvsk socket whose address needs resetting. + * @param[in,out] in6 inet6 address to reset. + */ + +void +PvtcpResetLoopbackInet6(PvtcpSock *pvsk, + struct in6_addr *in6) +{ + if (!PvtcpHasSockNamespace(pvsk) && ipv6_addr_v4mapped(in6)) { + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP)) { + /* If the original address came in as ::1, we reset as such. */ + + static const struct in6_addr in6Loopback = IN6ADDR_LOOPBACK_INIT; + + *in6 = in6Loopback; + } else { + PvtcpResetLoopbackInet4(pvsk, &in6->s6_addr32[3]); + } + } +} + + +/** + * @brief Called at module load time. It registers with the Comm runtime. + * @param args initialization arguments + * @return zero if successful, -1 otherwise + * @sideeffect Leaves the module loaded + */ + +static int +Init(void *args) +{ + int rc = -1; + +#if !defined(PVTCP_DISABLE_NETFILTER) + rc = nf_register_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + if (rc) { + CommOS_Log(("%s: Could not register netfilter hooks!\n", __FUNCTION__)); + goto out; + } else { + CommOS_Debug(("%s: Registered netfilter hooks.\n", __FUNCTION__)); + } + hooksRegistered = 1; +#else + CommOS_Log(("%s: Netfilter hooks disabled.\n", __FUNCTION__)); +#endif + + CommOS_MutexInit(&globalLock); + CommOS_WriteAtomic(&PvtcpOutputAIOSection, 0); + PvtcpOffLargeDgramBufInit(); + + pvtcpImpl.owner = CommOS_ModuleSelf(); + pvtcpImpl.stateCtor = StateAlloc; + pvtcpImpl.stateDtor = StateFree; + if (CommSvc_RegisterImpl(&pvtcpImpl) == 0) { + rc = 0; + pvtcpLoopbackOffAddr = GetLoopbackAddr(); + if (pvtcpLoopbackOffAddr == -1U) { + CommOS_Log(("%s: Could not allocate offload loopback address!\n", + __FUNCTION__)); + rc = -1; + CommSvc_UnregisterImpl(&pvtcpImpl); + } + } + +out: + if (rc) { + if (hooksRegistered) { + nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + } + } + return rc; +} + + +/** + * @brief Called at module unload time. It shuts down pvtcp. + * @sideeffect Total and utter destruction. + */ + +static void +Exit(void) +{ + PutLoopbackAddr(pvtcpLoopbackOffAddr); + CommSvc_UnregisterImpl(&pvtcpImpl); +#if !defined(PVTCP_DISABLE_NETFILTER) + if (hooksRegistered) { + nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + CommOS_Debug(("%s: Netfilter hooks unregistered.\n", __FUNCTION__)); + } +#endif + CommOS_Log(("%s: Allocations of large datagrams: %llu.\n", + __FUNCTION__, pvtcpOffDgramAllocations)); +} + + +/* + * Socket callback interceptors. + */ + +/** + * @brief Callback called when socket is destroyed. + * @param[in,out] sk socket to cleanup + * @return 0 if socket memory is freed, < 0 otherwise (no-op) + * @sideeffect Send queue buffers are deallocated + */ + +int +DestructCB(struct sock *sk) +{ + PvtcpOffBuf *internalBuf; + PvtcpOffBuf *tmp; + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->destruct == asmDestructorShim)) { + /* Module put _not_ to be performed by asmDestructorShim. */ + + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return -1; + } + + CommOS_ListForEachSafe(&pvsk->queue, internalBuf, tmp, link) { + CommOS_ListDel(&internalBuf->link); + PvtcpBufFree(PvtcpOffBufFromInternal(internalBuf)); + } + if (pvsk->destruct) { + pvsk->destruct(sk); + } + + if (pvsk->rpcReply) { + CommOS_Kfree(pvsk->rpcReply); + } + CommOS_Kfree(pvsk); + + /* + * Module put is performed by asmDestructorShim. + */ + + return 0; +} + + +/** + * @brief Callback called when socket state changes occur. + * @param sk socket specified socket which changed state + * @sideeffect A writer task may be scheduled + */ + +static void +StateChangeCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->stateChange == StateChangeCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + CommOS_Debug(("%s: [0x%p] sk_state [%u] sk_err [%d] sk_err_soft [%d].\n", + __FUNCTION__, pvsk, sk->sk_state, + sk->sk_err, sk->sk_err_soft)); + if (pvsk->stateChange) { + pvsk->stateChange(sk); + } + if (sk->sk_state == TCP_ESTABLISHED) { + PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when an error is set on the socket. + * @param sk socket the error happened on + * @sideeffect A writer task may be scheduled + */ + +static void +ErrorReportCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->errorReport == ErrorReportCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + * Interesting sk_err-s: + * ECONNRESET - tcp_disconnect(), tcp_reset() + * ECONNREFUSED - tcp_reset() + * EPIPE - tcp_reset() + * ETIMEDOUT - tcp_write_error() + * EHOSTUNREACH, etc. - tcp_v4_error()??, icmp errors + * etc. - __udp4_lib_err(), icmp errors + */ + + CommOS_Debug(("%s: [0x%p] sk_err [%d] sk_err_soft [%d].\n", + __FUNCTION__, pvsk, sk->sk_err, sk->sk_err_soft)); + if (pvsk->errorReport) { + pvsk->errorReport(sk); + } + pvsk->err = sk->sk_err; + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when data is available to be read from a socket. + * @param sk socket in question + * @param bytes number of bytes to read + * @sideeffect A writer task is scheduled _iff_ the peer can safely + * receive. + */ + +static void +DataReadyCB(struct sock *sk, + int bytes) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->dataReady == DataReadyCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + if (pvsk->dataReady) { + pvsk->dataReady(sk, bytes); + } + if (sk->sk_state == TCP_LISTEN) { + CommOS_Debug(("%s: Listen socket ready to accept [0x%p].\n", + __FUNCTION__, pvsk)); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when writing is possible on a socket. + * @param sk socket in question + * @sideeffect An AIO thread is scheduled. + */ + +static void +WriteSpaceCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->writeSpace == WriteSpaceCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + if (pvsk->writeSpace) { + pvsk->writeSpace(sk); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Initializes a newly created socket for offload operations. + * @param[in,out] sock socket to initialize + * @param channel channel to update + * @param peerSock peer PV socket of this socket + * @param parentPvsk parent of this socket or NULL + * @return zero on success, error code otherwise + */ + +static int +SockAllocInit(struct socket *sock, + CommChannel channel, + unsigned long long peerSock, + PvtcpSock *parentPvsk) +{ + struct sock *sk; + PvtcpSock *pvsk; + int sndBuf = PVTCP_SOCK_RCVSIZE * 4; + + if (!sock || !channel || !peerSock) { + return -EINVAL; + } + + sk = sock->sk; + sk->sk_user_data = NULL; + + pvsk = CommOS_Kmalloc(sizeof *pvsk); + if (!pvsk) { + return -ENOMEM; + } + + if (PvtcpOffSockInit(pvsk, channel)) { + CommOS_Kfree(pvsk); + return -ENOMEM; + } + + /* + * PVTCP sockets should be billed against the vmware uid. + */ + sk->sk_socket->file = &_file; + + /* Set peer (pv) socket. */ + pvsk->peerSock = peerSock; + pvsk->peerSockSet = 1; + + /* Set up back pointer. */ + pvsk->sk = sk; + + /* Keep track of new socket. */ + if (PvtcpStateAddSocket(channel, pvtcpIfUnbound, pvsk) != 0) { + CommOS_Kfree(pvsk); + return -ENOMEM; + } + + /* + * Keep pvtcp around for at least the lifetime of this socket + */ + CommOS_ModuleGet(pvtcpImpl.owner); + + if (!parentPvsk) { + pvsk->destruct = sk->sk_destruct; + sk->sk_destruct = asmDestructorShim; + pvsk->stateChange = sk->sk_state_change; + sk->sk_state_change = StateChangeCB; + pvsk->errorReport = sk->sk_error_report; + sk->sk_error_report = ErrorReportCB; + pvsk->dataReady = sk->sk_data_ready; + sk->sk_data_ready = DataReadyCB; + pvsk->writeSpace = sk->sk_write_space; + sk->sk_write_space = WriteSpaceCB; + } else { + /* + * Copy the parent's saved callbacks. The parent pvsk is only passed + * when creating/initializing a socket after an 'accept'. + */ + + pvsk->destruct = parentPvsk->destruct; + sk->sk_destruct = asmDestructorShim; + pvsk->stateChange = parentPvsk->stateChange; + sk->sk_state_change = StateChangeCB; + pvsk->errorReport = parentPvsk->errorReport; + sk->sk_error_report = ErrorReportCB; + pvsk->dataReady = parentPvsk->dataReady; + sk->sk_data_ready = DataReadyCB; + pvsk->writeSpace = parentPvsk->writeSpace; + sk->sk_write_space = WriteSpaceCB; + + if (parentPvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The parent socket was morphed/bound. */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + } + } + + /* Install forward socket reference. */ + sk->sk_user_data = pvsk; + + /* + * Force the send buffer size high enough, such that we don't lose the + * just-a-bit-over-the-limit bytes. This is mainly needed for datagrams. + * Note that we always apply flow control between host and guest modules, + * according to the sizing model; so this is not artificially inflated. + */ + + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUFFORCE, + (void *)&sndBuf, sizeof sndBuf); + + return 0; +} + + +/** + * @brief Allocates a pvsk socket for error reporting (create operation). + * @param err error code to report to PV side + * @param channel channel error socket belongs to + * @param peerSock peer PV socket of this socket + * @return error socket on success, NULL otherwise + */ + +static PvtcpSock * +SockAllocErrInit(int err, + CommChannel channel, + unsigned long long peerSock) +{ + PvtcpSock *pvsk; + + if (!channel || !peerSock) { + return NULL; + } + + pvsk = CommOS_Kmalloc(sizeof *pvsk); + if (!pvsk) { + return NULL; + } + + if (PvtcpOffSockInit(pvsk, channel)) { + CommOS_Kfree(pvsk); + return NULL; + } + + /* Set peer (pv) socket and error. */ + pvsk->peerSock = peerSock; + pvsk->peerSockSet = 1; + pvsk->err = err; + + /* Set up back pointer to NULL such that PvtcpPutSock deallocates it. */ + pvsk->sk = NULL; + return pvsk; +} + + +/* + * Offload operations. + */ + +/** + * @brief Creates an offload socket and schedules it for reply. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back. + */ + +void +PvtcpCreateOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + int rc; + struct socket *sock; + PvtcpSock *pvsk; + PvtcpState *state = (PvtcpState *)upperLayerState; + const int enable = 1; + + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + +#if defined(PVTCP_IPV6_DISABLE) + if (packet->data16 == AF_INET6) { + CommOS_Debug(("%s: AF_INET6 support is disabled.\n", __FUNCTION__)); + rc = -EAFNOSUPPORT; + } else +#endif + { + rc = sock_create_kern(packet->data16, packet->data32, + packet->data32ex, &sock); + } + + if (!rc) { + rc = SockAllocInit(sock, channel, packet->data64, NULL); + if (rc) { + SockReleaseWrapper(sock); + goto fail; + } + kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (void *)&enable, sizeof enable); + pvsk = PvskFromSk(sock->sk); + if (state->extra && + ((PvtcpStateKObj *)(state->extra))->useNS) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + } else { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + } + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + PvskSetOpFlag(pvsk, PVTCP_OP_CREATE); + } else { + CommOS_Debug(("%s: Error creating offload socket: %d\n", + __FUNCTION__, rc)); + /* + * Pass -rc so we follow error conventions for other reply ops. + * The error code is fixed by the PV side so error codes are properly + * reported. + */ + pvsk = SockAllocErrInit(-rc, channel, packet->data64); + if (!pvsk) { + goto fail; + } + } + + PvtcpSchedSock(pvsk); + return; + +fail: + CommOS_Log(("%s: BOOG ** FAILED TO CREATE OFFLOAD SOCKET [%d] " + "_AND_ ERROR REPORTING SOCKET!\n" + " PV SIDE MAY BE LOCKED UP UNTIL CREATE RPC TIMES OUT!", + __FUNCTION__, rc)); +} + + +/** + * @brief Schedules an offload socket to be removed. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back and + * then release the socket. + */ + +void +PvtcpReleaseOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + + /* + * Check if this is a pvsock datagram socket bound on a reserved port. + * If so, reset the bit such that filtering drops rogue packets. + */ + + if ((sk->sk_socket->type == SOCK_DGRAM) && + (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4)) { + unsigned short port = 0; + + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + if(!kernel_getsockname(sk->sk_socket, + (struct sockaddr *)&sin, &addrLen)) { + port = sin.sin_port; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + if(!kernel_getsockname(sk->sk_socket, + (struct sockaddr *)&sin, &addrLen)) { + port = sin.sin6_port; + } + } + + port = ntohs(port) - portRangeBase; + if (port < portRangeSize) { + CommOS_MutexLock(&globalLock); + PvtcpResetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + } + + /* + * - hold the socket before setting the 'release' flag and until after + * the call to PvtcpSchedSock(): if the socket had already been scheduled + * ReleaseAIO may run, find the flag set and release this socket while + * it's being unlocked here. + * + * - hold the dispatch lock until done to ensure that subsequent Ops for + * this socket see peerSockSet == 0. + */ + + PvtcpHoldSock(pvsk); + SOCK_STATE_LOCK(pvsk); + pvsk->peerSockSet = 0; + SOCK_STATE_UNLOCK(pvsk); + PvskSetOpFlag(pvsk, PVTCP_OP_RELEASE); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); +} + + +/** + * @brief Binds an offload socket to a given address + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpBindOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct sockaddr *addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + int reuseAddr; + int addrLen; + int rc; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + /* + * The socket-level option SO_REUSEADDR is set in the common socket code, + * meaning that we cannot intercept it in the guest pvtcp implementation. + * In order to respect the setting, the guest would pass the current + * setting in 'bind' requests. + * If the guest requires 'reuse address' setting, the value is incremented + * such that we differentiate between: 0) not requested, 1) 'false' and + * 2) 'true'. + */ + + reuseAddr = COMM_OPF_GET_VAL(packet->flags); + if ((reuseAddr == 1) || (reuseAddr == 2)) { + /* Explicit request, so decrement the value. */ + + reuseAddr--; + kernel_setsockopt(sk->sk_socket, SOL_SOCKET, SO_REUSEADDR, + (void *)&reuseAddr, sizeof reuseAddr); + } + + if (sk->sk_family == AF_INET) { + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_port = packet->data16; + sin.sin_addr.s_addr = (unsigned int)packet->data64ex; + addr = (struct sockaddr *)&sin; + addrLen = sizeof sin; + + rc = PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, + sin.sin_port); + if (rc <= 0) { + /* Bind has already happened. */ + + pvsk->err = -rc; + goto out; + } + } else { /* AF_INET6 */ + memset(&sin6, 0, sizeof sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = packet->data16; + addr = (struct sockaddr *)&sin6; + addrLen = sizeof sin6; + + rc = PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, + &packet->data64ex2, sin6.sin6_port); + if (rc <= 0) { + /* Bind has already happened. */ + + pvsk->err = -rc; + goto out; + } + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + } + + /* coverity[check_return] */ + pvsk->err = -kernel_bind(sk->sk_socket, addr, addrLen); + +out: + PvskSetOpFlag(pvsk, PVTCP_OP_BIND); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Sets a socket option. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ +void +PvtcpSetSockOptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + unsigned int optlen = packet->len - sizeof *packet; + + PvtcpHoldSock(pvsk); + + if ((vecLen != 1) || (vec[0].iov_len != optlen) || (optlen < sizeof(int))) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + if (packet->data32 == SOL_TCP) { + /* + * The back-end implementation must always run in 'nodelay' mode. + * Consequently, we ignore, but we cache the TCP_NODELAY and TCP_CORK + * settings such that getsockopt() can return them as they were 'set'. + * Applications use these settings for performance; pvtcp does quite + * well if it's not interfered with. + */ + + int on; + + switch (packet->data32ex) { + case TCP_NODELAY: + memcpy(&on, vec[0].iov_base, sizeof on); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY, on); + pvsk->rpcStatus = 0; + goto out; + case TCP_CORK: + memcpy(&on, vec[0].iov_base, sizeof on); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK, on); + pvsk->rpcStatus = 0; + goto out; + } + } + + pvsk->rpcStatus = kernel_setsockopt(sock, + packet->data32, + packet->data32ex, + vec[0].iov_base, + optlen); + +out: + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Retrieves a socket option. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ +void +PvtcpGetSockOptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + unsigned int optLen = (unsigned int)(packet->data64ex); + char *optBuf; + int rc = 0; + + PvtcpHoldSock(pvsk); + + if ((optLen < sizeof(int)) || (optLen > PVTCP_SOCK_SAFE_RCVSIZE)) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + optBuf = CommOS_Kmalloc(optLen); + if (!optBuf) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + if (packet->data32 == SOL_TCP) { + /* + * See comment in PvtcpSetSockOptOp() regarding special treatment for + * the TCP_NODELAY and TCP_CORK settings. + */ + + int on; + + switch (packet->data32ex) { + case TCP_NODELAY: + on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY); + optLen = sizeof on; + memcpy(optBuf, &on, optLen); + goto done; + case TCP_CORK: + on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK); + optLen = sizeof on; + memcpy(optBuf, &on, optLen); + goto done; + } + } + + rc = kernel_getsockopt(sock, packet->data32, + packet->data32ex, optBuf, &optLen); + +done: + if (!rc) { + pvsk->rpcReply = optBuf; + CommOS_MemBarrier(); + pvsk->rpcStatus = (int)optLen; + } else { + CommOS_Kfree(optBuf); + pvsk->rpcStatus = rc; + } + +out: + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Performs ioctl on offload socket. + * @param channel communication channel with offloader + * @param state state associated with this channel + * @param packet packet header received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + */ + +void +PvtcpIoctlOp(CommChannel channel, + void *state, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, state); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + + PvtcpHoldSock(pvsk); + + /* Not implemented yet. */ + + (void)sock; + pvsk->rpcStatus = -ENOIOCTLCMD; + + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_IOCTL); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Marks a socket for listening to incoming connections + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpListenOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + int backlog = (int)packet->data32; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + pvsk->err = -kernel_listen(sk->sk_socket, backlog); + PvskSetOpFlag(pvsk, PVTCP_OP_LISTEN); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Accepts a connected socket + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back. + */ + +void +PvtcpAcceptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + int rc; + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *newsock = NULL; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + rc = kernel_accept(sk->sk_socket, &newsock, O_NONBLOCK); + if (rc == 0) { + rc = SockAllocInit(newsock, channel, packet->data64ex, pvsk); + if (rc) { + SockReleaseWrapper(newsock); + } + } + + if (rc == 0) { + struct sock *newsk = newsock->sk; + PvtcpSock *newpvsk = PvskFromSk(newsk); + + /* We temporarily use the state field to cache parent socket. */ + + newpvsk->state = (PvtcpState *)pvsk; + PvskSetOpFlag(newpvsk, PVTCP_OP_ACCEPT); + PvtcpSchedSock(newpvsk); + } else { + pvsk->err = -rc; + PvskSetOpFlag(pvsk, PVTCP_OP_ACCEPT); + PvtcpSchedSock(pvsk); + } + + PvtcpPutSock(pvsk); +} + + +/** + * @brief Connects an offload socket to given address + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpConnectOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct sockaddr *addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + int addrLen; + int flags = 0; + int rc = 0; + int disconnect = 0; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + if (sk->sk_family == AF_INET) { + addr = (struct sockaddr *)&sin; + addrLen = sizeof sin; + memset(&sin, 0, sizeof sin); + sin.sin_port = packet->data16; + sin.sin_addr.s_addr = (unsigned int)packet->data64ex; + if (COMM_OPF_GET_VAL(packet->flags)) { + sin.sin_family = AF_UNSPEC; + disconnect = 1; + goto connect; + } + sin.sin_family = AF_INET; + PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, 0); + } else { /* AF_INET6 */ + addr = (struct sockaddr *)&sin6; + addrLen = sizeof sin6; + memset(&sin6, 0, sizeof sin6); + sin6.sin6_port = packet->data16; + if (COMM_OPF_GET_VAL(packet->flags)) { + sin6.sin6_family = AF_UNSPEC; + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + disconnect = 1; + goto connect; + } + sin6.sin6_family = AF_INET6; + PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, + &packet->data64ex2, 0); + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + } + +connect: + rc = kernel_connect(sk->sk_socket, addr, addrLen, flags | O_NONBLOCK); + + /* + * For datagram sockets, ErrorReportCB is not called, so we need to + * explicitly set the pvsk error to be returned back to the guest. + * This should not be used on SOCK_STREAM sockets. You have been + * warned. + */ + + if (rc && (sk->sk_socket->type == SOCK_DGRAM)) { + pvsk->err = -rc; + } + + /* + * Quite likely, stream actual connect requests will set err to EINPROGRESS. + * That's fine, error_report will trigger an AIO/flow-op reply. When the + * connection is established, state_change schedules an AIO/connect reply. + * Record whether the request was a disconnect. + */ + + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, disconnect); + PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Initiates socket shutdown on an offload socket + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect Socket queue will be drained and socket shutdown performed. + */ + +void +PvtcpShutdownOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + int how = (int)packet->data32; + + PvtcpHoldSock(pvsk); + if ((how == SHUT_RD) || (how == SHUT_RDWR)) { + kernel_sock_shutdown(SkFromPvsk(pvsk)->sk_socket, SHUT_RD); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_RD, 1); + } + if ((how == SHUT_WR) || (how == SHUT_RDWR)) { + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR, 1); + } + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/* + * AIO functions called from the main AIO processing function. + * Most of these functions complete processing initiated by the corresponding + * offload operations above. + */ + +/** + * @brief Processes socket release in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket to release. + * @sideeffect the socket will be released upon return from this function. + */ + +static inline void +ReleaseAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_RELEASE, + .data64 = pvsk->peerSock, + .data64ex = PvtcpGetHandle(pvsk) + }; + unsigned long long timeout = COMM_MAX_TO; + + SOCK_OUT_LOCK(pvsk); + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Release' [0x%p] -> 0x%0x] reply.\n", + __FUNCTION__, pvsk, (unsigned)(pvsk->peerSock))); +#endif + /* + * 'sk' goes away in the final ProcessAIO::sock_put() + */ + SockReleaseWrapper(sock); + SOCK_OUT_UNLOCK(pvsk); + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); +} + + +/** + * @brief Processes socket create reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk newly created socket to send ack for. + */ + +static inline void +CreateAIO(PvtcpSock *pvsk) +{ + struct sock *sk; + struct socket *sock; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_CREATE, + .data64 = pvsk->peerSock, + }; + unsigned long long timeout = COMM_MAX_TO; + int rc; + + sk = SkFromPvsk(pvsk); + if (!sk) { + /* + * This is a create-error socket. The error reply has been sent out + * already, by PvtcpFlowAIO(). This is a paranoid safety measure, as + * PVTCP_OP_CREATE OpFlag should not have been set. + */ + + return; + } + + sock = sk->sk_socket; + packet.data64ex = PvtcpGetHandle(pvsk); + + rc = CommSvc_Write(pvsk->channel, &packet, &timeout); + if (rc != packet.len) { + /* We mustn't leak it if PV can't get a hold of it. */ + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + CommOS_Log(("%s: BOOG -- Couldn't send 'Create' reply [0x%p]!\n", + __FUNCTION__, sk)); + } else { +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Create' [0x%p] reply [%d].\n", + __FUNCTION__, pvsk, rc)); +#endif + } +} + + +/** + * @brief Processes socket bind in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being bound. + */ + +static inline void +BindAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_BIND, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + int rc; + + if (pvsk->peerSockSet) { + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc) { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)(-rc); + packet.opCode = PVTCP_OP_FLOW; + } + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Bind' [0x%p, %d] reply.\n", + __FUNCTION__, pvsk, rc)); +#endif + } +} + + +/** + * @brief Sends result of setsockopt back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +SetSockOptAIO(PvtcpSock *pvsk) +{ + CommPacket packet; + unsigned long long timeout; + + packet.len = sizeof packet; + packet.flags = 0; + packet.opCode = PVTCP_OP_SETSOCKOPT; + packet.data64 = pvsk->peerSock; + packet.data32 = (unsigned int)(pvsk->rpcStatus); + timeout = COMM_MAX_TO; + CommSvc_Write(pvsk->channel, &packet, &timeout); + pvsk->rpcStatus = 0; +} + + +/** + * @brief Sends result of getsockopt back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +GetSockOptAIO(PvtcpSock *pvsk) +{ + CommPacket packet = { + .opCode = PVTCP_OP_GETSOCKOPT, + .flags = 0 + }; + unsigned long long timeout = COMM_MAX_TO; + + struct kvec vec[1]; + struct kvec *inVec = vec; + unsigned int vecLen = 1; + unsigned int iovOffset = 0; + + if (pvsk->rpcStatus > 0) { + packet.len = sizeof packet + pvsk->rpcStatus; + vec[0].iov_base = pvsk->rpcReply; + vec[0].iov_len = pvsk->rpcStatus; + } else { + vecLen = 0; + } + + packet.data64 = pvsk->peerSock; + packet.data32 = pvsk->rpcStatus; + + CommSvc_WriteVec(pvsk->channel, &packet, &inVec, &vecLen, + &timeout, &iovOffset); + + if (pvsk->rpcReply) { + CommOS_Kfree(pvsk->rpcReply); + pvsk->rpcReply = NULL; + } + pvsk->rpcStatus = 0; +} + + +/** + * @brief Sends result of ioctl back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +IoctlAIO(PvtcpSock *pvsk) +{ + CommPacket packet = { + .len = sizeof packet, + .opCode = PVTCP_OP_IOCTL, + .flags = 0 + }; + unsigned long long timeout = COMM_MAX_TO; + + packet.data64 = pvsk->peerSock; + packet.data32 = pvsk->rpcStatus; + CommSvc_Write(pvsk->channel, &packet, &timeout); + pvsk->rpcStatus = 0; +} + + +/** + * @brief Processes socket listen reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being put in listen mode. + */ + +static inline void +ListenAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_LISTEN, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + + if (pvsk->peerSockSet) { + if (sk->sk_state != TCP_LISTEN) { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)pvsk->err; + packet.opCode = PVTCP_OP_FLOW; + } + + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Listen' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif + } +} + + +/** + * @brief Processes socket accept reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk new socket or socket to accept on (see PvtcpAcceptOp). + */ + +static inline void +AcceptAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_ACCEPT + }; + unsigned long long timeout = COMM_MAX_TO; + const int enable = 1; + int rc; + + if (pvsk->peerSockSet) { + unsigned long long payloadSocks[2] = { 0, 0 }; + struct kvec payloadVec[] = { + { .iov_base = &payloadSocks, .iov_len = sizeof payloadSocks } + }; + struct kvec *payload = payloadVec; + unsigned int payloadLen = 1; + unsigned int iovOffset = 0; + + packet.len = sizeof packet + sizeof payloadSocks; + + /* + * accept() succeeded, so this is the child socket; its state field + * was temporarily changed to hold the parent/accepting socket. + * The newly accepted socket and its peer need to be put in a + * payload since we use up all available header fields with + * addressing information. Finally, the state field is restored. + */ + + packet.data64 = ((PvtcpSock *)pvsk->state)->peerSock; + pvsk->state = CommSvc_GetState(pvsk->channel); + + payloadSocks[0] = pvsk->peerSock; + payloadSocks[1] = PvtcpGetHandle(pvsk); + + rc = 0; + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc == 0) { + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, + (void *)&enable, sizeof enable); + } else { + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)ECONNABORTED; + packet.len = sizeof packet; + packet.opCode = PVTCP_OP_FLOW; + } + + rc = CommSvc_WriteVec(pvsk->channel, &packet, + &payload, &payloadLen, &timeout, &iovOffset); + if ((rc != packet.len) && !COMM_OPF_TEST_ERR(packet.flags)) { + /* Mustn't leak the new socket if PV can't get a hold of it. */ + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + } +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Accept' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif + } +} + + +/** + * @brief Processes socket connect in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being connected. + */ + +static inline void +ConnectAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_CONNECT, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + const int enable = 1; + int rc; + + if (!pvsk->peerSockSet || + (!PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT) && + (sk->sk_state != TCP_ESTABLISHED))) { + return; + } + + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT)) { + COMM_OPF_SET_VAL(packet.flags, 1); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, 0); + } else if (sk->sk_state == TCP_ESTABLISHED) { + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc == 0) { + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, + (void *)&enable, sizeof enable); + } else { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = ECONNABORTED; + packet.opCode = PVTCP_OP_FLOW; + } + } + + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Connect' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif +} + + +/** + * @brief Server side main asynchronous processing function. It writes to + * socket queued output buffers, it reads from socket and outputs to PV; it + * also completes operation processing and sends applicable replies to PV. + * Finally, processes error reporting and delta size acks. + * @param arg socket work item. + */ + +void +PvtcpProcessAIO(CommOSWork *arg) +{ + PvtcpSock *pvsk = container_of(arg, PvtcpSock, work); + struct sock *sk = SkFromPvsk(pvsk); + + if (!SOCK_OUT_TRYLOCK(pvsk)) { + /* + * Queued output processing. If trylock failed, we don't retry. + * There are only two reasons for not being able to take the lock: + * - IoOp() has it -- when done, it reschedules us if we're not running. + * - OutputAIO() is already running on another core. + */ + + if (sk && sk->sk_socket) { + PvtcpOutputAIO(pvsk); + } + SOCK_OUT_UNLOCK(pvsk); + } + + /* All other processing needs the socket IN lock. */ + + if (!SOCK_IN_TRYLOCK(pvsk)) { + + if (sk && sk->sk_socket) { + int err; + + /* Input processing. */ + + /* + * Workqueue handlers are pinned to a CPU core and therefore not + * migratable. No need to disable preemption. + */ + err = PvtcpInputAIO(pvsk, perCpuBuf[smp_processor_id()]); + + /* Error and ack notifications. */ + + PvtcpFlowAIO(pvsk, err); + + if (!pvsk->opFlags) { + /* No other operations need to be completed. */ + + goto doneInUnlock; + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) { + PvskResetOpFlag(pvsk, PVTCP_OP_RELEASE); + ReleaseAIO(pvsk); + + /* All possible in-flight operations must be dropped. */ + goto doneInUnlock; + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_CREATE)) { + /* No state locking required. */ + + PvskResetOpFlag(pvsk, PVTCP_OP_CREATE); + CreateAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_BIND)) { + PvskResetOpFlag(pvsk, PVTCP_OP_BIND); + BindAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_SETSOCKOPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); + SetSockOptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_GETSOCKOPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); + GetSockOptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_IOCTL)) { + PvskResetOpFlag(pvsk, PVTCP_OP_IOCTL); + IoctlAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_LISTEN)) { + PvskResetOpFlag(pvsk, PVTCP_OP_LISTEN); + ListenAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_ACCEPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_ACCEPT); + AcceptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_CONNECT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_CONNECT); + ConnectAIO(pvsk); + } + +doneInUnlock: + SOCK_IN_UNLOCK(pvsk); + } else { + /* + * Special case for error sockets which don't have a sk. + * Note that this socket was created by SockAllocErrInit() and so + * no 'real' socket sits atop it and is not present on any state + * netif list. The socket has a refcnt of one and it will get + * deallocated by the PvtcpPutSock() call below, so we don't need + * to unlock it. + */ + + PvtcpFlowAIO(pvsk, -ENETDOWN); + } + } else { + if ((pvsk->peerSockSet || PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) && + sk && sk->sk_socket) { + PvtcpSchedSock(pvsk); + } + } + + PvtcpPutSock(pvsk); +} |