diff options
Diffstat (limited to 'arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c')
-rw-r--r-- | arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c | 2858 |
1 files changed, 0 insertions, 2858 deletions
diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c deleted file mode 100644 index 047547f..0000000 --- a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c +++ /dev/null @@ -1,2858 +0,0 @@ -/* - * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server - * - * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; see the file COPYING. If not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#line 5 - -/** - * @file - * - * @brief Server (offload) side Linux-specific functions and callbacks. - */ - - -#include "pvtcp.h" - -#if defined(CONFIG_NET_NS) -#include <linux/nsproxy.h> -#include <linux/un.h> -#endif - -#include <net/ipv6.h> -#include <linux/kobject.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <linux/cred.h> - - -/* The PVSock address (127.238.0.1) in binary form, host byte order. */ -#define PVTCP_PVSOCK_ADDR 0x7fee0001 -#define PVTCP_PVSOCK_NET 0x7fee0000 -#define PVTCP_PVSOCK_MASK 0x000000ff - -/* From mvpkm */ -extern uid_t Mvpkm_vmwareUid; - -/* - * Credentials to back socket file pointer. Used in Android ICS network - * data usage accounting to bill guest data to MVP. - */ -static struct cred _cred; -static struct file _file = { - .f_cred = &_cred, -}; - -/* From pvtcp_off_io_linux.c */ -extern CommOSAtomic PvtcpOutputAIOSection; -extern void PvtcpOffLargeDgramBufInit(void); - -static const unsigned short portRangeBase = 7000; -static const unsigned int portRangeSize = 31; -static int hooksRegistered = 0; - -static inline int PvtcpTestPortIndexBit(unsigned int addr, - unsigned int portIdx); -/** - * @note - * Netfilter hooks: - * - * We decide to drop each packet based on the following criteria: - * 1) Destination address is to a pvsock address AND - * 3) (NOT(uid == 0 OR uid == vmwareUid)) OR - * 4) (type == UDP AND NOT(port-in-pvsock-range))) - */ - -/** - * @brief Netfilter hook. Restricts LOCAL_OUT packets. - * See note above to filter policy. - * @param skb skbuff - * @param inet6 is this socket ipv4 or ipv6? - * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise - */ -static inline unsigned int -PvsockNfHook(struct sk_buff *skb, int inet6) -{ - uid_t uid; - unsigned int port; - struct socket *sock; - unsigned int addr = inet6 ? - ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]) : - ntohl(ip_hdr(skb)->daddr); - - if (likely((addr ^ PVTCP_PVSOCK_NET) & ~PVTCP_PVSOCK_MASK)) { - /* Not a pvsock address. */ - return NF_ACCEPT; - } - - sock = skb->sk->sk_socket; - if (unlikely(!sock)) { - return NF_ACCEPT; - } - - /* - * Guest (kernel) sockets can send to other guest sockets, - * Root can send to whoever it wants, no checks. - */ - uid = (sock->file ? sock->file->f_cred->uid : 0); - if (uid == 0 || (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM)) { - return NF_ACCEPT; - } - - /* - * Only vmware can send to guest. - */ - if (likely(uid == Mvpkm_vmwareUid)) { - if (sock->type == SOCK_DGRAM) { - /* - * Deny sending to UDP port in pvsock range, if receiving socket was - * not created by the guest with this pvsock address. Drop all other - * UDP packets. - */ - port = ntohs(udp_hdr(skb)->dest) - portRangeBase; - if ((port < portRangeSize) && - PvtcpTestPortIndexBit(htonl(addr), port)) { - return NF_ACCEPT; - } - return NF_DROP; - } - /* - * TCP is all-good. - */ - return NF_ACCEPT; - } - - return NF_DROP; -} - - -/** - * @brief AF_INET4 Netfilter hook. Restricts LOCAL_OUT packets. - * See note above to filter policy. - * @param hooknum netfilter hook number - * @param skb skbuff - * @param in rx net_device - * @param out out net_device - * @param okfn ignored - * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise - */ -static unsigned int -Inet4NfHook(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return PvsockNfHook(skb, 0); -} - -/** - * @brief AF_INET6 Netfilter hook. Restricts LOCAL_OUT packets. - * See note above to filter policy. - * @param hooknum netfilter hook number - * @param skb skbuff - * @param in rx net_device - * @param out out net_device - * @param okfn ignored - * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise - */ -static unsigned int -Inet6NfHook(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!ipv6_addr_v4mapped(&ipv6_hdr(skb)->daddr)) { - /* Not ipv4-mapped, so not a pvsock address. */ - return NF_ACCEPT; - } - - return PvsockNfHook(skb, 1); -} - - -static struct nf_hook_ops netfilterHooks[] = { - { - .hook = Inet4NfHook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_SECURITY - }, - { - .hook = Inet6NfHook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_SECURITY - } -}; - - -#if !defined(CONFIG_SYSFS) -#error "The pvTCP offload module requires sysfs!" -#endif - -/* - * State kobject, attributes and type. - */ - -typedef struct PvtcpStateKObj { - struct kobject kobj; - CommTranspInitArgs transpArgs; - unsigned int pvsockAddr; - int useNS; - int haveNS; -} PvtcpStateKObj; - - -typedef struct PvtcpStateKObjAttr { - struct attribute attr; - ssize_t (*show)(PvtcpStateKObj *stateKObj, char *buf); - ssize_t (*store)(PvtcpStateKObj *stateKObj, const char *buf, size_t count); -} PvtcpStateKObjAttr; - - -/** - * @brief Releases state a kobject. - * @param kobj (embedded) state kobject. - */ - -static void -StateKObjRelease(struct kobject *kobj) -{ - kfree(container_of(kobj, PvtcpStateKObj, kobj)); -} - - -/** - * @brief Sysfs show function for all pvtcp attributes. - * @param kobj (embedded) state kobject. - * @param attr pvtcp attribute to show. - * @param buf output buffer. - * @return number of bytes written or negative error code. - */ - -static ssize_t -StateKObjShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); - PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); - - if (stateAttr->show) { - return stateAttr->show(stateKObj, buf); - } - - return -EIO; -} - - -/** - * @brief Sysfs store function for all pvtcp attributes. - * @param kobj (embedded) state kobject. - * @param attr pvtcp attribute to show. - * @param buf input buffer. - * @param count input buffer length. - * @return number of bytes consumed or negative error code. - */ - -static ssize_t -StateKObjStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count) -{ - PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); - PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); - - if (stateAttr->store) { - return stateAttr->store(stateKObj, buf, count); - } - - return -EIO; -} - - -static struct sysfs_ops StateKObjSysfsOps = { - .show = StateKObjShow, - .store = StateKObjStore -}; - - -/** - * @brief Show function for the comm_info pvtcp attribute. - * @param stateKObj state kobject. - * @param buf output buffer. - * @return number of bytes written or negative error code. - */ - -static ssize_t -StateKObjCommInfoShow(PvtcpStateKObj *stateKObj, - char *buf) -{ - unsigned int typeHash; - - /* - * In the offload module, the transport arguments' type field has been - * assigned the matching index in the versions array at probe time. - * Recover and print out the type hash. - */ - - typeHash = CommTransp_GetType(pvtcpVersions[stateKObj->transpArgs.type]); - - return snprintf(buf, PAGE_SIZE, "ID=%u,%u\nCAPACITY=%u\nTYPE=0x%0x\n", - stateKObj->transpArgs.id.d32[0], - stateKObj->transpArgs.id.d32[1], - stateKObj->transpArgs.capacity, - typeHash); -} - - -/** - * @brief Show function for the pvsock_addr pvtcp attribute. - * @param stateKObj state kobject. - * @param buf output buffer. - * @return number of bytes written or negative error code. - */ - -static ssize_t -StateKObjPvsockAddrShow(PvtcpStateKObj *stateKObj, - char *buf) -{ - union { - unsigned int raw; - unsigned char bytes[4]; - } addr; - - addr.raw = stateKObj->pvsockAddr; - return snprintf(buf, PAGE_SIZE, "%u.%u.%u.%u\n", - (unsigned int)addr.bytes[0], (unsigned int)addr.bytes[1], - (unsigned int)addr.bytes[2], (unsigned int)addr.bytes[3]); -} - - -/** - * @brief Show function for the use_ns pvtcp attribute. - * @param stateKObj state kobject. - * @param buf output buffer. - * @return number of bytes written or negative error code. - */ - -static ssize_t -StateKObjUseNSShow(PvtcpStateKObj *stateKObj, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", stateKObj->useNS); -} - - -/** - * @brief Store function for the use_ns pvtcp attribute. - * @param stateKObj state kobject. - * @param buf input buffer. - * @param count input buffer length. - * @return number of bytes consumed or negative error code. - */ - -static ssize_t -StateKObjUseNSStore(PvtcpStateKObj *stateKObj, - const char *buf, - size_t count) -{ - int rc = -EINVAL; - - /* coverity[secure_coding] */ - if (stateKObj->haveNS && (sscanf(buf, "%d", &stateKObj->useNS) == 1)) { - stateKObj->useNS = !!stateKObj->useNS; - rc = count; - } - - return rc; -} - - -static PvtcpStateKObjAttr stateKObjCommInfoAttr = - __ATTR(comm_info, 0444, StateKObjCommInfoShow, NULL); - -static PvtcpStateKObjAttr stateKObjPvsockAddrAttr = - __ATTR(pvsock_addr, 0444, StateKObjPvsockAddrShow, NULL); - -static PvtcpStateKObjAttr stateKObjUseNSAttr = - __ATTR(use_ns, 0644, StateKObjUseNSShow, StateKObjUseNSStore); - - -static struct attribute *stateKObjDefaultAttrs[] = { - &stateKObjCommInfoAttr.attr, - &stateKObjPvsockAddrAttr.attr, - &stateKObjUseNSAttr.attr, - NULL -}; - - -static struct kobj_type stateKType = { - .sysfs_ops = &StateKObjSysfsOps, - .release = StateKObjRelease, - .default_attrs = stateKObjDefaultAttrs -}; - - -/* - * Initialization of module entry and exit callbacks. - */ - -static int Init(void *args); -static void Exit(void); - -COMM_OS_MOD_INIT(Init, Exit); - - -/* - * AIO socket read buffers, stats and other global state. - */ - -static CommOSMutex globalLock; -static char perCpuBuf[NR_CPUS][PVTCP_SOCK_BUF_SIZE]; - -#define PVTCP_OFF_MAX_LB_ADDRS 255 -static unsigned int loopbackAddrs[PVTCP_OFF_MAX_LB_ADDRS] = { - 0xffffffff, // Network address always on, all ports allowed. - 0x7fffffff // Host address not yet on, all ports allowed. - // All the rest zeroed out. -}; - -static const unsigned int loopbackReserved = 0x00000001 << 31; - - -#define PvtcpTestLoopbackBit(entry, mask) \ - ((entry) & (mask)) - -#define PvtcpSetLoopbackBit(entry, mask) \ - ((entry) |= (mask)) - -#define PvtcpResetLoopbackBit(entry, mask) \ - ((entry) &= ~(mask)) - - -static inline int -PvtcpTestPortIndexBit(unsigned int addr, - unsigned int portIdx) -{ - return PvtcpTestLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], - BIT(portIdx)); -} - - -static inline void -PvtcpSetPortIndexBit(unsigned int addr, - unsigned int portIdx) -{ - PvtcpSetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], - BIT(portIdx)); -} - - -static inline void -PvtcpResetPortIndexBit(unsigned int addr, - unsigned int portIdx) -{ - PvtcpResetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], - BIT(portIdx)); -} - - -unsigned int pvtcpLoopbackOffAddr; - -unsigned long long pvtcpOffDgramAllocations = 0; - -/* - * Destructor shim addresses and function pointer - */ - -extern void asmDestructorShim(struct sock*); - - -/* - * Functions. - */ - -/** - * @brief Release a socket, NULLing out the fake file field to avoid confusing - * Linux on the release path - * @param sock socket to release - */ -static void -SockReleaseWrapper(struct socket *sock) -{ - sock->file = NULL; - sock_release(sock); -} - -/** - * @brief Gets a new loopback address in the 127.238.0.255 network. - * Note that the first address, 127.238.0.1, is always the host's. - * @return new address or -1U if none is available. - */ - -static unsigned int -GetLoopbackAddr(void) -{ - static unsigned char addrTempl[4] = { 127, 238, 0, 0 }; - unsigned int rc = -1U; - unsigned int idx; - struct socket *sock; - - CommOS_MutexLock(&globalLock); - for (idx = 1; idx < PVTCP_OFF_MAX_LB_ADDRS; idx++) { - if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { - addrTempl[3] = (unsigned char)idx; - memcpy(&rc, addrTempl, sizeof rc); - - /* Create a dgram socket to configure/bring-up the lo:N interface. */ - - if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { - int err; - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr = { .s_addr = rc } - }; - struct ifreq ifr = { - .ifr_flags = IFF_UP - }; - - snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); - memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); - err = kernel_sock_ioctl(sock, SIOCSIFADDR, (unsigned long)&ifr); - sock_release(sock); - if (err) { - CommOS_Log(("%s: Could not set loopback address (ioctl)!\n", - __FUNCTION__)); - rc = -1U; - continue; /* Try next address. */ - } else { - PvtcpSetLoopbackBit(loopbackAddrs[idx], loopbackReserved); - CommOS_Debug(("%s: Allocated loopback address [%u.%u.%u.%u].\n", - __FUNCTION__, - addrTempl[0], addrTempl[1], - addrTempl[2], addrTempl[3])); - break; - } - } else { - CommOS_Log(("%s: Could not set loopback address (create)!\n", - __FUNCTION__)); - rc = -1U; - break; - } - } - } - if (idx == PVTCP_OFF_MAX_LB_ADDRS) { - CommOS_Log(("%s: loopback address range exceeded!\n", __FUNCTION__)); - } - - CommOS_MutexUnlock(&globalLock); - return rc; -} - - -/** - * @brief Puts back a loopback address in the 127.238.0.255 network. - * @param uaddr address to put back. - */ - -static void -PutLoopbackAddr(unsigned int uaddr) -{ - const unsigned char addrTempl[3] = { 127, 238, 0 }; - unsigned char addr[4]; - unsigned int idx; - struct socket *sock; - - memcpy(addr, &uaddr, sizeof uaddr); - if (memcmp(addrTempl, addr, sizeof addrTempl)) { - return; - } - - idx = addr[3]; - if ((idx == 0) || (idx >= PVTCP_OFF_MAX_LB_ADDRS)) { - return; - } - - CommOS_MutexLock(&globalLock); - if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { - CommOS_Debug(("%s: loopback entry [%u] already freed.\n", - __FUNCTION__, idx)); - goto out; - } - - if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr = { .s_addr = uaddr } - }; - struct ifreq ifr = { - .ifr_flags = 0 - }; - - snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); - memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); - kernel_sock_ioctl(sock, SIOCSIFFLAGS, (unsigned long)&ifr); - sock_release(sock); - loopbackAddrs[idx] = 0; // Zero everything out. - CommOS_Debug(("%s: Deallocated loopback address [%u.%u.%u.%u].\n", - __FUNCTION__, addr[0], addr[1], addr[2], addr[3])); - } else { - CommOS_Log(("%s: Could not delete loopback address!\n", - __FUNCTION__)); - } - -out: - CommOS_MutexUnlock(&globalLock); -} - - -/** - * @brief Retrieves and retains the namespace associated with a channel. - * A server must be listening for requests to retrieve the pid of the - * process owning the net namespace for the passed context/vm id. - * Communication takes place over a datagram socket in the AF_UNIX family, - * bound to "/usr/lib/vmware/pvtcp/config/serv_addr". - * @param state channel state for which to retrieve the network namespace. - * @sideeffect If an associated namespace is found, it is retained and saved - * in the state object. - */ - -static void -GetNetNamespace(PvtcpState *state) -{ -#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) - CommTranspInitArgs args; - pid_t pidn; - struct pid *pid; - struct task_struct *tsk; - struct nsproxy *nsproxy; - struct net *ns; - struct socket *sock; - struct sockaddr_un addr = { - .sun_family = AF_UNIX - }; - struct timeval timeout = { - .tv_sec = 3000, - .tv_usec = 0 - }; - const int passcred = 1; - char buf[64]; - struct kvec vec; - const char *sockname = "pvtcp-vpn"; /* abstract namespace for AF_UNIX/LOCAL sockets */ - const size_t socknamelen = strlen(sockname); - - struct msghdr msg = { - .msg_name = (struct sockaddr *)&addr, - .msg_namelen = 1 + offsetof(struct sockaddr_un, sun_path) + socknamelen - }; - - - if (!state) { - return; - } - - args = CommSvc_GetTranspInitArgs(state->channel); - ns = NULL; - pidn = 0; - - if (sock_create_kern(AF_UNIX, SOCK_DGRAM, 0, &sock)) { - CommOS_Debug(("%s: Can't create config socket!\n", __FUNCTION__)); - goto out; - } - if (kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&timeout, sizeof timeout)) { - sock_release(sock); - CommOS_Debug(("%s: Can't set timeout on config socket!\n", __FUNCTION__)); - goto out; - } - if (kernel_setsockopt(sock, SOL_SOCKET, SO_PASSCRED, - (char *)&passcred, sizeof passcred)) { - sock_release(sock); - CommOS_Debug(("%s: Can't set passcred on config socket!\n", - __FUNCTION__)); - goto out; - } - - /* - * Send the configuration request and receive the reply: - * - the request carries the VM/guest ID as used in the transport - * arguments used to create the channel. - * - the reply is expected to contain the pid of the namespace owner. - */ - - memset(buf, 0, sizeof buf); - snprintf(buf, sizeof buf, "%u\n", args.id.d32[0]); - buf[sizeof buf - 1] = '\0'; - vec.iov_base = buf; - vec.iov_len = strlen(buf); - - /* use anonymous name */ - addr.sun_path[0] = 0; - memcpy(addr.sun_path+1, sockname, socknamelen); - - if (kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len) <= 0) { - sock_release(sock); - CommOS_Debug(("%s: Could not send config request for vm [%u]!\n", - __FUNCTION__, args.id.d32[0])); - goto out; - } - - memset(buf, 0, sizeof buf); - vec.iov_base = buf; - vec.iov_len = sizeof buf; - if (kernel_recvmsg(sock, &msg, &vec, 1, vec.iov_len, 0) <= 0) { - CommOS_Debug(("%s: Could not receive config reply for vm [%u]!\n", - __FUNCTION__, args.id.d32[0])); - } else { - buf[sizeof buf - 1] = '\0'; - /* coverity[secure_coding] */ - sscanf(buf, "%d", &pidn); - } - sock_release(sock); - - if (!pidn) { - goto out; - } - - pid = find_get_pid(pidn); - if (pid) { - tsk = pid_task(pid, PIDTYPE_PID); - if (tsk) { - rcu_read_lock(); - nsproxy = task_nsproxy(tsk); - if (nsproxy && nsproxy->net_ns) { - ns = maybe_get_net(nsproxy->net_ns); - } - rcu_read_unlock(); - } - put_pid(pid); - } - -out: - if (!ns) { - CommOS_Debug(("%s: Not using a namespace for vm [%u].\n", - __FUNCTION__, args.id.d32[0])); - ns = &init_net; - } else { - CommOS_Debug(("%s: Found the net namespace for vm [%u].\n", - __FUNCTION__, args.id.d32[0])); - } -#else - void *ns = NULL; -#endif - - state->namespace = ns; -} - - -/** - * @brief Releases the network namespace associated with a channel state. - * @param namespace namespace to be released. - * @sideeffect If the namespace is not the initial one, it is released. - */ - -static void -PutNetNamespace(void *namespace) -{ -#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) - if (namespace && (namespace != &init_net)) { - put_net((struct net *)namespace); - } -#endif -} - - -/** - * @brief Offload state constructor called when a channel is created. - * The function first calls the default state allocator; it then retrieves - * the n/w namespace associated with this client, retains it and stores it - * in the state object. Finally, it creates a sysfs node. - * @param[in,out] channel channel to initialize. - * @return pointer to a new state structure or NULL. - * @sideeffect Allocates memory. - */ - -static void * -StateAlloc(CommChannel channel) -{ - extern struct kset *Mvpkm_FindVMNamedKSet(int, const char *); - PvtcpState *state = NULL; - PvtcpIf *loopbackNetif = NULL; - PvtcpStateKObj *stateKObj = NULL; - struct kset *kset = NULL; - int rc; - CommTranspInitArgs transpArgs; - - transpArgs = CommSvc_GetTranspInitArgs(channel); - - /* - * The transport ID is assigned in an implementation-dependent way. - * (see lib/comm/comm_transp.h for transport type definitions.) - * However, the first 32 bits are expected to denote the guest/VM ID, - * while the last 32 bits are a resource handle within that VM. On MVP, - * transports map to queue pairs, which follow this convention. - */ - - kset = Mvpkm_FindVMNamedKSet((int)transpArgs.id.d32[0], "devices"); - if (!kset) { - CommOS_Debug(("%s: Could not find sysfs '.../vm/N/devices' kset!\n", - __FUNCTION__)); - goto error; - } - - state = PvtcpStateAlloc(channel); - if (!state) { - CommOS_Debug(("%s: Could not allocate state!\n", __FUNCTION__)); - goto error; - } - - /* coverity[leaked_storage] */ - stateKObj = kzalloc(sizeof *stateKObj, GFP_KERNEL); - if (!stateKObj) { - CommOS_Debug(("%s: Could not allocate state kobject!\n", __FUNCTION__)); - goto error; - } - - stateKObj->kobj.kset = kset; - /* coverity[leaked_storage] */ - rc = kobject_init_and_add(&stateKObj->kobj, &stateKType, NULL, "pvtcp"); - if (rc) { - CommOS_Debug(("%s: Could not add state kobject to parent kset [%d]!\n", - __FUNCTION__, rc)); - goto error; - } - - loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); - BUG_ON(loopbackNetif == NULL); - loopbackNetif->conf.addr.in.s_addr = GetLoopbackAddr(); - if (loopbackNetif->conf.addr.in.s_addr == -1U) { - CommOS_Log(("%s: Could not allocate loopback address!\n", __FUNCTION__)); - goto error; - } - - GetNetNamespace(state); - - stateKObj->transpArgs = transpArgs; - stateKObj->pvsockAddr = loopbackNetif->conf.addr.in.s_addr; -#if defined(CONFIG_NET_NS) - stateKObj->haveNS = (state->namespace != &init_net); - stateKObj->useNS = stateKObj->haveNS; -#endif - state->extra = stateKObj; - - _cred.uid = _cred.gid = _cred.suid = _cred.sgid = - _cred.euid = _cred.egid = _cred.fsuid = _cred.fsgid = Mvpkm_vmwareUid; - - -out: - if (kset) { - kset_put(kset); - } - return state; - -error: - if (stateKObj) { - kobject_del(&stateKObj->kobj); - kobject_put(&stateKObj->kobj); - } - if (loopbackNetif && (loopbackNetif->conf.addr.in.s_addr != -1U)) { - PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); - } - if (state) { - PvtcpStateFree(state); - state = NULL; - } - goto out; -} - - -/** - * @brief Offload state destructor called when a channel is closed. - * The function releases this client's n/w namespace and then calls the - * default state deallocator. - * @param arg pointer to state structure. - * @sideeffect Destroys all netifs and their sockets, deallocates memory. - */ - -static void -StateFree(void *arg) -{ - PvtcpState *state = arg; - PvtcpIf *loopbackNetif; - void *namespace; - - if (!state) { - return; - } - - if (state->extra) { - PvtcpStateKObj *stateKObj = state->extra; - - kobject_del(&stateKObj->kobj); - kobject_put(&stateKObj->kobj); - } - - namespace = state->namespace; - loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); - BUG_ON(loopbackNetif == NULL); - PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); - PvtcpStateFree(state); - PutNetNamespace(namespace); -} - - -/** - * @brief Releases socket. This function is called when the channel state - * owning the socket is closed. - * @param[in,out] pvsk PV socket to release. - * @sideeffect the socket eventually gets deallocated. - */ - -void -PvtcpReleaseSocket(PvtcpSock *pvsk) -{ - struct socket *sock = SkFromPvsk(pvsk)->sk_socket; - - SOCK_IN_LOCK(pvsk); - SOCK_OUT_LOCK(pvsk); - pvsk->peerSockSet = 0; - SockReleaseWrapper(sock); - SOCK_OUT_UNLOCK(pvsk); - SOCK_IN_UNLOCK(pvsk); - CommOS_Debug(("%s: [0x%p].\n", __FUNCTION__, pvsk)); -} - - -/** - * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1. - * @param pvsk socket to test. - * @param addr inet4 address to test. - * @return > 1: morph and propagate new address to caller, 1: just morph, - * 0: don't morph, < 0 (-EADDRNOTAVAIL): bad loopback. - */ - -static inline int -TestLoopbackInet4(PvtcpSock *pvsk, - unsigned int addr) -{ - if (!ipv4_is_loopback(addr)) { - return 0; - } - - if (addr != htonl(PVTCP_PVSOCK_ADDR)) { - if (addr != htonl(INADDR_LOOPBACK)) { - return -EADDRNOTAVAIL; - } - if (PvtcpHasSockNamespace(pvsk)) { - /* We don't morph normal 127.0.0.1 when NS present. */ - - return 0; - } - return 2; - } - - return 1; -} - - -/** - * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1 and the - * socket has a namespace. If yes, the address will be morphed into - * the actual loopback address, then a bind() is performed. - * Note that the function returns EADDRNOTAVAIL for any other loopbacks. - * @param pvsk socket to test. - * @param[in,out] addr inet4 address to test. - * @param port port to bind, or zero for any port. - * @return 1 if bind should be performed by caller, bind return code otherwise. - */ - -int -PvtcpTestAndBindLoopbackInet4(PvtcpSock *pvsk, - unsigned int *addr, - unsigned short port) -{ - int rc; - struct sockaddr_in sin; - unsigned int morphedAddr; - int propagate = 0; - - rc = TestLoopbackInet4(pvsk, *addr); - switch (rc) { - case 2: - propagate = 1; // Fall through. - case 1: - break; // Proceed with morphing. - case 0: - return 1; // Don't morph, let bind() be done by caller. - default: - return rc; - } - - if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { - /* The socket has already been morphed/bound. */ - - morphedAddr = pvsk->netif->conf.addr.in.s_addr; - rc = 0; - goto out; - } - - /* - * Move the socket to the initial namespace before binding it - * such that the loopback address is accessible to the host. - */ - - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); - PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); - morphedAddr = pvsk->netif->conf.addr.in.s_addr; - memset(&sin, 0, sizeof sin); - sin.sin_family = AF_INET; - sin.sin_port = port; - sin.sin_addr.s_addr = morphedAddr; - - /* Bind to the channel loopback address. */ - - rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, - (struct sockaddr *)&sin, sizeof sin); - if (rc) { - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); - PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); - } else { - /* - * Bind succeeded on pvsock address. - * If this is a pvsock UDP reserved port, record it. - */ - - port = ntohs(port) - portRangeBase; - if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && - (port < portRangeSize)) { - CommOS_MutexLock(&globalLock); - PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); - CommOS_MutexUnlock(&globalLock); - } - - /* - * pvsock data usage shouldn't be counted as MVP external traffic. - */ - SkFromPvsk(pvsk)->sk_socket->file = NULL; - } - -out: - if (propagate) { - *addr = morphedAddr; - } - return rc; -} - - -/** - * @brief Tests if the passed address is IPV4-mapped 127.238.0.1 or 127.0.0.1, - * clean ::1, and whether the socket has a namespace. - * If needed, the address will be morphed into the actual loopback address, - * then a bind() is performed. - * Note that the function returns EADDRNOTAVAIL for any other loopbacks. - * @param pvsk socket to test. - * @param[in,out] addr0 first 64 bits of inet6 address to test. - * @param[in,out] addr1 last 64 bits of inet6 address to test. - * @param port port to bind, or zero for any port. - * @return 1 if bind should be performed by caller, bind return code otherwise. - */ - -int -PvtcpTestAndBindLoopbackInet6(PvtcpSock *pvsk, - unsigned long long *addr0, - unsigned long long *addr1, - unsigned short port) -{ - int rc; - struct sockaddr_in6 sin6; - union { - unsigned long long halves[2]; - struct in6_addr in6; - } in6Addr = { - .halves = { *addr0, *addr1 } - }; - int propagate = 0; - const int ipv6Only = 0; - - if (ipv6_addr_loopback(&in6Addr.in6)) { - if (PvtcpHasSockNamespace(pvsk)) { - return 1; - } - - /* Remember that we were passed '::1'. */ - - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP, 1); - ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &in6Addr.in6); - } - - if (!ipv6_addr_v4mapped(&in6Addr.in6)) { - /* If the address is not ipv4-mapped, stop testing. */ - - return 1; - } - - rc = TestLoopbackInet4(pvsk, in6Addr.in6.s6_addr32[3]); - switch (rc) { - case 2: - propagate = 1; // Fall through. - case 1: - break; // Proceed with morphing. - case 0: - return 1; // Don't morph, let bind() be done by caller. - default: - return rc; - } - - if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { - /* The socket has already been morphed/bound. */ - - ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); - rc = 0; - goto out; - } - - /* - * Move the socket to the initial namespace before binding it - * such that the loopback address is accessible to the host. - */ - - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); - PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); - ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); - memset(&sin6, 0, sizeof sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_port = port; - sin6.sin6_addr = in6Addr.in6; - - /* - * Ensure we can use ipv4 mapped addresses and bind to the channel - * loopback address. - */ - - (void)kernel_setsockopt(SkFromPvsk(pvsk)->sk_socket, IPPROTO_IPV6, - IPV6_V6ONLY, (char *)&ipv6Only, sizeof ipv6Only); - rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, - (struct sockaddr *)&sin6, sizeof sin6); - if (rc) { - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); - PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); - } else { - /* - * Bind succeeded on pvsock address. - * If this is a pvsock UDP reserved port, record it. - */ - - port = ntohs(port) - portRangeBase; - if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && - (port < portRangeSize)) { - CommOS_MutexLock(&globalLock); - PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); - CommOS_MutexUnlock(&globalLock); - } - - /* - * pvsock data usage shouldn't be counted as MVP external traffic. - */ - SkFromPvsk(pvsk)->sk_socket->file = NULL; - } - -out: - if (propagate) { - *addr0 = in6Addr.halves[0]; - *addr1 = in6Addr.halves[1]; - } - return rc; -} - - -/** - * @brief Resets a 127.238.0.N address to 127.0.0.1. - * @param pvsk socket whose address needs resetting. - * @param[in,out] addr inet4 address to reset. - */ - -void -PvtcpResetLoopbackInet4(PvtcpSock *pvsk, - unsigned int *addr) -{ - if (!PvtcpHasSockNamespace(pvsk)) { - static const unsigned int pvsockAddr = htonl(PVTCP_PVSOCK_ADDR); - - if (!memcmp(&pvsockAddr, addr, 3) && memcmp(&pvsockAddr, addr, 4)) { - /* If it's a pvsock address but _not_ the host's, overwrite it. */ - - *addr = htonl(INADDR_LOOPBACK); - } - } -} - - -/** - * @brief Resets an IPV4-mapped ::ffff:127.238.0.N IPV6 address to loopback. - * @param pvsk socket whose address needs resetting. - * @param[in,out] in6 inet6 address to reset. - */ - -void -PvtcpResetLoopbackInet6(PvtcpSock *pvsk, - struct in6_addr *in6) -{ - if (!PvtcpHasSockNamespace(pvsk) && ipv6_addr_v4mapped(in6)) { - if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP)) { - /* If the original address came in as ::1, we reset as such. */ - - static const struct in6_addr in6Loopback = IN6ADDR_LOOPBACK_INIT; - - *in6 = in6Loopback; - } else { - PvtcpResetLoopbackInet4(pvsk, &in6->s6_addr32[3]); - } - } -} - - -/** - * @brief Called at module load time. It registers with the Comm runtime. - * @param args initialization arguments - * @return zero if successful, -1 otherwise - * @sideeffect Leaves the module loaded - */ - -static int -Init(void *args) -{ - int rc = -1; - -#if !defined(PVTCP_DISABLE_NETFILTER) - rc = nf_register_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); - if (rc) { - CommOS_Log(("%s: Could not register netfilter hooks!\n", __FUNCTION__)); - goto out; - } else { - CommOS_Debug(("%s: Registered netfilter hooks.\n", __FUNCTION__)); - } - hooksRegistered = 1; -#else - CommOS_Log(("%s: Netfilter hooks disabled.\n", __FUNCTION__)); -#endif - - CommOS_MutexInit(&globalLock); - CommOS_WriteAtomic(&PvtcpOutputAIOSection, 0); - PvtcpOffLargeDgramBufInit(); - - pvtcpImpl.owner = CommOS_ModuleSelf(); - pvtcpImpl.stateCtor = StateAlloc; - pvtcpImpl.stateDtor = StateFree; - if (CommSvc_RegisterImpl(&pvtcpImpl) == 0) { - rc = 0; - pvtcpLoopbackOffAddr = GetLoopbackAddr(); - if (pvtcpLoopbackOffAddr == -1U) { - CommOS_Log(("%s: Could not allocate offload loopback address!\n", - __FUNCTION__)); - rc = -1; - CommSvc_UnregisterImpl(&pvtcpImpl); - } - } - -out: - if (rc) { - if (hooksRegistered) { - nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); - } - } - return rc; -} - - -/** - * @brief Called at module unload time. It shuts down pvtcp. - * @sideeffect Total and utter destruction. - */ - -static void -Exit(void) -{ - PutLoopbackAddr(pvtcpLoopbackOffAddr); - CommSvc_UnregisterImpl(&pvtcpImpl); -#if !defined(PVTCP_DISABLE_NETFILTER) - if (hooksRegistered) { - nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); - CommOS_Debug(("%s: Netfilter hooks unregistered.\n", __FUNCTION__)); - } -#endif - CommOS_Log(("%s: Allocations of large datagrams: %llu.\n", - __FUNCTION__, pvtcpOffDgramAllocations)); -} - - -/* - * Socket callback interceptors. - */ - -/** - * @brief Callback called when socket is destroyed. - * @param[in,out] sk socket to cleanup - * @return 0 if socket memory is freed, < 0 otherwise (no-op) - * @sideeffect Send queue buffers are deallocated - */ - -int -DestructCB(struct sock *sk) -{ - PvtcpOffBuf *internalBuf; - PvtcpOffBuf *tmp; - PvtcpSock *pvsk = PvskFromSk(sk); - - if (!pvsk || - (SkFromPvsk(pvsk) != sk) || - (pvsk->destruct == asmDestructorShim)) { - /* Module put _not_ to be performed by asmDestructorShim. */ - - CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); - return -1; - } - - CommOS_ListForEachSafe(&pvsk->queue, internalBuf, tmp, link) { - CommOS_ListDel(&internalBuf->link); - PvtcpBufFree(PvtcpOffBufFromInternal(internalBuf)); - } - if (pvsk->destruct) { - pvsk->destruct(sk); - } - - if (pvsk->rpcReply) { - CommOS_Kfree(pvsk->rpcReply); - } - CommOS_Kfree(pvsk); - - /* - * Module put is performed by asmDestructorShim. - */ - - return 0; -} - - -/** - * @brief Callback called when socket state changes occur. - * @param sk socket specified socket which changed state - * @sideeffect A writer task may be scheduled - */ - -static void -StateChangeCB(struct sock *sk) -{ - PvtcpSock *pvsk = PvskFromSk(sk); - - if (!pvsk || - (SkFromPvsk(pvsk) != sk) || - (pvsk->stateChange == StateChangeCB)) { - CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); - return; - } - - /* - * The socket (spin) lock is held when this function is called. - */ - - CommOS_Debug(("%s: [0x%p] sk_state [%u] sk_err [%d] sk_err_soft [%d].\n", - __FUNCTION__, pvsk, sk->sk_state, - sk->sk_err, sk->sk_err_soft)); - if (pvsk->stateChange) { - pvsk->stateChange(sk); - } - if (sk->sk_state == TCP_ESTABLISHED) { - PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); - } - PvtcpSchedSock(pvsk); -} - - -/** - * @brief Callback called when an error is set on the socket. - * @param sk socket the error happened on - * @sideeffect A writer task may be scheduled - */ - -static void -ErrorReportCB(struct sock *sk) -{ - PvtcpSock *pvsk = PvskFromSk(sk); - - if (!pvsk || - (SkFromPvsk(pvsk) != sk) || - (pvsk->errorReport == ErrorReportCB)) { - CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored\n", __FUNCTION__)); - return; - } - - /* - * The socket (spin) lock is held when this function is called. - * Interesting sk_err-s: - * ECONNRESET - tcp_disconnect(), tcp_reset() - * ECONNREFUSED - tcp_reset() - * EPIPE - tcp_reset() - * ETIMEDOUT - tcp_write_error() - * EHOSTUNREACH, etc. - tcp_v4_error()??, icmp errors - * etc. - __udp4_lib_err(), icmp errors - */ - - CommOS_Debug(("%s: [0x%p] sk_err [%d] sk_err_soft [%d].\n", - __FUNCTION__, pvsk, sk->sk_err, sk->sk_err_soft)); - if (pvsk->errorReport) { - pvsk->errorReport(sk); - } - pvsk->err = sk->sk_err; - PvtcpSchedSock(pvsk); -} - - -/** - * @brief Callback called when data is available to be read from a socket. - * @param sk socket in question - * @param bytes number of bytes to read - * @sideeffect A writer task is scheduled _iff_ the peer can safely - * receive. - */ - -static void -DataReadyCB(struct sock *sk, - int bytes) -{ - PvtcpSock *pvsk = PvskFromSk(sk); - - if (!pvsk || - (SkFromPvsk(pvsk) != sk) || - (pvsk->dataReady == DataReadyCB)) { - CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); - return; - } - - /* - * The socket (spin) lock is held when this function is called. - */ - - if (pvsk->dataReady) { - pvsk->dataReady(sk, bytes); - } - if (sk->sk_state == TCP_LISTEN) { - CommOS_Debug(("%s: Listen socket ready to accept [0x%p].\n", - __FUNCTION__, pvsk)); - } - PvtcpSchedSock(pvsk); -} - - -/** - * @brief Callback called when writing is possible on a socket. - * @param sk socket in question - * @sideeffect An AIO thread is scheduled. - */ - -static void -WriteSpaceCB(struct sock *sk) -{ - PvtcpSock *pvsk = PvskFromSk(sk); - - if (!pvsk || - (SkFromPvsk(pvsk) != sk) || - (pvsk->writeSpace == WriteSpaceCB)) { - CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); - return; - } - - /* - * The socket (spin) lock is held when this function is called. - */ - - if (pvsk->writeSpace) { - pvsk->writeSpace(sk); - } - PvtcpSchedSock(pvsk); -} - - -/** - * @brief Initializes a newly created socket for offload operations. - * @param[in,out] sock socket to initialize - * @param channel channel to update - * @param peerSock peer PV socket of this socket - * @param parentPvsk parent of this socket or NULL - * @return zero on success, error code otherwise - */ - -static int -SockAllocInit(struct socket *sock, - CommChannel channel, - unsigned long long peerSock, - PvtcpSock *parentPvsk) -{ - struct sock *sk; - PvtcpSock *pvsk; - int sndBuf = PVTCP_SOCK_RCVSIZE * 4; - - if (!sock || !channel || !peerSock) { - return -EINVAL; - } - - sk = sock->sk; - sk->sk_user_data = NULL; - - pvsk = CommOS_Kmalloc(sizeof *pvsk); - if (!pvsk) { - return -ENOMEM; - } - - if (PvtcpOffSockInit(pvsk, channel)) { - CommOS_Kfree(pvsk); - return -ENOMEM; - } - - /* - * PVTCP sockets should be billed against the vmware uid. - */ - sk->sk_socket->file = &_file; - - /* Set peer (pv) socket. */ - pvsk->peerSock = peerSock; - pvsk->peerSockSet = 1; - - /* Set up back pointer. */ - pvsk->sk = sk; - - /* Keep track of new socket. */ - if (PvtcpStateAddSocket(channel, pvtcpIfUnbound, pvsk) != 0) { - CommOS_Kfree(pvsk); - return -ENOMEM; - } - - /* - * Keep pvtcp around for at least the lifetime of this socket - */ - CommOS_ModuleGet(pvtcpImpl.owner); - - if (!parentPvsk) { - pvsk->destruct = sk->sk_destruct; - sk->sk_destruct = asmDestructorShim; - pvsk->stateChange = sk->sk_state_change; - sk->sk_state_change = StateChangeCB; - pvsk->errorReport = sk->sk_error_report; - sk->sk_error_report = ErrorReportCB; - pvsk->dataReady = sk->sk_data_ready; - sk->sk_data_ready = DataReadyCB; - pvsk->writeSpace = sk->sk_write_space; - sk->sk_write_space = WriteSpaceCB; - } else { - /* - * Copy the parent's saved callbacks. The parent pvsk is only passed - * when creating/initializing a socket after an 'accept'. - */ - - pvsk->destruct = parentPvsk->destruct; - sk->sk_destruct = asmDestructorShim; - pvsk->stateChange = parentPvsk->stateChange; - sk->sk_state_change = StateChangeCB; - pvsk->errorReport = parentPvsk->errorReport; - sk->sk_error_report = ErrorReportCB; - pvsk->dataReady = parentPvsk->dataReady; - sk->sk_data_ready = DataReadyCB; - pvsk->writeSpace = parentPvsk->writeSpace; - sk->sk_write_space = WriteSpaceCB; - - if (parentPvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { - /* The parent socket was morphed/bound. */ - - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); - PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); - } - } - - /* Install forward socket reference. */ - sk->sk_user_data = pvsk; - - /* - * Force the send buffer size high enough, such that we don't lose the - * just-a-bit-over-the-limit bytes. This is mainly needed for datagrams. - * Note that we always apply flow control between host and guest modules, - * according to the sizing model; so this is not artificially inflated. - */ - - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUFFORCE, - (void *)&sndBuf, sizeof sndBuf); - - return 0; -} - - -/** - * @brief Allocates a pvsk socket for error reporting (create operation). - * @param err error code to report to PV side - * @param channel channel error socket belongs to - * @param peerSock peer PV socket of this socket - * @return error socket on success, NULL otherwise - */ - -static PvtcpSock * -SockAllocErrInit(int err, - CommChannel channel, - unsigned long long peerSock) -{ - PvtcpSock *pvsk; - - if (!channel || !peerSock) { - return NULL; - } - - pvsk = CommOS_Kmalloc(sizeof *pvsk); - if (!pvsk) { - return NULL; - } - - if (PvtcpOffSockInit(pvsk, channel)) { - CommOS_Kfree(pvsk); - return NULL; - } - - /* Set peer (pv) socket and error. */ - pvsk->peerSock = peerSock; - pvsk->peerSockSet = 1; - pvsk->err = err; - - /* Set up back pointer to NULL such that PvtcpPutSock deallocates it. */ - pvsk->sk = NULL; - return pvsk; -} - - -/* - * Offload operations. - */ - -/** - * @brief Creates an offload socket and schedules it for reply. - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back. - */ - -void -PvtcpCreateOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - int rc; - struct socket *sock; - PvtcpSock *pvsk; - PvtcpState *state = (PvtcpState *)upperLayerState; - const int enable = 1; - - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - -#if defined(PVTCP_IPV6_DISABLE) - if (packet->data16 == AF_INET6) { - CommOS_Debug(("%s: AF_INET6 support is disabled.\n", __FUNCTION__)); - rc = -EAFNOSUPPORT; - } else -#endif - { - rc = sock_create_kern(packet->data16, packet->data32, - packet->data32ex, &sock); - } - - if (!rc) { - rc = SockAllocInit(sock, channel, packet->data64, NULL); - if (rc) { - SockReleaseWrapper(sock); - goto fail; - } - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (void *)&enable, sizeof enable); - pvsk = PvskFromSk(sock->sk); - if (state->extra && - ((PvtcpStateKObj *)(state->extra))->useNS) { - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); - } else { - PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); - } - PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); - PvskSetOpFlag(pvsk, PVTCP_OP_CREATE); - } else { - CommOS_Debug(("%s: Error creating offload socket: %d\n", - __FUNCTION__, rc)); - /* - * Pass -rc so we follow error conventions for other reply ops. - * The error code is fixed by the PV side so error codes are properly - * reported. - */ - pvsk = SockAllocErrInit(-rc, channel, packet->data64); - if (!pvsk) { - goto fail; - } - } - - PvtcpSchedSock(pvsk); - return; - -fail: - CommOS_Log(("%s: BOOG ** FAILED TO CREATE OFFLOAD SOCKET [%d] " - "_AND_ ERROR REPORTING SOCKET!\n" - " PV SIDE MAY BE LOCKED UP UNTIL CREATE RPC TIMES OUT!", - __FUNCTION__, rc)); -} - - -/** - * @brief Schedules an offload socket to be removed. - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back and - * then release the socket. - */ - -void -PvtcpReleaseOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - - /* - * Check if this is a pvsock datagram socket bound on a reserved port. - * If so, reset the bit such that filtering drops rogue packets. - */ - - if ((sk->sk_socket->type == SOCK_DGRAM) && - (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4)) { - unsigned short port = 0; - - if (sk->sk_family == AF_INET) { - struct sockaddr_in sin = { .sin_family = AF_INET }; - int addrLen = sizeof sin; - - if(!kernel_getsockname(sk->sk_socket, - (struct sockaddr *)&sin, &addrLen)) { - port = sin.sin_port; - } - } else { /* AF_INET6 */ - struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; - int addrLen = sizeof sin; - - if(!kernel_getsockname(sk->sk_socket, - (struct sockaddr *)&sin, &addrLen)) { - port = sin.sin6_port; - } - } - - port = ntohs(port) - portRangeBase; - if (port < portRangeSize) { - CommOS_MutexLock(&globalLock); - PvtcpResetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); - CommOS_MutexUnlock(&globalLock); - } - } - - /* - * - hold the socket before setting the 'release' flag and until after - * the call to PvtcpSchedSock(): if the socket had already been scheduled - * ReleaseAIO may run, find the flag set and release this socket while - * it's being unlocked here. - * - * - hold the dispatch lock until done to ensure that subsequent Ops for - * this socket see peerSockSet == 0. - */ - - PvtcpHoldSock(pvsk); - SOCK_STATE_LOCK(pvsk); - pvsk->peerSockSet = 0; - SOCK_STATE_UNLOCK(pvsk); - PvskSetOpFlag(pvsk, PVTCP_OP_RELEASE); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); - PVTCP_UNLOCK_DISP_DISCARD_VEC(); -} - - -/** - * @brief Binds an offload socket to a given address - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back - */ - -void -PvtcpBindOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - struct sockaddr *addr; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - int reuseAddr; - int addrLen; - int rc; - - PvtcpHoldSock(pvsk); - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - - /* - * The socket-level option SO_REUSEADDR is set in the common socket code, - * meaning that we cannot intercept it in the guest pvtcp implementation. - * In order to respect the setting, the guest would pass the current - * setting in 'bind' requests. - * If the guest requires 'reuse address' setting, the value is incremented - * such that we differentiate between: 0) not requested, 1) 'false' and - * 2) 'true'. - */ - - reuseAddr = COMM_OPF_GET_VAL(packet->flags); - if ((reuseAddr == 1) || (reuseAddr == 2)) { - /* Explicit request, so decrement the value. */ - - reuseAddr--; - kernel_setsockopt(sk->sk_socket, SOL_SOCKET, SO_REUSEADDR, - (void *)&reuseAddr, sizeof reuseAddr); - } - - if (sk->sk_family == AF_INET) { - memset(&sin, 0, sizeof sin); - sin.sin_family = AF_INET; - sin.sin_port = packet->data16; - sin.sin_addr.s_addr = (unsigned int)packet->data64ex; - addr = (struct sockaddr *)&sin; - addrLen = sizeof sin; - - rc = PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, - sin.sin_port); - if (rc <= 0) { - /* Bind has already happened. */ - - pvsk->err = -rc; - goto out; - } - } else { /* AF_INET6 */ - memset(&sin6, 0, sizeof sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_port = packet->data16; - addr = (struct sockaddr *)&sin6; - addrLen = sizeof sin6; - - rc = PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, - &packet->data64ex2, sin6.sin6_port); - if (rc <= 0) { - /* Bind has already happened. */ - - pvsk->err = -rc; - goto out; - } - PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], - packet->data64ex, packet->data64ex2); - } - - /* coverity[check_return] */ - pvsk->err = -kernel_bind(sk->sk_socket, addr, addrLen); - -out: - PvskSetOpFlag(pvsk, PVTCP_OP_BIND); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Sets a socket option. - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back - */ -void -PvtcpSetSockOptOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - unsigned int optlen = packet->len - sizeof *packet; - - PvtcpHoldSock(pvsk); - - if ((vecLen != 1) || (vec[0].iov_len != optlen) || (optlen < sizeof(int))) { - pvsk->rpcStatus = -EINVAL; - goto out; - } - - if (packet->data32 == SOL_TCP) { - /* - * The back-end implementation must always run in 'nodelay' mode. - * Consequently, we ignore, but we cache the TCP_NODELAY and TCP_CORK - * settings such that getsockopt() can return them as they were 'set'. - * Applications use these settings for performance; pvtcp does quite - * well if it's not interfered with. - */ - - int on; - - switch (packet->data32ex) { - case TCP_NODELAY: - memcpy(&on, vec[0].iov_base, sizeof on); - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY, on); - pvsk->rpcStatus = 0; - goto out; - case TCP_CORK: - memcpy(&on, vec[0].iov_base, sizeof on); - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK, on); - pvsk->rpcStatus = 0; - goto out; - } - } - - pvsk->rpcStatus = kernel_setsockopt(sock, - packet->data32, - packet->data32ex, - vec[0].iov_base, - optlen); - -out: - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - PvskSetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Retrieves a socket option. - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back - */ -void -PvtcpGetSockOptOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - unsigned int optLen = (unsigned int)(packet->data64ex); - char *optBuf; - int rc = 0; - - PvtcpHoldSock(pvsk); - - if ((optLen < sizeof(int)) || (optLen > PVTCP_SOCK_SAFE_RCVSIZE)) { - pvsk->rpcStatus = -EINVAL; - goto out; - } - - optBuf = CommOS_Kmalloc(optLen); - if (!optBuf) { - pvsk->rpcStatus = -EINVAL; - goto out; - } - - if (packet->data32 == SOL_TCP) { - /* - * See comment in PvtcpSetSockOptOp() regarding special treatment for - * the TCP_NODELAY and TCP_CORK settings. - */ - - int on; - - switch (packet->data32ex) { - case TCP_NODELAY: - on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY); - optLen = sizeof on; - memcpy(optBuf, &on, optLen); - goto done; - case TCP_CORK: - on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK); - optLen = sizeof on; - memcpy(optBuf, &on, optLen); - goto done; - } - } - - rc = kernel_getsockopt(sock, packet->data32, - packet->data32ex, optBuf, &optLen); - -done: - if (!rc) { - pvsk->rpcReply = optBuf; - CommOS_MemBarrier(); - pvsk->rpcStatus = (int)optLen; - } else { - CommOS_Kfree(optBuf); - pvsk->rpcStatus = rc; - } - -out: - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - PvskSetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Performs ioctl on offload socket. - * @param channel communication channel with offloader - * @param state state associated with this channel - * @param packet packet header received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - */ - -void -PvtcpIoctlOp(CommChannel channel, - void *state, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, state); - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - - PvtcpHoldSock(pvsk); - - /* Not implemented yet. */ - - (void)sock; - pvsk->rpcStatus = -ENOIOCTLCMD; - - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - PvskSetOpFlag(pvsk, PVTCP_OP_IOCTL); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Marks a socket for listening to incoming connections - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back - */ - -void -PvtcpListenOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - int backlog = (int)packet->data32; - - PvtcpHoldSock(pvsk); - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - - pvsk->err = -kernel_listen(sk->sk_socket, backlog); - PvskSetOpFlag(pvsk, PVTCP_OP_LISTEN); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Accepts a connected socket - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back. - */ - -void -PvtcpAcceptOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - int rc; - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - struct socket *newsock = NULL; - - PvtcpHoldSock(pvsk); - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - - rc = kernel_accept(sk->sk_socket, &newsock, O_NONBLOCK); - if (rc == 0) { - rc = SockAllocInit(newsock, channel, packet->data64ex, pvsk); - if (rc) { - SockReleaseWrapper(newsock); - } - } - - if (rc == 0) { - struct sock *newsk = newsock->sk; - PvtcpSock *newpvsk = PvskFromSk(newsk); - - /* We temporarily use the state field to cache parent socket. */ - - newpvsk->state = (PvtcpState *)pvsk; - PvskSetOpFlag(newpvsk, PVTCP_OP_ACCEPT); - PvtcpSchedSock(newpvsk); - } else { - pvsk->err = -rc; - PvskSetOpFlag(pvsk, PVTCP_OP_ACCEPT); - PvtcpSchedSock(pvsk); - } - - PvtcpPutSock(pvsk); -} - - -/** - * @brief Connects an offload socket to given address - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect A writer task is scheduled, which will send reply back - */ - -void -PvtcpConnectOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - struct sock *sk = SkFromPvsk(pvsk); - struct sockaddr *addr; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - int addrLen; - int flags = 0; - int rc = 0; - int disconnect = 0; - - PvtcpHoldSock(pvsk); - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - - if (sk->sk_family == AF_INET) { - addr = (struct sockaddr *)&sin; - addrLen = sizeof sin; - memset(&sin, 0, sizeof sin); - sin.sin_port = packet->data16; - sin.sin_addr.s_addr = (unsigned int)packet->data64ex; - if (COMM_OPF_GET_VAL(packet->flags)) { - sin.sin_family = AF_UNSPEC; - disconnect = 1; - goto connect; - } - sin.sin_family = AF_INET; - PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, 0); - } else { /* AF_INET6 */ - addr = (struct sockaddr *)&sin6; - addrLen = sizeof sin6; - memset(&sin6, 0, sizeof sin6); - sin6.sin6_port = packet->data16; - if (COMM_OPF_GET_VAL(packet->flags)) { - sin6.sin6_family = AF_UNSPEC; - PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], - packet->data64ex, packet->data64ex2); - disconnect = 1; - goto connect; - } - sin6.sin6_family = AF_INET6; - PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, - &packet->data64ex2, 0); - PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], - packet->data64ex, packet->data64ex2); - } - -connect: - rc = kernel_connect(sk->sk_socket, addr, addrLen, flags | O_NONBLOCK); - - /* - * For datagram sockets, ErrorReportCB is not called, so we need to - * explicitly set the pvsk error to be returned back to the guest. - * This should not be used on SOCK_STREAM sockets. You have been - * warned. - */ - - if (rc && (sk->sk_socket->type == SOCK_DGRAM)) { - pvsk->err = -rc; - } - - /* - * Quite likely, stream actual connect requests will set err to EINPROGRESS. - * That's fine, error_report will trigger an AIO/flow-op reply. When the - * connection is established, state_change schedules an AIO/connect reply. - * Record whether the request was a disconnect. - */ - - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, disconnect); - PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/** - * @brief Initiates socket shutdown on an offload socket - * @param channel communication channel with offloader - * @param upperLayerState state associated with this channel - * @param packet first packet received in reply - * @param vec payload buffer descriptors - * @param vecLen payload buffer descriptor count - * @sideeffect Socket queue will be drained and socket shutdown performed. - */ - -void -PvtcpShutdownOp(CommChannel channel, - void *upperLayerState, - CommPacket *packet, - struct kvec *vec, - unsigned int vecLen) -{ - PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); - int how = (int)packet->data32; - - PvtcpHoldSock(pvsk); - if ((how == SHUT_RD) || (how == SHUT_RDWR)) { - kernel_sock_shutdown(SkFromPvsk(pvsk)->sk_socket, SHUT_RD); - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_RD, 1); - } - if ((how == SHUT_WR) || (how == SHUT_RDWR)) { - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR, 1); - } - PVTCP_UNLOCK_DISP_DISCARD_VEC(); - PvtcpSchedSock(pvsk); - PvtcpPutSock(pvsk); -} - - -/* - * AIO functions called from the main AIO processing function. - * Most of these functions complete processing initiated by the corresponding - * offload operations above. - */ - -/** - * @brief Processes socket release in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket to release. - * @sideeffect the socket will be released upon return from this function. - */ - -static inline void -ReleaseAIO(PvtcpSock *pvsk) -{ - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_RELEASE, - .data64 = pvsk->peerSock, - .data64ex = PvtcpGetHandle(pvsk) - }; - unsigned long long timeout = COMM_MAX_TO; - - SOCK_OUT_LOCK(pvsk); - CommSvc_Write(pvsk->channel, &packet, &timeout); -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Release' [0x%p] -> 0x%0x] reply.\n", - __FUNCTION__, pvsk, (unsigned)(pvsk->peerSock))); -#endif - /* - * 'sk' goes away in the final ProcessAIO::sock_put() - */ - SockReleaseWrapper(sock); - SOCK_OUT_UNLOCK(pvsk); - - PvtcpStateRemoveSocket(pvsk->channel, pvsk); -} - - -/** - * @brief Processes socket create reply in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk newly created socket to send ack for. - */ - -static inline void -CreateAIO(PvtcpSock *pvsk) -{ - struct sock *sk; - struct socket *sock; - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_CREATE, - .data64 = pvsk->peerSock, - }; - unsigned long long timeout = COMM_MAX_TO; - int rc; - - sk = SkFromPvsk(pvsk); - if (!sk) { - /* - * This is a create-error socket. The error reply has been sent out - * already, by PvtcpFlowAIO(). This is a paranoid safety measure, as - * PVTCP_OP_CREATE OpFlag should not have been set. - */ - - return; - } - - sock = sk->sk_socket; - packet.data64ex = PvtcpGetHandle(pvsk); - - rc = CommSvc_Write(pvsk->channel, &packet, &timeout); - if (rc != packet.len) { - /* We mustn't leak it if PV can't get a hold of it. */ - - PvtcpStateRemoveSocket(pvsk->channel, pvsk); - SockReleaseWrapper(sock); - CommOS_Log(("%s: BOOG -- Couldn't send 'Create' reply [0x%p]!\n", - __FUNCTION__, sk)); - } else { -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Create' [0x%p] reply [%d].\n", - __FUNCTION__, pvsk, rc)); -#endif - } -} - - -/** - * @brief Processes socket bind in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket being bound. - */ - -static inline void -BindAIO(PvtcpSock *pvsk) -{ - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_BIND, - .data64 = pvsk->peerSock - }; - unsigned long long timeout = COMM_MAX_TO; - int rc; - - if (pvsk->peerSockSet) { - if (sk->sk_family == AF_INET) { - struct sockaddr_in sin = { .sin_family = AF_INET }; - int addrLen = sizeof sin; - - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin_port; - PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); - packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; - } - } else { /* AF_INET6 */ - struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; - int addrLen = sizeof sin; - - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin6_port; - PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); - PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], - &packet.data64ex, &packet.data64ex2); - } - } - - if (rc) { - COMM_OPF_SET_ERR(packet.flags); - packet.data32ex = (unsigned int)(-rc); - packet.opCode = PVTCP_OP_FLOW; - } - CommSvc_Write(pvsk->channel, &packet, &timeout); -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Bind' [0x%p, %d] reply.\n", - __FUNCTION__, pvsk, rc)); -#endif - } -} - - -/** - * @brief Sends result of setsockopt back to guest. - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket that was modified. - */ - -static inline void -SetSockOptAIO(PvtcpSock *pvsk) -{ - CommPacket packet; - unsigned long long timeout; - - packet.len = sizeof packet; - packet.flags = 0; - packet.opCode = PVTCP_OP_SETSOCKOPT; - packet.data64 = pvsk->peerSock; - packet.data32 = (unsigned int)(pvsk->rpcStatus); - timeout = COMM_MAX_TO; - CommSvc_Write(pvsk->channel, &packet, &timeout); - pvsk->rpcStatus = 0; -} - - -/** - * @brief Sends result of getsockopt back to guest. - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket that was modified. - */ - -static inline void -GetSockOptAIO(PvtcpSock *pvsk) -{ - CommPacket packet = { - .opCode = PVTCP_OP_GETSOCKOPT, - .flags = 0 - }; - unsigned long long timeout = COMM_MAX_TO; - - struct kvec vec[1]; - struct kvec *inVec = vec; - unsigned int vecLen = 1; - unsigned int iovOffset = 0; - - if (pvsk->rpcStatus > 0) { - packet.len = sizeof packet + pvsk->rpcStatus; - vec[0].iov_base = pvsk->rpcReply; - vec[0].iov_len = pvsk->rpcStatus; - } else { - vecLen = 0; - } - - packet.data64 = pvsk->peerSock; - packet.data32 = pvsk->rpcStatus; - - CommSvc_WriteVec(pvsk->channel, &packet, &inVec, &vecLen, - &timeout, &iovOffset); - - if (pvsk->rpcReply) { - CommOS_Kfree(pvsk->rpcReply); - pvsk->rpcReply = NULL; - } - pvsk->rpcStatus = 0; -} - - -/** - * @brief Sends result of ioctl back to guest. - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket that was modified. - */ - -static inline void -IoctlAIO(PvtcpSock *pvsk) -{ - CommPacket packet = { - .len = sizeof packet, - .opCode = PVTCP_OP_IOCTL, - .flags = 0 - }; - unsigned long long timeout = COMM_MAX_TO; - - packet.data64 = pvsk->peerSock; - packet.data32 = pvsk->rpcStatus; - CommSvc_Write(pvsk->channel, &packet, &timeout); - pvsk->rpcStatus = 0; -} - - -/** - * @brief Processes socket listen reply in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket being put in listen mode. - */ - -static inline void -ListenAIO(PvtcpSock *pvsk) -{ - struct sock *sk = SkFromPvsk(pvsk); - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_LISTEN, - .data64 = pvsk->peerSock - }; - unsigned long long timeout = COMM_MAX_TO; - - if (pvsk->peerSockSet) { - if (sk->sk_state != TCP_LISTEN) { - COMM_OPF_SET_ERR(packet.flags); - packet.data32ex = (unsigned int)pvsk->err; - packet.opCode = PVTCP_OP_FLOW; - } - - CommSvc_Write(pvsk->channel, &packet, &timeout); -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Listen' [0x%p] reply.\n", __FUNCTION__, pvsk)); -#endif - } -} - - -/** - * @brief Processes socket accept reply in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk new socket or socket to accept on (see PvtcpAcceptOp). - */ - -static inline void -AcceptAIO(PvtcpSock *pvsk) -{ - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_ACCEPT - }; - unsigned long long timeout = COMM_MAX_TO; - const int enable = 1; - int rc; - - if (pvsk->peerSockSet) { - unsigned long long payloadSocks[2] = { 0, 0 }; - struct kvec payloadVec[] = { - { .iov_base = &payloadSocks, .iov_len = sizeof payloadSocks } - }; - struct kvec *payload = payloadVec; - unsigned int payloadLen = 1; - unsigned int iovOffset = 0; - - packet.len = sizeof packet + sizeof payloadSocks; - - /* - * accept() succeeded, so this is the child socket; its state field - * was temporarily changed to hold the parent/accepting socket. - * The newly accepted socket and its peer need to be put in a - * payload since we use up all available header fields with - * addressing information. Finally, the state field is restored. - */ - - packet.data64 = ((PvtcpSock *)pvsk->state)->peerSock; - pvsk->state = CommSvc_GetState(pvsk->channel); - - payloadSocks[0] = pvsk->peerSock; - payloadSocks[1] = PvtcpGetHandle(pvsk); - - rc = 0; - if (sk->sk_family == AF_INET) { - struct sockaddr_in sin = { .sin_family = AF_INET }; - int addrLen = sizeof sin; - - rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin_port; - PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); - packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; - } - } else { /* AF_INET6 */ - struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; - int addrLen = sizeof sin; - - rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin6_port; - PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); - PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], - &packet.data64ex, &packet.data64ex2); - } - } - - if (rc == 0) { - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&enable, sizeof enable); - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (void *)&enable, sizeof enable); - kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, - (void *)&enable, sizeof enable); - } else { - PvtcpStateRemoveSocket(pvsk->channel, pvsk); - SockReleaseWrapper(sock); - COMM_OPF_SET_ERR(packet.flags); - packet.data32ex = (unsigned int)ECONNABORTED; - packet.len = sizeof packet; - packet.opCode = PVTCP_OP_FLOW; - } - - rc = CommSvc_WriteVec(pvsk->channel, &packet, - &payload, &payloadLen, &timeout, &iovOffset); - if ((rc != packet.len) && !COMM_OPF_TEST_ERR(packet.flags)) { - /* Mustn't leak the new socket if PV can't get a hold of it. */ - - PvtcpStateRemoveSocket(pvsk->channel, pvsk); - SockReleaseWrapper(sock); - } -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Accept' [0x%p] reply.\n", __FUNCTION__, pvsk)); -#endif - } -} - - -/** - * @brief Processes socket connect in an AIO thread. This function is - * called with the socket 'in' lock taken. - * @param[in,out] pvsk socket being connected. - */ - -static inline void -ConnectAIO(PvtcpSock *pvsk) -{ - struct sock *sk = SkFromPvsk(pvsk); - struct socket *sock = sk->sk_socket; - CommPacket packet = { - .len = sizeof packet, - .flags = 0, - .opCode = PVTCP_OP_CONNECT, - .data64 = pvsk->peerSock - }; - unsigned long long timeout = COMM_MAX_TO; - const int enable = 1; - int rc; - - if (!pvsk->peerSockSet || - (!PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT) && - (sk->sk_state != TCP_ESTABLISHED))) { - return; - } - - if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT)) { - COMM_OPF_SET_VAL(packet.flags, 1); - PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, 0); - } else if (sk->sk_state == TCP_ESTABLISHED) { - if (sk->sk_family == AF_INET) { - struct sockaddr_in sin = { .sin_family = AF_INET }; - int addrLen = sizeof sin; - - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin_port; - PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); - packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; - } - } else { /* AF_INET6 */ - struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; - int addrLen = sizeof sin; - - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); - if (rc == 0) { - packet.data16 = sin.sin6_port; - PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); - PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], - &packet.data64ex, &packet.data64ex2); - } - } - - if (rc == 0) { - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&enable, sizeof enable); - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (void *)&enable, sizeof enable); - kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, - (void *)&enable, sizeof enable); - } else { - COMM_OPF_SET_ERR(packet.flags); - packet.data32ex = ECONNABORTED; - packet.opCode = PVTCP_OP_FLOW; - } - } - - CommSvc_Write(pvsk->channel, &packet, &timeout); -#if defined(PVTCP_FULL_DEBUG) - CommOS_Debug(("%s: Sent 'Connect' [0x%p] reply.\n", __FUNCTION__, pvsk)); -#endif -} - - -/** - * @brief Server side main asynchronous processing function. It writes to - * socket queued output buffers, it reads from socket and outputs to PV; it - * also completes operation processing and sends applicable replies to PV. - * Finally, processes error reporting and delta size acks. - * @param arg socket work item. - */ - -void -PvtcpProcessAIO(CommOSWork *arg) -{ - PvtcpSock *pvsk = container_of(arg, PvtcpSock, work); - struct sock *sk = SkFromPvsk(pvsk); - - if (!SOCK_OUT_TRYLOCK(pvsk)) { - /* - * Queued output processing. If trylock failed, we don't retry. - * There are only two reasons for not being able to take the lock: - * - IoOp() has it -- when done, it reschedules us if we're not running. - * - OutputAIO() is already running on another core. - */ - - if (sk && sk->sk_socket) { - PvtcpOutputAIO(pvsk); - } - SOCK_OUT_UNLOCK(pvsk); - } - - /* All other processing needs the socket IN lock. */ - - if (!SOCK_IN_TRYLOCK(pvsk)) { - - if (sk && sk->sk_socket) { - int err; - - /* Input processing. */ - - /* - * Workqueue handlers are pinned to a CPU core and therefore not - * migratable. No need to disable preemption. - */ - err = PvtcpInputAIO(pvsk, perCpuBuf[smp_processor_id()]); - - /* Error and ack notifications. */ - - PvtcpFlowAIO(pvsk, err); - - if (!pvsk->opFlags) { - /* No other operations need to be completed. */ - - goto doneInUnlock; - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) { - PvskResetOpFlag(pvsk, PVTCP_OP_RELEASE); - ReleaseAIO(pvsk); - - /* All possible in-flight operations must be dropped. */ - goto doneInUnlock; - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_CREATE)) { - /* No state locking required. */ - - PvskResetOpFlag(pvsk, PVTCP_OP_CREATE); - CreateAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_BIND)) { - PvskResetOpFlag(pvsk, PVTCP_OP_BIND); - BindAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_SETSOCKOPT)) { - PvskResetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); - SetSockOptAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_GETSOCKOPT)) { - PvskResetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); - GetSockOptAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_IOCTL)) { - PvskResetOpFlag(pvsk, PVTCP_OP_IOCTL); - IoctlAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_LISTEN)) { - PvskResetOpFlag(pvsk, PVTCP_OP_LISTEN); - ListenAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_ACCEPT)) { - PvskResetOpFlag(pvsk, PVTCP_OP_ACCEPT); - AcceptAIO(pvsk); - } - - if (PvskTestOpFlag(pvsk, PVTCP_OP_CONNECT)) { - PvskResetOpFlag(pvsk, PVTCP_OP_CONNECT); - ConnectAIO(pvsk); - } - -doneInUnlock: - SOCK_IN_UNLOCK(pvsk); - } else { - /* - * Special case for error sockets which don't have a sk. - * Note that this socket was created by SockAllocErrInit() and so - * no 'real' socket sits atop it and is not present on any state - * netif list. The socket has a refcnt of one and it will get - * deallocated by the PvtcpPutSock() call below, so we don't need - * to unlock it. - */ - - PvtcpFlowAIO(pvsk, -ENETDOWN); - } - } else { - if ((pvsk->peerSockSet || PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) && - sk && sk->sk_socket) { - PvtcpSchedSock(pvsk); - } - } - - PvtcpPutSock(pvsk); -} |