aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/mvp/mvpkm/mksck_kernel.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/mvp/mvpkm/mksck_kernel.c')
-rw-r--r--arch/arm/mvp/mvpkm/mksck_kernel.c2589
1 files changed, 2589 insertions, 0 deletions
diff --git a/arch/arm/mvp/mvpkm/mksck_kernel.c b/arch/arm/mvp/mvpkm/mksck_kernel.c
new file mode 100644
index 0000000..6811a68
--- /dev/null
+++ b/arch/arm/mvp/mvpkm/mksck_kernel.c
@@ -0,0 +1,2589 @@
+/*
+ * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support
+ *
+ * Copyright (C) 2010-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#line 5
+
+/**
+ * @file
+ *
+ * @brief The monitor/kernel socket interface kernel extension.
+ */
+
+#define __KERNEL_SYSCALLS__
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/fcntl.h>
+#include <linux/syscalls.h>
+#include <linux/kmod.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <net/sock.h>
+
+#include <asm/memory.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "mvp.h"
+#include "actions.h"
+#include "mvpkm_kernel.h"
+#include "mksck_kernel.h"
+#include "mksck_sockaddr.h"
+#include "mutex_kernel.h"
+
+void NORETURN FatalError(char const *file,
+ int line,
+ FECode feCode,
+ int bugno,
+ char const *fmt,
+ ...)
+{
+ /* Lock around printing the error details so that the messages from multiple
+ * threads are not interleaved. */
+ static DEFINE_MUTEX(fatalErrorMutex);
+ mutex_lock(&fatalErrorMutex);
+
+ FATALERROR_COMMON(printk, vprintk, file, line, feCode, bugno, fmt);
+
+ dump_stack();
+
+ /* done printing */
+ mutex_unlock(&fatalErrorMutex);
+
+ /* do_exit below exits the current thread but does not crash the kernel.
+ * Hence the stack dump will actually be readable from other user threads.
+ */
+ do_exit(1);
+}
+
+
+/*
+ * The project uses a new address family: AF_MKSCK. Optimally this address
+ * family were accepted with the Linux community and a permanent number
+ * were assigned. This, however, is a dream only, not even the x86 team
+ * has been able to pull it off.
+ *
+ * Instead we ASSUME that DECnet is dead and re-use it's address family number.
+ * This is what the x86 world is moving too in the latest versions.
+ */
+
+static struct proto mksckProto = {
+ .name = "AF_MKSCK",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof (struct sock),
+};
+
+static int MksckCreate(struct net *net,
+ struct socket *sock,
+ int protocol,
+ int kern);
+
+static struct net_proto_family mksckFamilyOps = {
+ .family = AF_MKSCK,
+ .owner = THIS_MODULE,
+ .create = MksckCreate,
+};
+
+static int MksckFault(struct vm_area_struct *vma, struct vm_fault *vmf);
+
+
+/**
+ * @brief Linux vma operations for receive windows established via Mksck
+ * mmap.
+ */
+static struct vm_operations_struct mksckVMOps = {
+ .fault = MksckFault
+};
+
+/*
+ * List of hosts and guests we know about.
+ */
+static spinlock_t mksckPageListLock;
+static MksckPage *mksckPages[MKSCK_MAX_SHARES];
+
+/*
+ * The following functions form the AF_MKSCK DGRAM operations.
+ */
+static int MksckRelease(struct socket *sock);
+static int MksckBacklogRcv(struct sock *sk, struct sk_buff *skb);
+static void MksckSkDestruct(struct sock *sk);
+static int MksckBind(struct socket *sock,
+ struct sockaddr *addr,
+ int addrLen);
+static int MksckBindGeneric(struct sock *sk,
+ Mksck_Address addr);
+static int MksckDgramRecvMsg(struct kiocb *kiocb,
+ struct socket *sock,
+ struct msghdr *msg,
+ size_t len,
+ int flags);
+static int MksckDgramSendMsg(struct kiocb *kiocb,
+ struct socket *sock,
+ struct msghdr *msg,
+ size_t len);
+static int MksckGetName(struct socket *sock,
+ struct sockaddr *addr,
+ int *addrLen,
+ int peer);
+static unsigned int MksckPoll(struct file *filp,
+ struct socket *sock,
+ poll_table *wait);
+static int MksckDgramConnect(struct socket *sock,
+ struct sockaddr *addr,
+ int addrLen,
+ int flags);
+static int MksckMMap(struct file *file,
+ struct socket *sock,
+ struct vm_area_struct *vma);
+
+static void MksckPageRelease(MksckPage *mksckPage);
+
+static struct proto_ops mksckDgramOps = {
+ .family = AF_MKSCK,
+ .owner = THIS_MODULE,
+ .release = MksckRelease,
+ .bind = MksckBind,
+ .connect = MksckDgramConnect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = MksckGetName,
+ .poll = MksckPoll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown, /* MksckShutdown, */
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = MksckDgramSendMsg,
+ .recvmsg = MksckDgramRecvMsg,
+ .mmap = MksckMMap,
+ .sendpage = sock_no_sendpage,
+};
+
+
+/**
+ * @brief Initialize the MKSCK protocol
+ *
+ * @return 0 on success, -errno on failure
+ */
+int
+Mksck_Init(void)
+{
+ int err;
+
+ spin_lock_init(&mksckPageListLock);
+
+ /*
+ * Create a slab to allocate socket structs from.
+ */
+ err = proto_register(&mksckProto, 1);
+ if (err != 0) {
+ printk(KERN_INFO
+ "Mksck_Init: Cannot register MKSCK protocol, errno = %d.\n", err);
+ return err;
+ }
+
+ /*
+ * Register the socket family
+ */
+ err = sock_register(&mksckFamilyOps);
+ if (err < 0) {
+ printk(KERN_INFO
+ "Mksck_Init: Could not register address family AF_MKSCK"
+ " (errno = %d).\n", err);
+ return err;
+ }
+
+ return 0;
+}
+
+
+/**
+ * @brief De-register the MKSCK protocol
+ */
+void
+Mksck_Exit(void)
+{
+ sock_unregister(mksckFamilyOps.family);
+ proto_unregister(&mksckProto);
+}
+
+
+/**
+ * @brief Create a new MKSCK socket
+ *
+ * @param net network namespace (2.6.24 or above)
+ * @param sock user socket structure
+ * @param protocol protocol to be used
+ * @param kern called from kernel mode
+ *
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckCreate(struct net *net,
+ struct socket *sock,
+ int protocol,
+ int kern)
+{
+ struct sock *sk;
+ uid_t currentUid = current_euid();
+
+ if (!(currentUid == 0 ||
+ currentUid == Mvpkm_vmwareUid)) {
+ printk(KERN_WARNING
+ "MksckCreate: rejected from process %s tgid=%d, pid=%d euid:%d.\n",
+ current->comm,
+ task_tgid_vnr(current),
+ task_pid_vnr(current),
+ currentUid);
+ return -EPERM;
+ }
+
+ if (!sock) {
+ return -EINVAL;
+ }
+
+ if (protocol) {
+ return -EPROTONOSUPPORT;
+ }
+
+ switch (sock->type) {
+ case SOCK_DGRAM: {
+ sock->ops = &mksckDgramOps;
+ break;
+ }
+ default: {
+ return -ESOCKTNOSUPPORT;
+ }
+ }
+
+ sock->state = SS_UNCONNECTED;
+
+ /*
+ * Most recently (in 2.6.24), sk_alloc() was changed to expect the
+ * network namespace, and the option to zero the sock was dropped.
+ */
+ sk = sk_alloc(net, mksckFamilyOps.family, GFP_KERNEL, &mksckProto);
+
+ if (!sk) {
+ return -ENOMEM;
+ }
+
+ sock_init_data(sock, sk);
+
+ sk->sk_type = SOCK_DGRAM;
+ sk->sk_destruct = MksckSkDestruct;
+ sk->sk_backlog_rcv = MksckBacklogRcv;
+
+ /*
+ * On socket lock...
+ *
+ * A bound socket will have an associated private area, the Mksck
+ * structure part of MksckPage. That area is pointed to by
+ * sk->sk_protinfo. In addition, a connected socket will have the
+ * peer field in its associated area set to point to the associated
+ * private area of the peer socket. A mechanism is needed to ensure
+ * that these private areas area not freed while they are being
+ * accessed within the scope of a function. A simple lock would not
+ * suffice as the interface functions (like MksckDgramRecvMsg())
+ * may block. Hence a reference count mechanism is employed. When
+ * the mentioned references (sk->sk_protinfo and mksck->peer) to
+ * the respective private areas are set a refcount is incremented,
+ * and decremented when the references are deleted.
+ *
+ * The refcounts of areas pointed to by sk->sk_protinfo and
+ * mksck->peer will be decremented under the lock of the socket.
+ * Hence these private areas cannot disappear as long as the socket
+ * lock is held.
+ *
+ * The interface functions will have one of the following
+ * structures:
+ *
+ * simpleFn(sk)
+ * {
+ * lock_sock(sk);
+ * if ((mksck = sk->sk_protinfo)) {
+ * <non-blocking use of mksck>
+ * }
+ * release_sock(sk);
+ * }
+ *
+ * complexFn(sk)
+ * {
+ * lock_sock(sk);
+ * if ((mksck = sk->sk_protinfo)) {
+ * IncRefc(mksck);
+ * }
+ * release_sock(sk);
+ *
+ * if (mksck) {
+ * <use of mksck in a potentially blocking manner>
+ * DecRefc(mksck);
+ * }
+ * }
+ */
+ sk->sk_protinfo = NULL;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ return 0;
+}
+
+
+/**
+ * @brief Delete a MKSCK socket
+ *
+ * @param sock user socket structure
+ *
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckRelease(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ lock_sock(sk);
+ sock_orphan(sk);
+ release_sock(sk);
+ sock_put(sk);
+ }
+
+ sock->sk = NULL;
+ sock->state = SS_FREE;
+
+ return 0;
+}
+
+
+static int
+MksckBacklogRcv(struct sock *sk, struct sk_buff *skb)
+{
+ /*
+ * We should never get these as we never queue an skb.
+ */
+ printk("MksckBacklogRcv: should never get here\n");
+ return -EIO;
+}
+
+
+/**
+ * @brief Callback at socket destruction
+ *
+ * @param sk pointer to kernel socket structure
+ */
+static void
+MksckSkDestruct(struct sock *sk)
+{
+ Mksck *mksck;
+
+ lock_sock(sk);
+ mksck = sk->sk_protinfo;
+
+ if (mksck != NULL) {
+ sk->sk_protinfo = NULL;
+ Mksck_CloseCommon(mksck);
+ }
+
+ if (sk->sk_user_data != NULL) {
+ sock_kfree_s(sk, sk->sk_user_data, sizeof(int));
+ sk->sk_user_data = NULL;
+ }
+
+ release_sock(sk);
+}
+
+
+/**
+ * @brief Set the local address of a MKSCK socket
+ *
+ * @param sk kernel socket structure
+ * @param addr the new address of the socket
+ *
+ * @return 0 on success, -errno on failure
+ *
+ * If addr.port is undefined a new random port is assigned.
+ * If addr.vmId is undefined then the vmId computed from the tgid is used.
+ * Hence the vmId of a socket does not determine the host all the time.
+ *
+ * Assumed that the socket is locked.
+ * This function is called by explicit set (MksckBind) and implicit (Send).
+ */
+static int
+MksckBindGeneric(struct sock *sk,
+ Mksck_Address addr)
+{
+ int err;
+ Mksck *mksck;
+ MksckPage *mksckPage;
+
+ if (sk->sk_protinfo != NULL) {
+ return -EISCONN;
+ }
+
+ /*
+ * Locate the page for the given host and increment its reference
+ * count so it can't get freed off while we are working on it.
+ */
+ if (addr.vmId == MKSCK_VMID_UNDEF) {
+ mksckPage = MksckPage_GetFromTgidIncRefc();
+ } else {
+ printk(KERN_WARNING "MksckBind: host bind called on vmid 0x%X\n", addr.vmId);
+ mksckPage = MksckPage_GetFromVmIdIncRefc(addr.vmId);
+ }
+
+ if (mksckPage == NULL) {
+ printk(KERN_INFO "MksckBind: no mksckPage for vm 0x%X\n", addr.vmId);
+ return -ENETUNREACH;
+ }
+ addr.vmId = mksckPage->vmId;
+
+ /*
+ * Before we can find an unused socket port on the page we have to
+ * lock the page for exclusive access so another thread can't
+ * allocate the same port.
+ */
+ err = Mutex_Lock(&mksckPage->mutex, MutexModeEX);
+ if (err < 0) {
+ goto outDec;
+ }
+
+ addr.port = MksckPage_GetFreePort(mksckPage, addr.port);
+ if (addr.port == MKSCK_PORT_UNDEF) {
+ err = -EINVAL;
+ goto outUnlockDec;
+ }
+
+ /*
+ * At this point we have the mksckPage locked for exclusive access
+ * and its reference count incremented. Also, addr is completely
+ * filled in with vmId and port that we want to bind.
+ *
+ * Find an available mksck struct on the shared page and initialize
+ * it.
+ */
+ mksck = MksckPage_AllocSocket(mksckPage, addr);
+ if (mksck == NULL) {
+ err = -EMFILE;
+ goto outUnlockDec;
+ }
+
+ /*
+ * Stable, release mutex. Leave mksckPage->refCount incremented so
+ * mksckPage can't be freed until socket is closed.
+ */
+ Mutex_Unlock(&mksckPage->mutex, MutexModeEX);
+
+ /*
+ * This is why we start mksck->refCount at 1. When sk_protinfo gets
+ * cleared, we decrement mksck->refCount.
+ */
+ sk->sk_protinfo = mksck;
+
+ PRINTK(KERN_DEBUG "MksckBind: socket bound to %08X\n", mksck->addr.addr);
+
+ return 0;
+
+outUnlockDec:
+ Mutex_Unlock(&mksckPage->mutex, MutexModeEX);
+outDec:
+ MksckPage_DecRefc(mksckPage);
+ return err;
+}
+
+
+/**
+ * @brief Test if the socket is already bound to a local address and,
+ * if not, bind it to an unused address.
+ *
+ * @param sk kernel socket structure
+ * @return 0 on success, -errno on failure
+ *
+ * Assumed that the socket is locked.
+ */
+static inline int
+MksckTryBind(struct sock *sk)
+{
+ int err = 0;
+
+ if (!sk->sk_protinfo) {
+ static const Mksck_Address addr = { .addr = MKSCK_ADDR_UNDEF };
+ err = MksckBindGeneric(sk, addr);
+ }
+ return err;
+}
+
+
+
+/**
+ * @brief Set the address of a MKSCK socket (user call)
+ *
+ * @param sock user socket structure
+ * @param addr the new address of the socket
+ * @param addrLen length of the address
+ *
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckBind(struct socket *sock,
+ struct sockaddr *addr,
+ int addrLen)
+{
+ int err;
+ struct sock *sk = sock->sk;
+ struct sockaddr_mk *addrMk = (struct sockaddr_mk *)addr;
+
+ if (addrLen != sizeof *addrMk) {
+ return -EINVAL;
+ }
+ if (addrMk->mk_family != AF_MKSCK) {
+ return -EAFNOSUPPORT;
+ }
+
+ /*
+ * Obtain the socket lock and call the generic Bind function.
+ */
+ lock_sock(sk);
+ err = MksckBindGeneric(sk, addrMk->mk_addr);
+ release_sock(sk);
+
+ return err;
+}
+
+/**
+ * @brief Lock the peer socket by locating it, incrementing its refc
+ * @param addr the address of the peer socket
+ * @param[out] peerMksckR set to the locked peer socket pointer
+ * upon successful lookup
+ * @return 0 on success, -errno on failure
+ */
+static int
+LockPeer(Mksck_Address addr, Mksck **peerMksckR)
+{
+ int err = 0;
+ MksckPage *peerMksckPage = MksckPage_GetFromVmIdIncRefc(addr.vmId);
+ Mksck *peerMksck;
+
+ /*
+ * Find corresponding destination shared page and increment its
+ * reference count so it can't be freed while we are sending to the
+ * socket. Make sure that the address is indeed an address of a
+ * monitor/guest socket.
+ */
+ if (peerMksckPage == NULL) {
+ printk(KERN_INFO "LockPeer: vmId %x is not in use!\n", addr.vmId);
+ return -ENETUNREACH;
+ }
+ if (!peerMksckPage->isGuest) {
+ MksckPage_DecRefc(peerMksckPage);
+ printk(KERN_INFO "LockPeer: vmId %x does not belong to a guest!\n",
+ addr.vmId);
+ return -ENETUNREACH;
+ }
+
+
+ err = Mutex_Lock(&peerMksckPage->mutex, MutexModeSH);
+ if (err < 0) {
+ MksckPage_DecRefc(peerMksckPage);
+ return err;
+ }
+
+ /*
+ * Find corresponding destination socket on that shared page and
+ * increment its reference count so it can't be freed while we are
+ * trying to send to it.
+ */
+ peerMksck = MksckPage_GetFromAddr(peerMksckPage, addr);
+
+ if (peerMksck) {
+ ATOMIC_ADDV(peerMksck->refCount, 1);
+ *peerMksckR = peerMksck;
+ } else {
+ printk(KERN_INFO "LockPeer: addr %x is not a defined socket!\n",
+ addr.addr);
+ err = -ENETUNREACH;
+ }
+
+ Mutex_Unlock(&peerMksckPage->mutex, MutexModeSH);
+ MksckPage_DecRefc(peerMksckPage);
+ return err;
+}
+
+/**
+ * @brief Set the peer address of a MKSCK socket
+ *
+ * @param sock user socket structure
+ * @param addr the new address of the socket
+ * @param addrLen length of the address
+ * @param flags flags
+ *
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckDgramConnect(struct socket *sock,
+ struct sockaddr *addr,
+ int addrLen,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ Mksck *mksck;
+ struct sockaddr_mk *peerAddrMk = (struct sockaddr_mk *)addr;
+ int err = 0;
+
+ if (addrLen != sizeof *peerAddrMk) {
+ printk(KERN_INFO "MksckConnect: wrong address length!\n");
+ return -EINVAL;
+ }
+ if (peerAddrMk->mk_family != AF_MKSCK) {
+ printk(KERN_INFO "MksckConnect: wrong address family!\n");
+ return -EAFNOSUPPORT;
+ }
+
+ lock_sock(sk);
+
+ if ((err = MksckTryBind(sk))) {
+ goto releaseSock;
+ }
+ mksck = sk->sk_protinfo;
+
+ /*
+ * First severe any past peer connections
+ */
+ Mksck_DisconnectPeer(mksck);
+ sock->state = SS_UNCONNECTED;
+
+ /*
+ * Then build new connections ...
+ */
+ if (peerAddrMk->mk_addr.addr != MKSCK_ADDR_UNDEF) {
+ sock->state = SS_CONNECTED;
+ mksck->peerAddr = peerAddrMk->mk_addr;
+ err = LockPeer(mksck->peerAddr, &mksck->peer);
+ PRINTK(KERN_DEBUG "MksckConnect: socket %x is connected to %x!\n",
+ mksck->addr.addr, mksck->peerAddr.addr);
+ }
+
+releaseSock:
+ release_sock(sk);
+
+ return err;
+}
+
+
+/**
+ * @brief returns the address of a MKSCK socket/peer address
+ *
+ * @param sock user socket structure
+ * @param addr the new address of the socket
+ * @param addrLen length of the address
+ * @param peer 1 if the peer address is sought
+ *
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckGetName(struct socket *sock,
+ struct sockaddr *addr,
+ int *addrLen,
+ int peer)
+{
+ int err;
+ Mksck *mksck;
+ struct sock *sk = sock->sk;
+
+ // MAX_SOCK_ADDR is size of *addr, Linux doesn't export it!
+ // ASSERT_ON_COMPILE(sizeof (struct sockaddr_mk) <= MAX_SOCK_ADDR);
+
+ lock_sock(sk);
+ mksck = sk->sk_protinfo;
+
+ if (mksck == NULL) {
+ if (peer) {
+ err = -ENOTCONN;
+ } else {
+ ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK;
+ ((struct sockaddr_mk *)addr)->mk_addr.addr = MKSCK_ADDR_UNDEF;
+ *addrLen = sizeof (struct sockaddr_mk);
+ err = 0;
+ }
+ } else if (!peer) {
+ ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK;
+ ((struct sockaddr_mk *)addr)->mk_addr = mksck->addr;
+ *addrLen = sizeof (struct sockaddr_mk);
+ err = 0;
+ } else if (mksck->peerAddr.addr == MKSCK_ADDR_UNDEF) {
+ err = -ENOTCONN;
+ } else {
+ ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK;
+ ((struct sockaddr_mk *)addr)->mk_addr = mksck->peerAddr;
+ *addrLen = sizeof (struct sockaddr_mk);
+ err = 0;
+ }
+
+ release_sock(sk);
+
+ return err;
+}
+
+
+/**
+ * @brief VMX polling a receipted packet from VMM.
+ *
+ * @param filp kernel file pointer to poll for
+ * @param sock user socket structure
+ * @param wait kernel polling table where to poll if not null
+ *
+ * @return poll mask state given from socket state.
+ */
+static unsigned int MksckPoll(struct file *filp,
+ struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+ Mksck *mksck = NULL;
+ uint32 read;
+ int err;
+
+ lock_sock(sk);
+ if ((err = MksckTryBind(sk))) {
+ release_sock(sk);
+ return err;
+ }
+ mksck = sk->sk_protinfo;
+
+ /*
+ * To avoid mksck disappearing right after the release_sock the
+ * refcount needs to be incremented. For more details read the
+ * block comment on locking in MksckCreate.
+ */
+ ATOMIC_ADDV(mksck->refCount, 1);
+ release_sock(sk);
+
+ /*
+ * Wait to make sure this is the only thread trying to access socket.
+ */
+ if ((err = Mutex_Lock(&mksck->mutex, MutexModeEX)) < 0) {
+ /* we might get in this situation if we are signaled
+ (select() may handle this, so leave) */
+ PRINTK(KERN_INFO "MksckPoll: try to abort\n");
+ return mask;
+ }
+
+ /*
+ * See if packet in ring.
+ */
+ read = mksck->read;
+ if (read != mksck->write) {
+ mask |= POLLIN | POLLRDNORM; /* readable, socket is unlocked */
+ /* Note that if we are implementing support for POLLOUT, we SHOULD
+ change this Mutex_Unlock by Mutex_UnlPoll, because there is no
+ obvious knowledge about the sleepy reason that is intended by user */
+ Mutex_Unlock(&mksck->mutex, MutexModeEX);
+ } else {
+ Mutex_UnlPoll(&mksck->mutex, MutexModeEX, MKSCK_CVAR_FILL, filp, wait);
+ }
+
+ /*
+ * Note that locking rules differ a little inside MksckPoll, since we are
+ * not only given a pointer to the struct socket but also a pointer to a
+ * struct file. This means that during the whole operation of this function
+ * and during any pending wait (registered with poll_wait()), the file itself
+ * is reference counted up, and we should rely on that 'upper' reference
+ * counting to prevent from tearing the Mksck down. That holds true since one
+ * never re-bind sockets !
+ */
+ Mksck_DecRefc(mksck);
+ return mask;
+}
+
+/**
+ * @brief Manage a set of Mksck_PageDesc from a message or a stored array.
+ *
+ * @param pd set of Mksck_PageDesc
+ * @param pages Mksck_PageDesc pages count for this management operation
+ * @param incr ternary used to indicate if we want to reference (+1), or
+ * dereference (-1), or count (0) 4k pages
+ *
+ * @return length of bytes processed.
+ */
+static size_t
+MksckPageDescManage(Mksck_PageDesc *pd,
+ uint32 pages,
+ int incr)
+{
+ size_t payloadLen = 0;
+ uint32 i;
+
+ for (i = 0; i < pages && pd[i].mpn != INVALID_MPN; ++i) {
+ uint32 j;
+
+ for (j = 0; j < 1 << pd[i].order; ++j) {
+ struct page *page;
+ MPN currMPN = pd[i].mpn + j;
+
+ /*
+ * The monitor tried to send an invalid MPN, bad.
+ */
+ if (!pfn_valid(currMPN)) {
+ printk("MksckPageDescManage: Invalid MPN %x\n", currMPN);
+ } else {
+ page = pfn_to_page(currMPN);
+
+ if (incr == +1) {
+ get_page(page);
+ }
+ if (incr == -1) {
+ put_page(page);
+ }
+ }
+
+ payloadLen += PAGE_SIZE;
+ }
+ }
+
+ return payloadLen;
+}
+
+/**
+ * @brief Management values to be used as third parameter of MksckPageDescManage
+ */
+#define MANAGE_INCREMENT +1
+#define MANAGE_DECREMENT -1
+#define MANAGE_COUNT 0
+
+
+/**
+ * @brief Map a set of Mksck_PageDesc from a message or a stored array.
+ *
+ * @param pd set of Mksck_PageDesc
+ * @param pages pages count for this mapping
+ * @param iov vectored user virtual addresses of the recv commands
+ * @param iovCount size for iov parameter
+ * @param vma virtual memory area used for the mapping, note that
+ * this is mandatorily required MksckPageDescMap is used
+ * on an indirect PageDesc context (i.e whenever iov is
+ * not computed by the kernel but by ourselves).
+ *
+ * Since find_vma() and vm_insert_page() are used, this function must
+ * be called with current's mmap_sem locked, or inside an MMap operation.
+ *
+ * @return length of bytes mapped.
+ */
+static size_t
+MksckPageDescMap(Mksck_PageDesc *pd,
+ uint32 pages,
+ struct iovec *iov,
+ int iovCount,
+ struct vm_area_struct *vma)
+{
+ size_t payloadLen = 0;
+ uint32 i;
+
+ for (i = 0; i < pages && pd[i].mpn != INVALID_MPN; ++i) {
+ uint32 j;
+
+ for (j = 0; j < 1 << pd[i].order; ++j) {
+ HUVA huva = 0;
+ struct page *page;
+ MPN currMPN = pd[i].mpn + j;
+
+ while (iovCount > 0 && iov->iov_len == 0) {
+ iovCount--;
+ iov++;
+ }
+
+ if (iovCount == 0) {
+ printk("MksckPageDescMap: Invalid iov length\n");
+ goto map_done;
+ }
+
+ huva = (HUVA)iov->iov_base;
+
+ /*
+ * iovecs for receiving the typed component of the message should
+ * have page aligned base and size sufficient for page descriptor's
+ * mappings.
+ */
+ if (huva & (PAGE_SIZE - 1) || iov->iov_len < PAGE_SIZE) {
+ printk("MksckPageDescMap: Invalid huva %x or iov_len %d\n",
+ huva,
+ iov->iov_len);
+ goto map_done;
+ }
+
+ /*
+ * Might be in a new vma...
+ */
+ if (vma == NULL || huva < vma->vm_start || huva >= vma->vm_end) {
+ vma = find_vma(current->mm, huva);
+
+ /*
+ * Couldn't find a matching vma for huva.
+ */
+ if (vma == NULL ||
+ huva < vma->vm_start ||
+ vma->vm_ops != &mksckVMOps) {
+ printk("MksckPageDescMap: Invalid vma\n");
+ goto map_done;
+ }
+ }
+
+ /*
+ * The monitor tried to send an invalid MPN, bad.
+ */
+ if (!pfn_valid(currMPN)) {
+ printk("MksckPageDescMap: Invalid MPN %x\n", currMPN);
+ } else {
+ int rc;
+
+ page = pfn_to_page(currMPN);
+
+ /*
+ * Map into the receive window.
+ */
+ rc = vm_insert_page(vma, huva, page);
+ if (rc) {
+ printk("MksckPageDescMap: Failed to insert %x at %x, error %d\n",
+ currMPN,
+ huva,
+ rc);
+ goto map_done;
+ }
+
+ ASSERT(iov->iov_len >= PAGE_SIZE);
+ iov->iov_base += PAGE_SIZE;
+ iov->iov_len -= PAGE_SIZE;
+ }
+
+ payloadLen += PAGE_SIZE;
+ }
+ }
+
+map_done:
+ return payloadLen;
+}
+
+
+/**
+ * @brief Check if the provided MsgHdr has still room for a receive operation.
+ *
+ * @param msg user buffer
+ * @return 1 if MsgHdr has IO space room in order to receive a mapping, 0 otherwise.
+ */
+static int
+MsgHdrHasAvailableRoom(struct msghdr *msg)
+{
+ struct iovec *vec = msg->msg_iov;
+ uint32 count = msg->msg_iovlen;
+
+ while (count > 0 && vec->iov_len == 0) {
+ count--;
+ vec++;
+ }
+
+ return (count != 0);
+}
+
+
+/**
+ * Whenever a typed message is received from the monitor, we may choose to store
+ * all the page descriptor content in a linked state of descriptors, through the
+ * following information context
+ */
+typedef struct MksckPageDescInfo {
+ struct MksckPageDescInfo *next;
+ uint32 flags;
+ uint32 pages;
+ uint32 mapCounts;
+ Mksck_PageDesc descs[0];
+} MksckPageDescInfo;
+
+static void MksckPageDescSkDestruct(struct sock *sk);
+static int MksckPageDescMMap(struct file *file,
+ struct socket *sock,
+ struct vm_area_struct *vma);
+static int MksckPageDescIoctl(struct socket *sock,
+ unsigned int cmd,
+ unsigned long arg);
+
+/**
+ * @brief Delete a page descriptor container socket
+ *
+ * @param sock user socket structure
+ * @return 0 on success, -errno on failure
+ */
+static int
+MksckPageDescRelease(struct socket *sock)
+{
+ /* This is generic socket release */
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ lock_sock(sk);
+ sock_orphan(sk);
+ release_sock(sk);
+ sock_put(sk);
+ }
+
+ sock->sk = NULL;
+ sock->state = SS_FREE;
+
+ return 0;
+}
+
+
+/**
+ * Whenever a typed message is received from the monitor, we may choose to store
+ * all the page descriptor content for a future mapping. One shall put a context
+ * usable by host userland, that means trough a file descriptor, and as a secure
+ * implementation we choose to define a strict set of operations that are used
+ * only for that purpose. This set of operation is reduced to leaving the
+ * default "PageDesc(s) accumulating" mode (inside ioctl), mapping the context,
+ * and generic socket destruction.
+ */
+static struct proto_ops mksckPageDescOps = {
+ .family = AF_MKSCK,
+ .owner = THIS_MODULE,
+ .release = MksckPageDescRelease,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = MksckPageDescIoctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = MksckPageDescMMap,
+ .sendpage = sock_no_sendpage,
+};
+
+
+/**
+ * @brief Create or accumulate to a PageDesc context, backed as a descriptor.
+ *
+ * @param sock user socket structure
+ * @param msg user buffer to receive the file descriptor as ancillary data
+ * @param pd source descriptor part of a message
+ * @param pages pages count for this mapping
+ *
+ * @return error if negative, 0 otherwise
+ *
+ */
+static int
+MksckPageDescToFd(struct socket *sock,
+ struct msghdr *msg,
+ Mksck_PageDesc *pd,
+ uint32 pages)
+{
+ int retval;
+ int newfd;
+ struct socket *newsock;
+ struct sock *newsk;
+ struct sock *sk = sock->sk;
+ MksckPageDescInfo **pmpdi, *mpdi;
+ lock_sock(sk);
+
+ /*
+ * Relation between any mk socket and the PageDesc context is as follow:
+ *
+ * From the mk socket to the PageDesc context:
+ * - sk->sk_user_data is a WEAK LINK, containing only a file descriptor
+ * numerical value such that accumulating is keyed on it.
+ *
+ * From the PageDesc context to the mk socket:
+ * - sk->sk_protinfo contains a MksckPageDescInfo struct.
+ * - sk->sk_user_data is a pointer REF-COUNTED sock_hold() LINK, also it is
+ * rarely dereferenced but usually used to check that the
+ * right socket pair is used. Full dereferencing is used
+ * only to break the described links.
+ */
+ if (sk->sk_user_data) {
+ MksckPageDescInfo *mpdi2;
+
+ /* continue any previous on-going mapping, i.e accumulate */
+ newfd = *((int *)sk->sk_user_data);
+ newsock = sockfd_lookup(newfd, &retval); // promote the weak link
+ if (!newsock) {
+ retval = -EINVAL;
+ goto endProcessingReleaseSock;
+ }
+
+ newsk = newsock->sk;
+ lock_sock(newsk);
+ sockfd_put(newsock);
+
+ if (((struct sock *)newsk->sk_user_data) != sk) {
+ /* One way of going into this situation would be for userland to dup
+ the file descriptor just received, close the original number, and
+ open a new mk socket in the very same spot. The userland code have
+ a lot of way of interacting with the kernel without this driver
+ code to be notified. */
+ retval = -EINVAL;
+ release_sock(newsk);
+ goto endProcessingReleaseSock;
+ }
+
+ mpdi = sock_kmalloc(newsk, sizeof(MksckPageDescInfo) +
+ pages*sizeof(Mksck_PageDesc), GFP_KERNEL);
+ if (IS_ERR(mpdi)) {
+ retval = PTR_ERR(mpdi);
+ release_sock(newsk);
+ goto endProcessingReleaseSock;
+ }
+
+ /* There is no mandatory needs for us to notify userland from
+ the progress in "appending" to the file descriptor, but it
+ would feel strange if the userland would have no mean to
+ tell if the received message was just not thrown away. So, in
+ order to be consistent one fill the ancillary message while
+ "creating" and "appending to" file descriptors. */
+ retval = put_cmsg(msg, SOL_DECNET, 0, sizeof(int), &newfd);
+ if (retval < 0) {
+ goto endProcessingKFreeReleaseSock;
+ }
+
+ release_sock(sk);
+
+ mpdi2 = (MksckPageDescInfo *)newsk->sk_protinfo;
+ while (mpdi2->next) {
+ mpdi2 = mpdi2->next;
+ }
+ pmpdi = &(mpdi2->next);
+
+ } else {
+ /* Create a new socket, new context and a new file descriptor. */
+ retval = sock_create(sk->sk_family, sock->type, 0, &newsock);
+ if (retval < 0) {
+ goto endProcessingReleaseSock;
+ }
+
+ newsk = newsock->sk;
+ lock_sock(newsk);
+ newsk->sk_destruct = &MksckPageDescSkDestruct;
+ newsk->sk_user_data = sk;
+ sock_hold(sk); // keeps a reference to parent mk socket
+ newsock->ops = &mksckPageDescOps;
+
+ mpdi = sock_kmalloc(newsk, sizeof(MksckPageDescInfo) +
+ pages*sizeof(Mksck_PageDesc), GFP_KERNEL);
+ if (IS_ERR(mpdi)) {
+ retval = PTR_ERR(mpdi);
+ goto endProcessingFreeNewSock;
+ }
+
+ sk->sk_user_data = sock_kmalloc(sk, sizeof(int), GFP_KERNEL);
+ if (IS_ERR(sk->sk_user_data)) {
+ retval = PTR_ERR(sk->sk_user_data);
+ sk->sk_user_data = NULL;
+ goto endProcessingKFreeAndNewSock;
+ }
+
+ /* mapping to a file descriptor may fail if a thread is closing
+ in parallel of sock_map_fd/sock_alloc_fd, or kernel memory is full */
+ newfd = sock_map_fd(newsock, O_CLOEXEC);
+ if (newfd < 0) {
+ retval = newfd;
+ sock_kfree_s(sk, sk->sk_user_data, sizeof(int));
+ sk->sk_user_data = NULL;
+ goto endProcessingKFreeAndNewSock;
+ }
+
+ /* notify userland from a new file descriptor, alike AF_UNIX ancillary */
+ retval = put_cmsg(msg, SOL_DECNET, 0, sizeof(int), &newfd);
+ if (retval < 0) {
+ sock_kfree_s(sk, sk->sk_user_data, sizeof(int));
+ sk->sk_user_data = NULL;
+ sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) +
+ mpdi->pages*sizeof(Mksck_PageDesc));
+ release_sock(newsk);
+ sockfd_put(newsock);
+ sock_release(newsock);
+ put_unused_fd(newfd);
+ goto endProcessingReleaseSock;
+ }
+
+ *(int*)sk->sk_user_data = newfd;
+ release_sock(sk);
+ pmpdi = (MksckPageDescInfo **)(&(newsk->sk_protinfo));
+ }
+
+ mpdi->next = NULL;
+ mpdi->flags = 0;
+ mpdi->mapCounts = 0;
+ mpdi->pages = pages;
+ memcpy(mpdi->descs, pd, pages*sizeof(Mksck_PageDesc));
+
+ *pmpdi = mpdi; // link
+ release_sock(newsk);
+
+ /* increment all reference counters for the pages */
+ MksckPageDescManage(pd, pages, MANAGE_INCREMENT);
+ return 0;
+
+endProcessingKFreeAndNewSock:
+ sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) +
+ mpdi->pages*sizeof(Mksck_PageDesc));
+endProcessingFreeNewSock:
+ release_sock(newsk);
+ sock_release(newsock);
+ release_sock(sk);
+ return retval;
+
+endProcessingKFreeReleaseSock:
+ sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) +
+ mpdi->pages*sizeof(Mksck_PageDesc));
+ release_sock(newsk);
+endProcessingReleaseSock:
+ release_sock(sk);
+ return retval;
+}
+
+/**
+ * @brief Callback at socket destruction
+ *
+ * @param sk pointer to kernel socket structure
+ */
+static void
+MksckPageDescSkDestruct(struct sock *sk)
+{
+ struct sock *mkSk = NULL;
+ MksckPageDescInfo *mpdi;
+ lock_sock(sk);
+ mpdi = sk->sk_protinfo;
+ while (mpdi) {
+ MksckPageDescInfo *next = mpdi->next;
+ MksckPageDescManage(mpdi->descs, mpdi->pages,
+ MANAGE_DECREMENT);
+ sock_kfree_s(sk, mpdi, sizeof(MksckPageDescInfo) +
+ mpdi->pages*sizeof(Mksck_PageDesc));
+ mpdi = next;
+ }
+ if (sk->sk_user_data) {
+ mkSk = (struct sock *)sk->sk_user_data;
+ sk->sk_user_data = NULL;
+ }
+ sk->sk_protinfo = NULL;
+ release_sock(sk);
+ /* clean the monki socket that we are holding */
+ if (mkSk) {
+ lock_sock(mkSk);
+ sock_kfree_s(mkSk, mkSk->sk_user_data, sizeof(int));
+ mkSk->sk_user_data = NULL;
+ release_sock(mkSk);
+ sock_put(mkSk); // revert of sock_hold()
+ }
+}
+
+/**
+ * @brief The mmap operation of the PageDesc context file descriptor.
+ *
+ * The mmap command is used to mmap any detached (i.e. no more accumulating)
+ * PageDesc context, full of the content from its parent communication mk
+ * socket. Mapping may be done a specified number of times, so that the
+ * PageDesc context could become useless (as a security restriction).
+ *
+ * Also note that mapping from an offset different from zero is considered
+ * as a userland invalid operation.
+ *
+ * @param file user file structure
+ * @param sock user socket structure
+ * @param vma virtual memory area structure
+ *
+ * @return error code, 0 on success
+ */
+static int
+MksckPageDescMMap(struct file *file,
+ struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ struct sock *sk = sock->sk;
+ MksckPageDescInfo *mpdi;
+ struct iovec iov;
+ unsigned long vm_flags;
+ int freed = 0;
+
+ iov.iov_base = (void*)vma->vm_start;
+ iov.iov_len = vma->vm_end - vma->vm_start;
+
+ lock_sock(sk);
+ mpdi = sk->sk_protinfo;
+
+ // vma->vm_pgoff is checked, since offsetting the map is not supported
+ if (!mpdi || sk->sk_user_data || vma->vm_pgoff) {
+ release_sock(sk);
+ printk(KERN_INFO "MMAP failed for virt %lx size %lx\n",
+ vma->vm_start, vma->vm_end - vma->vm_start);
+ return -EINVAL;
+ }
+
+ vm_flags = mpdi->flags;
+ if ((vma->vm_flags & ~vm_flags) & (VM_READ|VM_WRITE)) {
+ release_sock(sk);
+ return -EACCES;
+ }
+
+ while (mpdi) {
+ MksckPageDescInfo *next = mpdi->next;
+ MksckPageDescMap(mpdi->descs, mpdi->pages, &iov, 1, vma);
+ if (mpdi->mapCounts && !--mpdi->mapCounts) {
+ MksckPageDescManage(mpdi->descs, mpdi->pages,
+ MANAGE_DECREMENT);
+ sock_kfree_s(sk, mpdi, sizeof(MksckPageDescInfo) +
+ mpdi->pages*sizeof(Mksck_PageDesc));
+ freed = 1;
+ }
+ mpdi = next;
+ }
+
+ if (freed) {
+ sk->sk_protinfo = NULL;
+ }
+ vma->vm_ops = &mksckVMOps;
+ release_sock(sk);
+ return 0;
+}
+
+/**
+ * @brief The ioctl operation of the PageDesc context file descriptor.
+ *
+ * The ioctl MKSCK_DETACH command is used to detach the PageDesc context
+ * from its parent communication mk socket. Once done, the context
+ * is able to remap the transferred PageDesc(s) of typed messages accumulated
+ * into the context.
+ *
+ * @param sock user socket structure
+ * @param cmd select which cmd function needs to be performed
+ * @param arg argument for command
+ *
+ * @return error code, 0 on success
+ */
+static int
+MksckPageDescIoctl(struct socket *sock,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *monkiSk = NULL;
+ struct sock *sk = sock->sk;
+ MksckPageDescInfo *mpdi;
+ int retval = 0;
+
+ switch (cmd) {
+ /**
+ * ioctl MKSCK_DETACH (in and out):
+ * Detach, compute size and define allowed protection access rights
+ *
+ * [in]: unsigned long flags, similar to prot argument of mmap()
+ * unsigned long number of available further mappings
+ * with 0 meaning unlimited number of mappings
+ * [out]: unsigned long size of the available mappable area
+ */
+ case MKSCK_DETACH: {
+ unsigned long ul[2];
+ lock_sock(sk);
+ mpdi = sk->sk_protinfo;
+ // read unsigned long argument that contains the mmap alike flags
+ if (copy_from_user(ul, (void *)arg, sizeof ul)) {
+ retval = -EFAULT;
+ // check that the file descriptor has a parent and some context there
+ } else if (!mpdi || !sk->sk_user_data) {
+ retval = -EINVAL;
+ } else {
+ /* compute mapping protection bits from argument and size of the
+ * mapping, that is also given back to userland as unsigned long.
+ */
+ uint32 flags = calc_vm_prot_bits(ul[0]);
+ ul[0] = 0;
+ while (mpdi) {
+ MksckPageDescInfo *next = mpdi->next;
+ ul[0] += MksckPageDescManage(mpdi->descs, mpdi->pages,
+ MANAGE_COUNT);
+ mpdi->mapCounts = ul[1];
+ mpdi = next;
+ }
+ if (copy_to_user((void *)arg, ul, sizeof(ul[0]))) {
+ retval = -EFAULT;
+ } else {
+ mpdi = sk->sk_protinfo;
+ mpdi->flags = flags;
+ monkiSk = (struct sock *)sk->sk_user_data;
+ sk->sk_user_data = NULL;
+ }
+ }
+ release_sock(sk);
+ // clean the monki socket that we are holding
+ if ((sk = monkiSk)) {
+ lock_sock(sk);
+ sock_kfree_s(sk, sk->sk_user_data, sizeof(int));
+ sk->sk_user_data = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ }
+ break;
+ }
+ default: {
+ retval = -EINVAL;
+ break;
+ }
+ }
+ return retval;
+}
+
+
+/**
+ * @brief VMX receiving a packet from VMM.
+ *
+ * @param kiocb kernel io control block (unused)
+ * @param sock user socket structure
+ * @param msg user buffer to receive the packet
+ * @param len size of the user buffer
+ * @param flags flags
+ *
+ * @return -errno on failure, else length of untyped portion + total number
+ * of bytes mapped for typed portion.
+ */
+static int
+MksckDgramRecvMsg(struct kiocb *kiocb,
+ struct socket *sock,
+ struct msghdr *msg,
+ size_t len,
+ int flags)
+{
+ int err = 0;
+ struct sock *sk = sock->sk;
+ Mksck *mksck;
+ Mksck_Datagram *dg;
+ struct sockaddr_mk *fromAddr;
+ uint32 read;
+ struct iovec *iov;
+ size_t payloadLen, untypedLen;
+ uint32 iovCount;
+
+ if (flags & MSG_OOB || flags & MSG_ERRQUEUE) {
+ return -EOPNOTSUPP;
+ }
+
+ if ((msg->msg_name != NULL) && (msg->msg_namelen < sizeof *fromAddr)) {
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+ if ((err = MksckTryBind(sk))) {
+ release_sock(sk);
+ return err;
+ }
+ mksck = sk->sk_protinfo;
+
+ /*
+ * To avoid mksck disappearing right after the release_sock the
+ * refcount needs to be incremented. For more details read the
+ * block comment on locking in MksckCreate.
+ */
+ ATOMIC_ADDV(mksck->refCount, 1);
+ release_sock(sk);
+
+ /*
+ * Get pointer to next packet in ring to be dequeued.
+ */
+ while (1) {
+
+ /*
+ * Wait to make sure this is the only thread trying to access socket.
+ */
+ if ((err = Mutex_Lock(&mksck->mutex, MutexModeEX)) < 0) {
+ goto decRefc;
+ }
+
+ /*
+ * See if packet in ring.
+ */
+ read = mksck->read;
+ if (read != mksck->write) {
+ break;
+ }
+
+ /*
+ * Nothing there, if user wants us not to block then just return EAGAIN.
+ */
+ if (flags & MSG_DONTWAIT) {
+ Mutex_Unlock(&mksck->mutex, MutexModeEX);
+ err = -EAGAIN;
+ goto decRefc;
+ }
+
+ /*
+ * Nothing there, unlock socket and wait for data.
+ */
+ mksck->foundEmpty ++;
+ err = Mutex_UnlSleep(&mksck->mutex, MutexModeEX, MKSCK_CVAR_FILL);
+ if (err < 0) {
+ PRINTK(KERN_INFO "MksckDgramRecvMsg: aborted\n");
+ goto decRefc;
+ }
+ }
+
+ /*
+ * Point to packet in ring.
+ */
+ dg = (void *)&mksck->buff[read];
+
+ /*
+ * Provide the address of the sender.
+ */
+ if (msg->msg_name != NULL) {
+ fromAddr = (void *)msg->msg_name;
+ fromAddr->mk_addr = dg->fromAddr;
+ fromAddr->mk_family = AF_MKSCK;
+ msg->msg_namelen = sizeof *fromAddr;
+ } else {
+ msg->msg_namelen = 0;
+ }
+
+ /*
+ * Copy data from ring buffer to caller's buffer and remove packet from
+ * ring buffer.
+ */
+ iov = msg->msg_iov;
+ iovCount = msg->msg_iovlen;
+ payloadLen = untypedLen =
+ dg->len - dg->pages * sizeof(Mksck_PageDesc) - dg->pad;
+
+ /*
+ * Handle the untyped portion of the message.
+ */
+ if (untypedLen <= len) {
+ err = memcpy_toiovec(iov,
+ dg->data,
+ untypedLen);
+ if (err < 0) {
+ printk("MksckDgramRecvMsg: Failed to memcpy_to_iovec untyped message component "
+ "(buf len %d datagram len %d (untyped %d))\n",
+ len,
+ dg->len,
+ untypedLen);
+ }
+ } else {
+ err = -EINVAL;
+ }
+
+ /*
+ * Map in the typed descriptor.
+ */
+ if (err >= 0 && dg->pages > 0) {
+ Mksck_PageDesc *pd = (Mksck_PageDesc *)(dg->data + untypedLen + dg->pad);
+
+ /*
+ * There are 3 ways of receiving typed messages from the monitor.
+ * - The typed message is mapped directly into a VMA. To indicate this the
+ * userland sets msg_controllen == 0.
+ * - The typed message is mapped directly into a VMA and a file descriptor
+ * created for further mappings on the host (in same userland address
+ * space or an alternate userland address space). In this case
+ * msg_controllen should be set to sizeof(fd).
+ * - The typed message is not mapped directly into a VMA, but a file
+ * descriptor is created for later mapping on the host. In this case
+ * msg_controllen should be set to sizeof(fd) and the supplied iovec
+ * shall not specify a receive window.
+ *
+ * The conjuncts below decide on which of these 3 cases we've encountered.
+ */
+
+ if ((msg->msg_controllen <= 0) ||
+ ((err = MksckPageDescToFd(sock, msg, pd, dg->pages)) != 0) ||
+ (MsgHdrHasAvailableRoom(msg) != 0)) {
+
+ down_write(&current->mm->mmap_sem); // lock for a change of mapping
+ payloadLen += MksckPageDescMap(pd, dg->pages, iov, iovCount, NULL);
+ up_write(&current->mm->mmap_sem);
+ }
+ }
+
+ /*
+ * Now that packet is removed, it is safe to unlock socket so another thread
+ * can do a recv(). We also want to wake someone waiting for room to insert
+ * a new packet.
+ */
+ if ((err >= 0) && Mksck_IncReadIndex(mksck, read, dg)) {
+ Mutex_UnlWake(&mksck->mutex, MutexModeEX, MKSCK_CVAR_ROOM, true);
+ } else {
+ Mutex_Unlock(&mksck->mutex, MutexModeEX);
+ }
+
+ /*
+ * If memcpy error, return error status.
+ * Otherwise, return number of bytes copied.
+ */
+ if (err >= 0) {
+ err = payloadLen;
+ }
+
+decRefc:
+ Mksck_DecRefc(mksck);
+ return err;
+}
+
+
+/**
+ * @brief VMX sending a packet to VMM.
+ *
+ * @param kiocb kernel io control block
+ * @param sock user socket structure
+ * @param msg packet to be transmitted
+ * @param len length of the packet
+ *
+ * @return length of the sent msg on success, -errno on failure
+ */
+static int
+MksckDgramSendMsg(struct kiocb *kiocb,
+ struct socket *sock,
+ struct msghdr *msg,
+ size_t len)
+{
+ int err = 0;
+ struct sock *sk = sock->sk;
+ Mksck *peerMksck;
+ Mksck_Datagram *dg;
+ uint32 needed;
+ uint32 write;
+ Mksck_Address fromAddr;
+
+ if (msg->msg_flags & MSG_OOB) {
+ return -EOPNOTSUPP;
+ }
+
+ if (len > MKSCK_XFER_MAX) {
+ return -EMSGSIZE;
+ }
+
+ /*
+ * In the next locked section peerMksck pointer needs to be set and
+ * its refcount needs to be incremented.
+ */
+ lock_sock(sk);
+ do {
+ Mksck *mksck;
+ Mksck_Address peerAddr =
+ { .addr = (msg->msg_name ?
+ ((struct sockaddr_mk *)msg->msg_name)->mk_addr.addr :
+ MKSCK_ADDR_UNDEF) };
+
+ if ((err = MksckTryBind(sk))) {
+ break;
+ }
+ mksck = sk->sk_protinfo;
+ fromAddr = mksck->addr;
+
+ /*
+ * If the socket is connected, use that address (no sendto for
+ * connected sockets). Otherwise, use the provided address if any.
+ */
+ if ((peerMksck = mksck->peer)) {
+ if (peerAddr.addr != MKSCK_ADDR_UNDEF &&
+ peerAddr.addr != mksck->peerAddr.addr) {
+ err = -EISCONN;
+ break;
+ }
+ /*
+ * To avoid mksckPeer disappearing right after the
+ * release_sock the refcount needs to be incremented. For
+ * more details read the block comment on locking in
+ * MksckCreate.
+ */
+ ATOMIC_ADDV(peerMksck->refCount, 1);
+ } else if (peerAddr.addr == MKSCK_ADDR_UNDEF) {
+ err = -ENOTCONN;
+ } else {
+ /*
+ * LockPeer also increments the refc on the peer.
+ */
+ err = LockPeer(peerAddr, &peerMksck);
+ }
+ } while(0);
+ release_sock(sk);
+
+ if (err) {
+ return err;
+ }
+
+ /*
+ * Get pointer to sufficient empty space in ring buffer.
+ */
+ needed = MKSCK_DGSIZE(len);
+ while (1) {
+ /*
+ * Wait to make sure this is the only thread trying to write to ring.
+ */
+ if ((err = Mutex_Lock(&peerMksck->mutex, MutexModeEX)) < 0) {
+ goto decRefc;
+ }
+
+ /*
+ * Check if socket can receive data.
+ */
+ if (peerMksck->shutDown & MKSCK_SHUT_RD) {
+ err = -ENOTCONN;
+ goto unlockDecRefc;
+ }
+
+ /*
+ * See if there is room for the packet.
+ */
+ write = Mksck_FindSendRoom(peerMksck, needed);
+ if (write != MKSCK_FINDSENDROOM_FULL) {
+ break;
+ }
+
+ /*
+ * No room, unlock socket and maybe wait for room.
+ */
+ if (msg->msg_flags & MSG_DONTWAIT) {
+ err = -EAGAIN;
+ goto unlockDecRefc;
+ }
+
+ peerMksck->foundFull ++;
+ err = Mutex_UnlSleep(&peerMksck->mutex,
+ MutexModeEX,
+ MKSCK_CVAR_ROOM);
+ if (err < 0) {
+ PRINTK(KERN_INFO "MksckDgramSendMsg: aborted\n");
+ goto decRefc;
+ }
+ }
+
+ /*
+ * Point to room in ring and fill in message.
+ */
+ dg = (void *)&peerMksck->buff[write];
+
+ dg->fromAddr = fromAddr;
+ dg->len = len;
+
+ if ((err = memcpy_fromiovec(dg->data, msg->msg_iov, len)) != 0) {
+ goto unlockDecRefc;
+ }
+
+ /*
+ * Increment past message.
+ */
+ Mksck_IncWriteIndex(peerMksck, write, needed);
+
+ /*
+ * Unlock socket and wake someone trying to receive, ie, we filled
+ * in a message.
+ */
+ Mutex_UnlWake(&peerMksck->mutex, MutexModeEX, MKSCK_CVAR_FILL, false);
+
+ /*
+ * Maybe guest is in a general 'wait for interrupt' wait or
+ * grinding away executing guest instructions.
+ *
+ * If it has a receive callback armed for the socket and is
+ * waiting a message, just wake it up. Else send an IPI to the CPU
+ * running the guest so it will interrupt whatever it is doing and
+ * read the message.
+ *
+ * Holding the mksckPage->mutex prevents mksckPage->vmHKVA from
+ * clearing on us.
+ */
+ if (peerMksck->rcvCBEntryMVA != 0) {
+ MksckPage *peerMksckPage = Mksck_ToSharedPage(peerMksck);
+
+ if ((err = Mutex_Lock(&peerMksckPage->mutex, MutexModeSH)) == 0) {
+ uint32 sockIdx = peerMksck->index;
+ MvpkmVM *vm = (MvpkmVM *) peerMksckPage->vmHKVA;
+
+ /*
+ * The destruction of vm and wsp is blocked by the
+ * mksckPage->mutex.
+ */
+ if (vm) {
+ WorldSwitchPage *wsp = vm->wsp;
+
+ ASSERT(sockIdx < 8 * sizeof peerMksckPage->wakeVMMRecv);
+ ATOMIC_ORV(peerMksckPage->wakeVMMRecv, 1U << sockIdx);
+
+ if (wsp) {
+ Mvpkm_WakeGuest(vm, ACTION_MKSCK);
+ }
+ }
+ Mutex_Unlock(&peerMksckPage->mutex, MutexModeSH);
+ }
+ }
+
+ /*
+ * If all are happy tell the caller the number of transferred bytes.
+ */
+ if (!err) {
+ err = len;
+ }
+
+ /*
+ * Now that we are done with target socket, allow it to be freed.
+ */
+decRefc:
+ Mksck_DecRefc(peerMksck);
+ return err;
+
+unlockDecRefc:
+ Mutex_Unlock(&peerMksck->mutex, MutexModeEX);
+ goto decRefc;
+}
+
+
+/**
+ * @brief Page fault handler for receive windows. Since the host process
+ * should not be faulting in this region and only be accessing
+ * memory that has been established via a typed message transfer,
+ * we always signal the fault back to the process.
+ */
+static int
+MksckFault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+/**
+ * @brief Establish a region in the host process suitable for use as a
+ * receive window.
+ *
+ * @param file file reference (ignored).
+ * @param sock user socket structure.
+ * @param vma Linux virtual memory area defining the region.
+ *
+ * @return 0 on success, otherwise error code.
+ */
+static int
+MksckMMap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+ /*
+ * All the hard work is done in MksckDgramRecvMsg. Here we simply mark the
+ * vma as belonging to Mksck.
+ */
+ vma->vm_ops = &mksckVMOps;
+
+ return 0;
+}
+
+/**
+ * @brief This gets called after returning from the monitor.
+ * Since the monitor doesn't directly wake VMX threads when it sends
+ * something to VMX (for efficiency), this routine checks for the
+ * omitted wakes and does them.
+ * @param mksckPage some shared page that the monitor writes packets to, ie
+ * an host shared page
+ */
+void
+Mksck_WakeBlockedSockets(MksckPage *mksckPage)
+{
+ Mksck *mksck;
+ uint32 i, wakeHostRecv;
+
+ wakeHostRecv = mksckPage->wakeHostRecv;
+ if (wakeHostRecv != 0) {
+ mksckPage->wakeHostRecv = 0;
+ for (i = 0; wakeHostRecv != 0; i ++) {
+ if (wakeHostRecv & 1) {
+ mksck = &mksckPage->sockets[i];
+ Mutex_CondSig(&mksck->mutex, MKSCK_CVAR_FILL, true);
+ }
+ wakeHostRecv >>= 1;
+ }
+ }
+}
+
+/**
+ * @brief allocate and initialize a shared page.
+ * @return pointer to shared page.<br>
+ * NULL on error
+ */
+MksckPage *
+MksckPageAlloc(void)
+{
+ uint32 jj;
+ /*
+ * Ask for pages in the virtual kernel space. There is no
+ * requirement to be physically contiguous.
+ */
+ MksckPage *mksckPage = vmalloc(MKSCKPAGE_SIZE);
+
+ if (mksckPage) {
+
+ /*
+ * Initialize its contents. Start refCount at 1 and decrement it
+ * when the worldswitch or VM page gets freed.
+ */
+ memset(mksckPage, 0, MKSCKPAGE_SIZE);
+ ATOMIC_SETV(mksckPage->refCount, 1);
+ mksckPage->portStore = MKSCK_PORT_HIGH;
+
+ Mutex_Init(&mksckPage->mutex);
+ for (jj = 0; jj<MKSCK_SOCKETS_PER_PAGE; jj++) {
+ Mutex_Init(&mksckPage->sockets[jj].mutex);
+ }
+ }
+
+ return mksckPage;
+}
+
+/**
+ * @brief Release the allocated pages.
+ * @param mksckPage the address of the mksckPage to be released
+ */
+static void
+MksckPageRelease(MksckPage *mksckPage)
+{
+ int ii;
+
+ for (ii = 0; ii<MKSCK_SOCKETS_PER_PAGE; ii++) {
+ Mutex_Destroy(&mksckPage->sockets[ii].mutex);
+ }
+ Mutex_Destroy(&mksckPage->mutex);
+
+ vfree(mksckPage);
+}
+
+/**
+ * @brief Using the tgid locate the vmid of this process.
+ * Assumed that mksckPageListLock is held
+ * @return the vmId if page is already allocated,
+ * the first vacant vmid if not yet allocated.<br>
+ * MKSCK_PORT_UNDEF if no slot is vacant
+ */
+static inline Mksck_VmId
+GetHostVmId(void)
+{
+ uint32 jj;
+ Mksck_VmId vmId, vmIdFirstVacant = MKSCK_VMID_UNDEF;
+ MksckPage *mksckPage;
+ uint32 tgid = task_tgid_vnr(current);
+ /*
+ * Assign an unique vmId to the shared page. Start the search from
+ * the vmId that is the result of hashing tgid to 15 bits. As a
+ * used page with a given vmId can occupy only a given slot in the
+ * mksckPages array, it is enough to search through the
+ * MKSCK_MAX_SHARES slots for a vacancy.
+ */
+ for (jj = 0, vmId = MKSCK_TGID2VMID(tgid);
+ jj < MKSCK_MAX_SHARES;
+ jj++, vmId++) {
+ if (vmId > MKSCK_VMID_HIGH) {
+ vmId = 0;
+ }
+ mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)];
+
+ if (mksckPage) {
+ if (mksckPage->tgid == tgid &&
+ !mksckPage->isGuest) {
+ return mksckPage->vmId;
+ }
+
+ } else if (vmIdFirstVacant == MKSCK_VMID_UNDEF) {
+ vmIdFirstVacant = vmId;
+ }
+ }
+ return vmIdFirstVacant;
+}
+
+
+/**
+ * @brief Locate the first empty slot
+ * Assumed that mksckPageListLock is held
+ * @return the first vacant vmid.<br>
+ * MKSCK_PORT_UNDEF if no slot is vacant
+ */
+static inline Mksck_VmId
+GetNewGuestVmId(void)
+{
+ Mksck_VmId vmId;
+
+ for (vmId = 0; vmId < MKSCK_MAX_SHARES; vmId++) {
+ if (!mksckPages[MKSCK_VMID2IDX(vmId)]) {
+ return vmId;
+ }
+ }
+ return MKSCK_VMID_UNDEF;
+}
+
+
+/**
+ * @brief Find shared page for a given idx. The page referred to be the
+ * idx should exist and be locked by the caller.
+ * @param idx index of the page in the array
+ * @return pointer to shared page
+ */
+MksckPage *
+MksckPage_GetFromIdx(uint32 idx)
+{
+ MksckPage *mksckPage = mksckPages[idx];
+ ASSERT(mksckPage);
+ ASSERT(idx<MKSCK_MAX_SHARES);
+ ASSERT(ATOMIC_GETO(mksckPage->refCount));
+ return mksckPage;
+}
+
+/**
+ * @brief find shared page for a given vmId
+ * The vmid should exist and be locked by the caller.
+ * @param vmId vmId to look for, either an host vmId or a guest vmId
+ * @return pointer to shared page
+ */
+MksckPage *
+MksckPage_GetFromVmId(Mksck_VmId vmId)
+{
+ MksckPage *mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)];
+ ASSERT(mksckPage);
+ ASSERT(mksckPage->vmId == vmId);
+ ASSERT(ATOMIC_GETO(mksckPage->refCount));
+ return mksckPage;
+}
+
+
+/**
+ * @brief find shared page for a given vmId
+ * @param vmId vmId to look for, either an host vmId or a guest vmId
+ * @return NULL: no such shared page exists<br>
+ * else: pointer to shared page.
+ * Call Mksck_DecRefc() when done with pointer
+ */
+MksckPage *
+MksckPage_GetFromVmIdIncRefc(Mksck_VmId vmId)
+{
+ MksckPage *mksckPage;
+
+ spin_lock(&mksckPageListLock);
+ mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)];
+
+ if (!mksckPage || (mksckPage->vmId != vmId)) {
+ printk(KERN_INFO "MksckPage_GetFromVmIdIncRefc: vmId %04X not found\n",
+ vmId);
+ mksckPage = NULL;
+ } else {
+ ATOMIC_ADDV(mksckPage->refCount, 1);
+ }
+ spin_unlock(&mksckPageListLock);
+ return mksckPage;
+}
+
+
+/**
+ * @brief find or allocate shared page using tgid
+ * @return NULL: no such shared page exists<br>
+ * else: pointer to shared page.
+ * Call Mksck_DecRefc() when done with pointer
+ */
+MksckPage *
+MksckPage_GetFromTgidIncRefc(void)
+{
+ MksckPage *mksckPage;
+ Mksck_VmId vmId;
+
+ while (1) {
+ spin_lock(&mksckPageListLock);
+ vmId = GetHostVmId();
+
+ if (vmId == MKSCK_VMID_UNDEF) {
+ /*
+ * No vmId has been allocated yet and there is no free slot.
+ */
+ spin_unlock(&mksckPageListLock);
+ return NULL;
+ }
+
+ mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)];
+ if (mksckPage != NULL) {
+ /*
+ * There is a vmid already allocated, increment the refc on it.
+ */
+ ATOMIC_ADDV(mksckPage->refCount, 1);
+ spin_unlock(&mksckPageListLock);
+ return mksckPage;
+ }
+
+ /*
+ * Have to release spinlock to allocate a new page.
+ */
+ spin_unlock(&mksckPageListLock);
+ mksckPage = MksckPageAlloc();
+ if (mksckPage == NULL) {
+ return NULL;
+ }
+
+ /*
+ * Re-lock and make sure no one else allocated while unlocked.
+ * If someone else did allocate, free ours off and use theirs.
+ */
+ spin_lock(&mksckPageListLock);
+ vmId = GetHostVmId();
+ if ((vmId != MKSCK_VMID_UNDEF) &&
+ (mksckPages[MKSCK_VMID2IDX(vmId)] == NULL)) {
+ break;
+ }
+ spin_unlock(&mksckPageListLock);
+ MksckPageRelease(mksckPage);
+ }
+
+ /*
+ * This is a successful new allocation. insert it into the table
+ * and initialize the fields.
+ */
+ mksckPages[MKSCK_VMID2IDX(vmId)] = mksckPage;
+ mksckPage->vmId = vmId;
+ mksckPage->isGuest = false;
+ mksckPage->vmHKVA = 0;
+ mksckPage->tgid = task_tgid_vnr(current);
+ printk(KERN_DEBUG "New host mksck page is allocated: idx %x, vmId %x, tgid %d\n",
+ MKSCK_VMID2IDX(vmId), vmId, mksckPage->tgid);
+
+ spin_unlock(&mksckPageListLock);
+ return mksckPage;
+}
+
+/**
+ * @brief Initialize the VMX provided wsp. Allocate communication page.
+ * @param vm which virtual machine we're running
+ * @return 0 if all OK, error value otherwise
+ */
+int
+Mksck_WspInitialize(MvpkmVM *vm)
+{
+ WorldSwitchPage *wsp = vm->wsp;
+ int err;
+ Mksck_VmId vmId;
+ MksckPage *mksckPage;
+
+ if (wsp->guestId) {
+ err = -EBUSY;
+ } else if (!(mksckPage = MksckPageAlloc())) {
+ err = -ENOMEM;
+ } else {
+ spin_lock(&mksckPageListLock);
+
+ if ((vmId = GetNewGuestVmId()) == MKSCK_VMID_UNDEF) {
+
+ err = -EMFILE;
+ MksckPageRelease(mksckPage);
+
+ printk(KERN_INFO "Mksck_WspInitialize: Cannot allocate vmId\n");
+
+ } else {
+ /*
+ * Now that the mksckPage is all initialized, let others see it.
+ */
+ mksckPages[MKSCK_VMID2IDX(vmId)] = mksckPage;
+ mksckPage->vmId = vmId;
+ mksckPage->isGuest = true;
+ mksckPage->vmHKVA = (HKVA)vm;
+ /* mksckPage->tgid is undefined when isGuest is true */
+
+ wsp->guestId = vmId;
+
+ printk(KERN_DEBUG "New guest mksck page is allocated: idx %x, vmId %x\n",
+ MKSCK_VMID2IDX(vmId), vmId);
+
+ err = 0;
+ }
+
+ /*
+ * All stable, ie, mksckPages[] written, ok to unlock now.
+ */
+ spin_unlock(&mksckPageListLock);
+ }
+
+ return err;
+}
+
+/**
+ * @brief Release the wsp. Clean up after the monitor. Free the
+ * associated communication page.
+ * @param wsp which worldswitch page (VCPU)
+ */
+void
+Mksck_WspRelease(WorldSwitchPage *wsp)
+{
+ int ii;
+ int err;
+ MksckPage *mksckPage = MksckPage_GetFromVmId(wsp->guestId);
+
+ /*
+ * The worldswitch page for a particular VCPU is about to be freed
+ * off, so we know the monitor will never execute again. But the
+ * monitor most likely left some sockets open. Those may have
+ * outbound connections to host sockets that we must close.
+ *
+ * Loop through all possibly open sockets.
+ */
+ uint32 isOpened = wsp->isOpened;
+ Mksck *mksck = mksckPage->sockets;
+ while (isOpened) {
+ if (isOpened & 1) {
+ ASSERT(ATOMIC_GETO(mksck->refCount) != 0);
+ /*
+ * The socket may be connected to a peer (host) socket, so we
+ * have to decrement that target socket's reference
+ * count. Unfortunately, Mksck_DisconnectPeer(mksck) cannot
+ * be called as mksck->peer is an mva not an hkva. Translate
+ * the address first.
+ */
+ if (mksck->peer) {
+ MksckPage *mksckPagePeer = MksckPage_GetFromVmId(mksck->peerAddr.vmId);
+ ASSERT(mksckPagePeer);
+ mksck->peer = MksckPage_GetFromAddr(mksckPagePeer, mksck->peerAddr);
+ ASSERT(mksck->peer);
+ /* mksck->peer is now a hkva */
+ }
+
+ Mksck_CloseCommon(mksck);
+ }
+ isOpened >>= 1;
+ mksck++;
+ }
+
+ /*
+ * A host socket may be in the process of sending to the guest. It
+ * will attempt to wake up the guest using mksckPage->vmHKVA and
+ * mksckPage->vmHKVA->wsp. To assure that the vm and wsp structures
+ * are not disappearing from under the sending thread we lock the
+ * page here.
+ */
+ err = Mutex_Lock(&mksckPage->mutex, MutexModeEX);
+ ASSERT(!err);
+ mksckPage->vmHKVA = 0;
+ Mutex_Unlock(&mksckPage->mutex, MutexModeEX);
+ /*
+ * Decrement refcount set by MksckPageAlloc() call in
+ * Mksck_WspInitialize().
+ */
+ MksckPage_DecRefc(mksckPage);
+
+ /*
+ * Decrement refcount set by VMM:Mksck_Init() referring to the local
+ * variable guestMksckPage.
+ */
+ if (wsp->guestPageMapped) {
+ wsp->guestPageMapped = false;
+ MksckPage_DecRefc(mksckPage);
+ }
+
+ /*
+ * Another task is to decrement the reference count on the mksck
+ * pages the monitor accessed. Those pages are listed in the
+ * wsp->isPageMapped list. They were locked by the monitor
+ * calling WSCALL_GET_PAGE_FROM_VMID
+ */
+ for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) {
+ if (wsp->isPageMapped[ii]) {
+ MksckPage *mksckPageOther = MksckPage_GetFromIdx(ii);
+
+ wsp->isPageMapped[ii] = false;
+ MksckPage_DecRefc(mksckPageOther);
+ }
+ }
+}
+
+/**
+ * @brief disconnect from peer by decrementing
+ * peer socket's reference count and clearing the pointer.
+ * @param mksck local socket to check for connection
+ */
+void
+Mksck_DisconnectPeer(Mksck *mksck)
+{
+ Mksck *peerMksck = mksck->peer;
+ if (peerMksck != NULL) {
+ mksck->peer = NULL;
+ mksck->peerAddr.addr = MKSCK_ADDR_UNDEF;
+ Mksck_DecRefc(peerMksck);
+ }
+}
+
+
+/**
+ * @brief decrement shared page reference count, free page if it goes zero.
+ * also do a dmb first to make sure all activity on the struct is
+ * finished before decrementing the ref count.
+ * @param mksckPage shared page
+ */
+void
+MksckPage_DecRefc(MksckPage *mksckPage)
+{
+ uint32 oldRefc;
+
+ DMB();
+ do {
+ while ((oldRefc = ATOMIC_GETO(mksckPage->refCount)) == 1) {
+
+ /*
+ * Find corresponding entry in list of known shared pages and
+ * clear it so we can't open any new sockets on this shared
+ * page, thus preventing its refCount from being incremented.
+ */
+ spin_lock(&mksckPageListLock);
+ if (ATOMIC_SETIF(mksckPage->refCount, 0, 1)) {
+ uint32 ii = MKSCK_VMID2IDX(mksckPage->vmId);
+ ASSERT(ii < MKSCK_MAX_SHARES);
+ ASSERT(mksckPages[ii] == mksckPage);
+ mksckPages[ii] = NULL;
+ spin_unlock(&mksckPageListLock);
+ printk(KERN_DEBUG "%s mksck page is released: idx %x, vmId %x, tgid %d\n",
+ mksckPage->isGuest?"Guest":"Host",
+ ii, mksckPage->vmId, mksckPage->tgid);
+ MksckPageRelease(mksckPage);
+ return;
+ }
+ spin_unlock(&mksckPageListLock);
+ }
+ ASSERT(oldRefc != 0);
+ } while (!ATOMIC_SETIF(mksckPage->refCount, oldRefc - 1, oldRefc));
+}
+
+/**
+ * @brief Lookup if the provided mpn belongs to one of the Mksck pages. Map if found.
+ * @return 0 if all OK, error value otherwise
+ */
+int
+MksckPage_LookupAndInsertPage(struct vm_area_struct *vma,
+ unsigned long address,
+ MPN mpn)
+{
+ int ii, jj;
+ MksckPage **mksckPagePtr = mksckPages;
+
+ spin_lock(&mksckPageListLock);
+ for (jj = MKSCK_MAX_SHARES; jj--; mksckPagePtr++) {
+ if (*mksckPagePtr) {
+ for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) {
+ if (vmalloc_to_pfn((void*)(((HKVA)*mksckPagePtr) + ii*PAGE_SIZE)) == mpn &&
+ vm_insert_page(vma, address, pfn_to_page(mpn)) == 0) {
+ spin_unlock(&mksckPageListLock);
+ return 0;
+ }
+ }
+ }
+ }
+ spin_unlock(&mksckPageListLock);
+ return -1;
+}
+
+
+/**
+ * @brief Print information on the allocated shared pages
+ *
+ * This function reports (among many other things) on the use of locks
+ * on the mksck page (page lock and individual socket locks). To avoid
+ * the Hiesenberg effect it avoids using locks unless there is a
+ * danger of dereferencing freed memory. In particular, holding
+ * mksckPageListLock ensures that the mksck page is not freed while it
+ * is read. But under very rare conditions this function may report
+ * inconsistent or garbage data.
+ */
+static int
+MksckPageInfoShow(struct seq_file *m, void *private)
+{
+ int ii, jj;
+ uint32 isPageMapped = 0;
+ int err;
+ MvpkmVM *vm;
+
+ /*
+ * Lock is needed to atomize the test and dereference of
+ * mksckPages[ii]
+ */
+ spin_lock(&mksckPageListLock);
+ for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) {
+ MksckPage *mksckPage = mksckPages[ii];
+ if (mksckPage != NULL && mksckPage->isGuest) {
+ /*
+ * After the refcount is incremented mksckPage will not be
+ * freed and it can continued to be dereferenced after the
+ * unlock of mksckPageListLock.
+ */
+ ATOMIC_ADDV(mksckPage->refCount, 1);
+ spin_unlock(&mksckPageListLock);
+
+ /*
+ * To dereference mksckPage->vmHKVA, we need to have the page
+ * lock.
+ */
+ err = Mutex_Lock(&mksckPage->mutex, MutexModeEX);
+ vm = (MvpkmVM *) mksckPage->vmHKVA;
+
+ if (err == 0 && vm && vm->wsp) {
+ for (jj = 0; jj < MKSCK_MAX_SHARES; jj++) {
+ if (vm->wsp->isPageMapped[jj]) isPageMapped |= 1<<jj;
+ }
+ }
+ Mutex_Unlock(&mksckPage->mutex, MutexModeEX);
+ /*
+ * Decrement the page refcount and relock the
+ * mksckPageListLock for the next for loop.
+ */
+ MksckPage_DecRefc(mksckPage);
+ spin_lock(&mksckPageListLock);
+ break;
+ }
+ }
+
+ /* mksckPageListLock is still locked, mksckPages[ii] can be dereferenced */
+ for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) {
+ MksckPage *mksckPage = mksckPages[ii];
+ if (mksckPage != NULL) {
+ uint32 lState = ATOMIC_GETO(mksckPage->mutex.state);
+ uint32 isOpened = 0; /* guest has an implicit ref */
+
+ seq_printf(m, "MksckPage[%02d]: { vmId = %4x(%c), refC = %2d%s",
+ ii, mksckPage->vmId,
+ mksckPage->isGuest?'G':'H',
+ ATOMIC_GETO(mksckPage->refCount),
+ (isPageMapped&(1<<ii) ? "*" : ""));
+
+ if (lState) {
+ seq_printf(m, ", lock=%x locked by line %d, unlocked by %d",
+ lState, mksckPage->mutex.line, mksckPage->mutex.lineUnl);
+ }
+
+
+ if (!mksckPage->isGuest) {
+ struct task_struct *target;
+ seq_printf(m, ", tgid = %d", mksckPage->tgid);
+
+ rcu_read_lock();
+
+ target = pid_task(find_vpid(mksckPage->tgid), PIDTYPE_PID);
+ seq_printf(m, "(%s)", target ? target->comm : "no such process");
+
+ rcu_read_unlock();
+ } else {
+ ATOMIC_ADDV(mksckPage->refCount, 1);
+ spin_unlock(&mksckPageListLock);
+
+ err = Mutex_Lock(&mksckPage->mutex, MutexModeEX);
+ vm = (MvpkmVM *) mksckPage->vmHKVA;
+
+ if (err == 0 && vm && vm->wsp) {
+ isOpened = vm->wsp->isOpened;
+ }
+ Mutex_Unlock(&mksckPage->mutex, MutexModeEX);
+ MksckPage_DecRefc(mksckPage);
+ spin_lock(&mksckPageListLock);
+ /*
+ * As the mksckPageListLock was unlocked, nothing
+ * prevented the MksckPage_DecRefc from actually freeing
+ * the page. Lets verify that the page is still there.
+ */
+ if (mksckPage != mksckPages[ii]) {
+ seq_printf(m, " released }\n");
+ continue;
+ }
+ }
+ seq_printf(m, ", sockets[] = {");
+
+ for (jj = 0; jj < mksckPage->numAllocSocks; jj++, isOpened >>= 1) {
+ Mksck *mksck = mksckPage->sockets + jj;
+
+ if (ATOMIC_GETO(mksck->refCount)) {
+ uint32 blocked;
+ lState = ATOMIC_GETO(mksck->mutex.state);
+ seq_printf(m, "\n { addr = %8x, refC = %2d%s%s%s",
+ mksck->addr.addr,
+ ATOMIC_GETO(mksck->refCount),
+ (isOpened & 1 ? "*" : ""),
+ (mksck->shutDown & MKSCK_SHUT_RD ? " SHUTD_RD":""),
+ (mksck->shutDown & MKSCK_SHUT_WR ? " SHUTD_WR":""));
+
+ if (mksck->peer) {
+ seq_printf(m, ", peerAddr = %8x",
+ mksck->peerAddr.addr);
+ }
+
+ if (lState) {
+ seq_printf(m, ", lock=%x locked by line %d, unlocked by %d",
+ lState, mksck->mutex.line, mksck->mutex.lineUnl);
+ }
+
+ if ((blocked = ATOMIC_GETO(mksck->mutex.blocked))) {
+ seq_printf(m, ", blocked=%d", blocked);
+ }
+
+ seq_printf(m, " }");
+ }
+ }
+ seq_printf(m, " } }\n");
+ }
+ }
+ spin_unlock(&mksckPageListLock);
+
+ return 0;
+}
+
+
+static int
+MksckPageInfoOpen(struct inode *inode, struct file *file)
+{
+ return single_open(file, MksckPageInfoShow, inode->i_private);
+}
+
+static const struct file_operations mksckPageInfoFops = {
+ .open = MksckPageInfoOpen,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *mksckPageDentry = NULL;
+
+void
+MksckPageInfo_Init(void)
+{
+ mksckPageDentry = debugfs_create_file("mksckPage",
+ S_IROTH,
+ NULL,
+ NULL,
+ &mksckPageInfoFops);
+}
+
+void
+MksckPageInfo_Exit(void)
+{
+ if (mksckPageDentry) {
+ debugfs_remove(mksckPageDentry);
+ }
+}