diff options
Diffstat (limited to 'arch/arm/mvp/mvpkm/mvpkm_main.c')
-rw-r--r-- | arch/arm/mvp/mvpkm/mvpkm_main.c | 2691 |
1 files changed, 0 insertions, 2691 deletions
diff --git a/arch/arm/mvp/mvpkm/mvpkm_main.c b/arch/arm/mvp/mvpkm/mvpkm_main.c deleted file mode 100644 index d32a4c1..0000000 --- a/arch/arm/mvp/mvpkm/mvpkm_main.c +++ /dev/null @@ -1,2691 +0,0 @@ -/* - * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support - * - * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; see the file COPYING. If not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#line 5 - -/** - * @file - * - * @brief The kernel level driver. - */ - -#define __KERNEL_SYSCALLS__ -#include <linux/version.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/proc_fs.h> -#include <linux/fcntl.h> -#include <linux/syscalls.h> -#include <linux/kmod.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/miscdevice.h> -#include <linux/poll.h> -#include <linux/smp.h> -#include <linux/capability.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/sysfs.h> -#include <linux/pid.h> -#include <linux/highmem.h> -#include <linux/syscalls.h> - -#ifdef CONFIG_HAS_WAKELOCK -#include <linux/wakelock.h> -#endif - -#include <net/sock.h> - -#include <asm/cacheflush.h> -#include <asm/memory.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/uaccess.h> - -#include "mvp.h" -#include "mvp_version.h" -#include "mvpkm_types.h" -#include "mvpkm_private.h" -#include "mvpkm_kernel.h" -#include "actions.h" -#include "wscalls.h" -#include "arm_inline.h" -#include "tsc.h" -#include "mksck_kernel.h" -#include "mmu_types.h" -#include "mvp_timer.h" -#include "qp.h" -#include "qp_host_kernel.h" -#include "cpufreq_kernel.h" -#include "mvpkm_comm_ev.h" -#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER -#include "mvp_balloon.h" -#endif - - -/********************************************************************* - * - * Definition of the file operations - * - *********************************************************************/ -static _Bool LockedListAdd(MvpkmVM *vm, - __u32 mpn, - __u32 order, - PhysMem_RegionType forRegion); -static _Bool LockedListDel(MvpkmVM *vm, __u32 mpn); -static void LockedListUnlockAll(MvpkmVM *vm); -static _Bool LockedListLookup(MvpkmVM *vm, __u32 mpn); -static int SetupMonitor(MvpkmVM *vm); -static int RunMonitor(MvpkmVM *vm); -static MPN AllocZeroedFreePages(MvpkmVM *vm, - uint32 order, - _Bool highmem, - PhysMem_RegionType forRegion, - HKVA *hkvaRet); -static HKVA MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo); -static void UnmapWSPHKVA(MvpkmVM *vm); -static int MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend); -static void ReleaseVM(MvpkmVM *vm); - -/* - * Mksck open request must come from this uid. It must be root until - * it is set via an ioctl from mvpd. - */ -uid_t Mvpkm_vmwareUid = 0; -EXPORT_SYMBOL(Mvpkm_vmwareUid); - -/* - * Minimum hidden app oom_adj, provided by mvpd, since we can't get it directly - * from the lowmemorykiller module. - */ -static int minHiddenAppOOMAdj; - -/* - * vCPU cpu affinity to let monitor/guest run on some CPUs only (when possible) - */ -static DECLARE_BITMAP(vcpuAffinity, NR_CPUS); - -/********************************************************************* - * - * Sysfs nodes - * - *********************************************************************/ -/* - * kobject for our sysfs representation, used for global nodes. - */ -static struct kobject *mvpkmKObj; - -/* - * kobject for the balloon exports. - */ -static struct kobject *balloonKObj; - -/** - * @brief sysfs show function for global version attribute. - * - * @param kobj reference to kobj nested in MvpkmVM struct. - * @param attr kobj_attribute reference, not used. - * @param buf PAGE_SIZEd buffer to write to. - * - * @return number of characters printed (not including trailing null character). - */ -static ssize_t -version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); -} - -static struct kobj_attribute versionAttr = __ATTR_RO(version); - -/** - * @brief sysfs show function for global background_pages attribute. - * - * Used by vmx balloon policy controller to gauge the amount of freeable - * anonymous memory. - * - * @param kobj reference to kobj nested in MvpkmVM struct. - * @param attr kobj_attribute reference, not used. - * @param buf PAGE_SIZEd buffer to write to. - * - * @return number of characters printed (not including trailing null character). - */ -static ssize_t -background_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -{ -#ifndef CONFIG_ANDROID_LOW_MEMORY_KILLER - return snprintf(buf, PAGE_SIZE, "0\n"); -#else - return snprintf(buf, PAGE_SIZE, "%d\n", Balloon_AndroidBackgroundPages(minHiddenAppOOMAdj)); -#endif -} - -static struct kobj_attribute backgroundAttr = __ATTR_RO(background); - -/** - * @brief sysfs show function to export the other_file calculation in - * lowmemorykiller. - * - * It's helpful, in the balloon controller, to know what the lowmemorykiller - * module is using to know when the system has crossed a minfree threshold. - * Since there exists a number of different other_file calculations in various - * lowmemorykiller patches (@see{MVP-1674}), and the module itself doesn't - * provide a clean export of this figure, we provide it on a case-by-case basis - * for the various supported hosts here. - * - * @param kobj reference to kobj nested in MvpkmVM struct. - * @param attr kobj_attribute reference, not used. - * @param buf PAGE_SIZEd buffer to write to. - * - * @return number of characters printed (not including trailing null character). - */ -static ssize_t -other_file_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -{ - int32 other_file = 0; - -#ifndef LOWMEMKILLER_VARIANT -#define LOWMEMKILLER_VARIANT 0 -#endif - -#ifndef LOWMEMKILLER_MD5 -#define LOWMEMKILLER_MD5 0 -#endif - -#ifndef LOWMEMKILLER_SHRINK_MD5 -#define LOWMEMKILLER_SHRINK_MD5 0 -#endif - - /* - * The build system hashes the lowmemorykiller section related to the - * other_file calculation in the kernel source for us, here we have to - * provide the code. - */ -#if LOWMEMKILLER_VARIANT == 1 - /* - * This is the same as the non-exported global_reclaimable_pages() when there - * is no swap. - */ - other_file = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); -#elif LOWMEMKILLER_VARIANT == 2 - other_file = global_page_state(NR_FILE_PAGES); -#elif LOWMEMKILLER_VARIANT == 3 - other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); -#elif LOWMEMKILLER_VARIANT == 4 - /* - * Here free/file pages are fungible and max(free, file) isn't used, but we - * can continue to use max(free, file) since max(free, file) = other_file in - * this case. - */ - other_file = global_page_state(NR_FREE_PAGES) + global_page_state(NR_FILE_PAGES); -#elif defined(NONANDROID) - /* - * Non-Android host platforms don't have ballooning enabled. - */ -#else - /* - * If you get this message, you need to run 'make lowmem-info' and inspect - * lowmemorykiller.c. If the "other_file = ..." calculation in lowmem_shrink - * appears above, simply add the "Shrink#" to an existing entry in - * lowmemkiller-variant.sh, pointing to the variant number above. Otherwise, - * provide a new entry above and variant number, with the appropriate - * other_file calculation and update lowmemkiller-variant.sh accordingly. - */ -//#warning "Unknown lowmemorykiller variant in hosted/module/mvpkm_main.c, falling back on default (see other_file_show for the remedy)" - /* - * Fall back on default - this may bias strangely for/against the host, but - * nothing catastrophic should result. - */ - other_file = global_page_state(NR_FILE_PAGES); -#endif - -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - return snprintf(buf, - PAGE_SIZE, - "%d %d %s %s\n", - other_file, - LOWMEMKILLER_VARIANT, - STRINGIFY(LOWMEMKILLER_MD5), - STRINGIFY(LOWMEMKILLER_SHRINK_MD5)); -#undef _STRINGIFY -#undef STRINGIFY -} - -static struct kobj_attribute otherFileAttr = __ATTR_RO(other_file); - -/* - * kset for our sysfs representation, used for per-VM nodes. - */ -static struct kset *mvpkmKSet; - -static ssize_t MvpkmAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf); -static ssize_t MvpkmAttrStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count); - -static void MvpkmKObjRelease(struct kobject *kobj) - __attribute__ ((optimize ("-fomit-frame-pointer"))); - - -/** - * @brief Releases the vm structure containing the kobject. - * - * @param kobj the vm's kobject. - */ - -static void -MvpkmKObjRelease(struct kobject *kobj) -{ - MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); - - ReleaseVM(vm); - - module_put(THIS_MODULE); -} - - -/** - * @name mvpkm ktype attribute structures for locked_pages. - * - * @{ - */ -static struct sysfs_ops mvpkmSysfsOps = { - .show = MvpkmAttrShow, - .store = MvpkmAttrStore -}; - -static struct attribute mvpkmLockedPagesAttr = { - .name = "locked_pages", - .mode = 0444, -}; - -static struct attribute mvpkmBalloonWatchdogAttr = { - .name = "balloon_watchdog", - .mode = 0666 -}; - -static struct attribute mvpkmMonitorAttr = { - .name = "monitor", - .mode = 0400, -}; - -static struct attribute *mvpkmDefaultAttrs[] = { - &mvpkmLockedPagesAttr, - &mvpkmBalloonWatchdogAttr, - &mvpkmMonitorAttr, - NULL, -}; - -static struct kobj_type mvpkmKType = { - .sysfs_ops = &mvpkmSysfsOps, - .release = MvpkmKObjRelease, - .default_attrs = mvpkmDefaultAttrs, -}; -/*@}*/ - -/* - * As it is not very common for host kernels to have SYS_HYPERVISOR enabled and - * you have to "hack" a Kconfig file to enable it, just include the - * functionality inline if it is not enabled. - */ -#ifndef CONFIG_SYS_HYPERVISOR -struct kobject *hypervisor_kobj; -EXPORT_SYMBOL_GPL(hypervisor_kobj); -#endif - - -/* - * kobject and kset utilities. - */ - -extern struct kobject *kset_find_obj(struct kset *, const char *) - __attribute__((weak)); - - -/** - * @brief Finds a kobject in a kset. The actual implementation is copied from - * kernel source in lib/kobject.c. Although the symbol is extern-declared, - * it is not EXPORT_SYMBOL-ed. We use a weak reference in case the symbol - * might be exported in future kernel versions. - * - * @param kset set to search. - * @param name object name. - * - * @return retained kobject if found, NULL otherwise. - */ - -struct kobject * -kset_find_obj(struct kset *kset, - const char *name) -{ - struct kobject *k; - struct kobject *ret = NULL; - - spin_lock(&kset->list_lock); - list_for_each_entry(k, &kset->list, entry) { - if (kobject_name(k) && !strcmp(kobject_name(k), name)) { - ret = kobject_get(k); - break; - } - } - spin_unlock(&kset->list_lock); - return ret; -} - - -/** - * @brief Finds one of the VM's pre-defined ksets. - * - * @param vmID a VM ID. - * @param name name of one of the VM's pre-defined ksets. - * - * @return retained kset if found, NULL otherwise. - */ - -struct kset * -Mvpkm_FindVMNamedKSet(int vmID, - const char *name) -{ - MvpkmVM *vm; - struct kobject *kobj; - char vmName[32] = {}; /* Large enough to hold externally-formatted int32. */ - struct kset *res = NULL; - - if (!mvpkmKSet) { - return NULL; - } - - snprintf(vmName, sizeof vmName, "%d", vmID); - vmName[sizeof vmName - 1] = '\0'; /* Always null-terminate, no overflow. */ - - kobj = kset_find_obj(mvpkmKSet, vmName); - if (!kobj) { - return NULL; - } - - vm = container_of(kobj, MvpkmVM, kobj); - - if (!strcmp(name, "devices")) { - res = kset_get(vm->devicesKSet); - } else if (!strcmp(name, "misc")) { - res = kset_get(vm->miscKSet); - } - - kobject_put(kobj); - return res; -} - -EXPORT_SYMBOL(Mvpkm_FindVMNamedKSet); - - - -/********************************************************************* - * - * Standard Linux miscellaneous device registration - * - *********************************************************************/ - -MODULE_LICENSE("GPL"); // for kallsyms_lookup_name - -static int MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf); - - -/** - * @brief Linux vma operations for /dev/mem-like kernel module mmap. We - * enforce the restriction that only MPNs that have been allocated - * to the opened VM may be mapped and also increment the reference - * count (via vm_insert_page), so that even if the memory is later - * freed by the VM, host process vma's containing the MPN can't - * compromise the system. - * - * However, only trusted host processes (e.g. the vmx) should be allowed - * to use this interface, since you can mmap the monitor's code/data/ - * page tables etc. with it. Untrusted host processes are limited to - * typed messages for sharing memory with the monitor. Unix file system - * access permissions are the intended method of restricting access. - * Unfortunately, today _any_ host process utilizing Mksck requires - * access to mvpkm to setup its Mksck pages and obtain socket info via - * ioctls - we probably should be exporting two devices, one for trusted - * and one for arbitrary host processes to avoid this confusion of - * concerns. - */ -static struct vm_operations_struct mvpkmVMOps = { - .fault = MvpkmFault -}; - -/* - * Generic kernel module file ops. These functions will be registered - * at the time the kernel module is loaded. - */ -static long MvpkmUnlockedIoctl(struct file *filep, - unsigned int cmd, - unsigned long arg); -static int MvpkmOpen(struct inode *inode, struct file *filp); -static int MvpkmRelease(struct inode *inode, struct file *filp); -static int MvpkmMMap(struct file *file, struct vm_area_struct *vma); - -/** - * @brief the file_operation structure contains the callback functions - * that are registered with Linux to handle file operations on - * the mvpkm device. - * - * The structure contains other members that the mvpkm device - * does not use. Those members are auto-initialized to NULL. - * - * WARNING, this structure has changed after Linux kernel 2.6.19: - * readv/writev are changed to aio_read/aio_write (neither is used here). - */ -static const struct file_operations mvpkmFileOps = { - .owner = THIS_MODULE, - .unlocked_ioctl = MvpkmUnlockedIoctl, - .open = MvpkmOpen, - .release = MvpkmRelease, - .mmap = MvpkmMMap -}; - -/** - * @brief The mvpkm device identifying information to be used to register - * the device with the Linux kernel. - */ -static struct miscdevice mvpkmDev = { - .minor = 165, - .name = "mvpkm", - .fops = &mvpkmFileOps -}; - -/** - * Mvpkm is loaded by mvpd and only mvpd will be allowed to open - * it. There is a very simple way to verify that: record the process - * id (thread group id) at the time the module is loaded and test it - * at the time the module is opened. - */ -static struct pid *initTgid; - - -#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER -/** - * @name Slab shrinker for triggering balloon adjustment. - * - * @note shrinker us used as a trigger for guest balloon. - * - * @{ - */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) -static int MvpkmShrink(struct shrinker *this, struct shrink_control *sc); -#else -static int MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask); -#endif - -static struct shrinker mvpkmShrinker = { - .shrink = MvpkmShrink, - .seeks = DEFAULT_SEEKS -}; -/*@}*/ -#endif - -module_param_array(vcpuAffinity, ulong, NULL, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(vcpuAffinity, "vCPU affinity"); - - -/** - * @brief Initialize the mvpkm device, register it with the Linux kernel. - * - * @return A zero is returned on success and a negative errno code for failure. - * (Same as the return policy of misc_register(9).) - */ - -static int __init -MvpkmInit(void) -{ - int err = 0; - _Bool mksckInited = false; - _Bool cpuFreqInited = false; - - printk(KERN_INFO "Mvpkm: " MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); - printk(KERN_INFO "Mvpkm: loaded from process %s tgid=%d, pid=%d\n", - current->comm, - task_tgid_vnr(current), - task_pid_vnr(current)); - - if (bitmap_empty(vcpuAffinity, NR_CPUS)) { - bitmap_copy(vcpuAffinity, cpumask_bits(cpu_possible_mask), NR_CPUS); - } - - if ((err = misc_register(&mvpkmDev))) { - return -ENOENT; - } - - if ((err = Mksck_Init())) { - goto error; - } else { - mksckInited = true; - } - - QP_HostInit(); - - CpuFreq_Init(); - cpuFreqInited = true; - - /* - * Reference mvpd (module loader) tgid struct, so that we can avoid - * attacks based on pid number wraparound. - */ - initTgid = get_pid(task_tgid(current)); - -#ifndef CONFIG_SYS_HYPERVISOR - hypervisor_kobj = kobject_create_and_add("hypervisor", NULL); - if (!hypervisor_kobj) { - err = -ENOMEM; - goto error; - } -#endif - - if (!(mvpkmKObj = kobject_create_and_add("mvp", hypervisor_kobj)) || - !(balloonKObj = kobject_create_and_add("lowmem", mvpkmKObj)) || - !(mvpkmKSet = kset_create_and_add("vm", NULL, mvpkmKObj))) { - err = -ENOMEM; - goto error; - } - - if ((err = sysfs_create_file(mvpkmKObj, &versionAttr.attr))) { - goto error; - } - - if ((err = sysfs_create_file(balloonKObj, &backgroundAttr.attr))) { - goto error; - } - - if ((err = sysfs_create_file(balloonKObj, &otherFileAttr.attr))) { - goto error; - } - -#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER - register_shrinker(&mvpkmShrinker); -#endif - - MksckPageInfo_Init(); - - return 0; - -error: - if (mvpkmKSet) { - kset_unregister(mvpkmKSet); - } - - if (balloonKObj) { - kobject_del(balloonKObj); - kobject_put(balloonKObj); - } - - if (mvpkmKObj) { - kobject_del(mvpkmKObj); - kobject_put(mvpkmKObj); - } - -#ifndef CONFIG_SYS_HYPERVISOR - if (hypervisor_kobj) { - kobject_del(hypervisor_kobj); - kobject_put(hypervisor_kobj); - } -#endif - - if (cpuFreqInited) { - CpuFreq_Exit(); - } - - if (mksckInited) { - Mksck_Exit(); - } - - if (initTgid) { - put_pid(initTgid); - } - - misc_deregister(&mvpkmDev); - return err; -} - -/** - * @brief De-register the mvpkm device with the Linux kernel. - */ -void -MvpkmExit(void) -{ - PRINTK(KERN_INFO "MvpkmExit called !\n"); - - MksckPageInfo_Exit(); - -#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER - unregister_shrinker(&mvpkmShrinker); -#endif - - kset_unregister(mvpkmKSet); - kobject_del(balloonKObj); - kobject_put(balloonKObj); - kobject_del(mvpkmKObj); - kobject_put(mvpkmKObj); -#ifndef CONFIG_SYS_HYPERVISOR - kobject_del(hypervisor_kobj); - kobject_put(hypervisor_kobj); -#endif - - CpuFreq_Exit(); - - Mksck_Exit(); - - put_pid(initTgid); - - misc_deregister(&mvpkmDev); -} - -/* - * The standard module registration macros of Linux. - */ -module_init(MvpkmInit); -module_exit(MvpkmExit); - -module_param_named(minHiddenAppOOMAdj, minHiddenAppOOMAdj, int, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(minHiddenAppOOMAdj, "minimum hidden app oom_adj, as per lowmemorykiller"); - -#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER -/** - * @brief Balloon watchdog timeout callback. - * - * Terminate the VM since it's not responsive. - * - * @param data vm reference representation. - */ -static void -WatchdogCB(unsigned long data) -{ - MvpkmVM *vm = (MvpkmVM *)data; - - printk("Balloon watchdog expired (%d s)!\n", BALLOON_WATCHDOG_TIMEOUT_SECS); - - Mvpkm_WakeGuest(vm, ACTION_ABORT); -} - -/** - * @brief Slab shrinker. - * - * Called by Linux kernel when we're under memory pressure. We treat all locked - * pages as a slab for this purpose, similar to the Android low memory killer. - * - * @param this reference to registered shrinker for callback context. - * @param nrToScan number of entries to scan. If 0 then just return the number - * of present entries. We ignore the value of nrToScan when > 1 - * since the shrinker is a trigger to readjust guest balloons, - * where the actual balloon size is determined in conjunction - * with the guest. - * @param gfpMask ignored. - * - * @return number of locked pages. - */ -static int -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) -MvpkmShrink(struct shrinker *this, struct shrink_control *sc) -#else -MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask) -#endif -{ - uint32 locked = 0; - struct kobject *k; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) - int nrToScan = sc->nr_to_scan; -#endif - - spin_lock(&mvpkmKSet->list_lock); - - list_for_each_entry(k, &mvpkmKSet->list, entry) { - MvpkmVM *vm = container_of(k, MvpkmVM, kobj); - - locked += ATOMIC_GETO(vm->usedPages); - - /* - * Try and grab the WSP semaphore - if we fail, we must be VM setup or - * teardown, no point trying to wake the guest. - */ - if (nrToScan > 0 && - down_read_trylock(&vm->wspSem)) { - - if (vm->wsp) { - Mvpkm_WakeGuest(vm, ACTION_BALLOON); - - /* - * Balloon watchdog. - */ - if (vm->balloonWDEnabled) { - struct timer_list *t = &vm->balloonWDTimer; - - if (!timer_pending(t)) { - t->data = (unsigned long)vm; - t->function = WatchdogCB; - t->expires = jiffies + BALLOON_WATCHDOG_TIMEOUT_SECS * HZ; - add_timer(t); - } - } - } - - up_read(&vm->wspSem); - } - } - - spin_unlock(&mvpkmKSet->list_lock); - - return locked; -} -#endif - - -/** - * @brief The open file operation. Initializes the vm specific structure. - */ -int -MvpkmOpen(struct inode *inode, struct file *filp) -{ - MvpkmVM *vm; - - if (initTgid != task_tgid(current)) { - printk(KERN_ERR "%s: MVPKM can be opened only from MVPD (process %d).\n", - __FUNCTION__, pid_vnr(initTgid)); - return -EPERM; - } - printk(KERN_DEBUG "%s: Allocating an MvpkmVM structure from process %s tgid=%d, pid=%d\n", - __FUNCTION__, - current->comm, - task_tgid_vnr(current), - task_pid_vnr(current)); - - vm = kmalloc(sizeof(MvpkmVM), GFP_KERNEL); - if (!vm) { - return -ENOMEM; - } - - memset(vm, 0, sizeof *vm); - - init_timer(&vm->balloonWDTimer); - init_rwsem(&vm->lockedSem); - init_rwsem(&vm->wspSem); - init_rwsem(&vm->monThreadTaskSem); - vm->monThreadTask = NULL; - vm->isMonitorInited = false; - - filp->private_data = vm; - - if (!Mvpkm_vmwareUid) { - Mvpkm_vmwareUid = current_euid(); - } - - return 0; -} - -/** - * @brief Releases a VMs resources - * @param vm vm to release - */ -static void -ReleaseVM(MvpkmVM *vm) -{ - del_timer_sync(&vm->balloonWDTimer); - - down_write(&vm->wspSem); - - if (vm->isMonitorInited) { - MonitorTimer_Request(&vm->monTimer, 0); -#ifdef CONFIG_HAS_WAKELOCK - wake_lock_destroy(&vm->wakeLock); -#endif - Mksck_WspRelease(vm->wsp); - vm->wsp = NULL; - } - - up_write(&vm->wspSem); - - LockedListUnlockAll(vm); - - UnmapWSPHKVA(vm); - - /* - * All sockets potentially connected to sockets of this vm's vmId will fail - * at send now. DGRAM sockets are note required to tear down connection - * explicitly. - */ - - kfree(vm); -} - -/** - * @brief The release file operation. Releases the vm specific - * structure including all the locked pages. - * - * @param inode Unused - * @param filp which VM we're dealing with - * @return 0 - */ -int -MvpkmRelease(struct inode *inode, struct file *filp) -{ - MvpkmVM *vm = filp->private_data; - - /* - * Tear down any queue pairs associated with this VM - */ - if (vm->isMonitorInited) { - ASSERT(vm->wsp); - QP_DetachAll(vm->wsp->guestId); - } - - /* - * Release the VM's ksets. - */ - - kset_unregister(vm->miscKSet); - kset_unregister(vm->devicesKSet); - - if (vm->haveKObj) { - /* - * Release the VM's kobject. - * 'vm' will be kfree-d in its kobject's release function. - */ - - kobject_del(&vm->kobj); - kobject_put(&vm->kobj); - } else { - ReleaseVM(vm); - } - - filp->private_data = NULL; - - printk(KERN_INFO "%s: Released MvpkmVM structure from process %s tgid=%d, pid=%d\n", - __FUNCTION__, - current->comm, - task_tgid_vnr(current), - task_pid_vnr(current)); - - return 0; -} - -/** - * @brief Page fault handler for /dev/mem-like regions (see mvpkmVMOps - * block comment). - */ -static int -MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - unsigned long address = (unsigned long)vmf->virtual_address; - MPN mpn = vmf->pgoff; - MvpkmVM *vm = vma->vm_file->private_data; - - - /* - * Only insert pages belonging to the VM. The check is slow, O(n) in the - * number of MPNs associated with the VM, but it doesn't matter - the mmap - * interface should only be used by trusted processes at initialization - * time and for debugging. - * - * The mpn can be either in the memory reserved the monitor or mvpd - * through the regular mechanisms or it could be a mksck page. - */ - if (!pfn_valid(mpn)) { - printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx, mpn invalid\n", - mpn, - address); - } else if (LockedListLookup(vm, mpn)) { - if (vm_insert_page(vma, address, pfn_to_page(mpn)) == 0) { - return VM_FAULT_NOPAGE; - } - - printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx \n", - mpn, - address); - } else if (MksckPage_LookupAndInsertPage(vma, address, mpn) == 0) { - return VM_FAULT_NOPAGE; - } - - if (vm->stubPageMPN) { - if (vm_insert_page(vma, address, pfn_to_page(vm->stubPageMPN)) == 0) { - printk(KERN_INFO "MvpkmMMap: mapped the stub page at %x @ %lx \n", - mpn, - address); - return VM_FAULT_NOPAGE; - } - - printk(KERN_ERR "MvpkmMMap: Could not insert stub page %x @ %lx \n", - mpn, - address); - - } - - return VM_FAULT_SIGBUS; -} - -/** - * @brief sysfs show function for per-VM locked_pages attribute. - * - * @param kobj reference to kobj nested in MvpkmVM struct. - * @param attr attribute reference. - * @param buf PAGE_SIZEd buffer to write to. - * - * @return number of characters printed (not including trailing null character). - */ -static ssize_t -MvpkmAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - if (attr == &mvpkmLockedPagesAttr) { - MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); - - return snprintf(buf, PAGE_SIZE, "%d\n", ATOMIC_GETO(vm->usedPages)); - } else if (attr == &mvpkmMonitorAttr) { - MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); - - return snprintf(buf, - PAGE_SIZE, - "hostActions %x callno %d\n", - ATOMIC_GETO(vm->wsp->hostActions), - WSP_Params(vm->wsp)->callno); - } else { - return -EPERM; - } -} - -/** - * @brief sysfs store function for per-VM locked_pages attribute. - * - * @param kobj reference to kobj nested in MvpkmVM struct. - * @param attr attribute reference. - * @param buf PAGE_SIZEd buffer to write to. - * @param buf input buffer. - * @param count input buffer length. - * - * @return number of bytes consumed or negative error code. - */ -static ssize_t -MvpkmAttrStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count) -{ - if (attr == &mvpkmBalloonWatchdogAttr) { - MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); - - /* - * Enable balloon watchdog on first write. This includes all ballooning - * capable guest. - */ - vm->balloonWDEnabled = true; - del_timer_sync(&vm->balloonWDTimer); - - return 1; - } else { - return -EPERM; - } -} - -/** - * @brief Map machine address space region into host process. - * - * @param file file reference (ignored). - * @param vma Linux virtual memory area defining the region. - * - * @return 0 on success, otherwise error code. - */ -static int -MvpkmMMap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &mvpkmVMOps; - - return 0; -} - -#ifdef CONFIG_ARM_LPAE -/** - * @brief Determine host cacheability/shareability attributes. - * - * Used to ensure monitor/guest shared mappings are consistent with - * those of host user/kernel. - * - * @param[out] attribMAN when setting up the HW monitor this provides the - * attributes in the generic ARM_MemAttrNormal form, - * suitable for configuring the monitor and guest's - * [H]MAIR0 and setting the shareability attributes of - * the LPAE descriptors. - */ -static void -DetermineMemAttrLPAE(ARM_MemAttrNormal *attribMAN) -{ - /* - * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for - * normal kernel/user L2D mappings. These bits should be consistent both - * with each other and what we use in the monitor since we share various - * pages with both host processes, the kernel module and monitor, and the - * ARM ARM requires that synonyms have the same cacheability attributes, - * see end of A3.5.{4,7} ARM DDI 0406A. - */ - HKVA hkva = __get_free_pages(GFP_KERNEL, 0); - - ARM_LPAE_L3D *pt = (ARM_LPAE_L3D *)hkva; - ARM_LPAE_L3D *kernL3D = &pt[0], *userL3D = &pt[1]; - uint32 attr, mair0, mair1; - - set_pte_ext((pte_t *)kernL3D, pfn_pte(0, PAGE_KERNEL), 0); - set_pte_ext((pte_t *)userL3D, pfn_pte(0, PAGE_NONE), 0); - - printk(KERN_INFO - "DetermineMemAttr: Kernel L3D AttrIndx=%x SH=%x\n", - kernL3D->blockS1.attrIndx, - kernL3D->blockS1.sh); - - printk(KERN_INFO - "DetermineMemAttr: User L3D AttrIndx=%x SH=%x\n", - userL3D->blockS1.attrIndx, - userL3D->blockS1.sh); - - ASSERT(kernL3D->blockS1.attrIndx == userL3D->blockS1.attrIndx); - ASSERT(kernL3D->blockS1.sh == userL3D->blockS1.sh); - - switch (kernL3D->blockS1.sh) { - case 0: { - attribMAN->share = ARM_SHARE_ATTR_NONE; - break; - } - case 2: { - attribMAN->share = ARM_SHARE_ATTR_OUTER; - break; - } - case 3: { - attribMAN->share = ARM_SHARE_ATTR_INNER; - break; - } - default: { - FATAL(); - } - } - - ARM_MRC_CP15(MAIR0, mair0); - ARM_MRC_CP15(MAIR1, mair1); - - attr = MVP_EXTRACT_FIELD(kernL3D->blockS1.attrIndx >= 4 ? mair1 : mair0, - 8 * (kernL3D->blockS1.attrIndx % 4), - 8); - - /* - * See B4-1615 ARM DDI 0406C-2c for magic. - */ -#define MAIR_ATTR_2_CACHE_ATTR(x, y) \ - switch (x) { \ - case 2: { \ - (y) = ARM_CACHE_ATTR_NORMAL_WT; \ - break; \ - } \ - case 3: { \ - (y) = ARM_CACHE_ATTR_NORMAL_WB; \ - break; \ - } \ - default: { \ - FATAL(); \ - } \ - } - - MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 2, 2), attribMAN->innerCache); - MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 6, 2), attribMAN->outerCache); - -#undef MAIR_ATTR_2_CACHE_ATTR - - printk(KERN_INFO - "DetermineMemAttr: innerCache %x outerCache %x share %x\n", - attribMAN->innerCache, - attribMAN->outerCache, - attribMAN->share); - - free_pages(hkva, 0); -} - -#else - -/** - * @brief Determine host cacheability/shareability attributes. - * - * Used to ensure monitor/guest shared mappings are consistent with - * those of host user/kernel. - * - * @param[out] attribL2D when setting up the LPV monitor a template L2D - * containing cacheability attributes {S, TEX,CB} used by - * host kernel for normal memory mappings. These may be - * used directly for monitor/guest mappings, since both - * worlds share a common {TRE, PRRR, NMRR}. - * @param[out] attribMAN when setting up TTBR0 in the LPV monitor and the page - * tables for the HW monitor this provides the attributes - * in the generic ARM_MemAttrNormal form, suitable for - * configuring TTBR0 + the monitor and guest's [H]MAIR0 - * and setting the shareability attributes of the LPAE - * descriptors. - */ -static void -DetermineMemAttrNonLPAE(ARM_L2D *attribL2D, ARM_MemAttrNormal *attribMAN) -{ - /* - * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for - * normal kernel/user L2D mappings. These bits should be consistent both - * with each other and what we use in the monitor since we share various - * pages with both host processes, the kernel module and monitor, and the - * ARM ARM requires that synonyms have the same cacheability attributes, - * see end of A3.5.{4,7} ARM DDI 0406A. - */ - HKVA hkva = __get_free_pages(GFP_KERNEL, 0); - uint32 sctlr; - ARM_L2D *pt = (ARM_L2D *)hkva; - ARM_L2D *kernL2D = &pt[0], *userL2D = &pt[1]; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 38) - /* - * Linux uses the magic 2048 offset in set_pte_ext. See include/asm/pgtable.h - * for PAGE_NONE and PAGE_KERNEL semantics. - */ - const uint32 set_pte_ext_offset = 2048; -#else - /* - * Linux 2.6.38 switched the order of Linux vs hardware page tables. - * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. - */ - const uint32 set_pte_ext_offset = 0; -#endif - - set_pte_ext((pte_t *)(kernL2D + set_pte_ext_offset/sizeof(ARM_L2D)), - pfn_pte(0, PAGE_KERNEL), - 0); - set_pte_ext((pte_t *)(userL2D + set_pte_ext_offset/sizeof(ARM_L2D)), - pfn_pte(0, PAGE_NONE), - 0); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 38) - /* - * Linux 2.6.38 switched the order of Linux vs hardware page tables. - * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. - */ - kernL2D += 2048/sizeof(ARM_L2D); - userL2D += 2048/sizeof(ARM_L2D); -#endif - - printk(KERN_INFO - "DetermineMemAttr: Kernel L2D TEX=%x CB=%x S=%x\n", - kernL2D->small.tex, - kernL2D->small.cb, - kernL2D->small.s); - - printk(KERN_INFO - "DetermineMemAttr: User L2D TEX=%x CB=%x S=%x\n", - userL2D->small.tex, - userL2D->small.cb, - userL2D->small.s); - - ASSERT((kernL2D->small.tex & 1) == (userL2D->small.tex & 1)); - ASSERT(kernL2D->small.cb == userL2D->small.cb); - ASSERT(kernL2D->small.s == userL2D->small.s); - - *attribL2D = *kernL2D; - - /* - * We now decode TEX remap and obtain the more generic form for use in - * the LPV monitor's TTBR0 initialization and the HW monitor. - */ - - ARM_MRC_CP15(CONTROL_REGISTER, sctlr); - - if (sctlr & ARM_CP15_CNTL_TRE) { - uint32 prrr, nmrr, indx, type, innerCache, outerCache, outerShare, - share; - - printk(KERN_INFO - "DetermineMemAttr: TEX remapping enabled\n"); - - ARM_MRC_CP15(PRIMARY_REGION_REMAP, prrr); - ARM_MRC_CP15(NORMAL_MEMORY_REMAP, nmrr); - - printk(KERN_INFO - "DetermineMemAttr: PRRR=%x NMRR=%x\n", - prrr, - nmrr); - - /* - * Decode PRRR/NMRR below. See B3.7 ARM DDI 0406B for register - * encodings, tables and magic numbers. - */ - - indx = (MVP_BIT(kernL2D->small.tex, 0) << 2) | kernL2D->small.cb; - - /* - * Only normal memory makes sense here. - */ - type = MVP_EXTRACT_FIELD(prrr, 2 * indx, 2); - ASSERT(type == 2); - - innerCache = MVP_EXTRACT_FIELD(nmrr, 2 * indx, 2); - outerCache = MVP_EXTRACT_FIELD(nmrr, 16 + 2 * indx, 2); - outerShare = !MVP_BIT(prrr, 24 + indx); - share = MVP_BIT(prrr, 18 + kernL2D->small.s); - - printk(KERN_INFO - "DetermineMemAttr: type %x innerCache %x outerCache %x" - " share %x outerShare %x\n", - type, - innerCache, - outerCache, - share, - outerShare); - - if (share) { - if (outerShare) { - attribMAN->share = ARM_SHARE_ATTR_OUTER; - } else { - attribMAN->share = ARM_SHARE_ATTR_INNER; - } - } else { - attribMAN->share = ARM_SHARE_ATTR_NONE; - } - - attribMAN->innerCache = innerCache; - attribMAN->outerCache = outerCache; - } else { - NOT_IMPLEMENTED_JIRA(1849); - } - - free_pages(hkva, 0); -} -#endif - -/** - * @brief The ioctl file operation. - * - * The ioctl command is the main communication method between the - * vmx and the mvpkm kernel module. - * - * @param filp which VM we're dealing with - * @param cmd select which cmd function needs to be performed - * @param arg argument for command - * @return error code, 0 on success - */ -long -MvpkmUnlockedIoctl(struct file *filp, - unsigned int cmd, - unsigned long arg) -{ - MvpkmVM *vm = filp->private_data; - int retval = 0; - - switch (cmd) { - - - case MVPKM_DISABLE_FAULT: { - if (!vm->stubPageMPN) { - uint32 *ptr; - - vm->stubPageMPN = - AllocZeroedFreePages(vm, 0, false, MEMREGION_MAINMEM, (HKVA*)&ptr); - if (!vm->stubPageMPN) { - break; - } - ptr[0] = MVPKM_STUBPAGE_BEG; - ptr[PAGE_SIZE/sizeof(uint32) - 1] = MVPKM_STUBPAGE_END; - } - break; - } - - /* - * Allocate some pinned pages from kernel. - * Returns -ENOMEM if no host pages available for allocation. - */ - case MVPKM_LOCK_MPN: { - struct MvpkmLockMPN buf; - - if (copy_from_user(&buf, (void *)arg, sizeof buf)) { - return -EFAULT; - } - - buf.mpn = AllocZeroedFreePages(vm, - buf.order, - false, - buf.forRegion, - NULL); - if (buf.mpn == 0) { - return -ENOMEM; - } - - if (copy_to_user((void *)arg, &buf, sizeof buf)) { - return -EFAULT; - } - break; - } - - case MVPKM_UNLOCK_MPN: { - struct MvpkmLockMPN buf; - - if (copy_from_user(&buf, (void *)arg, sizeof buf)) { - return -EFAULT; - } - - if (!LockedListDel(vm, buf.mpn)) { - return -EINVAL; - } - break; - } - - case MVPKM_MAP_WSPHKVA: { - MvpkmMapHKVA mvpkmMapInfo; - HkvaMapInfo mapInfo[WSP_PAGE_COUNT]; - - if (copy_from_user(&mvpkmMapInfo, (void *)arg, sizeof mvpkmMapInfo)) { - return -EFAULT; - } - - if (copy_from_user(mapInfo, (void *)mvpkmMapInfo.mapInfo, sizeof mapInfo)) { - return -EFAULT; - } - - mvpkmMapInfo.hkva = MapWSPHKVA(vm, mapInfo); - BUG_ON(mvpkmMapInfo.hkva == 0); - - if (mvpkmMapInfo.forRegion == MEMREGION_WSP) { - vm->wsp = (WorldSwitchPage *) mvpkmMapInfo.hkva; - } - - if (copy_to_user((void *)arg, &mvpkmMapInfo, sizeof mvpkmMapInfo)) { - return -EFAULT; - } - break; - } - - case MVPKM_RUN_MONITOR: { - if (!vm->isMonitorInited) { - vm->isMonitorInited = ((retval = SetupMonitor(vm)) == 0); - } - - if (vm->isMonitorInited) { - retval = RunMonitor(vm); - } - - break; - } - - case MVPKM_ABORT_MONITOR: { - if (!vm->isMonitorInited) { - return -EINVAL; - } - - ASSERT(vm->wsp != NULL); - - Mvpkm_WakeGuest(vm, ACTION_ABORT); - break; - } - - case MVPKM_CPU_INFO: { - struct MvpkmCpuInfo buf; - uint32 mpidr; - -#ifdef CONFIG_ARM_LPAE - DetermineMemAttrLPAE(&buf.attribMAN); - /** - * We need to add support to the LPV monitor for LPAE page tables if we - * want to use it on a LPAE host, due to the costs involved in - * transitioning between LPAE and non-LPAE page tables without Hyp - * assistance. - * - * @knownjira{MVP-2184} - */ - buf.attribL2D.u = 0; -#else - DetermineMemAttrNonLPAE(&buf.attribL2D, &buf.attribMAN); -#endif - /* - * Are MP extensions implemented? See B4-1618 ARM DDI 0406C-2c for - * magic. - */ - ARM_MRC_CP15(MPIDR, mpidr); - - buf.mpExt = mpidr & ARM_CP15_MPIDR_MP; - - if (copy_to_user((int *)arg, &buf, sizeof(struct MvpkmCpuInfo))) { - retval = -EFAULT; - } - break; - } - - default: { - retval = -EINVAL; - break; - } - } - - PRINTK(KERN_INFO "returning from IOCTL(%d) retval = %d %s\n", - cmd, retval, signal_pending(current)?"(pending signal)":"" ); - - return retval; -} - - - -/********************************************************************* - * - * Locked page management - * - *********************************************************************/ - -/* - * Pages locked by the kernel module are remembered so an unlockAll - * operation can be performed when the vmm is closed. The locked page - * identifiers are stored in a red-black tree to support O(log n) - * removal and search (required for /dev/mem-like mmap). - */ - -/** - * @brief Descriptor of a locked page range - */ -typedef struct { - struct { - __u32 mpn : 20; ///< MPN. - __u32 order : 6; ///< Size/alignment exponent for page. - __u32 forRegion : 6; ///< Annotation to identify guest page allocation - } page; - struct rb_node rb; -} LockedPage; - -static void FreeLockedPages(LockedPage *lp); - -/** - * @brief Search for an mpn inside a RB tree of LockedPages. The mpn - * will match a LockedPage as long as it is covered by the - * entry, i.e. in a non-zero order entry it doesn't have to be - * the base MPN. - * - * This must be called with the relevant vm->lockedSem held. - * - * @param root RB tree root. - * @param mpn MPN to search for. - * - * @return reference to LockedPage entry if found, otherwise NULL. - */ -static LockedPage * -LockedListSearch(struct rb_root *root, __u32 mpn) -{ - struct rb_node *n = root->rb_node; - - while (n) { - LockedPage *lp = rb_entry(n, LockedPage, rb); - - if (lp->page.mpn == (mpn & (~0UL << lp->page.order))) { - return lp; - } - - if (mpn < lp->page.mpn) { - n = n->rb_left; - } else { - n = n->rb_right; - } - } - - return NULL; -} - -/** - * @brief Delete an mpn from the list of locked pages. - * - * @param vm Mvpkm module control structure pointer - * @param mpn MPN to be unlocked and freed for reuse - * @return true if list contained MPN and it was deleted from list - */ - -static _Bool -LockedListDel(MvpkmVM *vm, __u32 mpn) -{ - LockedPage *lp; - - down_write(&vm->lockedSem); - - lp = LockedListSearch(&vm->lockedRoot, mpn); - - /* - * The MPN should be in the locked pages RB tree and it should be the - * base of an entry, i.e. we can't fragment existing allocations for - * a VM. - */ - if (lp == NULL || lp->page.mpn != mpn) { - up_write(&vm->lockedSem); - return false; - } - - FreeLockedPages(lp); - - if (lp->page.forRegion == MEMREGION_MAINMEM) { - ATOMIC_SUBV(vm->usedPages, 1U << lp->page.order); - } - - rb_erase(&lp->rb, &vm->lockedRoot); - kfree(lp); - - up_write(&vm->lockedSem); - - return true; -} - -/** - * @brief Scan the list of locked pages to see if an MPN matches. - * - * @param vm Mvpkm module control structure pointer - * @param mpn MPN to check - * - * @return true iff list contains MPN. - */ -static _Bool -LockedListLookup(MvpkmVM *vm, __u32 mpn) -{ - LockedPage *lp; - - down_read(&vm->lockedSem); - - lp = LockedListSearch(&vm->lockedRoot, mpn); - - up_read(&vm->lockedSem); - - return lp != NULL; -} - -/** - * @brief Add a new mpn to the locked pages RB tree. - * - * @param vm control structure pointer - * - * @param mpn mpn of page that was locked with get_user_pages or some sort of - * get that is undone by put_page. - * The mpn is assumed to be non-zero - * @param order size/alignment exponent for page - * @param forRegion Annotation for Page pool to identify guest page allocations - * - * @return false: couldn't allocate internal memory to record mpn in<br> - * true: successful. - */ -static _Bool -LockedListAdd(MvpkmVM *vm, - __u32 mpn, - __u32 order, - PhysMem_RegionType forRegion) -{ - struct rb_node *parent, **p; - LockedPage *tp, *lp = kmalloc(sizeof *lp, GFP_KERNEL); - - if (!lp) { - return false; - } - - lp->page.mpn = mpn; - lp->page.order = order; - lp->page.forRegion = forRegion; - - down_write(&vm->lockedSem); - - if (forRegion == MEMREGION_MAINMEM) { - ATOMIC_ADDV(vm->usedPages, 1U << order); - } - - /* - * Insert as a red leaf in the tree (see include/linux/rbtree.h). - */ - p = &vm->lockedRoot.rb_node; - parent = NULL; - - while (*p) { - parent = *p; - tp = rb_entry(parent, LockedPage, rb); - - /* - * MPN should not already exist in the tree. - */ - ASSERT(tp->page.mpn != (mpn & (~0UL << tp->page.order))); - - if (mpn < tp->page.mpn) { - p = &(*p)->rb_left; - } else { - p = &(*p)->rb_right; - } - } - - rb_link_node(&lp->rb, parent, p); - - /* - * Restructure tree if necessary (see include/linux/rbtree.h). - */ - rb_insert_color(&lp->rb, &vm->lockedRoot); - - up_write(&vm->lockedSem); - - return true; -} - -/** - * @brief Traverse RB locked tree, freeing every entry. - * - * This must be called with the relevant vm->lockedSem held. - * - * @param node reference to RB node at root of subtree. - */ -static void -LockedListNuke(struct rb_node *node) -{ - while (node) { - if (node->rb_left) { - node = node->rb_left; - } else if (node->rb_right) { - node = node->rb_right; - } else { - /* - * We found a leaf, free it and go back to parent. - */ - LockedPage *lp = rb_entry(node, LockedPage, rb); - - if ((node = rb_parent(node))) { - if (node->rb_left) { - node->rb_left = NULL; - } else { - node->rb_right = NULL; - } - } - - FreeLockedPages(lp); - kfree(lp); - } - } -} - -/** - * @brief Unlock all pages at vm close time. - * - * @param vm control structure pointer - */ -static void -LockedListUnlockAll(MvpkmVM *vm) -{ - - down_write(&vm->lockedSem); - - LockedListNuke(vm->lockedRoot.rb_node); - - ATOMIC_SETV(vm->usedPages, 0); - - up_write(&vm->lockedSem); -} - - -/** - * @brief Allocate zeroed free pages - * - * @param[in] vm which VM the pages are for so they will be freed when the vm - * closes - * @param[in] order log2(number of contiguous pages to allocate) - * @param[in] highmem is it OK to allocate this page in ZONE_HIGHMEM? This - * option should only be specified for pages the host kernel - * will not need to address directly. - * @param[out] hkvaRet where to return host kernel virtual address of the - * allocated pages, if non-NULL, and ONLY IF !highmem. - * @param forRegion Annotation for Page pool to identify guest page allocations - * @return 0: no host memory available<br> - * else: starting MPN<br> - * *hkvaRet = filled in - */ -static MPN -AllocZeroedFreePages(MvpkmVM *vm, - uint32 order, - _Bool highmem, - PhysMem_RegionType forRegion, - HKVA *hkvaRet) -{ - MPN mpn; - struct page *page; - - if (order > PAGE_ALLOC_COSTLY_ORDER) { - printk(KERN_WARNING "Order %d allocation for region %d exceeds the safe " - "maximum order %d\n", - order, - forRegion, - PAGE_ALLOC_COSTLY_ORDER); - } - - /* - * Get some pages for the requested range. They will be physically - * contiguous and have the requested alignment. They will also - * have a kernel virtual mapping if !highmem. - * - * We allocate out of ZONE_MOVABLE even though we can't just pick up our - * bags. We do this to support platforms that explicitly configure - * ZONE_MOVABLE, such as the Qualcomm MSM8960, to enable deep power down of - * memory banks. When the kernel attempts to take a memory bank offline, it - * will try and place the pages on the isolate LRU - only pages already on an - * LRU, such as anon/file, can get there, so it will not be able to - * migrate/move our pages (and hence the bank will not be offlined). The - * other alternative is to live withing ZONE_NORMAL, and only have available - * a small fraction of system memory. Long term we plan on hooking the - * offlining callback in mvpkm and perform our own migration with the - * cooperation of the monitor, but we don't have dev board to support this - * today. - * - * @knownjira{MVP-3477} - */ - page = alloc_pages(GFP_USER | __GFP_COMP | __GFP_ZERO | - (highmem ? __GFP_HIGHMEM | __GFP_MOVABLE : 0), - order); - - if (page == NULL) { - return 0; - } - - /* - * Return the corresponding page number. - */ - mpn = page_to_pfn(page); - ASSERT(mpn != 0); - - /* - * Remember to unlock the pages when the FD is closed. - */ - if (!LockedListAdd(vm, mpn, order, forRegion)) { - __free_pages(page, order); - return 0; - } - - if (hkvaRet) { - *hkvaRet = highmem ? 0 : __phys_to_virt(page_to_phys(page)); - } - - return mpn; -} - -/** - * @brief Map already-pinned WSP memory in host kernel virtual address(HKVA) - * space. Assumes 2 world switch pages on an 8k boundary. - * - * @param[in] vm which VM the HKVA Area is to be mapped for - * @param[in] mapInfo array of MPNs and execute permission flags to be used in - inserting a new contiguous map in HKVA space - * @return 0: HKVA space could not be mapped - else: HKVA where mapping was inserted - */ -static HKVA -MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo) -{ - unsigned int i; - struct page **pages = NULL; - struct page **pagesPtr; - pgprot_t prot; - int retval; - int allocateCount = WSP_PAGE_COUNT + 1; // Reserve one page for alignment - int pageIndex = 0; - HKVA dummyPage = (HKVA)NULL; - HKVA start; - HKVA startSegment; - HKVA endSegment; - - /* - * Add one page for alignment purposes in case __get_vm_area returns an - * unaligned address. - */ - ASSERT(allocateCount == 3); - ASSERT_ON_COMPILE(WSP_PAGE_COUNT == 2); - - /* - * NOT_IMPLEMENTED if MapHKVA is called more than once. - */ - BUG_ON(vm->wspHkvaArea); - - /* - * Reserve virtual address space. - */ - vm->wspHkvaArea = __get_vm_area((allocateCount * PAGE_SIZE), VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!vm->wspHkvaArea) { - return 0; - } - - pages = kmalloc(allocateCount * sizeof(struct page *), GFP_TEMPORARY); - if (!pages) { - goto err; - } - pagesPtr = pages; - - /* - * Use a dummy page to boundary align the section, if needed. - */ - dummyPage = __get_free_pages(GFP_KERNEL, 0); - if (!dummyPage) { - goto err; - } - vm->wspHKVADummyPage = dummyPage; - - /* - * Back every entry with the dummy page. - */ - for (i = 0; i < allocateCount; i++) { - pages[i] = virt_to_page(dummyPage); - } - - /* - * World switch pages must not span a 1MB boundary in order to maintain only - * a single L2 page table. - */ - start = (HKVA)vm->wspHkvaArea->addr; - startSegment = start & ~(ARM_L1D_SECTION_SIZE - 1); - endSegment = (start + PAGE_SIZE) & ~(ARM_L1D_SECTION_SIZE - 1); - /* - * Insert dummy page at pageIndex, if needed. - */ - pageIndex = (startSegment != endSegment); - - /* - * Back the rest with the actual world switch pages - */ - for (i = pageIndex; i < pageIndex + WSP_PAGE_COUNT; i++) { - pages[i] = pfn_to_page(mapInfo[i - pageIndex].mpn); - } - - /* - * Given the lack of functionality in the kernel for being able to mark - * mappings for a given vm area with different sets of protection bits, - * we simply mark the entire vm area as PAGE_KERNEL_EXEC for now - * (i.e., union of all the protection bits). Given that the kernel - * itself does something similar while loading modules, this should be a - * reasonable workaround for now. In the future, we should set the - * protection bits to strictly adhere to what has been requested in the - * mapInfo parameter. - */ - prot = PAGE_KERNEL_EXEC; - - retval = map_vm_area(vm->wspHkvaArea, prot, &pagesPtr); - if (retval < 0) { - goto err; - } - - kfree(pages); - - return (HKVA)(vm->wspHkvaArea->addr) + pageIndex * PAGE_SIZE; - -err: - if (dummyPage) { - free_pages(dummyPage, 0); - vm->wspHKVADummyPage = (HKVA)NULL; - } - - if (pages) { - kfree(pages); - } - - free_vm_area(vm->wspHkvaArea); - vm->wspHkvaArea = (HKVA)NULL; - - return 0; -} - -static void -UnmapWSPHKVA(MvpkmVM *vm) -{ - if (vm->wspHkvaArea) { - free_vm_area(vm->wspHkvaArea); - } - - if (vm->wspHKVADummyPage) { - free_pages(vm->wspHKVADummyPage, 0); - vm->wspHKVADummyPage = (HKVA)NULL; - } -} - -/** - * @brief Clean and release locked pages - * - * @param lp Reference to the locked pages - */ -static void -FreeLockedPages(LockedPage *lp) -{ - struct page *page; - int count; - - page = pfn_to_page(lp->page.mpn); - count = page_count(page); - - if (count == 0) { - printk(KERN_ERR "%s: found locked page with 0 reference (mpn %05x)\n", - __func__, lp->page.mpn); - return; - } - - if (count == 1) { - int i; - - /* - * There is no other user for this page, clean it. - * - * We don't bother checking if the page was highmem or not, clear_highmem - * works for both. - * We clear the content of the page, and rely on the fact that the previous - * worldswitch has cleaned the potential VIVT I-CACHE. - */ - for (i = 0; i < (1 << lp->page.order); i++) { - clear_highpage(page + i); - } - } else if (lp->page.forRegion != MEMREGION_MAINMEM) { - printk(KERN_WARNING "%s: mpn 0x%05x for region %d is still in use\n", - __func__, lp->page.mpn, lp->page.forRegion); - } - - __free_pages(page, lp->page.order); -} - -/********************************************************************* - * - * Communicate with monitor - * - *********************************************************************/ - -/** - * @brief Register a new monitor page. - * - * @param vm which virtual machine we're running - * @return 0: successful<br> - * else: -errno - */ -static int -SetupMonitor(MvpkmVM *vm) -{ - int retval; - WorldSwitchPage *wsp = vm->wsp; - - if (!wsp || - wsp->wspHKVA != (HKVA)wsp) { - return -EINVAL; - } - - if ((retval = Mksck_WspInitialize(vm))) { - return retval; - } - - vm->kobj.kset = mvpkmKSet; - retval = kobject_init_and_add(&vm->kobj, &mvpkmKType, NULL, "%d", wsp->guestId); - if (retval) { - goto error; - } - - /* - * Get a reference to this module such that it cannot be unloaded until - * our kobject's release function completes. - */ - - __module_get(THIS_MODULE); - vm->haveKObj = true; - - /* - * Caution: From here on, if we fail, we must not call kobject_put() - * on vm->kobj since that may / will deallocate 'vm'. Unregistering VM - * ksets on failures, is fine and should be done for proper ref counting. - */ - - vm->devicesKSet = kset_create_and_add("devices", NULL, &vm->kobj); - if (!vm->devicesKSet) { - retval = -ENOMEM; - goto error; - } - - vm->miscKSet = kset_create_and_add("misc", NULL, &vm->kobj); - if (!vm->miscKSet) { - kset_unregister(vm->devicesKSet); - vm->devicesKSet = NULL; - retval = -ENOMEM; - goto error; - } - - down_write(&vm->wspSem); - - /* - * The VE monitor needs to issue a SMC to bootstrap Hyp mode. - */ - if (wsp->monType == MONITOR_TYPE_VE) { - /* - * Here we assemble the monitor's HMAIR0 based on wsp->memAttr. We map - * from the inner/outer normal page cacheability attributes obtained - * from DetermineCacheabilityAttribs to the format required in 4.2.8 - * ARM PRD03-GENC-008469 13.0 (see this document for the magic numbers). - * - * Where a choice is available, we opt for read and/or write allocation. - */ - static const uint32 normalCacheAttr2MAIR[4] = { 0x4, 0xf, 0xa, 0xe }; - - uint32 hmair0 = - ((normalCacheAttr2MAIR[wsp->memAttr.innerCache] | - (normalCacheAttr2MAIR[wsp->memAttr.outerCache] << 4)) - << 8 * MVA_MEMORY) | - (0x4 << 8 * MVA_DEVICE); - - /* - * See B4.1.74 ARM DDI 0406C-2c for the HTCR magic. - */ - uint32 htcr = - 0x80000000 | - (wsp->memAttr.innerCache << 8) | - (wsp->memAttr.outerCache << 10) | - (wsp->memAttr.share << 12); - - /** - * @knownjira{MVP-377} - * Set HSCTLR to enable MMU and caches. We should really run the - * monitor WXN, in non-MVP_DEVEL builds. See - * 13.18 ARM PRD03-GENC-008353 11.0 for the magic. - */ - static const uint32 hsctlr = 0x30c5187d; - - register uint32 r0 asm("r0") = wsp->monVA.excVec; - register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; - register uint32 r2 asm("r2") = htcr; - register uint32 r3 asm("r3") = hmair0; - register uint32 r4 asm("r4") = hsctlr; - - asm volatile ( - ".arch_extension sec\n" - "smc 0" - : - : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4) - : "memory" - ); - } - - /* - * Initialize guest wait-for-interrupt waitqueue. - */ - init_waitqueue_head(&vm->wfiWaitQ); - - MonitorTimer_Setup(vm); - -#ifdef CONFIG_HAS_WAKELOCK - wake_lock_init(&vm->wakeLock, WAKE_LOCK_SUSPEND, "mvpkm"); -#endif - - wsp->mvpkmVersion = MVP_VERSION_CODE; - up_write(&vm->wspSem); - /* - * Ensure coherence of monitor loading and page tables. - */ - flush_cache_all(); - return 0; - -error: - Mksck_WspRelease(wsp); - vm->wsp = NULL; - return retval; -} - -/** - * @brief dummy function to drop the info parameter - * @param info ignored - */ -static -void FlushAllCpuCaches(void *info) -{ - flush_cache_all(); -} - -/** - * @brief return to where monitor called worldswitch - * - * @param vm which virtual machine we're running - * @return 0: successful, just call back when ready<br> - * 1: successful, process code in WSP_Params(wsp)->callno<br> - * else: -errno - */ -static int -RunMonitor(MvpkmVM *vm) -{ - int ii; - unsigned long flags; - WorldSwitchPage *wsp = vm->wsp; - int retval = 0; - - ASSERT(wsp); - -#ifdef CONFIG_HAS_WAKELOCK - wake_lock(&vm->wakeLock); -#endif - - /* - * Set VCPUThread affinity - */ - if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask)) { - set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); - } - - /* - * Record the the current task structure, so an ABORT will know, - * who to wake. - */ - down_write(&vm->monThreadTaskSem); - vm->monThreadTask = get_current(); - up_write(&vm->monThreadTaskSem); - - /* - * Keep going as long as the monitor is in critical section or - * there are no pending signals such as SIGINT or SIGKILL. Block - * interrupts before checking so any IPI sent will remain pending - * if our check just misses detecting the signal. - */ - local_irq_save(flags); - while (wsp->critSecCount > 0 || - (!signal_pending(current) && - !(ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT))) { - /* - * ARMv7 Performance counters are per CPU core and might be disabled over - * CPU core sleep if there is nothing else in the system to re-enable - * them, so now that we have been allocated a CPU core to run the guest, - * enable them and in particular the TSC (CCNT) which is used for monitor - * timing between world switches. - */ - { - uint32 pmnc; - uint32 pmcnt; - - /* make sure that the Performance Counters are enabled */ - ARM_MRC_CP15(PERF_MON_CONTROL_REGISTER, pmnc); - if ((pmnc & (ARM_PMNC_E | ARM_PMNC_D)) != (ARM_PMNC_E)) { - pmnc |= ARM_PMNC_E; // Enable TSC - pmnc &= ~ARM_PMNC_D; // Disable cycle count divider - ARM_MCR_CP15(PERF_MON_CONTROL_REGISTER, pmnc); - } - - /* make sure that the CCNT is enabled */ - ARM_MRC_CP15(PERF_MON_COUNT_SET, pmcnt); - if ((pmcnt & ARM_PMCNT_C) != ARM_PMCNT_C) { - pmcnt |= ARM_PMCNT_C; - ARM_MCR_CP15(PERF_MON_COUNT_SET, pmcnt); - } - } - - /* - * Update TSC to RATE64 ratio - */ - { - struct TscToRate64Cb *ttr = &__get_cpu_var(tscToRate64); - wsp->tscToRate64Mult = ttr->mult; - wsp->tscToRate64Shift = ttr->shift; - } - - /* - * Save the time of day for the monitor's timer facility. The timing - * facility in the vmm needs to compute current time in the host linux's - * time representation. It uses the formula: - * now = wsp->switchedAt64 + (uint32)(TSC_READ() - wsp->lowerTSC) - * - * Read the timestamp counter *immediately after* ktime_get() as that - * will give the most consistent offset between reading the hardware - * clock register in ktime_get() and reading the hardware timestamp - * counter with TSC_READ(). - */ - ASSERT_ON_COMPILE(MVP_TIMER_RATE64 == NSEC_PER_SEC); - { - ktime_t now = ktime_get(); - TSC_READ(wsp->switchedAtTSC); - wsp->switchedAt64 = ktime_to_ns(now); - } - - /* - * Save host FPU contents and load monitor contents. - */ - SWITCH_VFP_TO_MONITOR; - - /* - * Call into the monitor to run guest instructions until it wants us to - * do something for it. Note that any hardware interrupt request will - * cause it to volunteer. - */ - switch (wsp->monType) { - case MONITOR_TYPE_LPV: { - uint32 hostVBAR; - - ARM_MRC_CP15(VECTOR_BASE, hostVBAR); - (*wsp->switchToMonitor)(&wsp->regSave); - ARM_MCR_CP15(VECTOR_BASE, hostVBAR); - break; - } - case MONITOR_TYPE_VE: { - register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; - - asm volatile ( - ".word " MVP_STRINGIFY(ARM_INSTR_HVC_A1_ENC(0)) - : "=r" (r1) : "r" (r1) : "r0", "r2", "memory" - ); - break; - } - default: FATAL(); - } - - /* - * Save monitor FPU contents and load host contents. - */ - SWITCH_VFP_TO_HOST; - - /* - * Re-enable local interrupts now that we are back in the host world - */ - local_irq_restore(flags); - - - /* - * Maybe the monitor wrote some messages to monitor->host sockets. - * This will wake the corresponding host threads to receive them. - */ - /** - * @todo This lousy loop is in the critical path. It should be changed - * to some faster algorithm to wake blocked host sockets. - */ - for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { - if (wsp->isPageMapped[ii]) { - Mksck_WakeBlockedSockets(MksckPage_GetFromIdx(ii)); - } - } - - switch (WSP_Params(wsp)->callno) { - case WSCALL_ACQUIRE_PAGE: { - uint32 i; - - for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { - MPN mpn = AllocZeroedFreePages(vm, - WSP_Params(wsp)->pages.order, - true, - WSP_Params(wsp)->pages.forRegion, - NULL); - if (mpn == 0) { - printk(KERN_WARNING "WSCALL_ACQUIRE_PAGE: no order %u pages available\n", - WSP_Params(wsp)->pages.order); - WSP_Params(wsp)->pages.pages = i; - break; - } - - WSP_Params(wsp)->pages.mpns[i] = mpn; - } - - break; - } - case WSCALL_RELEASE_PAGE: { - uint32 i; - - for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { - if (!LockedListDel(vm, WSP_Params(wsp)->pages.mpns[i])) { - WSP_Params(wsp)->pages.pages = i; - break; - } - } - - break; - } - case WSCALL_MUTEXLOCK: { - retval = Mutex_Lock((void *)WSP_Params(wsp)->mutex.mtxHKVA, - WSP_Params(wsp)->mutex.mode); - - if (retval < 0) { - WSP_Params(wsp)->mutex.ok = false; - goto monitorExit; - } - - /* - * The locking succeeded. From this point on the monitor - * is in critical section. Even if an interrupt comes - * right here, it must return to the monitor to unlock the - * mutex. - */ - wsp->critSecCount++; - WSP_Params(wsp)->mutex.ok = true; - break; - } - case WSCALL_MUTEXUNLOCK: { - Mutex_Unlock((void *)WSP_Params(wsp)->mutex.mtxHKVA, - WSP_Params(wsp)->mutex.mode); - break; - } - case WSCALL_MUTEXUNLSLEEP: { - /* - * The vcpu has just come back from the monitor. During - * the transition interrupts were disabled. Above, - * however, interrupts were enabled again and it is - * possible that a context switch happened into a thread - * (serve_vmx) that instructed the vcpu thread to - * abort. After returning to this thread the vcpu may - * enter a sleep below never to return from it. To avoid - * this deadlock we need to test the abort flag in - * Mutex_UnlSleepTest. - */ - retval = - Mutex_UnlSleepTest((void *)WSP_Params(wsp)->mutex.mtxHKVA, - WSP_Params(wsp)->mutex.mode, - WSP_Params(wsp)->mutex.cvi, - &wsp->hostActions, - ACTION_ABORT); - if (retval < 0) { - goto monitorExit; - } - break; - } - case WSCALL_MUTEXUNLWAKE: { - Mutex_UnlWake((void *)WSP_Params(wsp)->mutex.mtxHKVA, - WSP_Params(wsp)->mutex.mode, - WSP_Params(wsp)->mutex.cvi, - WSP_Params(wsp)->mutex.all); - break; - } - - /* - * The monitor wants us to block (allowing other host threads to run) - * until an async message is waiting for the monitor to process. - * - * If MvpkmWaitForInt() returns an error, it should only be if there - * is another signal pending (such as SIGINT). So we pretend it - * completed normally, as the monitor is ready to be called again (it - * will see no messages to process and wait again), and return to user - * mode so the signals can be processed. - */ - case WSCALL_WAIT: { -#ifdef CONFIG_HAS_WAKELOCK - if (WSP_Params(wsp)->wait.suspendMode) { - /* guest has ok'ed suspend mode, so release SUSPEND wakelock */ - wake_unlock(&vm->wakeLock); - retval = MvpkmWaitForInt(vm, true); - wake_lock(&vm->wakeLock); - WSP_Params(wsp)->wait.suspendMode = 0; - } else { - /* guest has asked for WFI not suspend so keep holding SUSPEND - * wakelock */ - retval = MvpkmWaitForInt(vm, false); - } -#else - retval = MvpkmWaitForInt(vm, WSP_Params(wsp)->wait.suspendMode); -#endif - if (retval < 0) { - goto monitorExit; - } - break; - } - - /* - * The only reason the monitor returned was because there was a - * pending hardware interrupt. The host serviced and cleared that - * interrupt when we enabled interrupts above. Now we call the - * scheduler in case that interrupt woke another thread, we want to - * allow that thread to run before returning to do more guest code. - */ - case WSCALL_IRQ: { - break; - } - - case WSCALL_GET_PAGE_FROM_VMID: { - MksckPage *mksckPage; - mksckPage = MksckPage_GetFromVmIdIncRefc(WSP_Params(wsp)->pageMgmnt.vmId); - - if (mksckPage) { - int ii; - - WSP_Params(wsp)->pageMgmnt.found = true; - for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) { - WSP_Params(wsp)->pageMgmnt.mpn[ii] = - vmalloc_to_pfn( (void*)(((HKVA)mksckPage) + ii*PAGE_SIZE) ); - } - - ASSERT(!wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); - wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = true; - } else { - WSP_Params(wsp)->pageMgmnt.found = false; - } - break; - } - - case WSCALL_REMOVE_PAGE_FROM_VMID: { - MksckPage *mksckPage; - mksckPage = MksckPage_GetFromVmId(WSP_Params(wsp)->pageMgmnt.vmId); - ASSERT(wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); - wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = false; - MksckPage_DecRefc(mksckPage); - break; - } - - /* - * Read current wallclock time. - */ - case WSCALL_READTOD: { - struct timeval nowTV; - do_gettimeofday(&nowTV); - WSP_Params(wsp)->tod.now = nowTV.tv_sec; - WSP_Params(wsp)->tod.nowusec = nowTV.tv_usec; - break; - } - - case WSCALL_LOG: { - int len = strlen(WSP_Params(wsp)->log.messg); - printk(KERN_INFO - "VMM: %s%s", - WSP_Params(wsp)->log.messg, - (WSP_Params(wsp)->log.messg[len-1] == '\n') ? "" : "\n"); - break; - } - - case WSCALL_ABORT: { - retval = WSP_Params(wsp)->abort.status; - goto monitorExit; - } - - case WSCALL_QP_GUEST_ATTACH: { - int32 rc; - QPInitArgs args; - uint32 base; - uint32 nrPages; - - args.id = WSP_Params(wsp)->qp.id; - args.capacity = WSP_Params(wsp)->qp.capacity; - args.type = WSP_Params(wsp)->qp.type; - base = WSP_Params(wsp)->qp.base; - nrPages = WSP_Params(wsp)->qp.nrPages; - - rc = QP_GuestAttachRequest(vm, &args, base, nrPages); - - WSP_Params(wsp)->qp.rc = rc; - WSP_Params(wsp)->qp.id = args.id; - break; - } - - case WSCALL_QP_NOTIFY: { - QPInitArgs args; - - args.id = WSP_Params(wsp)->qp.id; - args.capacity = WSP_Params(wsp)->qp.capacity; - args.type = WSP_Params(wsp)->qp.type; - - WSP_Params(wsp)->qp.rc = QP_NotifyListener(&args); - break; - } - - case WSCALL_MONITOR_TIMER: { - MonitorTimer_Request(&vm->monTimer, WSP_Params(wsp)->timer.when64); - break; - } - - case WSCALL_COMM_SIGNAL: { - Mvpkm_CommEvSignal(&WSP_Params(wsp)->commEvent.transpID, - WSP_Params(wsp)->commEvent.event); - break; - } - - case WSCALL_FLUSH_ALL_DCACHES: { - /* - * Broadcast Flush DCache request to all cores. - * Block while waiting for all of them to get done. - */ - on_each_cpu(FlushAllCpuCaches, NULL, 1); - break; - } - default: { - retval = -EPIPE; - goto monitorExit; - } - } - - /* - * The params.callno callback was handled in kernel mode and completed - * successfully. Repeat for another call without returning to user mode, - * unless there are signals pending. - * - * But first, call the Linux scheduler to switch threads if there is - * some other thread Linux wants to run now. - */ - if (need_resched()) { - schedule(); - } - - /* - * Check if cpus allowed mask has to be updated. - * Updating it must be done outside of an atomic context. - */ - if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask) && - !cpumask_equal(to_cpumask(vcpuAffinity), ¤t->cpus_allowed)) { - set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); - } - - local_irq_save(flags); - } - - /* - * There are signals pending so don't try to do any more monitor/guest - * stuff. But since we were at the point of just about to run the monitor, - * return success status as user mode can simply call us back to run the - * monitor again. - */ - local_irq_restore(flags); - -monitorExit: - ASSERT(wsp->critSecCount == 0); - - if (ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT) { - PRINTK(KERN_INFO "Monitor has ABORT flag set.\n"); - retval = ExitStatusHostRequest; - } - -#ifdef CONFIG_HAS_WAKELOCK - wake_unlock(&vm->wakeLock); -#endif - - down_write(&vm->monThreadTaskSem); - vm->monThreadTask = NULL; - up_write(&vm->monThreadTaskSem); - - return retval; -} - -/** - * @brief Guest is waiting for interrupts, sleep if necessary - * - * @param vm which virtual machine we're running - * @param suspend is the guest entering suspend or just WFI? - * @return 0: woken up, hostActions should have pending events - * -ERESTARTSYS: broke out because other signals are pending - * - * This function is called in the VCPU context after the world switch to wait - * for an incoming message. If any message gets queued to this VCPU, the - * sender will wake us up. - */ -int -MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend) -{ - WorldSwitchPage *wsp = vm->wsp; - wait_queue_head_t *q = &vm->wfiWaitQ; - - if (suspend) { - return wait_event_interruptible(*q, ATOMIC_GETO(wsp->hostActions) != 0); - } else { - int ret; - ret = wait_event_interruptible_timeout(*q, ATOMIC_GETO(wsp->hostActions) != 0, 10*HZ); - if (ret == 0) { - printk("MvpkmWaitForInt: guest stuck for 10s in WFI! (hostActions %08x)\n", - ATOMIC_GETO(wsp->hostActions)); - } - return ret > 0 ? 0 : ret; - } -} - - -/** - * @brief Force the guest to evaluate its hostActions flag field - * - * @param vm which guest needs waking - * @param why why should be guest be woken up? - * - * This function updates the hostAction flag field as and wakes up the guest as - * required so that it can evaluate it. The guest could be executing guest - * code in an SMP system, in that case send an IPI; or it could be sleeping, in - * the case wake it up. - */ -void -Mvpkm_WakeGuest(MvpkmVM *vm, int why) -{ - ASSERT(why != 0); - - /* set the host action */ - if (ATOMIC_ORO(vm->wsp->hostActions, why) & why) { - /* guest has already been woken up so no need to do it again */ - return; - } - - /* - * VCPU is certainly in 'wait for interrupt' wait. Wake it up ! - */ -#ifdef CONFIG_HAS_WAKELOCK - /* - * To prevent the system to go in suspend mode before the monitor had a - * chance on being scheduled, we will hold the VM wakelock from now. - * As the wakelocks are not managed as reference counts, this is not an - * an issue to take a wake_lock twice in a row. - */ - wake_lock(&vm->wakeLock); -#endif - - /* - * On a UP system, we ensure the monitor thread isn't blocked. - * - * On an MP system the other CPU might be running the guest. This - * is noop on UP. - * - * When the guest is running, it is an invariant that monThreadTaskSem is not - * held as a write lock, so we should not fail to acquire the lock. - * Mvpkm_WakeGuest may be called from an atomic context, so we can't sleep - * here. - */ - if (down_read_trylock(&vm->monThreadTaskSem)) { - if (vm->monThreadTask) { - wake_up_process(vm->monThreadTask); - kick_process(vm->monThreadTask); - } - up_read(&vm->monThreadTaskSem); - } else { - printk("Unexpected failure to acquire monThreadTaskSem!\n"); - } -} |