/* * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support * * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by * the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; see the file COPYING. If not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #line 5 /** * @file * * @brief The kernel level driver. */ #define __KERNEL_SYSCALLS__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_HAS_WAKELOCK #include #endif #include #include #include #include #include #include #include "mvp.h" #include "mvp_version.h" #include "mvpkm_types.h" #include "mvpkm_private.h" #include "mvpkm_kernel.h" #include "actions.h" #include "wscalls.h" #include "arm_inline.h" #include "tsc.h" #include "mksck_kernel.h" #include "mmu_types.h" #include "mvp_timer.h" #include "qp.h" #include "qp_host_kernel.h" #include "cpufreq_kernel.h" #include "mvpkm_comm_ev.h" #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER #include "mvp_balloon.h" #endif /********************************************************************* * * Definition of the file operations * *********************************************************************/ static _Bool LockedListAdd(MvpkmVM *vm, __u32 mpn, __u32 order, PhysMem_RegionType forRegion); static _Bool LockedListDel(MvpkmVM *vm, __u32 mpn); static void LockedListUnlockAll(MvpkmVM *vm); static _Bool LockedListLookup(MvpkmVM *vm, __u32 mpn); static int SetupMonitor(MvpkmVM *vm); static int RunMonitor(MvpkmVM *vm); static MPN AllocZeroedFreePages(MvpkmVM *vm, uint32 order, _Bool highmem, PhysMem_RegionType forRegion, HKVA *hkvaRet); static HKVA MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo); static void UnmapWSPHKVA(MvpkmVM *vm); static int MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend); static void ReleaseVM(MvpkmVM *vm); /* * Mksck open request must come from this uid. It must be root until * it is set via an ioctl from mvpd. */ uid_t Mvpkm_vmwareUid = 0; EXPORT_SYMBOL(Mvpkm_vmwareUid); /* * Minimum hidden app oom_adj, provided by mvpd, since we can't get it directly * from the lowmemorykiller module. */ static int minHiddenAppOOMAdj; /* * vCPU cpu affinity to let monitor/guest run on some CPUs only (when possible) */ static DECLARE_BITMAP(vcpuAffinity, NR_CPUS); /********************************************************************* * * Sysfs nodes * *********************************************************************/ /* * kobject for our sysfs representation, used for global nodes. */ static struct kobject *mvpkmKObj; /* * kobject for the balloon exports. */ static struct kobject *balloonKObj; /** * @brief sysfs show function for global version attribute. * * @param kobj reference to kobj nested in MvpkmVM struct. * @param attr kobj_attribute reference, not used. * @param buf PAGE_SIZEd buffer to write to. * * @return number of characters printed (not including trailing null character). */ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); } static struct kobj_attribute versionAttr = __ATTR_RO(version); /** * @brief sysfs show function for global background_pages attribute. * * Used by vmx balloon policy controller to gauge the amount of freeable * anonymous memory. * * @param kobj reference to kobj nested in MvpkmVM struct. * @param attr kobj_attribute reference, not used. * @param buf PAGE_SIZEd buffer to write to. * * @return number of characters printed (not including trailing null character). */ static ssize_t background_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { #ifndef CONFIG_ANDROID_LOW_MEMORY_KILLER return snprintf(buf, PAGE_SIZE, "0\n"); #else return snprintf(buf, PAGE_SIZE, "%d\n", Balloon_AndroidBackgroundPages(minHiddenAppOOMAdj)); #endif } static struct kobj_attribute backgroundAttr = __ATTR_RO(background); /** * @brief sysfs show function to export the other_file calculation in * lowmemorykiller. * * It's helpful, in the balloon controller, to know what the lowmemorykiller * module is using to know when the system has crossed a minfree threshold. * Since there exists a number of different other_file calculations in various * lowmemorykiller patches (@see{MVP-1674}), and the module itself doesn't * provide a clean export of this figure, we provide it on a case-by-case basis * for the various supported hosts here. * * @param kobj reference to kobj nested in MvpkmVM struct. * @param attr kobj_attribute reference, not used. * @param buf PAGE_SIZEd buffer to write to. * * @return number of characters printed (not including trailing null character). */ static ssize_t other_file_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int32 other_file = 0; #ifndef LOWMEMKILLER_VARIANT #define LOWMEMKILLER_VARIANT 0 #endif #ifndef LOWMEMKILLER_MD5 #define LOWMEMKILLER_MD5 0 #endif #ifndef LOWMEMKILLER_SHRINK_MD5 #define LOWMEMKILLER_SHRINK_MD5 0 #endif /* * The build system hashes the lowmemorykiller section related to the * other_file calculation in the kernel source for us, here we have to * provide the code. */ #if LOWMEMKILLER_VARIANT == 1 /* * This is the same as the non-exported global_reclaimable_pages() when there * is no swap. */ other_file = global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_FILE); #elif LOWMEMKILLER_VARIANT == 2 other_file = global_page_state(NR_FILE_PAGES); #elif LOWMEMKILLER_VARIANT == 3 other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); #elif LOWMEMKILLER_VARIANT == 4 /* * Here free/file pages are fungible and max(free, file) isn't used, but we * can continue to use max(free, file) since max(free, file) = other_file in * this case. */ other_file = global_page_state(NR_FREE_PAGES) + global_page_state(NR_FILE_PAGES); #elif defined(NONANDROID) /* * Non-Android host platforms don't have ballooning enabled. */ #else /* * If you get this message, you need to run 'make lowmem-info' and inspect * lowmemorykiller.c. If the "other_file = ..." calculation in lowmem_shrink * appears above, simply add the "Shrink#" to an existing entry in * lowmemkiller-variant.sh, pointing to the variant number above. Otherwise, * provide a new entry above and variant number, with the appropriate * other_file calculation and update lowmemkiller-variant.sh accordingly. */ //#warning "Unknown lowmemorykiller variant in hosted/module/mvpkm_main.c, falling back on default (see other_file_show for the remedy)" /* * Fall back on default - this may bias strangely for/against the host, but * nothing catastrophic should result. */ other_file = global_page_state(NR_FILE_PAGES); #endif #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) return snprintf(buf, PAGE_SIZE, "%d %d %s %s\n", other_file, LOWMEMKILLER_VARIANT, STRINGIFY(LOWMEMKILLER_MD5), STRINGIFY(LOWMEMKILLER_SHRINK_MD5)); #undef _STRINGIFY #undef STRINGIFY } static struct kobj_attribute otherFileAttr = __ATTR_RO(other_file); /* * kset for our sysfs representation, used for per-VM nodes. */ static struct kset *mvpkmKSet; static ssize_t MvpkmAttrShow(struct kobject *kobj, struct attribute *attr, char *buf); static ssize_t MvpkmAttrStore(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count); static void MvpkmKObjRelease(struct kobject *kobj) __attribute__ ((optimize ("-fomit-frame-pointer"))); /** * @brief Releases the vm structure containing the kobject. * * @param kobj the vm's kobject. */ static void MvpkmKObjRelease(struct kobject *kobj) { MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); ReleaseVM(vm); module_put(THIS_MODULE); } /** * @name mvpkm ktype attribute structures for locked_pages. * * @{ */ static struct sysfs_ops mvpkmSysfsOps = { .show = MvpkmAttrShow, .store = MvpkmAttrStore }; static struct attribute mvpkmLockedPagesAttr = { .name = "locked_pages", .mode = 0444, }; static struct attribute mvpkmBalloonWatchdogAttr = { .name = "balloon_watchdog", .mode = 0666 }; static struct attribute mvpkmMonitorAttr = { .name = "monitor", .mode = 0400, }; static struct attribute *mvpkmDefaultAttrs[] = { &mvpkmLockedPagesAttr, &mvpkmBalloonWatchdogAttr, &mvpkmMonitorAttr, NULL, }; static struct kobj_type mvpkmKType = { .sysfs_ops = &mvpkmSysfsOps, .release = MvpkmKObjRelease, .default_attrs = mvpkmDefaultAttrs, }; /*@}*/ /* * As it is not very common for host kernels to have SYS_HYPERVISOR enabled and * you have to "hack" a Kconfig file to enable it, just include the * functionality inline if it is not enabled. */ #ifndef CONFIG_SYS_HYPERVISOR struct kobject *hypervisor_kobj; EXPORT_SYMBOL_GPL(hypervisor_kobj); #endif /* * kobject and kset utilities. */ extern struct kobject *kset_find_obj(struct kset *, const char *) __attribute__((weak)); /** * @brief Finds a kobject in a kset. The actual implementation is copied from * kernel source in lib/kobject.c. Although the symbol is extern-declared, * it is not EXPORT_SYMBOL-ed. We use a weak reference in case the symbol * might be exported in future kernel versions. * * @param kset set to search. * @param name object name. * * @return retained kobject if found, NULL otherwise. */ struct kobject * kset_find_obj(struct kset *kset, const char *name) { struct kobject *k; struct kobject *ret = NULL; spin_lock(&kset->list_lock); list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ret = kobject_get(k); break; } } spin_unlock(&kset->list_lock); return ret; } /** * @brief Finds one of the VM's pre-defined ksets. * * @param vmID a VM ID. * @param name name of one of the VM's pre-defined ksets. * * @return retained kset if found, NULL otherwise. */ struct kset * Mvpkm_FindVMNamedKSet(int vmID, const char *name) { MvpkmVM *vm; struct kobject *kobj; char vmName[32] = {}; /* Large enough to hold externally-formatted int32. */ struct kset *res = NULL; if (!mvpkmKSet) { return NULL; } snprintf(vmName, sizeof vmName, "%d", vmID); vmName[sizeof vmName - 1] = '\0'; /* Always null-terminate, no overflow. */ kobj = kset_find_obj(mvpkmKSet, vmName); if (!kobj) { return NULL; } vm = container_of(kobj, MvpkmVM, kobj); if (!strcmp(name, "devices")) { res = kset_get(vm->devicesKSet); } else if (!strcmp(name, "misc")) { res = kset_get(vm->miscKSet); } kobject_put(kobj); return res; } EXPORT_SYMBOL(Mvpkm_FindVMNamedKSet); /********************************************************************* * * Standard Linux miscellaneous device registration * *********************************************************************/ MODULE_LICENSE("GPL"); // for kallsyms_lookup_name static int MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf); /** * @brief Linux vma operations for /dev/mem-like kernel module mmap. We * enforce the restriction that only MPNs that have been allocated * to the opened VM may be mapped and also increment the reference * count (via vm_insert_page), so that even if the memory is later * freed by the VM, host process vma's containing the MPN can't * compromise the system. * * However, only trusted host processes (e.g. the vmx) should be allowed * to use this interface, since you can mmap the monitor's code/data/ * page tables etc. with it. Untrusted host processes are limited to * typed messages for sharing memory with the monitor. Unix file system * access permissions are the intended method of restricting access. * Unfortunately, today _any_ host process utilizing Mksck requires * access to mvpkm to setup its Mksck pages and obtain socket info via * ioctls - we probably should be exporting two devices, one for trusted * and one for arbitrary host processes to avoid this confusion of * concerns. */ static struct vm_operations_struct mvpkmVMOps = { .fault = MvpkmFault }; /* * Generic kernel module file ops. These functions will be registered * at the time the kernel module is loaded. */ static long MvpkmUnlockedIoctl(struct file *filep, unsigned int cmd, unsigned long arg); static int MvpkmOpen(struct inode *inode, struct file *filp); static int MvpkmRelease(struct inode *inode, struct file *filp); static int MvpkmMMap(struct file *file, struct vm_area_struct *vma); /** * @brief the file_operation structure contains the callback functions * that are registered with Linux to handle file operations on * the mvpkm device. * * The structure contains other members that the mvpkm device * does not use. Those members are auto-initialized to NULL. * * WARNING, this structure has changed after Linux kernel 2.6.19: * readv/writev are changed to aio_read/aio_write (neither is used here). */ static const struct file_operations mvpkmFileOps = { .owner = THIS_MODULE, .unlocked_ioctl = MvpkmUnlockedIoctl, .open = MvpkmOpen, .release = MvpkmRelease, .mmap = MvpkmMMap }; /** * @brief The mvpkm device identifying information to be used to register * the device with the Linux kernel. */ static struct miscdevice mvpkmDev = { .minor = 165, .name = "mvpkm", .fops = &mvpkmFileOps }; /** * Mvpkm is loaded by mvpd and only mvpd will be allowed to open * it. There is a very simple way to verify that: record the process * id (thread group id) at the time the module is loaded and test it * at the time the module is opened. */ static struct pid *initTgid; #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER /** * @name Slab shrinker for triggering balloon adjustment. * * @note shrinker us used as a trigger for guest balloon. * * @{ */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) static int MvpkmShrink(struct shrinker *this, struct shrink_control *sc); #else static int MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask); #endif static struct shrinker mvpkmShrinker = { .shrink = MvpkmShrink, .seeks = DEFAULT_SEEKS }; /*@}*/ #endif module_param_array(vcpuAffinity, ulong, NULL, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(vcpuAffinity, "vCPU affinity"); /** * @brief Initialize the mvpkm device, register it with the Linux kernel. * * @return A zero is returned on success and a negative errno code for failure. * (Same as the return policy of misc_register(9).) */ static int __init MvpkmInit(void) { int err = 0; _Bool mksckInited = false; _Bool cpuFreqInited = false; printk(KERN_INFO "Mvpkm: " MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); printk(KERN_INFO "Mvpkm: loaded from process %s tgid=%d, pid=%d\n", current->comm, task_tgid_vnr(current), task_pid_vnr(current)); if (bitmap_empty(vcpuAffinity, NR_CPUS)) { bitmap_copy(vcpuAffinity, cpumask_bits(cpu_possible_mask), NR_CPUS); } if ((err = misc_register(&mvpkmDev))) { return -ENOENT; } if ((err = Mksck_Init())) { goto error; } else { mksckInited = true; } QP_HostInit(); CpuFreq_Init(); cpuFreqInited = true; /* * Reference mvpd (module loader) tgid struct, so that we can avoid * attacks based on pid number wraparound. */ initTgid = get_pid(task_tgid(current)); #ifndef CONFIG_SYS_HYPERVISOR hypervisor_kobj = kobject_create_and_add("hypervisor", NULL); if (!hypervisor_kobj) { err = -ENOMEM; goto error; } #endif if (!(mvpkmKObj = kobject_create_and_add("mvp", hypervisor_kobj)) || !(balloonKObj = kobject_create_and_add("lowmem", mvpkmKObj)) || !(mvpkmKSet = kset_create_and_add("vm", NULL, mvpkmKObj))) { err = -ENOMEM; goto error; } if ((err = sysfs_create_file(mvpkmKObj, &versionAttr.attr))) { goto error; } if ((err = sysfs_create_file(balloonKObj, &backgroundAttr.attr))) { goto error; } if ((err = sysfs_create_file(balloonKObj, &otherFileAttr.attr))) { goto error; } #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER register_shrinker(&mvpkmShrinker); #endif MksckPageInfo_Init(); return 0; error: if (mvpkmKSet) { kset_unregister(mvpkmKSet); } if (balloonKObj) { kobject_del(balloonKObj); kobject_put(balloonKObj); } if (mvpkmKObj) { kobject_del(mvpkmKObj); kobject_put(mvpkmKObj); } #ifndef CONFIG_SYS_HYPERVISOR if (hypervisor_kobj) { kobject_del(hypervisor_kobj); kobject_put(hypervisor_kobj); } #endif if (cpuFreqInited) { CpuFreq_Exit(); } if (mksckInited) { Mksck_Exit(); } if (initTgid) { put_pid(initTgid); } misc_deregister(&mvpkmDev); return err; } /** * @brief De-register the mvpkm device with the Linux kernel. */ void MvpkmExit(void) { PRINTK(KERN_INFO "MvpkmExit called !\n"); MksckPageInfo_Exit(); #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER unregister_shrinker(&mvpkmShrinker); #endif kset_unregister(mvpkmKSet); kobject_del(balloonKObj); kobject_put(balloonKObj); kobject_del(mvpkmKObj); kobject_put(mvpkmKObj); #ifndef CONFIG_SYS_HYPERVISOR kobject_del(hypervisor_kobj); kobject_put(hypervisor_kobj); #endif CpuFreq_Exit(); Mksck_Exit(); put_pid(initTgid); misc_deregister(&mvpkmDev); } /* * The standard module registration macros of Linux. */ module_init(MvpkmInit); module_exit(MvpkmExit); module_param_named(minHiddenAppOOMAdj, minHiddenAppOOMAdj, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(minHiddenAppOOMAdj, "minimum hidden app oom_adj, as per lowmemorykiller"); #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER /** * @brief Balloon watchdog timeout callback. * * Terminate the VM since it's not responsive. * * @param data vm reference representation. */ static void WatchdogCB(unsigned long data) { MvpkmVM *vm = (MvpkmVM *)data; printk("Balloon watchdog expired (%d s)!\n", BALLOON_WATCHDOG_TIMEOUT_SECS); Mvpkm_WakeGuest(vm, ACTION_ABORT); } /** * @brief Slab shrinker. * * Called by Linux kernel when we're under memory pressure. We treat all locked * pages as a slab for this purpose, similar to the Android low memory killer. * * @param this reference to registered shrinker for callback context. * @param nrToScan number of entries to scan. If 0 then just return the number * of present entries. We ignore the value of nrToScan when > 1 * since the shrinker is a trigger to readjust guest balloons, * where the actual balloon size is determined in conjunction * with the guest. * @param gfpMask ignored. * * @return number of locked pages. */ static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) MvpkmShrink(struct shrinker *this, struct shrink_control *sc) #else MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask) #endif { uint32 locked = 0; struct kobject *k; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) int nrToScan = sc->nr_to_scan; #endif spin_lock(&mvpkmKSet->list_lock); list_for_each_entry(k, &mvpkmKSet->list, entry) { MvpkmVM *vm = container_of(k, MvpkmVM, kobj); locked += ATOMIC_GETO(vm->usedPages); /* * Try and grab the WSP semaphore - if we fail, we must be VM setup or * teardown, no point trying to wake the guest. */ if (nrToScan > 0 && down_read_trylock(&vm->wspSem)) { if (vm->wsp) { Mvpkm_WakeGuest(vm, ACTION_BALLOON); /* * Balloon watchdog. */ if (vm->balloonWDEnabled) { struct timer_list *t = &vm->balloonWDTimer; if (!timer_pending(t)) { t->data = (unsigned long)vm; t->function = WatchdogCB; t->expires = jiffies + BALLOON_WATCHDOG_TIMEOUT_SECS * HZ; add_timer(t); } } } up_read(&vm->wspSem); } } spin_unlock(&mvpkmKSet->list_lock); return locked; } #endif /** * @brief The open file operation. Initializes the vm specific structure. */ int MvpkmOpen(struct inode *inode, struct file *filp) { MvpkmVM *vm; if (initTgid != task_tgid(current)) { printk(KERN_ERR "%s: MVPKM can be opened only from MVPD (process %d).\n", __FUNCTION__, pid_vnr(initTgid)); return -EPERM; } printk(KERN_DEBUG "%s: Allocating an MvpkmVM structure from process %s tgid=%d, pid=%d\n", __FUNCTION__, current->comm, task_tgid_vnr(current), task_pid_vnr(current)); vm = kmalloc(sizeof(MvpkmVM), GFP_KERNEL); if (!vm) { return -ENOMEM; } memset(vm, 0, sizeof *vm); init_timer(&vm->balloonWDTimer); init_rwsem(&vm->lockedSem); init_rwsem(&vm->wspSem); init_rwsem(&vm->monThreadTaskSem); vm->monThreadTask = NULL; vm->isMonitorInited = false; filp->private_data = vm; if (!Mvpkm_vmwareUid) { Mvpkm_vmwareUid = current_euid(); } return 0; } /** * @brief Releases a VMs resources * @param vm vm to release */ static void ReleaseVM(MvpkmVM *vm) { del_timer_sync(&vm->balloonWDTimer); down_write(&vm->wspSem); if (vm->isMonitorInited) { MonitorTimer_Request(&vm->monTimer, 0); #ifdef CONFIG_HAS_WAKELOCK wake_lock_destroy(&vm->wakeLock); #endif Mksck_WspRelease(vm->wsp); vm->wsp = NULL; } up_write(&vm->wspSem); LockedListUnlockAll(vm); UnmapWSPHKVA(vm); /* * All sockets potentially connected to sockets of this vm's vmId will fail * at send now. DGRAM sockets are note required to tear down connection * explicitly. */ kfree(vm); } /** * @brief The release file operation. Releases the vm specific * structure including all the locked pages. * * @param inode Unused * @param filp which VM we're dealing with * @return 0 */ int MvpkmRelease(struct inode *inode, struct file *filp) { MvpkmVM *vm = filp->private_data; /* * Tear down any queue pairs associated with this VM */ if (vm->isMonitorInited) { ASSERT(vm->wsp); QP_DetachAll(vm->wsp->guestId); } /* * Release the VM's ksets. */ kset_unregister(vm->miscKSet); kset_unregister(vm->devicesKSet); if (vm->haveKObj) { /* * Release the VM's kobject. * 'vm' will be kfree-d in its kobject's release function. */ kobject_del(&vm->kobj); kobject_put(&vm->kobj); } else { ReleaseVM(vm); } filp->private_data = NULL; printk(KERN_INFO "%s: Released MvpkmVM structure from process %s tgid=%d, pid=%d\n", __FUNCTION__, current->comm, task_tgid_vnr(current), task_pid_vnr(current)); return 0; } /** * @brief Page fault handler for /dev/mem-like regions (see mvpkmVMOps * block comment). */ static int MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long address = (unsigned long)vmf->virtual_address; MPN mpn = vmf->pgoff; MvpkmVM *vm = vma->vm_file->private_data; /* * Only insert pages belonging to the VM. The check is slow, O(n) in the * number of MPNs associated with the VM, but it doesn't matter - the mmap * interface should only be used by trusted processes at initialization * time and for debugging. * * The mpn can be either in the memory reserved the monitor or mvpd * through the regular mechanisms or it could be a mksck page. */ if (!pfn_valid(mpn)) { printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx, mpn invalid\n", mpn, address); } else if (LockedListLookup(vm, mpn)) { if (vm_insert_page(vma, address, pfn_to_page(mpn)) == 0) { return VM_FAULT_NOPAGE; } printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx \n", mpn, address); } else if (MksckPage_LookupAndInsertPage(vma, address, mpn) == 0) { return VM_FAULT_NOPAGE; } if (vm->stubPageMPN) { if (vm_insert_page(vma, address, pfn_to_page(vm->stubPageMPN)) == 0) { printk(KERN_INFO "MvpkmMMap: mapped the stub page at %x @ %lx \n", mpn, address); return VM_FAULT_NOPAGE; } printk(KERN_ERR "MvpkmMMap: Could not insert stub page %x @ %lx \n", mpn, address); } return VM_FAULT_SIGBUS; } /** * @brief sysfs show function for per-VM locked_pages attribute. * * @param kobj reference to kobj nested in MvpkmVM struct. * @param attr attribute reference. * @param buf PAGE_SIZEd buffer to write to. * * @return number of characters printed (not including trailing null character). */ static ssize_t MvpkmAttrShow(struct kobject *kobj, struct attribute *attr, char *buf) { if (attr == &mvpkmLockedPagesAttr) { MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); return snprintf(buf, PAGE_SIZE, "%d\n", ATOMIC_GETO(vm->usedPages)); } else if (attr == &mvpkmMonitorAttr) { MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); return snprintf(buf, PAGE_SIZE, "hostActions %x callno %d\n", ATOMIC_GETO(vm->wsp->hostActions), WSP_Params(vm->wsp)->callno); } else { return -EPERM; } } /** * @brief sysfs store function for per-VM locked_pages attribute. * * @param kobj reference to kobj nested in MvpkmVM struct. * @param attr attribute reference. * @param buf PAGE_SIZEd buffer to write to. * @param buf input buffer. * @param count input buffer length. * * @return number of bytes consumed or negative error code. */ static ssize_t MvpkmAttrStore(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { if (attr == &mvpkmBalloonWatchdogAttr) { MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); /* * Enable balloon watchdog on first write. This includes all ballooning * capable guest. */ vm->balloonWDEnabled = true; del_timer_sync(&vm->balloonWDTimer); return 1; } else { return -EPERM; } } /** * @brief Map machine address space region into host process. * * @param file file reference (ignored). * @param vma Linux virtual memory area defining the region. * * @return 0 on success, otherwise error code. */ static int MvpkmMMap(struct file *file, struct vm_area_struct *vma) { vma->vm_ops = &mvpkmVMOps; return 0; } #ifdef CONFIG_ARM_LPAE /** * @brief Determine host cacheability/shareability attributes. * * Used to ensure monitor/guest shared mappings are consistent with * those of host user/kernel. * * @param[out] attribMAN when setting up the HW monitor this provides the * attributes in the generic ARM_MemAttrNormal form, * suitable for configuring the monitor and guest's * [H]MAIR0 and setting the shareability attributes of * the LPAE descriptors. */ static void DetermineMemAttrLPAE(ARM_MemAttrNormal *attribMAN) { /* * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for * normal kernel/user L2D mappings. These bits should be consistent both * with each other and what we use in the monitor since we share various * pages with both host processes, the kernel module and monitor, and the * ARM ARM requires that synonyms have the same cacheability attributes, * see end of A3.5.{4,7} ARM DDI 0406A. */ HKVA hkva = __get_free_pages(GFP_KERNEL, 0); ARM_LPAE_L3D *pt = (ARM_LPAE_L3D *)hkva; ARM_LPAE_L3D *kernL3D = &pt[0], *userL3D = &pt[1]; uint32 attr, mair0, mair1; set_pte_ext((pte_t *)kernL3D, pfn_pte(0, PAGE_KERNEL), 0); set_pte_ext((pte_t *)userL3D, pfn_pte(0, PAGE_NONE), 0); printk(KERN_INFO "DetermineMemAttr: Kernel L3D AttrIndx=%x SH=%x\n", kernL3D->blockS1.attrIndx, kernL3D->blockS1.sh); printk(KERN_INFO "DetermineMemAttr: User L3D AttrIndx=%x SH=%x\n", userL3D->blockS1.attrIndx, userL3D->blockS1.sh); ASSERT(kernL3D->blockS1.attrIndx == userL3D->blockS1.attrIndx); ASSERT(kernL3D->blockS1.sh == userL3D->blockS1.sh); switch (kernL3D->blockS1.sh) { case 0: { attribMAN->share = ARM_SHARE_ATTR_NONE; break; } case 2: { attribMAN->share = ARM_SHARE_ATTR_OUTER; break; } case 3: { attribMAN->share = ARM_SHARE_ATTR_INNER; break; } default: { FATAL(); } } ARM_MRC_CP15(MAIR0, mair0); ARM_MRC_CP15(MAIR1, mair1); attr = MVP_EXTRACT_FIELD(kernL3D->blockS1.attrIndx >= 4 ? mair1 : mair0, 8 * (kernL3D->blockS1.attrIndx % 4), 8); /* * See B4-1615 ARM DDI 0406C-2c for magic. */ #define MAIR_ATTR_2_CACHE_ATTR(x, y) \ switch (x) { \ case 2: { \ (y) = ARM_CACHE_ATTR_NORMAL_WT; \ break; \ } \ case 3: { \ (y) = ARM_CACHE_ATTR_NORMAL_WB; \ break; \ } \ default: { \ FATAL(); \ } \ } MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 2, 2), attribMAN->innerCache); MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 6, 2), attribMAN->outerCache); #undef MAIR_ATTR_2_CACHE_ATTR printk(KERN_INFO "DetermineMemAttr: innerCache %x outerCache %x share %x\n", attribMAN->innerCache, attribMAN->outerCache, attribMAN->share); free_pages(hkva, 0); } #else /** * @brief Determine host cacheability/shareability attributes. * * Used to ensure monitor/guest shared mappings are consistent with * those of host user/kernel. * * @param[out] attribL2D when setting up the LPV monitor a template L2D * containing cacheability attributes {S, TEX,CB} used by * host kernel for normal memory mappings. These may be * used directly for monitor/guest mappings, since both * worlds share a common {TRE, PRRR, NMRR}. * @param[out] attribMAN when setting up TTBR0 in the LPV monitor and the page * tables for the HW monitor this provides the attributes * in the generic ARM_MemAttrNormal form, suitable for * configuring TTBR0 + the monitor and guest's [H]MAIR0 * and setting the shareability attributes of the LPAE * descriptors. */ static void DetermineMemAttrNonLPAE(ARM_L2D *attribL2D, ARM_MemAttrNormal *attribMAN) { /* * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for * normal kernel/user L2D mappings. These bits should be consistent both * with each other and what we use in the monitor since we share various * pages with both host processes, the kernel module and monitor, and the * ARM ARM requires that synonyms have the same cacheability attributes, * see end of A3.5.{4,7} ARM DDI 0406A. */ HKVA hkva = __get_free_pages(GFP_KERNEL, 0); uint32 sctlr; ARM_L2D *pt = (ARM_L2D *)hkva; ARM_L2D *kernL2D = &pt[0], *userL2D = &pt[1]; #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 38) /* * Linux uses the magic 2048 offset in set_pte_ext. See include/asm/pgtable.h * for PAGE_NONE and PAGE_KERNEL semantics. */ const uint32 set_pte_ext_offset = 2048; #else /* * Linux 2.6.38 switched the order of Linux vs hardware page tables. * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. */ const uint32 set_pte_ext_offset = 0; #endif set_pte_ext((pte_t *)(kernL2D + set_pte_ext_offset/sizeof(ARM_L2D)), pfn_pte(0, PAGE_KERNEL), 0); set_pte_ext((pte_t *)(userL2D + set_pte_ext_offset/sizeof(ARM_L2D)), pfn_pte(0, PAGE_NONE), 0); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 38) /* * Linux 2.6.38 switched the order of Linux vs hardware page tables. * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. */ kernL2D += 2048/sizeof(ARM_L2D); userL2D += 2048/sizeof(ARM_L2D); #endif printk(KERN_INFO "DetermineMemAttr: Kernel L2D TEX=%x CB=%x S=%x\n", kernL2D->small.tex, kernL2D->small.cb, kernL2D->small.s); printk(KERN_INFO "DetermineMemAttr: User L2D TEX=%x CB=%x S=%x\n", userL2D->small.tex, userL2D->small.cb, userL2D->small.s); ASSERT((kernL2D->small.tex & 1) == (userL2D->small.tex & 1)); ASSERT(kernL2D->small.cb == userL2D->small.cb); ASSERT(kernL2D->small.s == userL2D->small.s); *attribL2D = *kernL2D; /* * We now decode TEX remap and obtain the more generic form for use in * the LPV monitor's TTBR0 initialization and the HW monitor. */ ARM_MRC_CP15(CONTROL_REGISTER, sctlr); if (sctlr & ARM_CP15_CNTL_TRE) { uint32 prrr, nmrr, indx, type, innerCache, outerCache, outerShare, share; printk(KERN_INFO "DetermineMemAttr: TEX remapping enabled\n"); ARM_MRC_CP15(PRIMARY_REGION_REMAP, prrr); ARM_MRC_CP15(NORMAL_MEMORY_REMAP, nmrr); printk(KERN_INFO "DetermineMemAttr: PRRR=%x NMRR=%x\n", prrr, nmrr); /* * Decode PRRR/NMRR below. See B3.7 ARM DDI 0406B for register * encodings, tables and magic numbers. */ indx = (MVP_BIT(kernL2D->small.tex, 0) << 2) | kernL2D->small.cb; /* * Only normal memory makes sense here. */ type = MVP_EXTRACT_FIELD(prrr, 2 * indx, 2); ASSERT(type == 2); innerCache = MVP_EXTRACT_FIELD(nmrr, 2 * indx, 2); outerCache = MVP_EXTRACT_FIELD(nmrr, 16 + 2 * indx, 2); outerShare = !MVP_BIT(prrr, 24 + indx); share = MVP_BIT(prrr, 18 + kernL2D->small.s); printk(KERN_INFO "DetermineMemAttr: type %x innerCache %x outerCache %x" " share %x outerShare %x\n", type, innerCache, outerCache, share, outerShare); if (share) { if (outerShare) { attribMAN->share = ARM_SHARE_ATTR_OUTER; } else { attribMAN->share = ARM_SHARE_ATTR_INNER; } } else { attribMAN->share = ARM_SHARE_ATTR_NONE; } attribMAN->innerCache = innerCache; attribMAN->outerCache = outerCache; } else { NOT_IMPLEMENTED_JIRA(1849); } free_pages(hkva, 0); } #endif /** * @brief The ioctl file operation. * * The ioctl command is the main communication method between the * vmx and the mvpkm kernel module. * * @param filp which VM we're dealing with * @param cmd select which cmd function needs to be performed * @param arg argument for command * @return error code, 0 on success */ long MvpkmUnlockedIoctl(struct file *filp, unsigned int cmd, unsigned long arg) { MvpkmVM *vm = filp->private_data; int retval = 0; switch (cmd) { case MVPKM_DISABLE_FAULT: { if (!vm->stubPageMPN) { uint32 *ptr; vm->stubPageMPN = AllocZeroedFreePages(vm, 0, false, MEMREGION_MAINMEM, (HKVA*)&ptr); if (!vm->stubPageMPN) { break; } ptr[0] = MVPKM_STUBPAGE_BEG; ptr[PAGE_SIZE/sizeof(uint32) - 1] = MVPKM_STUBPAGE_END; } break; } /* * Allocate some pinned pages from kernel. * Returns -ENOMEM if no host pages available for allocation. */ case MVPKM_LOCK_MPN: { struct MvpkmLockMPN buf; if (copy_from_user(&buf, (void *)arg, sizeof buf)) { return -EFAULT; } buf.mpn = AllocZeroedFreePages(vm, buf.order, false, buf.forRegion, NULL); if (buf.mpn == 0) { return -ENOMEM; } if (copy_to_user((void *)arg, &buf, sizeof buf)) { return -EFAULT; } break; } case MVPKM_UNLOCK_MPN: { struct MvpkmLockMPN buf; if (copy_from_user(&buf, (void *)arg, sizeof buf)) { return -EFAULT; } if (!LockedListDel(vm, buf.mpn)) { return -EINVAL; } break; } case MVPKM_MAP_WSPHKVA: { MvpkmMapHKVA mvpkmMapInfo; HkvaMapInfo mapInfo[WSP_PAGE_COUNT]; if (copy_from_user(&mvpkmMapInfo, (void *)arg, sizeof mvpkmMapInfo)) { return -EFAULT; } if (copy_from_user(mapInfo, (void *)mvpkmMapInfo.mapInfo, sizeof mapInfo)) { return -EFAULT; } mvpkmMapInfo.hkva = MapWSPHKVA(vm, mapInfo); BUG_ON(mvpkmMapInfo.hkva == 0); if (mvpkmMapInfo.forRegion == MEMREGION_WSP) { vm->wsp = (WorldSwitchPage *) mvpkmMapInfo.hkva; } if (copy_to_user((void *)arg, &mvpkmMapInfo, sizeof mvpkmMapInfo)) { return -EFAULT; } break; } case MVPKM_RUN_MONITOR: { if (!vm->isMonitorInited) { vm->isMonitorInited = ((retval = SetupMonitor(vm)) == 0); } if (vm->isMonitorInited) { retval = RunMonitor(vm); } break; } case MVPKM_ABORT_MONITOR: { if (!vm->isMonitorInited) { return -EINVAL; } ASSERT(vm->wsp != NULL); Mvpkm_WakeGuest(vm, ACTION_ABORT); break; } case MVPKM_CPU_INFO: { struct MvpkmCpuInfo buf; uint32 mpidr; #ifdef CONFIG_ARM_LPAE DetermineMemAttrLPAE(&buf.attribMAN); /** * We need to add support to the LPV monitor for LPAE page tables if we * want to use it on a LPAE host, due to the costs involved in * transitioning between LPAE and non-LPAE page tables without Hyp * assistance. * * @knownjira{MVP-2184} */ buf.attribL2D.u = 0; #else DetermineMemAttrNonLPAE(&buf.attribL2D, &buf.attribMAN); #endif /* * Are MP extensions implemented? See B4-1618 ARM DDI 0406C-2c for * magic. */ ARM_MRC_CP15(MPIDR, mpidr); buf.mpExt = mpidr & ARM_CP15_MPIDR_MP; if (copy_to_user((int *)arg, &buf, sizeof(struct MvpkmCpuInfo))) { retval = -EFAULT; } break; } default: { retval = -EINVAL; break; } } PRINTK(KERN_INFO "returning from IOCTL(%d) retval = %d %s\n", cmd, retval, signal_pending(current)?"(pending signal)":"" ); return retval; } /********************************************************************* * * Locked page management * *********************************************************************/ /* * Pages locked by the kernel module are remembered so an unlockAll * operation can be performed when the vmm is closed. The locked page * identifiers are stored in a red-black tree to support O(log n) * removal and search (required for /dev/mem-like mmap). */ /** * @brief Descriptor of a locked page range */ typedef struct { struct { __u32 mpn : 20; ///< MPN. __u32 order : 6; ///< Size/alignment exponent for page. __u32 forRegion : 6; ///< Annotation to identify guest page allocation } page; struct rb_node rb; } LockedPage; static void FreeLockedPages(LockedPage *lp); /** * @brief Search for an mpn inside a RB tree of LockedPages. The mpn * will match a LockedPage as long as it is covered by the * entry, i.e. in a non-zero order entry it doesn't have to be * the base MPN. * * This must be called with the relevant vm->lockedSem held. * * @param root RB tree root. * @param mpn MPN to search for. * * @return reference to LockedPage entry if found, otherwise NULL. */ static LockedPage * LockedListSearch(struct rb_root *root, __u32 mpn) { struct rb_node *n = root->rb_node; while (n) { LockedPage *lp = rb_entry(n, LockedPage, rb); if (lp->page.mpn == (mpn & (~0UL << lp->page.order))) { return lp; } if (mpn < lp->page.mpn) { n = n->rb_left; } else { n = n->rb_right; } } return NULL; } /** * @brief Delete an mpn from the list of locked pages. * * @param vm Mvpkm module control structure pointer * @param mpn MPN to be unlocked and freed for reuse * @return true if list contained MPN and it was deleted from list */ static _Bool LockedListDel(MvpkmVM *vm, __u32 mpn) { LockedPage *lp; down_write(&vm->lockedSem); lp = LockedListSearch(&vm->lockedRoot, mpn); /* * The MPN should be in the locked pages RB tree and it should be the * base of an entry, i.e. we can't fragment existing allocations for * a VM. */ if (lp == NULL || lp->page.mpn != mpn) { up_write(&vm->lockedSem); return false; } FreeLockedPages(lp); if (lp->page.forRegion == MEMREGION_MAINMEM) { ATOMIC_SUBV(vm->usedPages, 1U << lp->page.order); } rb_erase(&lp->rb, &vm->lockedRoot); kfree(lp); up_write(&vm->lockedSem); return true; } /** * @brief Scan the list of locked pages to see if an MPN matches. * * @param vm Mvpkm module control structure pointer * @param mpn MPN to check * * @return true iff list contains MPN. */ static _Bool LockedListLookup(MvpkmVM *vm, __u32 mpn) { LockedPage *lp; down_read(&vm->lockedSem); lp = LockedListSearch(&vm->lockedRoot, mpn); up_read(&vm->lockedSem); return lp != NULL; } /** * @brief Add a new mpn to the locked pages RB tree. * * @param vm control structure pointer * * @param mpn mpn of page that was locked with get_user_pages or some sort of * get that is undone by put_page. * The mpn is assumed to be non-zero * @param order size/alignment exponent for page * @param forRegion Annotation for Page pool to identify guest page allocations * * @return false: couldn't allocate internal memory to record mpn in
* true: successful. */ static _Bool LockedListAdd(MvpkmVM *vm, __u32 mpn, __u32 order, PhysMem_RegionType forRegion) { struct rb_node *parent, **p; LockedPage *tp, *lp = kmalloc(sizeof *lp, GFP_KERNEL); if (!lp) { return false; } lp->page.mpn = mpn; lp->page.order = order; lp->page.forRegion = forRegion; down_write(&vm->lockedSem); if (forRegion == MEMREGION_MAINMEM) { ATOMIC_ADDV(vm->usedPages, 1U << order); } /* * Insert as a red leaf in the tree (see include/linux/rbtree.h). */ p = &vm->lockedRoot.rb_node; parent = NULL; while (*p) { parent = *p; tp = rb_entry(parent, LockedPage, rb); /* * MPN should not already exist in the tree. */ ASSERT(tp->page.mpn != (mpn & (~0UL << tp->page.order))); if (mpn < tp->page.mpn) { p = &(*p)->rb_left; } else { p = &(*p)->rb_right; } } rb_link_node(&lp->rb, parent, p); /* * Restructure tree if necessary (see include/linux/rbtree.h). */ rb_insert_color(&lp->rb, &vm->lockedRoot); up_write(&vm->lockedSem); return true; } /** * @brief Traverse RB locked tree, freeing every entry. * * This must be called with the relevant vm->lockedSem held. * * @param node reference to RB node at root of subtree. */ static void LockedListNuke(struct rb_node *node) { while (node) { if (node->rb_left) { node = node->rb_left; } else if (node->rb_right) { node = node->rb_right; } else { /* * We found a leaf, free it and go back to parent. */ LockedPage *lp = rb_entry(node, LockedPage, rb); if ((node = rb_parent(node))) { if (node->rb_left) { node->rb_left = NULL; } else { node->rb_right = NULL; } } FreeLockedPages(lp); kfree(lp); } } } /** * @brief Unlock all pages at vm close time. * * @param vm control structure pointer */ static void LockedListUnlockAll(MvpkmVM *vm) { down_write(&vm->lockedSem); LockedListNuke(vm->lockedRoot.rb_node); ATOMIC_SETV(vm->usedPages, 0); up_write(&vm->lockedSem); } /** * @brief Allocate zeroed free pages * * @param[in] vm which VM the pages are for so they will be freed when the vm * closes * @param[in] order log2(number of contiguous pages to allocate) * @param[in] highmem is it OK to allocate this page in ZONE_HIGHMEM? This * option should only be specified for pages the host kernel * will not need to address directly. * @param[out] hkvaRet where to return host kernel virtual address of the * allocated pages, if non-NULL, and ONLY IF !highmem. * @param forRegion Annotation for Page pool to identify guest page allocations * @return 0: no host memory available
* else: starting MPN
* *hkvaRet = filled in */ static MPN AllocZeroedFreePages(MvpkmVM *vm, uint32 order, _Bool highmem, PhysMem_RegionType forRegion, HKVA *hkvaRet) { MPN mpn; struct page *page; if (order > PAGE_ALLOC_COSTLY_ORDER) { printk(KERN_WARNING "Order %d allocation for region %d exceeds the safe " "maximum order %d\n", order, forRegion, PAGE_ALLOC_COSTLY_ORDER); } /* * Get some pages for the requested range. They will be physically * contiguous and have the requested alignment. They will also * have a kernel virtual mapping if !highmem. * * We allocate out of ZONE_MOVABLE even though we can't just pick up our * bags. We do this to support platforms that explicitly configure * ZONE_MOVABLE, such as the Qualcomm MSM8960, to enable deep power down of * memory banks. When the kernel attempts to take a memory bank offline, it * will try and place the pages on the isolate LRU - only pages already on an * LRU, such as anon/file, can get there, so it will not be able to * migrate/move our pages (and hence the bank will not be offlined). The * other alternative is to live withing ZONE_NORMAL, and only have available * a small fraction of system memory. Long term we plan on hooking the * offlining callback in mvpkm and perform our own migration with the * cooperation of the monitor, but we don't have dev board to support this * today. * * @knownjira{MVP-3477} */ page = alloc_pages(GFP_USER | __GFP_COMP | __GFP_ZERO | (highmem ? __GFP_HIGHMEM | __GFP_MOVABLE : 0), order); if (page == NULL) { return 0; } /* * Return the corresponding page number. */ mpn = page_to_pfn(page); ASSERT(mpn != 0); /* * Remember to unlock the pages when the FD is closed. */ if (!LockedListAdd(vm, mpn, order, forRegion)) { __free_pages(page, order); return 0; } if (hkvaRet) { *hkvaRet = highmem ? 0 : __phys_to_virt(page_to_phys(page)); } return mpn; } /** * @brief Map already-pinned WSP memory in host kernel virtual address(HKVA) * space. Assumes 2 world switch pages on an 8k boundary. * * @param[in] vm which VM the HKVA Area is to be mapped for * @param[in] mapInfo array of MPNs and execute permission flags to be used in inserting a new contiguous map in HKVA space * @return 0: HKVA space could not be mapped else: HKVA where mapping was inserted */ static HKVA MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo) { unsigned int i; struct page **pages = NULL; struct page **pagesPtr; pgprot_t prot; int retval; int allocateCount = WSP_PAGE_COUNT + 1; // Reserve one page for alignment int pageIndex = 0; HKVA dummyPage = (HKVA)NULL; HKVA start; HKVA startSegment; HKVA endSegment; /* * Add one page for alignment purposes in case __get_vm_area returns an * unaligned address. */ ASSERT(allocateCount == 3); ASSERT_ON_COMPILE(WSP_PAGE_COUNT == 2); /* * NOT_IMPLEMENTED if MapHKVA is called more than once. */ BUG_ON(vm->wspHkvaArea); /* * Reserve virtual address space. */ vm->wspHkvaArea = __get_vm_area((allocateCount * PAGE_SIZE), VM_ALLOC, MODULES_VADDR, MODULES_END); if (!vm->wspHkvaArea) { return 0; } pages = kmalloc(allocateCount * sizeof(struct page *), GFP_TEMPORARY); if (!pages) { goto err; } pagesPtr = pages; /* * Use a dummy page to boundary align the section, if needed. */ dummyPage = __get_free_pages(GFP_KERNEL, 0); if (!dummyPage) { goto err; } vm->wspHKVADummyPage = dummyPage; /* * Back every entry with the dummy page. */ for (i = 0; i < allocateCount; i++) { pages[i] = virt_to_page(dummyPage); } /* * World switch pages must not span a 1MB boundary in order to maintain only * a single L2 page table. */ start = (HKVA)vm->wspHkvaArea->addr; startSegment = start & ~(ARM_L1D_SECTION_SIZE - 1); endSegment = (start + PAGE_SIZE) & ~(ARM_L1D_SECTION_SIZE - 1); /* * Insert dummy page at pageIndex, if needed. */ pageIndex = (startSegment != endSegment); /* * Back the rest with the actual world switch pages */ for (i = pageIndex; i < pageIndex + WSP_PAGE_COUNT; i++) { pages[i] = pfn_to_page(mapInfo[i - pageIndex].mpn); } /* * Given the lack of functionality in the kernel for being able to mark * mappings for a given vm area with different sets of protection bits, * we simply mark the entire vm area as PAGE_KERNEL_EXEC for now * (i.e., union of all the protection bits). Given that the kernel * itself does something similar while loading modules, this should be a * reasonable workaround for now. In the future, we should set the * protection bits to strictly adhere to what has been requested in the * mapInfo parameter. */ prot = PAGE_KERNEL_EXEC; retval = map_vm_area(vm->wspHkvaArea, prot, &pagesPtr); if (retval < 0) { goto err; } kfree(pages); return (HKVA)(vm->wspHkvaArea->addr) + pageIndex * PAGE_SIZE; err: if (dummyPage) { free_pages(dummyPage, 0); vm->wspHKVADummyPage = (HKVA)NULL; } if (pages) { kfree(pages); } free_vm_area(vm->wspHkvaArea); vm->wspHkvaArea = (HKVA)NULL; return 0; } static void UnmapWSPHKVA(MvpkmVM *vm) { if (vm->wspHkvaArea) { free_vm_area(vm->wspHkvaArea); } if (vm->wspHKVADummyPage) { free_pages(vm->wspHKVADummyPage, 0); vm->wspHKVADummyPage = (HKVA)NULL; } } /** * @brief Clean and release locked pages * * @param lp Reference to the locked pages */ static void FreeLockedPages(LockedPage *lp) { struct page *page; int count; page = pfn_to_page(lp->page.mpn); count = page_count(page); if (count == 0) { printk(KERN_ERR "%s: found locked page with 0 reference (mpn %05x)\n", __func__, lp->page.mpn); return; } if (count == 1) { int i; /* * There is no other user for this page, clean it. * * We don't bother checking if the page was highmem or not, clear_highmem * works for both. * We clear the content of the page, and rely on the fact that the previous * worldswitch has cleaned the potential VIVT I-CACHE. */ for (i = 0; i < (1 << lp->page.order); i++) { clear_highpage(page + i); } } else if (lp->page.forRegion != MEMREGION_MAINMEM) { printk(KERN_WARNING "%s: mpn 0x%05x for region %d is still in use\n", __func__, lp->page.mpn, lp->page.forRegion); } __free_pages(page, lp->page.order); } /********************************************************************* * * Communicate with monitor * *********************************************************************/ /** * @brief Register a new monitor page. * * @param vm which virtual machine we're running * @return 0: successful
* else: -errno */ static int SetupMonitor(MvpkmVM *vm) { int retval; WorldSwitchPage *wsp = vm->wsp; if (!wsp || wsp->wspHKVA != (HKVA)wsp) { return -EINVAL; } if ((retval = Mksck_WspInitialize(vm))) { return retval; } vm->kobj.kset = mvpkmKSet; retval = kobject_init_and_add(&vm->kobj, &mvpkmKType, NULL, "%d", wsp->guestId); if (retval) { goto error; } /* * Get a reference to this module such that it cannot be unloaded until * our kobject's release function completes. */ __module_get(THIS_MODULE); vm->haveKObj = true; /* * Caution: From here on, if we fail, we must not call kobject_put() * on vm->kobj since that may / will deallocate 'vm'. Unregistering VM * ksets on failures, is fine and should be done for proper ref counting. */ vm->devicesKSet = kset_create_and_add("devices", NULL, &vm->kobj); if (!vm->devicesKSet) { retval = -ENOMEM; goto error; } vm->miscKSet = kset_create_and_add("misc", NULL, &vm->kobj); if (!vm->miscKSet) { kset_unregister(vm->devicesKSet); vm->devicesKSet = NULL; retval = -ENOMEM; goto error; } down_write(&vm->wspSem); /* * The VE monitor needs to issue a SMC to bootstrap Hyp mode. */ if (wsp->monType == MONITOR_TYPE_VE) { /* * Here we assemble the monitor's HMAIR0 based on wsp->memAttr. We map * from the inner/outer normal page cacheability attributes obtained * from DetermineCacheabilityAttribs to the format required in 4.2.8 * ARM PRD03-GENC-008469 13.0 (see this document for the magic numbers). * * Where a choice is available, we opt for read and/or write allocation. */ static const uint32 normalCacheAttr2MAIR[4] = { 0x4, 0xf, 0xa, 0xe }; uint32 hmair0 = ((normalCacheAttr2MAIR[wsp->memAttr.innerCache] | (normalCacheAttr2MAIR[wsp->memAttr.outerCache] << 4)) << 8 * MVA_MEMORY) | (0x4 << 8 * MVA_DEVICE); /* * See B4.1.74 ARM DDI 0406C-2c for the HTCR magic. */ uint32 htcr = 0x80000000 | (wsp->memAttr.innerCache << 8) | (wsp->memAttr.outerCache << 10) | (wsp->memAttr.share << 12); /** * @knownjira{MVP-377} * Set HSCTLR to enable MMU and caches. We should really run the * monitor WXN, in non-MVP_DEVEL builds. See * 13.18 ARM PRD03-GENC-008353 11.0 for the magic. */ static const uint32 hsctlr = 0x30c5187d; register uint32 r0 asm("r0") = wsp->monVA.excVec; register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; register uint32 r2 asm("r2") = htcr; register uint32 r3 asm("r3") = hmair0; register uint32 r4 asm("r4") = hsctlr; asm volatile ( ".arch_extension sec\n" "smc 0" : : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4) : "memory" ); } /* * Initialize guest wait-for-interrupt waitqueue. */ init_waitqueue_head(&vm->wfiWaitQ); MonitorTimer_Setup(vm); #ifdef CONFIG_HAS_WAKELOCK wake_lock_init(&vm->wakeLock, WAKE_LOCK_SUSPEND, "mvpkm"); #endif wsp->mvpkmVersion = MVP_VERSION_CODE; up_write(&vm->wspSem); /* * Ensure coherence of monitor loading and page tables. */ flush_cache_all(); return 0; error: Mksck_WspRelease(wsp); vm->wsp = NULL; return retval; } /** * @brief dummy function to drop the info parameter * @param info ignored */ static void FlushAllCpuCaches(void *info) { flush_cache_all(); } /** * @brief return to where monitor called worldswitch * * @param vm which virtual machine we're running * @return 0: successful, just call back when ready
* 1: successful, process code in WSP_Params(wsp)->callno
* else: -errno */ static int RunMonitor(MvpkmVM *vm) { int ii; unsigned long flags; WorldSwitchPage *wsp = vm->wsp; int retval = 0; ASSERT(wsp); #ifdef CONFIG_HAS_WAKELOCK wake_lock(&vm->wakeLock); #endif /* * Set VCPUThread affinity */ if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask)) { set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); } /* * Record the the current task structure, so an ABORT will know, * who to wake. */ down_write(&vm->monThreadTaskSem); vm->monThreadTask = get_current(); up_write(&vm->monThreadTaskSem); /* * Keep going as long as the monitor is in critical section or * there are no pending signals such as SIGINT or SIGKILL. Block * interrupts before checking so any IPI sent will remain pending * if our check just misses detecting the signal. */ local_irq_save(flags); while (wsp->critSecCount > 0 || (!signal_pending(current) && !(ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT))) { /* * ARMv7 Performance counters are per CPU core and might be disabled over * CPU core sleep if there is nothing else in the system to re-enable * them, so now that we have been allocated a CPU core to run the guest, * enable them and in particular the TSC (CCNT) which is used for monitor * timing between world switches. */ { uint32 pmnc; uint32 pmcnt; /* make sure that the Performance Counters are enabled */ ARM_MRC_CP15(PERF_MON_CONTROL_REGISTER, pmnc); if ((pmnc & (ARM_PMNC_E | ARM_PMNC_D)) != (ARM_PMNC_E)) { pmnc |= ARM_PMNC_E; // Enable TSC pmnc &= ~ARM_PMNC_D; // Disable cycle count divider ARM_MCR_CP15(PERF_MON_CONTROL_REGISTER, pmnc); } /* make sure that the CCNT is enabled */ ARM_MRC_CP15(PERF_MON_COUNT_SET, pmcnt); if ((pmcnt & ARM_PMCNT_C) != ARM_PMCNT_C) { pmcnt |= ARM_PMCNT_C; ARM_MCR_CP15(PERF_MON_COUNT_SET, pmcnt); } } /* * Update TSC to RATE64 ratio */ { struct TscToRate64Cb *ttr = &__get_cpu_var(tscToRate64); wsp->tscToRate64Mult = ttr->mult; wsp->tscToRate64Shift = ttr->shift; } /* * Save the time of day for the monitor's timer facility. The timing * facility in the vmm needs to compute current time in the host linux's * time representation. It uses the formula: * now = wsp->switchedAt64 + (uint32)(TSC_READ() - wsp->lowerTSC) * * Read the timestamp counter *immediately after* ktime_get() as that * will give the most consistent offset between reading the hardware * clock register in ktime_get() and reading the hardware timestamp * counter with TSC_READ(). */ ASSERT_ON_COMPILE(MVP_TIMER_RATE64 == NSEC_PER_SEC); { ktime_t now = ktime_get(); TSC_READ(wsp->switchedAtTSC); wsp->switchedAt64 = ktime_to_ns(now); } /* * Save host FPU contents and load monitor contents. */ SWITCH_VFP_TO_MONITOR; /* * Call into the monitor to run guest instructions until it wants us to * do something for it. Note that any hardware interrupt request will * cause it to volunteer. */ switch (wsp->monType) { case MONITOR_TYPE_LPV: { uint32 hostVBAR; ARM_MRC_CP15(VECTOR_BASE, hostVBAR); (*wsp->switchToMonitor)(&wsp->regSave); ARM_MCR_CP15(VECTOR_BASE, hostVBAR); break; } case MONITOR_TYPE_VE: { register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; asm volatile ( ".word " MVP_STRINGIFY(ARM_INSTR_HVC_A1_ENC(0)) : "=r" (r1) : "r" (r1) : "r0", "r2", "memory" ); break; } default: FATAL(); } /* * Save monitor FPU contents and load host contents. */ SWITCH_VFP_TO_HOST; /* * Re-enable local interrupts now that we are back in the host world */ local_irq_restore(flags); /* * Maybe the monitor wrote some messages to monitor->host sockets. * This will wake the corresponding host threads to receive them. */ /** * @todo This lousy loop is in the critical path. It should be changed * to some faster algorithm to wake blocked host sockets. */ for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { if (wsp->isPageMapped[ii]) { Mksck_WakeBlockedSockets(MksckPage_GetFromIdx(ii)); } } switch (WSP_Params(wsp)->callno) { case WSCALL_ACQUIRE_PAGE: { uint32 i; for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { MPN mpn = AllocZeroedFreePages(vm, WSP_Params(wsp)->pages.order, true, WSP_Params(wsp)->pages.forRegion, NULL); if (mpn == 0) { printk(KERN_WARNING "WSCALL_ACQUIRE_PAGE: no order %u pages available\n", WSP_Params(wsp)->pages.order); WSP_Params(wsp)->pages.pages = i; break; } WSP_Params(wsp)->pages.mpns[i] = mpn; } break; } case WSCALL_RELEASE_PAGE: { uint32 i; for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { if (!LockedListDel(vm, WSP_Params(wsp)->pages.mpns[i])) { WSP_Params(wsp)->pages.pages = i; break; } } break; } case WSCALL_MUTEXLOCK: { retval = Mutex_Lock((void *)WSP_Params(wsp)->mutex.mtxHKVA, WSP_Params(wsp)->mutex.mode); if (retval < 0) { WSP_Params(wsp)->mutex.ok = false; goto monitorExit; } /* * The locking succeeded. From this point on the monitor * is in critical section. Even if an interrupt comes * right here, it must return to the monitor to unlock the * mutex. */ wsp->critSecCount++; WSP_Params(wsp)->mutex.ok = true; break; } case WSCALL_MUTEXUNLOCK: { Mutex_Unlock((void *)WSP_Params(wsp)->mutex.mtxHKVA, WSP_Params(wsp)->mutex.mode); break; } case WSCALL_MUTEXUNLSLEEP: { /* * The vcpu has just come back from the monitor. During * the transition interrupts were disabled. Above, * however, interrupts were enabled again and it is * possible that a context switch happened into a thread * (serve_vmx) that instructed the vcpu thread to * abort. After returning to this thread the vcpu may * enter a sleep below never to return from it. To avoid * this deadlock we need to test the abort flag in * Mutex_UnlSleepTest. */ retval = Mutex_UnlSleepTest((void *)WSP_Params(wsp)->mutex.mtxHKVA, WSP_Params(wsp)->mutex.mode, WSP_Params(wsp)->mutex.cvi, &wsp->hostActions, ACTION_ABORT); if (retval < 0) { goto monitorExit; } break; } case WSCALL_MUTEXUNLWAKE: { Mutex_UnlWake((void *)WSP_Params(wsp)->mutex.mtxHKVA, WSP_Params(wsp)->mutex.mode, WSP_Params(wsp)->mutex.cvi, WSP_Params(wsp)->mutex.all); break; } /* * The monitor wants us to block (allowing other host threads to run) * until an async message is waiting for the monitor to process. * * If MvpkmWaitForInt() returns an error, it should only be if there * is another signal pending (such as SIGINT). So we pretend it * completed normally, as the monitor is ready to be called again (it * will see no messages to process and wait again), and return to user * mode so the signals can be processed. */ case WSCALL_WAIT: { #ifdef CONFIG_HAS_WAKELOCK if (WSP_Params(wsp)->wait.suspendMode) { /* guest has ok'ed suspend mode, so release SUSPEND wakelock */ wake_unlock(&vm->wakeLock); retval = MvpkmWaitForInt(vm, true); wake_lock(&vm->wakeLock); WSP_Params(wsp)->wait.suspendMode = 0; } else { /* guest has asked for WFI not suspend so keep holding SUSPEND * wakelock */ retval = MvpkmWaitForInt(vm, false); } #else retval = MvpkmWaitForInt(vm, WSP_Params(wsp)->wait.suspendMode); #endif if (retval < 0) { goto monitorExit; } break; } /* * The only reason the monitor returned was because there was a * pending hardware interrupt. The host serviced and cleared that * interrupt when we enabled interrupts above. Now we call the * scheduler in case that interrupt woke another thread, we want to * allow that thread to run before returning to do more guest code. */ case WSCALL_IRQ: { break; } case WSCALL_GET_PAGE_FROM_VMID: { MksckPage *mksckPage; mksckPage = MksckPage_GetFromVmIdIncRefc(WSP_Params(wsp)->pageMgmnt.vmId); if (mksckPage) { int ii; WSP_Params(wsp)->pageMgmnt.found = true; for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) { WSP_Params(wsp)->pageMgmnt.mpn[ii] = vmalloc_to_pfn( (void*)(((HKVA)mksckPage) + ii*PAGE_SIZE) ); } ASSERT(!wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = true; } else { WSP_Params(wsp)->pageMgmnt.found = false; } break; } case WSCALL_REMOVE_PAGE_FROM_VMID: { MksckPage *mksckPage; mksckPage = MksckPage_GetFromVmId(WSP_Params(wsp)->pageMgmnt.vmId); ASSERT(wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = false; MksckPage_DecRefc(mksckPage); break; } /* * Read current wallclock time. */ case WSCALL_READTOD: { struct timeval nowTV; do_gettimeofday(&nowTV); WSP_Params(wsp)->tod.now = nowTV.tv_sec; WSP_Params(wsp)->tod.nowusec = nowTV.tv_usec; break; } case WSCALL_LOG: { int len = strlen(WSP_Params(wsp)->log.messg); printk(KERN_INFO "VMM: %s%s", WSP_Params(wsp)->log.messg, (WSP_Params(wsp)->log.messg[len-1] == '\n') ? "" : "\n"); break; } case WSCALL_ABORT: { retval = WSP_Params(wsp)->abort.status; goto monitorExit; } case WSCALL_QP_GUEST_ATTACH: { int32 rc; QPInitArgs args; uint32 base; uint32 nrPages; args.id = WSP_Params(wsp)->qp.id; args.capacity = WSP_Params(wsp)->qp.capacity; args.type = WSP_Params(wsp)->qp.type; base = WSP_Params(wsp)->qp.base; nrPages = WSP_Params(wsp)->qp.nrPages; rc = QP_GuestAttachRequest(vm, &args, base, nrPages); WSP_Params(wsp)->qp.rc = rc; WSP_Params(wsp)->qp.id = args.id; break; } case WSCALL_QP_NOTIFY: { QPInitArgs args; args.id = WSP_Params(wsp)->qp.id; args.capacity = WSP_Params(wsp)->qp.capacity; args.type = WSP_Params(wsp)->qp.type; WSP_Params(wsp)->qp.rc = QP_NotifyListener(&args); break; } case WSCALL_MONITOR_TIMER: { MonitorTimer_Request(&vm->monTimer, WSP_Params(wsp)->timer.when64); break; } case WSCALL_COMM_SIGNAL: { Mvpkm_CommEvSignal(&WSP_Params(wsp)->commEvent.transpID, WSP_Params(wsp)->commEvent.event); break; } case WSCALL_FLUSH_ALL_DCACHES: { /* * Broadcast Flush DCache request to all cores. * Block while waiting for all of them to get done. */ on_each_cpu(FlushAllCpuCaches, NULL, 1); break; } default: { retval = -EPIPE; goto monitorExit; } } /* * The params.callno callback was handled in kernel mode and completed * successfully. Repeat for another call without returning to user mode, * unless there are signals pending. * * But first, call the Linux scheduler to switch threads if there is * some other thread Linux wants to run now. */ if (need_resched()) { schedule(); } /* * Check if cpus allowed mask has to be updated. * Updating it must be done outside of an atomic context. */ if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask) && !cpumask_equal(to_cpumask(vcpuAffinity), ¤t->cpus_allowed)) { set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); } local_irq_save(flags); } /* * There are signals pending so don't try to do any more monitor/guest * stuff. But since we were at the point of just about to run the monitor, * return success status as user mode can simply call us back to run the * monitor again. */ local_irq_restore(flags); monitorExit: ASSERT(wsp->critSecCount == 0); if (ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT) { PRINTK(KERN_INFO "Monitor has ABORT flag set.\n"); retval = ExitStatusHostRequest; } #ifdef CONFIG_HAS_WAKELOCK wake_unlock(&vm->wakeLock); #endif down_write(&vm->monThreadTaskSem); vm->monThreadTask = NULL; up_write(&vm->monThreadTaskSem); return retval; } /** * @brief Guest is waiting for interrupts, sleep if necessary * * @param vm which virtual machine we're running * @param suspend is the guest entering suspend or just WFI? * @return 0: woken up, hostActions should have pending events * -ERESTARTSYS: broke out because other signals are pending * * This function is called in the VCPU context after the world switch to wait * for an incoming message. If any message gets queued to this VCPU, the * sender will wake us up. */ int MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend) { WorldSwitchPage *wsp = vm->wsp; wait_queue_head_t *q = &vm->wfiWaitQ; if (suspend) { return wait_event_interruptible(*q, ATOMIC_GETO(wsp->hostActions) != 0); } else { int ret; ret = wait_event_interruptible_timeout(*q, ATOMIC_GETO(wsp->hostActions) != 0, 10*HZ); if (ret == 0) { printk("MvpkmWaitForInt: guest stuck for 10s in WFI! (hostActions %08x)\n", ATOMIC_GETO(wsp->hostActions)); } return ret > 0 ? 0 : ret; } } /** * @brief Force the guest to evaluate its hostActions flag field * * @param vm which guest needs waking * @param why why should be guest be woken up? * * This function updates the hostAction flag field as and wakes up the guest as * required so that it can evaluate it. The guest could be executing guest * code in an SMP system, in that case send an IPI; or it could be sleeping, in * the case wake it up. */ void Mvpkm_WakeGuest(MvpkmVM *vm, int why) { ASSERT(why != 0); /* set the host action */ if (ATOMIC_ORO(vm->wsp->hostActions, why) & why) { /* guest has already been woken up so no need to do it again */ return; } /* * VCPU is certainly in 'wait for interrupt' wait. Wake it up ! */ #ifdef CONFIG_HAS_WAKELOCK /* * To prevent the system to go in suspend mode before the monitor had a * chance on being scheduled, we will hold the VM wakelock from now. * As the wakelocks are not managed as reference counts, this is not an * an issue to take a wake_lock twice in a row. */ wake_lock(&vm->wakeLock); #endif /* * On a UP system, we ensure the monitor thread isn't blocked. * * On an MP system the other CPU might be running the guest. This * is noop on UP. * * When the guest is running, it is an invariant that monThreadTaskSem is not * held as a write lock, so we should not fail to acquire the lock. * Mvpkm_WakeGuest may be called from an atomic context, so we can't sleep * here. */ if (down_read_trylock(&vm->monThreadTaskSem)) { if (vm->monThreadTask) { wake_up_process(vm->monThreadTask); kick_process(vm->monThreadTask); } up_read(&vm->monThreadTaskSem); } else { printk("Unexpected failure to acquire monThreadTaskSem!\n"); } }